一些自动格式更改,添加一些注释等
This commit is contained in:
parent
f588745484
commit
ad177f2165
|
|
@ -1,3 +1,5 @@
|
||||||
|
*.paf
|
||||||
|
*.sam
|
||||||
.cproject
|
.cproject
|
||||||
.project
|
.project
|
||||||
.*.swp
|
.*.swp
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"configurations": [
|
"configurations": [
|
||||||
{
|
{
|
||||||
"name": "Launch",
|
"name": "read overlap",
|
||||||
"preLaunchTask": "Build",
|
"preLaunchTask": "Build",
|
||||||
"type": "cppdbg",
|
"type": "cppdbg",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
|
|
@ -14,13 +14,31 @@
|
||||||
"-x",
|
"-x",
|
||||||
"ava-ont",
|
"ava-ont",
|
||||||
"-t",
|
"-t",
|
||||||
"1",
|
"4",
|
||||||
"/public/home/zzh/work/3gseq/TGM-2021YFF/Acinetobacter_pittii.fastq",
|
"/public/home/zzh/work/3gseq/TGM-2021YFF/Acinetobacter_pittii.fastq",
|
||||||
"/public/home/zzh/work/3gseq/TGM-2021YFF/Acinetobacter_pittii.fastq",
|
"/public/home/zzh/work/3gseq/TGM-2021YFF/Acinetobacter_pittii.fastq",
|
||||||
"-o",
|
"-o",
|
||||||
"reads.paf"
|
"reads.paf"
|
||||||
],
|
],
|
||||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "mapping",
|
||||||
|
"preLaunchTask": "Build",
|
||||||
|
"type": "cppdbg",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${workspaceRoot}/minimap2",
|
||||||
|
"args": [
|
||||||
|
"-ax",
|
||||||
|
"map-ont",
|
||||||
|
"-t",
|
||||||
|
"1",
|
||||||
|
"/public/home/zzh/work/3gseq/TGM-2021YFF/reads.fasta",
|
||||||
|
"/public/home/zzh/work/3gseq/TGM-2021YFF/Acinetobacter_pittii.fastq",
|
||||||
|
"-o",
|
||||||
|
"aln.sam"
|
||||||
|
],
|
||||||
|
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
{
|
{
|
||||||
"files.associations": {
|
"files.associations": {
|
||||||
"minimap.h": "c",
|
"minimap.h": "c",
|
||||||
"time.h": "c"
|
"time.h": "c",
|
||||||
|
"kalloc.h": "c"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
512
index.c
512
index.c
|
|
@ -15,7 +15,7 @@
|
||||||
#include "kvec.h"
|
#include "kvec.h"
|
||||||
#include "khash.h"
|
#include "khash.h"
|
||||||
|
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
extern int64_t get_mseconds();
|
extern int64_t get_mseconds();
|
||||||
extern int64_t time_mm_idx_reader_read;
|
extern int64_t time_mm_idx_reader_read;
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -29,19 +29,22 @@ KHASH_MAP_INIT_STR(str, uint32_t)
|
||||||
|
|
||||||
#define kroundup64(x) (--(x), (x) |= (x) >> 1, (x) |= (x) >> 2, (x) |= (x) >> 4, (x) |= (x) >> 8, (x) |= (x) >> 16, (x) |= (x) >> 32, ++(x))
|
#define kroundup64(x) (--(x), (x) |= (x) >> 1, (x) |= (x) >> 2, (x) |= (x) >> 4, (x) |= (x) >> 8, (x) |= (x) >> 16, (x) |= (x) >> 32, ++(x))
|
||||||
|
|
||||||
typedef struct mm_idx_bucket_s {
|
typedef struct mm_idx_bucket_s
|
||||||
|
{
|
||||||
mm128_v a; // (minimizer, position) array
|
mm128_v a; // (minimizer, position) array
|
||||||
int32_t n; // size of the _p_ array
|
int32_t n; // size of the _p_ array
|
||||||
uint64_t *p; // position array for minimizers appearing >1 times
|
uint64_t *p; // position array for minimizers appearing >1 times
|
||||||
void *h; // hash table indexing _p_ and minimizers appearing once
|
void *h; // hash table indexing _p_ and minimizers appearing once
|
||||||
} mm_idx_bucket_t;
|
} mm_idx_bucket_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
int32_t st, en, max; // max is not used for now
|
int32_t st, en, max; // max is not used for now
|
||||||
int32_t score : 30, strand : 2;
|
int32_t score : 30, strand : 2;
|
||||||
} mm_idx_intv1_t;
|
} mm_idx_intv1_t;
|
||||||
|
|
||||||
typedef struct mm_idx_intv_s {
|
typedef struct mm_idx_intv_s
|
||||||
|
{
|
||||||
int32_t n, m;
|
int32_t n, m;
|
||||||
mm_idx_intv1_t *a;
|
mm_idx_intv1_t *a;
|
||||||
} mm_idx_intv_t;
|
} mm_idx_intv_t;
|
||||||
|
|
@ -49,38 +52,51 @@ typedef struct mm_idx_intv_s {
|
||||||
mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
|
mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
|
||||||
{
|
{
|
||||||
mm_idx_t *mi;
|
mm_idx_t *mi;
|
||||||
if (k*2 < b) b = k * 2;
|
if (k * 2 < b)
|
||||||
if (w < 1) w = 1;
|
b = k * 2;
|
||||||
|
if (w < 1)
|
||||||
|
w = 1;
|
||||||
mi = (mm_idx_t *)calloc(1, sizeof(mm_idx_t));
|
mi = (mm_idx_t *)calloc(1, sizeof(mm_idx_t));
|
||||||
mi->w = w, mi->k = k, mi->b = b, mi->flag = flag;
|
mi->w = w, mi->k = k, mi->b = b, mi->flag = flag;
|
||||||
mi->B = (mm_idx_bucket_t *)calloc(1 << b, sizeof(mm_idx_bucket_t));
|
mi->B = (mm_idx_bucket_t *)calloc(1 << b, sizeof(mm_idx_bucket_t));
|
||||||
if (!(mm_dbg_flag & 1)) mi->km = km_init();
|
if (!(mm_dbg_flag & 1))
|
||||||
|
mi->km = km_init();
|
||||||
return mi;
|
return mi;
|
||||||
}
|
}
|
||||||
|
|
||||||
void mm_idx_destroy(mm_idx_t *mi)
|
void mm_idx_destroy(mm_idx_t *mi)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
if (mi == 0) return;
|
if (mi == 0)
|
||||||
if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
|
return;
|
||||||
if (mi->B) {
|
if (mi->h)
|
||||||
for (i = 0; i < 1U<<mi->b; ++i) {
|
kh_destroy(str, (khash_t(str) *)mi->h);
|
||||||
|
if (mi->B)
|
||||||
|
{
|
||||||
|
for (i = 0; i < 1U << mi->b; ++i)
|
||||||
|
{
|
||||||
free(mi->B[i].p);
|
free(mi->B[i].p);
|
||||||
free(mi->B[i].a.a);
|
free(mi->B[i].a.a);
|
||||||
kh_destroy(idx, (idxhash_t *)mi->B[i].h);
|
kh_destroy(idx, (idxhash_t *)mi->B[i].h);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (mi->I) {
|
if (mi->I)
|
||||||
|
{
|
||||||
for (i = 0; i < mi->n_seq; ++i)
|
for (i = 0; i < mi->n_seq; ++i)
|
||||||
free(mi->I[i].a);
|
free(mi->I[i].a);
|
||||||
free(mi->I);
|
free(mi->I);
|
||||||
}
|
}
|
||||||
if (!mi->km) {
|
if (!mi->km)
|
||||||
|
{
|
||||||
for (i = 0; i < mi->n_seq; ++i)
|
for (i = 0; i < mi->n_seq; ++i)
|
||||||
free(mi->seq[i].name);
|
free(mi->seq[i].name);
|
||||||
free(mi->seq);
|
free(mi->seq);
|
||||||
} else km_destroy(mi->km);
|
}
|
||||||
free(mi->B); free(mi->S); free(mi);
|
else
|
||||||
|
km_destroy(mi->km);
|
||||||
|
free(mi->B);
|
||||||
|
free(mi->S);
|
||||||
|
free(mi);
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
|
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
|
||||||
|
|
@ -90,13 +106,18 @@ const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
|
||||||
mm_idx_bucket_t *b = &mi->B[minier & mask];
|
mm_idx_bucket_t *b = &mi->B[minier & mask];
|
||||||
idxhash_t *h = (idxhash_t *)b->h;
|
idxhash_t *h = (idxhash_t *)b->h;
|
||||||
*n = 0;
|
*n = 0;
|
||||||
if (h == 0) return 0;
|
if (h == 0)
|
||||||
|
return 0;
|
||||||
k = kh_get(idx, h, minier >> mi->b << 1);
|
k = kh_get(idx, h, minier >> mi->b << 1);
|
||||||
if (k == kh_end(h)) return 0;
|
if (k == kh_end(h))
|
||||||
if (kh_key(h, k)&1) { // special casing when there is only one k-mer
|
return 0;
|
||||||
|
if (kh_key(h, k) & 1)
|
||||||
|
{ // special casing when there is only one k-mer
|
||||||
*n = 1;
|
*n = 1;
|
||||||
return &kh_val(h, k);
|
return &kh_val(h, k);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
*n = (uint32_t)kh_val(h, k);
|
*n = (uint32_t)kh_val(h, k);
|
||||||
return &b->p[kh_val(h, k) >> 32];
|
return &b->p[kh_val(h, k) >> 32];
|
||||||
}
|
}
|
||||||
|
|
@ -111,15 +132,20 @@ void mm_idx_stat(const mm_idx_t *mi)
|
||||||
for (i = 0; i < mi->n_seq; ++i)
|
for (i = 0; i < mi->n_seq; ++i)
|
||||||
len += mi->seq[i].len;
|
len += mi->seq[i].len;
|
||||||
for (i = 0; i < 1U << mi->b; ++i)
|
for (i = 0; i < 1U << mi->b; ++i)
|
||||||
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
if (mi->B[i].h)
|
||||||
for (i = 0; i < 1U<<mi->b; ++i) {
|
n += kh_size((idxhash_t *)mi->B[i].h);
|
||||||
|
for (i = 0; i < 1U << mi->b; ++i)
|
||||||
|
{
|
||||||
idxhash_t *h = (idxhash_t *)mi->B[i].h;
|
idxhash_t *h = (idxhash_t *)mi->B[i].h;
|
||||||
khint_t k;
|
khint_t k;
|
||||||
if (h == 0) continue;
|
if (h == 0)
|
||||||
|
continue;
|
||||||
for (k = 0; k < kh_end(h); ++k)
|
for (k = 0; k < kh_end(h); ++k)
|
||||||
if (kh_exist(h, k)) {
|
if (kh_exist(h, k))
|
||||||
|
{
|
||||||
sum += kh_key(h, k) & 1 ? 1 : (uint32_t)kh_val(h, k);
|
sum += kh_key(h, k) & 1 ? 1 : (uint32_t)kh_val(h, k);
|
||||||
if (kh_key(h, k)&1) ++n1;
|
if (kh_key(h, k) & 1)
|
||||||
|
++n1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %d (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
|
fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %d (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
|
||||||
|
|
@ -131,13 +157,17 @@ int mm_idx_index_name(mm_idx_t *mi)
|
||||||
khash_t(str) * h;
|
khash_t(str) * h;
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
int has_dup = 0, absent;
|
int has_dup = 0, absent;
|
||||||
if (mi->h) return 0;
|
if (mi->h)
|
||||||
|
return 0;
|
||||||
h = kh_init(str);
|
h = kh_init(str);
|
||||||
for (i = 0; i < mi->n_seq; ++i) {
|
for (i = 0; i < mi->n_seq; ++i)
|
||||||
|
{
|
||||||
khint_t k;
|
khint_t k;
|
||||||
k = kh_put(str, h, mi->seq[i].name, &absent);
|
k = kh_put(str, h, mi->seq[i].name, &absent);
|
||||||
if (absent) kh_val(h, k) = i;
|
if (absent)
|
||||||
else has_dup = 1;
|
kh_val(h, k) = i;
|
||||||
|
else
|
||||||
|
has_dup = 1;
|
||||||
}
|
}
|
||||||
mi->h = h;
|
mi->h = h;
|
||||||
if (has_dup && mm_verbose >= 2)
|
if (has_dup && mm_verbose >= 2)
|
||||||
|
|
@ -149,7 +179,8 @@ int mm_idx_name2id(const mm_idx_t *mi, const char *name)
|
||||||
{
|
{
|
||||||
khash_t(str) *h = (khash_t(str) *)mi->h;
|
khash_t(str) *h = (khash_t(str) *)mi->h;
|
||||||
khint_t k;
|
khint_t k;
|
||||||
if (h == 0) return -2;
|
if (h == 0)
|
||||||
|
return -2;
|
||||||
k = kh_get(str, h, name);
|
k = kh_get(str, h, name);
|
||||||
return k == kh_end(h) ? -1 : kh_val(h, k);
|
return k == kh_end(h) ? -1 : kh_val(h, k);
|
||||||
}
|
}
|
||||||
|
|
@ -157,8 +188,10 @@ int mm_idx_name2id(const mm_idx_t *mi, const char *name)
|
||||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
|
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
|
||||||
{
|
{
|
||||||
uint64_t i, st1, en1;
|
uint64_t i, st1, en1;
|
||||||
if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
|
if (rid >= mi->n_seq || st >= mi->seq[rid].len)
|
||||||
if (en > mi->seq[rid].len) en = mi->seq[rid].len;
|
return -1;
|
||||||
|
if (en > mi->seq[rid].len)
|
||||||
|
en = mi->seq[rid].len;
|
||||||
st1 = mi->seq[rid].offset + st;
|
st1 = mi->seq[rid].offset + st;
|
||||||
en1 = mi->seq[rid].offset + en;
|
en1 = mi->seq[rid].offset + en;
|
||||||
for (i = st1; i < en1; ++i)
|
for (i = st1; i < en1; ++i)
|
||||||
|
|
@ -170,12 +203,15 @@ int mm_idx_getseq_rev(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en
|
||||||
{
|
{
|
||||||
uint64_t i, st1, en1;
|
uint64_t i, st1, en1;
|
||||||
const mm_idx_seq_t *s;
|
const mm_idx_seq_t *s;
|
||||||
if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
|
if (rid >= mi->n_seq || st >= mi->seq[rid].len)
|
||||||
|
return -1;
|
||||||
s = &mi->seq[rid];
|
s = &mi->seq[rid];
|
||||||
if (en > s->len) en = s->len;
|
if (en > s->len)
|
||||||
|
en = s->len;
|
||||||
st1 = s->offset + (s->len - en);
|
st1 = s->offset + (s->len - en);
|
||||||
en1 = s->offset + (s->len - st);
|
en1 = s->offset + (s->len - st);
|
||||||
for (i = st1; i < en1; ++i) {
|
for (i = st1; i < en1; ++i)
|
||||||
|
{
|
||||||
uint8_t c = mm_seq4_get(mi->S, i);
|
uint8_t c = mm_seq4_get(mi->S, i);
|
||||||
seq[en1 - i - 1] = c < 4 ? 3 - c : c;
|
seq[en1 - i - 1] = c < 4 ? 3 - c : c;
|
||||||
}
|
}
|
||||||
|
|
@ -184,8 +220,10 @@ int mm_idx_getseq_rev(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en
|
||||||
|
|
||||||
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
|
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
|
||||||
{
|
{
|
||||||
if (is_rev) return mm_idx_getseq_rev(mi, rid, st, en, seq);
|
if (is_rev)
|
||||||
else return mm_idx_getseq(mi, rid, st, en, seq);
|
return mm_idx_getseq_rev(mi, rid, st, en, seq);
|
||||||
|
else
|
||||||
|
return mm_idx_getseq(mi, rid, st, en, seq);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
||||||
|
|
@ -194,15 +232,21 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
||||||
size_t n = 0;
|
size_t n = 0;
|
||||||
uint32_t thres;
|
uint32_t thres;
|
||||||
khint_t *a, k;
|
khint_t *a, k;
|
||||||
if (f <= 0.) return INT32_MAX;
|
if (f <= 0.)
|
||||||
|
return INT32_MAX;
|
||||||
for (i = 0; i < 1 << mi->b; ++i)
|
for (i = 0; i < 1 << mi->b; ++i)
|
||||||
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
if (mi->B[i].h)
|
||||||
|
n += kh_size((idxhash_t *)mi->B[i].h);
|
||||||
a = (uint32_t *)malloc(n * 4);
|
a = (uint32_t *)malloc(n * 4);
|
||||||
for (i = n = 0; i < 1<<mi->b; ++i) {
|
for (i = n = 0; i < 1 << mi->b; ++i)
|
||||||
|
{
|
||||||
idxhash_t *h = (idxhash_t *)mi->B[i].h;
|
idxhash_t *h = (idxhash_t *)mi->B[i].h;
|
||||||
if (h == 0) continue;
|
if (h == 0)
|
||||||
for (k = 0; k < kh_end(h); ++k) {
|
continue;
|
||||||
if (!kh_exist(h, k)) continue;
|
for (k = 0; k < kh_end(h); ++k)
|
||||||
|
{
|
||||||
|
if (!kh_exist(h, k))
|
||||||
|
continue;
|
||||||
a[n++] = kh_key(h, k) & 1 ? 1 : (uint32_t)kh_val(h, k);
|
a[n++] = kh_key(h, k) & 1 ? 1 : (uint32_t)kh_val(h, k);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -222,35 +266,46 @@ static void worker_post(void *g, long i, int tid)
|
||||||
idxhash_t *h;
|
idxhash_t *h;
|
||||||
mm_idx_t *mi = (mm_idx_t *)g;
|
mm_idx_t *mi = (mm_idx_t *)g;
|
||||||
mm_idx_bucket_t *b = &mi->B[i];
|
mm_idx_bucket_t *b = &mi->B[i];
|
||||||
if (b->a.n == 0) return;
|
if (b->a.n == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
// sort by minimizer
|
// sort by minimizer
|
||||||
radix_sort_128x(b->a.a, b->a.a + b->a.n);
|
radix_sort_128x(b->a.a, b->a.a + b->a.n);
|
||||||
|
|
||||||
// count and preallocate
|
// count and preallocate
|
||||||
for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
|
for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j)
|
||||||
if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
|
{
|
||||||
|
if (j == b->a.n || b->a.a[j].x >> 8 != b->a.a[j - 1].x >> 8)
|
||||||
|
{
|
||||||
++n_keys;
|
++n_keys;
|
||||||
if (n > 1) b->n += n;
|
if (n > 1)
|
||||||
|
b->n += n;
|
||||||
n = 1;
|
n = 1;
|
||||||
} else ++n;
|
}
|
||||||
|
else
|
||||||
|
++n;
|
||||||
}
|
}
|
||||||
h = kh_init(idx);
|
h = kh_init(idx);
|
||||||
kh_resize(idx, h, n_keys);
|
kh_resize(idx, h, n_keys);
|
||||||
b->p = (uint64_t *)calloc(b->n, 8);
|
b->p = (uint64_t *)calloc(b->n, 8);
|
||||||
|
|
||||||
// create the hash table
|
// create the hash table
|
||||||
for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
|
for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j)
|
||||||
if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
|
{
|
||||||
|
if (j == b->a.n || b->a.a[j].x >> 8 != b->a.a[j - 1].x >> 8)
|
||||||
|
{
|
||||||
khint_t itr;
|
khint_t itr;
|
||||||
int absent;
|
int absent;
|
||||||
mm128_t *p = &b->a.a[j - 1];
|
mm128_t *p = &b->a.a[j - 1];
|
||||||
itr = kh_put(idx, h, p->x >> 8 >> mi->b << 1, &absent);
|
itr = kh_put(idx, h, p->x >> 8 >> mi->b << 1, &absent);
|
||||||
assert(absent && j == start_a + n);
|
assert(absent && j == start_a + n);
|
||||||
if (n == 1) {
|
if (n == 1)
|
||||||
|
{
|
||||||
kh_key(h, itr) |= 1;
|
kh_key(h, itr) |= 1;
|
||||||
kh_val(h, itr) = p->y;
|
kh_val(h, itr) = p->y;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
int k;
|
int k;
|
||||||
for (k = 0; k < n; ++k)
|
for (k = 0; k < n; ++k)
|
||||||
b->p[start_p + k] = b->a.a[start_a + k].y;
|
b->p[start_p + k] = b->a.a[start_a + k].y;
|
||||||
|
|
@ -259,7 +314,9 @@ static void worker_post(void *g, long i, int tid)
|
||||||
start_p += n;
|
start_p += n;
|
||||||
}
|
}
|
||||||
start_a = j, n = 1;
|
start_a = j, n = 1;
|
||||||
} else ++n;
|
}
|
||||||
|
else
|
||||||
|
++n;
|
||||||
}
|
}
|
||||||
b->h = h;
|
b->h = h;
|
||||||
assert(b->n == (int32_t)start_p);
|
assert(b->n == (int32_t)start_p);
|
||||||
|
|
@ -282,14 +339,16 @@ static void mm_idx_post(mm_idx_t *mi, int n_threads)
|
||||||
#include <zlib.h>
|
#include <zlib.h>
|
||||||
#include "bseq.h"
|
#include "bseq.h"
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
int mini_batch_size;
|
int mini_batch_size;
|
||||||
uint64_t batch_size, sum_len;
|
uint64_t batch_size, sum_len;
|
||||||
mm_bseq_file_t *fp;
|
mm_bseq_file_t *fp;
|
||||||
mm_idx_t *mi;
|
mm_idx_t *mi;
|
||||||
} pipeline_t;
|
} pipeline_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
int n_seq;
|
int n_seq;
|
||||||
mm_bseq1_t *seq;
|
mm_bseq1_t *seq;
|
||||||
mm128_v a;
|
mm128_v a;
|
||||||
|
|
@ -298,7 +357,8 @@ typedef struct {
|
||||||
static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
|
static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
|
||||||
{
|
{
|
||||||
int i, mask = (1 << mi->b) - 1;
|
int i, mask = (1 << mi->b) - 1;
|
||||||
for (i = 0; i < n; ++i) {
|
for (i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
mm128_v *p = &mi->B[a[i].x >> 8 & mask].a;
|
mm128_v *p = &mi->B[a[i].x >> 8 & mask].a;
|
||||||
kv_push(mm128_t, 0, *p, a[i]);
|
kv_push(mm128_t, 0, *p, a[i]);
|
||||||
}
|
}
|
||||||
|
|
@ -308,45 +368,59 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
pipeline_t *p = (pipeline_t *)shared;
|
pipeline_t *p = (pipeline_t *)shared;
|
||||||
if (step == 0) { // step 0: read sequences
|
if (step == 0)
|
||||||
|
{ // step 0: read sequences
|
||||||
step_t *s;
|
step_t *s;
|
||||||
if (p->sum_len > p->batch_size) return 0;
|
if (p->sum_len > p->batch_size)
|
||||||
|
return 0;
|
||||||
s = (step_t *)calloc(1, sizeof(step_t));
|
s = (step_t *)calloc(1, sizeof(step_t));
|
||||||
s->seq = mm_bseq_read(p->fp, p->mini_batch_size, 0, &s->n_seq); // read a mini-batch
|
s->seq = mm_bseq_read(p->fp, p->mini_batch_size, 0, &s->n_seq); // read a mini-batch
|
||||||
if (s->seq) {
|
if (s->seq)
|
||||||
|
{
|
||||||
uint32_t old_m, m;
|
uint32_t old_m, m;
|
||||||
assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
|
assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
|
||||||
// make room for p->mi->seq
|
// make room for p->mi->seq
|
||||||
old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
|
old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
|
||||||
kroundup32(m); kroundup32(old_m);
|
kroundup32(m);
|
||||||
|
kroundup32(old_m);
|
||||||
if (old_m != m)
|
if (old_m != m)
|
||||||
p->mi->seq = (mm_idx_seq_t *)krealloc(p->mi->km, p->mi->seq, m * sizeof(mm_idx_seq_t));
|
p->mi->seq = (mm_idx_seq_t *)krealloc(p->mi->km, p->mi->seq, m * sizeof(mm_idx_seq_t));
|
||||||
// make room for p->mi->S
|
// make room for p->mi->S
|
||||||
if (!(p->mi->flag & MM_I_NO_SEQ)) {
|
if (!(p->mi->flag & MM_I_NO_SEQ))
|
||||||
|
{
|
||||||
uint64_t sum_len, old_max_len, max_len;
|
uint64_t sum_len, old_max_len, max_len;
|
||||||
for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
|
for (i = 0, sum_len = 0; i < s->n_seq; ++i)
|
||||||
|
sum_len += s->seq[i].l_seq;
|
||||||
old_max_len = (p->sum_len + 7) / 8;
|
old_max_len = (p->sum_len + 7) / 8;
|
||||||
max_len = (p->sum_len + sum_len + 7) / 8;
|
max_len = (p->sum_len + sum_len + 7) / 8;
|
||||||
kroundup64(old_max_len); kroundup64(max_len);
|
kroundup64(old_max_len);
|
||||||
if (old_max_len != max_len) {
|
kroundup64(max_len);
|
||||||
|
if (old_max_len != max_len)
|
||||||
|
{
|
||||||
p->mi->S = (uint32_t *)realloc(p->mi->S, max_len * 4);
|
p->mi->S = (uint32_t *)realloc(p->mi->S, max_len * 4);
|
||||||
memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
|
memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// populate p->mi->seq
|
// populate p->mi->seq
|
||||||
for (i = 0; i < s->n_seq; ++i) {
|
for (i = 0; i < s->n_seq; ++i)
|
||||||
|
{
|
||||||
mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq];
|
mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq];
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
if (!(p->mi->flag & MM_I_NO_NAME)) {
|
if (!(p->mi->flag & MM_I_NO_NAME))
|
||||||
|
{
|
||||||
seq->name = (char *)kmalloc(p->mi->km, strlen(s->seq[i].name) + 1);
|
seq->name = (char *)kmalloc(p->mi->km, strlen(s->seq[i].name) + 1);
|
||||||
strcpy(seq->name, s->seq[i].name);
|
strcpy(seq->name, s->seq[i].name);
|
||||||
} else seq->name = 0;
|
}
|
||||||
|
else
|
||||||
|
seq->name = 0;
|
||||||
seq->len = s->seq[i].l_seq;
|
seq->len = s->seq[i].l_seq;
|
||||||
seq->offset = p->sum_len;
|
seq->offset = p->sum_len;
|
||||||
seq->is_alt = 0;
|
seq->is_alt = 0;
|
||||||
// copy the sequence
|
// copy the sequence
|
||||||
if (!(p->mi->flag & MM_I_NO_SEQ)) {
|
if (!(p->mi->flag & MM_I_NO_SEQ))
|
||||||
for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
|
{
|
||||||
|
for (j = 0; j < seq->len; ++j)
|
||||||
|
{ // TODO: this is not the fastest way, but let's first see if speed matters here
|
||||||
uint64_t o = p->sum_len + j;
|
uint64_t o = p->sum_len + j;
|
||||||
int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
|
int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
|
||||||
mm_seq4_set(p->mi->S, o, c);
|
mm_seq4_set(p->mi->S, o, c);
|
||||||
|
|
@ -357,31 +431,43 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
||||||
s->seq[i].rid = p->mi->n_seq++;
|
s->seq[i].rid = p->mi->n_seq++;
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
} else free(s);
|
}
|
||||||
} else if (step == 1) { // step 1: compute sketch
|
else
|
||||||
|
free(s);
|
||||||
|
}
|
||||||
|
else if (step == 1)
|
||||||
|
{ // step 1: compute sketch
|
||||||
step_t *s = (step_t *)in;
|
step_t *s = (step_t *)in;
|
||||||
for (i = 0; i < s->n_seq; ++i) {
|
for (i = 0; i < s->n_seq; ++i)
|
||||||
|
{
|
||||||
mm_bseq1_t *t = &s->seq[i];
|
mm_bseq1_t *t = &s->seq[i];
|
||||||
if (t->l_seq > 0)
|
if (t->l_seq > 0)
|
||||||
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag & MM_I_HPC, &s->a);
|
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag & MM_I_HPC, &s->a);
|
||||||
else if (mm_verbose >= 2)
|
else if (mm_verbose >= 2)
|
||||||
fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);
|
fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);
|
||||||
free(t->seq); free(t->name);
|
free(t->seq);
|
||||||
|
free(t->name);
|
||||||
}
|
}
|
||||||
free(s->seq); s->seq = 0;
|
free(s->seq);
|
||||||
|
s->seq = 0;
|
||||||
return s;
|
return s;
|
||||||
} else if (step == 2) { // dispatch sketch to buckets
|
}
|
||||||
|
else if (step == 2)
|
||||||
|
{ // dispatch sketch to buckets
|
||||||
step_t *s = (step_t *)in;
|
step_t *s = (step_t *)in;
|
||||||
mm_idx_add(p->mi, s->a.n, s->a.a);
|
mm_idx_add(p->mi, s->a.n, s->a.a);
|
||||||
kfree(0, s->a.a); free(s);
|
kfree(0, s->a.a);
|
||||||
|
free(s);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 生成index
|
||||||
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
|
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
|
||||||
{
|
{
|
||||||
pipeline_t pl;
|
pipeline_t pl;
|
||||||
if (fp == 0 || mm_bseq_eof(fp)) return 0;
|
if (fp == 0 || mm_bseq_eof(fp))
|
||||||
|
return 0;
|
||||||
memset(&pl, 0, sizeof(pipeline_t));
|
memset(&pl, 0, sizeof(pipeline_t));
|
||||||
pl.mini_batch_size = (uint64_t)mini_batch_size < batch_size ? mini_batch_size : batch_size;
|
pl.mini_batch_size = (uint64_t)mini_batch_size < batch_size ? mini_batch_size : batch_size;
|
||||||
pl.batch_size = batch_size;
|
pl.batch_size = batch_size;
|
||||||
|
|
@ -404,7 +490,8 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) //
|
||||||
mm_bseq_file_t *fp;
|
mm_bseq_file_t *fp;
|
||||||
mm_idx_t *mi;
|
mm_idx_t *mi;
|
||||||
fp = mm_bseq_open(fn);
|
fp = mm_bseq_open(fn);
|
||||||
if (fp == 0) return 0;
|
if (fp == 0)
|
||||||
|
return 0;
|
||||||
mi = mm_idx_gen(fp, w, k, 14, flag, 1 << 18, n_threads, UINT64_MAX);
|
mi = mm_idx_gen(fp, w, k, 14, flag, 1 << 18, n_threads, UINT64_MAX);
|
||||||
mm_bseq_close(fp);
|
mm_bseq_close(fp);
|
||||||
return mi;
|
return mi;
|
||||||
|
|
@ -418,22 +505,28 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha
|
||||||
khash_t(str) * h;
|
khash_t(str) * h;
|
||||||
int i, flag = 0;
|
int i, flag = 0;
|
||||||
|
|
||||||
if (n <= 0) return 0;
|
if (n <= 0)
|
||||||
|
return 0;
|
||||||
for (i = 0; i < n; ++i) // get the total length
|
for (i = 0; i < n; ++i) // get the total length
|
||||||
sum_len += strlen(seq[i]);
|
sum_len += strlen(seq[i]);
|
||||||
if (is_hpc) flag |= MM_I_HPC;
|
if (is_hpc)
|
||||||
if (name == 0) flag |= MM_I_NO_NAME;
|
flag |= MM_I_HPC;
|
||||||
if (bucket_bits < 0) bucket_bits = 14;
|
if (name == 0)
|
||||||
|
flag |= MM_I_NO_NAME;
|
||||||
|
if (bucket_bits < 0)
|
||||||
|
bucket_bits = 14;
|
||||||
mi = mm_idx_init(w, k, bucket_bits, flag);
|
mi = mm_idx_init(w, k, bucket_bits, flag);
|
||||||
mi->n_seq = n;
|
mi->n_seq = n;
|
||||||
mi->seq = (mm_idx_seq_t *)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km
|
mi->seq = (mm_idx_seq_t *)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km
|
||||||
mi->S = (uint32_t *)calloc((sum_len + 7) / 8, 4);
|
mi->S = (uint32_t *)calloc((sum_len + 7) / 8, 4);
|
||||||
mi->h = h = kh_init(str);
|
mi->h = h = kh_init(str);
|
||||||
for (i = 0, sum_len = 0; i < n; ++i) {
|
for (i = 0, sum_len = 0; i < n; ++i)
|
||||||
|
{
|
||||||
const char *s = seq[i];
|
const char *s = seq[i];
|
||||||
mm_idx_seq_t *p = &mi->seq[i];
|
mm_idx_seq_t *p = &mi->seq[i];
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
if (name && name[i]) {
|
if (name && name[i])
|
||||||
|
{
|
||||||
int absent;
|
int absent;
|
||||||
p->name = (char *)kmalloc(mi->km, strlen(name[i]) + 1);
|
p->name = (char *)kmalloc(mi->km, strlen(name[i]) + 1);
|
||||||
strcpy(p->name, name[i]);
|
strcpy(p->name, name[i]);
|
||||||
|
|
@ -443,13 +536,15 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha
|
||||||
p->offset = sum_len;
|
p->offset = sum_len;
|
||||||
p->len = strlen(s);
|
p->len = strlen(s);
|
||||||
p->is_alt = 0;
|
p->is_alt = 0;
|
||||||
for (j = 0; j < p->len; ++j) {
|
for (j = 0; j < p->len; ++j)
|
||||||
|
{
|
||||||
int c = seq_nt4_table[(uint8_t)s[j]];
|
int c = seq_nt4_table[(uint8_t)s[j]];
|
||||||
uint64_t o = sum_len + j;
|
uint64_t o = sum_len + j;
|
||||||
mm_seq4_set(mi->S, o, c);
|
mm_seq4_set(mi->S, o, c);
|
||||||
}
|
}
|
||||||
sum_len += p->len;
|
sum_len += p->len;
|
||||||
if (p->len > 0) {
|
if (p->len > 0)
|
||||||
|
{
|
||||||
a.n = 0;
|
a.n = 0;
|
||||||
mm_sketch(0, s, p->len, w, k, i, is_hpc, &a);
|
mm_sketch(0, s, p->len, w, k, i, is_hpc, &a);
|
||||||
mm_idx_add(mi, a.n, a.a);
|
mm_idx_add(mi, a.n, a.a);
|
||||||
|
|
@ -472,19 +567,24 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
|
||||||
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
|
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
|
||||||
fwrite(MM_IDX_MAGIC, 1, 4, fp);
|
fwrite(MM_IDX_MAGIC, 1, 4, fp);
|
||||||
fwrite(x, 4, 5, fp);
|
fwrite(x, 4, 5, fp);
|
||||||
for (i = 0; i < mi->n_seq; ++i) {
|
for (i = 0; i < mi->n_seq; ++i)
|
||||||
if (mi->seq[i].name) {
|
{
|
||||||
|
if (mi->seq[i].name)
|
||||||
|
{
|
||||||
uint8_t l = strlen(mi->seq[i].name);
|
uint8_t l = strlen(mi->seq[i].name);
|
||||||
fwrite(&l, 1, 1, fp);
|
fwrite(&l, 1, 1, fp);
|
||||||
fwrite(mi->seq[i].name, 1, l, fp);
|
fwrite(mi->seq[i].name, 1, l, fp);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
uint8_t l = 0;
|
uint8_t l = 0;
|
||||||
fwrite(&l, 1, 1, fp);
|
fwrite(&l, 1, 1, fp);
|
||||||
}
|
}
|
||||||
fwrite(&mi->seq[i].len, 4, 1, fp);
|
fwrite(&mi->seq[i].len, 4, 1, fp);
|
||||||
sum_len += mi->seq[i].len;
|
sum_len += mi->seq[i].len;
|
||||||
}
|
}
|
||||||
for (i = 0; i < 1<<mi->b; ++i) {
|
for (i = 0; i < 1 << mi->b; ++i)
|
||||||
|
{
|
||||||
mm_idx_bucket_t *b = &mi->B[i];
|
mm_idx_bucket_t *b = &mi->B[i];
|
||||||
khint_t k;
|
khint_t k;
|
||||||
idxhash_t *h = (idxhash_t *)b->h;
|
idxhash_t *h = (idxhash_t *)b->h;
|
||||||
|
|
@ -492,10 +592,13 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
|
||||||
fwrite(&b->n, 4, 1, fp);
|
fwrite(&b->n, 4, 1, fp);
|
||||||
fwrite(b->p, 8, b->n, fp);
|
fwrite(b->p, 8, b->n, fp);
|
||||||
fwrite(&size, 4, 1, fp);
|
fwrite(&size, 4, 1, fp);
|
||||||
if (size == 0) continue;
|
if (size == 0)
|
||||||
for (k = 0; k < kh_end(h); ++k) {
|
continue;
|
||||||
|
for (k = 0; k < kh_end(h); ++k)
|
||||||
|
{
|
||||||
uint64_t x[2];
|
uint64_t x[2];
|
||||||
if (!kh_exist(h, k)) continue;
|
if (!kh_exist(h, k))
|
||||||
|
continue;
|
||||||
x[0] = kh_key(h, k), x[1] = kh_val(h, k);
|
x[0] = kh_key(h, k), x[1] = kh_val(h, k);
|
||||||
fwrite(x, 8, 2, fp);
|
fwrite(x, 8, 2, fp);
|
||||||
}
|
}
|
||||||
|
|
@ -512,17 +615,22 @@ mm_idx_t *mm_idx_load(FILE *fp)
|
||||||
uint64_t sum_len = 0;
|
uint64_t sum_len = 0;
|
||||||
mm_idx_t *mi;
|
mm_idx_t *mi;
|
||||||
|
|
||||||
if (fread(magic, 1, 4, fp) != 4) return 0;
|
if (fread(magic, 1, 4, fp) != 4)
|
||||||
if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
|
return 0;
|
||||||
if (fread(x, 4, 5, fp) != 5) return 0;
|
if (strncmp(magic, MM_IDX_MAGIC, 4) != 0)
|
||||||
|
return 0;
|
||||||
|
if (fread(x, 4, 5, fp) != 5)
|
||||||
|
return 0;
|
||||||
mi = mm_idx_init(x[0], x[1], x[2], x[4]);
|
mi = mm_idx_init(x[0], x[1], x[2], x[4]);
|
||||||
mi->n_seq = x[3];
|
mi->n_seq = x[3];
|
||||||
mi->seq = (mm_idx_seq_t *)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t));
|
mi->seq = (mm_idx_seq_t *)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t));
|
||||||
for (i = 0; i < mi->n_seq; ++i) {
|
for (i = 0; i < mi->n_seq; ++i)
|
||||||
|
{
|
||||||
uint8_t l;
|
uint8_t l;
|
||||||
mm_idx_seq_t *s = &mi->seq[i];
|
mm_idx_seq_t *s = &mi->seq[i];
|
||||||
fread(&l, 1, 1, fp);
|
fread(&l, 1, 1, fp);
|
||||||
if (l) {
|
if (l)
|
||||||
|
{
|
||||||
s->name = (char *)kmalloc(mi->km, l + 1);
|
s->name = (char *)kmalloc(mi->km, l + 1);
|
||||||
fread(s->name, 1, l, fp);
|
fread(s->name, 1, l, fp);
|
||||||
s->name[l] = 0;
|
s->name[l] = 0;
|
||||||
|
|
@ -532,7 +640,8 @@ mm_idx_t *mm_idx_load(FILE *fp)
|
||||||
s->is_alt = 0;
|
s->is_alt = 0;
|
||||||
sum_len += s->len;
|
sum_len += s->len;
|
||||||
}
|
}
|
||||||
for (i = 0; i < 1<<mi->b; ++i) {
|
for (i = 0; i < 1 << mi->b; ++i)
|
||||||
|
{
|
||||||
mm_idx_bucket_t *b = &mi->B[i];
|
mm_idx_bucket_t *b = &mi->B[i];
|
||||||
uint32_t j, size;
|
uint32_t j, size;
|
||||||
khint_t k;
|
khint_t k;
|
||||||
|
|
@ -541,10 +650,12 @@ mm_idx_t *mm_idx_load(FILE *fp)
|
||||||
b->p = (uint64_t *)malloc(b->n * 8);
|
b->p = (uint64_t *)malloc(b->n * 8);
|
||||||
fread(b->p, 8, b->n, fp);
|
fread(b->p, 8, b->n, fp);
|
||||||
fread(&size, 4, 1, fp);
|
fread(&size, 4, 1, fp);
|
||||||
if (size == 0) continue;
|
if (size == 0)
|
||||||
|
continue;
|
||||||
b->h = h = kh_init(idx);
|
b->h = h = kh_init(idx);
|
||||||
kh_resize(idx, h, size);
|
kh_resize(idx, h, size);
|
||||||
for (j = 0; j < size; ++j) {
|
for (j = 0; j < size; ++j)
|
||||||
|
{
|
||||||
uint64_t x[2];
|
uint64_t x[2];
|
||||||
int absent;
|
int absent;
|
||||||
fread(x, 8, 2, fp);
|
fread(x, 8, 2, fp);
|
||||||
|
|
@ -553,7 +664,8 @@ mm_idx_t *mm_idx_load(FILE *fp)
|
||||||
kh_val(h, k) = x[1];
|
kh_val(h, k) = x[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!(mi->flag & MM_I_NO_SEQ)) {
|
if (!(mi->flag & MM_I_NO_SEQ))
|
||||||
|
{
|
||||||
mi->S = (uint32_t *)malloc((sum_len + 7) / 8 * 4);
|
mi->S = (uint32_t *)malloc((sum_len + 7) / 8 * 4);
|
||||||
fread(mi->S, 4, (sum_len + 7) / 8, fp);
|
fread(mi->S, 4, (sum_len + 7) / 8, fp);
|
||||||
}
|
}
|
||||||
|
|
@ -566,14 +678,18 @@ int64_t mm_idx_is_idx(const char *fn)
|
||||||
int64_t ret, off_end;
|
int64_t ret, off_end;
|
||||||
char magic[4];
|
char magic[4];
|
||||||
|
|
||||||
if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
|
if (strcmp(fn, "-") == 0)
|
||||||
|
return 0; // read from pipe; not an index
|
||||||
fd = open(fn, O_RDONLY);
|
fd = open(fn, O_RDONLY);
|
||||||
if (fd < 0) return -1; // error
|
if (fd < 0)
|
||||||
|
return -1; // error
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
if ((off_end = _lseeki64(fd, 0, SEEK_END)) >= 4) {
|
if ((off_end = _lseeki64(fd, 0, SEEK_END)) >= 4)
|
||||||
|
{
|
||||||
_lseeki64(fd, 0, SEEK_SET);
|
_lseeki64(fd, 0, SEEK_SET);
|
||||||
#else
|
#else
|
||||||
if ((off_end = lseek(fd, 0, SEEK_END)) >= 4) {
|
if ((off_end = lseek(fd, 0, SEEK_END)) >= 4)
|
||||||
|
{
|
||||||
lseek(fd, 0, SEEK_SET);
|
lseek(fd, 0, SEEK_SET);
|
||||||
#endif // WIN32
|
#endif // WIN32
|
||||||
ret = read(fd, magic, 4);
|
ret = read(fd, magic, 4);
|
||||||
|
|
@ -589,44 +705,58 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, cons
|
||||||
int64_t is_idx;
|
int64_t is_idx;
|
||||||
mm_idx_reader_t *r;
|
mm_idx_reader_t *r;
|
||||||
is_idx = mm_idx_is_idx(fn);
|
is_idx = mm_idx_is_idx(fn);
|
||||||
if (is_idx < 0) return 0; // failed to open the index
|
if (is_idx < 0)
|
||||||
|
return 0; // failed to open the index
|
||||||
r = (mm_idx_reader_t *)calloc(1, sizeof(mm_idx_reader_t));
|
r = (mm_idx_reader_t *)calloc(1, sizeof(mm_idx_reader_t));
|
||||||
r->is_idx = is_idx;
|
r->is_idx = is_idx;
|
||||||
if (opt) r->opt = *opt;
|
if (opt)
|
||||||
else mm_idxopt_init(&r->opt);
|
r->opt = *opt;
|
||||||
if (r->is_idx) {
|
else
|
||||||
|
mm_idxopt_init(&r->opt);
|
||||||
|
if (r->is_idx)
|
||||||
|
{
|
||||||
r->fp.idx = fopen(fn, "rb");
|
r->fp.idx = fopen(fn, "rb");
|
||||||
r->idx_size = is_idx;
|
r->idx_size = is_idx;
|
||||||
} else r->fp.seq = mm_bseq_open(fn);
|
}
|
||||||
if (fn_out) r->fp_out = fopen(fn_out, "wb");
|
else
|
||||||
|
r->fp.seq = mm_bseq_open(fn);
|
||||||
|
if (fn_out)
|
||||||
|
r->fp_out = fopen(fn_out, "wb");
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
void mm_idx_reader_close(mm_idx_reader_t *r)
|
void mm_idx_reader_close(mm_idx_reader_t *r)
|
||||||
{
|
{
|
||||||
if (r->is_idx) fclose(r->fp.idx);
|
if (r->is_idx)
|
||||||
else mm_bseq_close(r->fp.seq);
|
fclose(r->fp.idx);
|
||||||
if (r->fp_out) fclose(r->fp_out);
|
else
|
||||||
|
mm_bseq_close(r->fp.seq);
|
||||||
|
if (r->fp_out)
|
||||||
|
fclose(r->fp_out);
|
||||||
free(r);
|
free(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
||||||
{
|
{
|
||||||
mm_idx_t *mi;
|
mm_idx_t *mi;
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_cur_time = get_mseconds();
|
int64_t tmp_cur_time = get_mseconds();
|
||||||
#endif
|
#endif
|
||||||
if (r->is_idx) {
|
if (r->is_idx)
|
||||||
|
{
|
||||||
mi = mm_idx_load(r->fp.idx);
|
mi = mm_idx_load(r->fp.idx);
|
||||||
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag & MM_I_HPC) != (r->opt.flag & MM_I_HPC)))
|
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag & MM_I_HPC) != (r->opt.flag & MM_I_HPC)))
|
||||||
fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
|
fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
|
||||||
} else
|
}
|
||||||
|
else
|
||||||
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
|
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
|
||||||
if (mi) {
|
if (mi)
|
||||||
if (r->fp_out) mm_idx_dump(r->fp_out, mi);
|
{
|
||||||
|
if (r->fp_out)
|
||||||
|
mm_idx_dump(r->fp_out, mi);
|
||||||
mi->index = r->n_parts++;
|
mi->index = r->n_parts++;
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
time_mm_idx_reader_read += get_mseconds() - tmp_cur_time;
|
time_mm_idx_reader_read += get_mseconds() - tmp_cur_time;
|
||||||
#endif
|
#endif
|
||||||
return mi;
|
return mi;
|
||||||
|
|
@ -650,16 +780,22 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
|
||||||
kstream_t *ks;
|
kstream_t *ks;
|
||||||
kstring_t str = {0, 0, 0};
|
kstring_t str = {0, 0, 0};
|
||||||
fp = fn && strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
fp = fn && strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
||||||
if (fp == 0) return -1;
|
if (fp == 0)
|
||||||
|
return -1;
|
||||||
ks = ks_init(fp);
|
ks = ks_init(fp);
|
||||||
if (mi->h == 0) mm_idx_index_name(mi);
|
if (mi->h == 0)
|
||||||
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
|
mm_idx_index_name(mi);
|
||||||
|
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0)
|
||||||
|
{
|
||||||
char *p;
|
char *p;
|
||||||
int id;
|
int id;
|
||||||
for (p = str.s; *p && !isspace(*p); ++p) { }
|
for (p = str.s; *p && !isspace(*p); ++p)
|
||||||
|
{
|
||||||
|
}
|
||||||
*p = 0;
|
*p = 0;
|
||||||
id = mm_idx_name2id(mi, str.s);
|
id = mm_idx_name2id(mi, str.s);
|
||||||
if (id >= 0) mi->seq[id].is_alt = 1, ++n_alt;
|
if (id >= 0)
|
||||||
|
mi->seq[id].is_alt = 1, ++n_alt;
|
||||||
}
|
}
|
||||||
mi->n_alt = n_alt;
|
mi->n_alt = n_alt;
|
||||||
if (mm_verbose >= 3)
|
if (mm_verbose >= 3)
|
||||||
|
|
@ -678,65 +814,102 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
|
||||||
mm_idx_intv_t *I;
|
mm_idx_intv_t *I;
|
||||||
|
|
||||||
fp = fn && strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
fp = fn && strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
||||||
if (fp == 0) return 0;
|
if (fp == 0)
|
||||||
|
return 0;
|
||||||
I = (mm_idx_intv_t *)calloc(mi->n_seq, sizeof(*I));
|
I = (mm_idx_intv_t *)calloc(mi->n_seq, sizeof(*I));
|
||||||
ks = ks_init(fp);
|
ks = ks_init(fp);
|
||||||
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
|
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0)
|
||||||
|
{
|
||||||
mm_idx_intv_t *r;
|
mm_idx_intv_t *r;
|
||||||
mm_idx_intv1_t t = {-1, -1, -1, -1, 0};
|
mm_idx_intv1_t t = {-1, -1, -1, -1, 0};
|
||||||
char *p, *q, *bl, *bs;
|
char *p, *q, *bl, *bs;
|
||||||
int32_t i, id = -1, n_blk = 0;
|
int32_t i, id = -1, n_blk = 0;
|
||||||
for (p = q = str.s, i = 0;; ++p) {
|
for (p = q = str.s, i = 0;; ++p)
|
||||||
if (*p == 0 || *p == '\t') {
|
{
|
||||||
|
if (*p == 0 || *p == '\t')
|
||||||
|
{
|
||||||
int32_t c = *p;
|
int32_t c = *p;
|
||||||
*p = 0;
|
*p = 0;
|
||||||
if (i == 0) { // chr
|
if (i == 0)
|
||||||
|
{ // chr
|
||||||
id = mm_idx_name2id(mi, q);
|
id = mm_idx_name2id(mi, q);
|
||||||
if (id < 0) break; // unknown name; TODO: throw a warning
|
if (id < 0)
|
||||||
} else if (i == 1) { // start
|
break; // unknown name; TODO: throw a warning
|
||||||
|
}
|
||||||
|
else if (i == 1)
|
||||||
|
{ // start
|
||||||
t.st = atol(q); // TODO: watch out integer overflow!
|
t.st = atol(q); // TODO: watch out integer overflow!
|
||||||
if (t.st < 0) break;
|
if (t.st < 0)
|
||||||
} else if (i == 2) { // end
|
break;
|
||||||
|
}
|
||||||
|
else if (i == 2)
|
||||||
|
{ // end
|
||||||
t.en = atol(q);
|
t.en = atol(q);
|
||||||
if (t.en < 0) break;
|
if (t.en < 0)
|
||||||
} else if (i == 4) { // BED score
|
break;
|
||||||
|
}
|
||||||
|
else if (i == 4)
|
||||||
|
{ // BED score
|
||||||
t.score = atol(q);
|
t.score = atol(q);
|
||||||
} else if (i == 5) { // strand
|
}
|
||||||
t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
|
else if (i == 5)
|
||||||
} else if (i == 9) {
|
{ // strand
|
||||||
if (!isdigit(*q)) break;
|
t.strand = *q == '+' ? 1 : *q == '-' ? -1
|
||||||
|
: 0;
|
||||||
|
}
|
||||||
|
else if (i == 9)
|
||||||
|
{
|
||||||
|
if (!isdigit(*q))
|
||||||
|
break;
|
||||||
n_blk = atol(q);
|
n_blk = atol(q);
|
||||||
} else if (i == 10) {
|
}
|
||||||
|
else if (i == 10)
|
||||||
|
{
|
||||||
bl = q;
|
bl = q;
|
||||||
} else if (i == 11) {
|
}
|
||||||
|
else if (i == 11)
|
||||||
|
{
|
||||||
bs = q;
|
bs = q;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (c == 0) break;
|
if (c == 0)
|
||||||
|
break;
|
||||||
++i, q = p + 1;
|
++i, q = p + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (id < 0 || t.st < 0 || t.st >= t.en) continue;
|
if (id < 0 || t.st < 0 || t.st >= t.en)
|
||||||
|
continue;
|
||||||
r = &I[id];
|
r = &I[id];
|
||||||
if (i >= 11 && read_junc) { // BED12
|
if (i >= 11 && read_junc)
|
||||||
|
{ // BED12
|
||||||
int32_t st, sz, en;
|
int32_t st, sz, en;
|
||||||
st = strtol(bs, &bs, 10); ++bs;
|
st = strtol(bs, &bs, 10);
|
||||||
sz = strtol(bl, &bl, 10); ++bl;
|
++bs;
|
||||||
|
sz = strtol(bl, &bl, 10);
|
||||||
|
++bl;
|
||||||
en = t.st + st + sz;
|
en = t.st + st + sz;
|
||||||
for (i = 1; i < n_blk; ++i) {
|
for (i = 1; i < n_blk; ++i)
|
||||||
|
{
|
||||||
mm_idx_intv1_t s = t;
|
mm_idx_intv1_t s = t;
|
||||||
if (r->n == r->m) {
|
if (r->n == r->m)
|
||||||
|
{
|
||||||
r->m = r->m ? r->m + (r->m >> 1) : 16;
|
r->m = r->m ? r->m + (r->m >> 1) : 16;
|
||||||
r->a = (mm_idx_intv1_t *)realloc(r->a, sizeof(*r->a) * r->m);
|
r->a = (mm_idx_intv1_t *)realloc(r->a, sizeof(*r->a) * r->m);
|
||||||
}
|
}
|
||||||
st = strtol(bs, &bs, 10); ++bs;
|
st = strtol(bs, &bs, 10);
|
||||||
sz = strtol(bl, &bl, 10); ++bl;
|
++bs;
|
||||||
|
sz = strtol(bl, &bl, 10);
|
||||||
|
++bl;
|
||||||
s.st = en, s.en = t.st + st;
|
s.st = en, s.en = t.st + st;
|
||||||
en = t.st + st + sz;
|
en = t.st + st + sz;
|
||||||
if (s.en > s.st) r->a[r->n++] = s;
|
if (s.en > s.st)
|
||||||
|
r->a[r->n++] = s;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
if (r->n == r->m) {
|
else
|
||||||
|
{
|
||||||
|
if (r->n == r->m)
|
||||||
|
{
|
||||||
r->m = r->m ? r->m + (r->m >> 1) : 16;
|
r->m = r->m ? r->m + (r->m >> 1) : 16;
|
||||||
r->a = (mm_idx_intv1_t *)realloc(r->a, sizeof(*r->a) * r->m);
|
r->a = (mm_idx_intv1_t *)realloc(r->a, sizeof(*r->a) * r->m);
|
||||||
}
|
}
|
||||||
|
|
@ -752,9 +925,11 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
|
||||||
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
|
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
|
||||||
{
|
{
|
||||||
int32_t i;
|
int32_t i;
|
||||||
if (mi->h == 0) mm_idx_index_name(mi);
|
if (mi->h == 0)
|
||||||
|
mm_idx_index_name(mi);
|
||||||
mi->I = mm_idx_read_bed(mi, fn, read_junc);
|
mi->I = mm_idx_read_bed(mi, fn, read_junc);
|
||||||
if (mi->I == 0) return -1;
|
if (mi->I == 0)
|
||||||
|
return -1;
|
||||||
for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
|
for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
|
||||||
radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
|
radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -765,19 +940,28 @@ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uin
|
||||||
int32_t i, left, right;
|
int32_t i, left, right;
|
||||||
mm_idx_intv_t *r;
|
mm_idx_intv_t *r;
|
||||||
memset(s, 0, en - st);
|
memset(s, 0, en - st);
|
||||||
if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq) return -1;
|
if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq)
|
||||||
|
return -1;
|
||||||
r = &mi->I[ctg];
|
r = &mi->I[ctg];
|
||||||
left = 0, right = r->n;
|
left = 0, right = r->n;
|
||||||
while (right > left) {
|
while (right > left)
|
||||||
|
{
|
||||||
int32_t mid = left + ((right - left) >> 1);
|
int32_t mid = left + ((right - left) >> 1);
|
||||||
if (r->a[mid].st >= st) right = mid;
|
if (r->a[mid].st >= st)
|
||||||
else left = mid + 1;
|
right = mid;
|
||||||
|
else
|
||||||
|
left = mid + 1;
|
||||||
}
|
}
|
||||||
for (i = left; i < r->n; ++i) {
|
for (i = left; i < r->n; ++i)
|
||||||
if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0) {
|
{
|
||||||
if (r->a[i].strand > 0) {
|
if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0)
|
||||||
|
{
|
||||||
|
if (r->a[i].strand > 0)
|
||||||
|
{
|
||||||
s[r->a[i].st - st] |= 1, s[r->a[i].en - 1 - st] |= 2;
|
s[r->a[i].st - st] |= 1, s[r->a[i].en - 1 - st] |= 2;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
s[r->a[i].st - st] |= 8, s[r->a[i].en - 1 - st] |= 4;
|
s[r->a[i].st - st] |= 8, s[r->a[i].en - 1 - st] |= 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
10
lchain.c
10
lchain.c
|
|
@ -5,7 +5,7 @@
|
||||||
#include "mmpriv.h"
|
#include "mmpriv.h"
|
||||||
#include "kalloc.h"
|
#include "kalloc.h"
|
||||||
#include "krmq.h"
|
#include "krmq.h"
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
extern int64_t get_mseconds();
|
extern int64_t get_mseconds();
|
||||||
extern int64_t time_mg_lchain_dp,
|
extern int64_t time_mg_lchain_dp,
|
||||||
time_mg_chain_backtrack;
|
time_mg_chain_backtrack;
|
||||||
|
|
@ -155,7 +155,7 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
||||||
int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw;
|
int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw;
|
||||||
int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
|
int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
|
||||||
uint64_t *u;
|
uint64_t *u;
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||||
#endif
|
#endif
|
||||||
if (_u) *_u = 0, *n_u_ = 0;
|
if (_u) *_u = 0, *n_u_ = 0;
|
||||||
|
|
@ -211,17 +211,17 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int
|
||||||
max_ii = i;
|
max_ii = i;
|
||||||
if (mmax_f < max_f) mmax_f = max_f;
|
if (mmax_f < max_f) mmax_f = max_f;
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_inner_time = get_mseconds();
|
int64_t tmp_inner_time = get_mseconds();
|
||||||
#endif
|
#endif
|
||||||
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v);
|
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v);
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_inner_time;
|
tmp_diff = get_mseconds() - tmp_inner_time;
|
||||||
__sync_fetch_and_add(&time_mg_chain_backtrack, tmp_diff);
|
__sync_fetch_and_add(&time_mg_chain_backtrack, tmp_diff);
|
||||||
#endif
|
#endif
|
||||||
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
|
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
|
||||||
kfree(km, p); kfree(km, f); kfree(km, t);
|
kfree(km, p); kfree(km, f); kfree(km, t);
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_mg_lchain_dp, tmp_diff);
|
__sync_fetch_and_add(&time_mg_lchain_dp, tmp_diff);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
10
main.c
10
main.c
|
|
@ -18,7 +18,7 @@ int64_t get_mseconds()
|
||||||
}
|
}
|
||||||
|
|
||||||
// 记录运行时间的变量
|
// 记录运行时间的变量
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
|
|
||||||
int64_t time_mm_idx_reader_read,
|
int64_t time_mm_idx_reader_read,
|
||||||
time_mm_map_file_frag,
|
time_mm_map_file_frag,
|
||||||
|
|
@ -34,7 +34,8 @@ int64_t time_mm_idx_reader_read,
|
||||||
time_mg_lchain_dp = 0,
|
time_mg_lchain_dp = 0,
|
||||||
time_collect_seed_hits_heap = 0,
|
time_collect_seed_hits_heap = 0,
|
||||||
time_collect_seed_hits = 0,
|
time_collect_seed_hits = 0,
|
||||||
time_mg_chain_backtrack = 0;
|
time_mg_chain_backtrack = 0,
|
||||||
|
time_ksw_extd2_sse = 0;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
|
|
@ -181,7 +182,7 @@ int main(int argc, char *argv[])
|
||||||
mm_realtime0 = realtime();
|
mm_realtime0 = realtime();
|
||||||
mm_set_opt(0, &ipt, &opt);
|
mm_set_opt(0, &ipt, &opt);
|
||||||
|
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
|
|
||||||
time_mm_idx_reader_read = 0;
|
time_mm_idx_reader_read = 0;
|
||||||
time_mm_map_file_frag = 0;
|
time_mm_map_file_frag = 0;
|
||||||
|
|
@ -699,7 +700,7 @@ int main(int argc, char *argv[])
|
||||||
fprintf(stderr, " %s", argv[i]);
|
fprintf(stderr, " %s", argv[i]);
|
||||||
fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec; Peak RSS: %.3f GB\n", __func__, realtime() - mm_realtime0, cputime(), peakrss() / 1024.0 / 1024.0 / 1024.0);
|
fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec; Peak RSS: %.3f GB\n", __func__, realtime() - mm_realtime0, cputime(), peakrss() / 1024.0 / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
fprintf(stderr, "time_mm_idx_reader_read: %f s\n", time_mm_idx_reader_read / 1000.0);
|
fprintf(stderr, "time_mm_idx_reader_read: %f s\n", time_mm_idx_reader_read / 1000.0);
|
||||||
|
|
@ -717,6 +718,7 @@ int main(int argc, char *argv[])
|
||||||
fprintf(stderr, "time_collect_seed_hits: %f s\n", time_collect_seed_hits / 1000.0 / n_threads);
|
fprintf(stderr, "time_collect_seed_hits: %f s\n", time_collect_seed_hits / 1000.0 / n_threads);
|
||||||
fprintf(stderr, "time_mg_lchain_dp: %f s\n", time_mg_lchain_dp / 1000.0 / n_threads);
|
fprintf(stderr, "time_mg_lchain_dp: %f s\n", time_mg_lchain_dp / 1000.0 / n_threads);
|
||||||
fprintf(stderr, "time_mg_chain_backtrack: %f s\n", time_mg_chain_backtrack / 1000.0 / n_threads);
|
fprintf(stderr, "time_mg_chain_backtrack: %f s\n", time_mg_chain_backtrack / 1000.0 / n_threads);
|
||||||
|
fprintf(stderr, "time_ksw_extd2_sse: %f s\n", time_ksw_extd2_sse / 1000.0 / n_threads);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
44
map.c
44
map.c
|
|
@ -10,7 +10,7 @@
|
||||||
#include "bseq.h"
|
#include "bseq.h"
|
||||||
#include "khash.h"
|
#include "khash.h"
|
||||||
|
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
extern int64_t get_mseconds();
|
extern int64_t get_mseconds();
|
||||||
extern int64_t time_mm_map_file_frag,
|
extern int64_t time_mm_map_file_frag,
|
||||||
time_map_work_for_block_1,
|
time_map_work_for_block_1,
|
||||||
|
|
@ -145,7 +145,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
|
||||||
int64_t j, n_for = 0, n_rev = 0;
|
int64_t j, n_for = 0, n_rev = 0;
|
||||||
mm_seed_t *m;
|
mm_seed_t *m;
|
||||||
mm128_t *a, *heap;
|
mm128_t *a, *heap;
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||||
#endif
|
#endif
|
||||||
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||||
|
|
@ -217,7 +217,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
|
||||||
memmove(a + n_for, a + (*n_a) - n_rev, n_rev * sizeof(mm128_t));
|
memmove(a + n_for, a + (*n_a) - n_rev, n_rev * sizeof(mm128_t));
|
||||||
*n_a = n_for + n_rev;
|
*n_a = n_for + n_rev;
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_collect_seed_hits_heap, tmp_diff);
|
__sync_fetch_and_add(&time_collect_seed_hits_heap, tmp_diff);
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -230,14 +230,10 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
||||||
int i, n_m;
|
int i, n_m;
|
||||||
mm_seed_t *m;
|
mm_seed_t *m;
|
||||||
mm128_t *a;
|
mm128_t *a;
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||||
#endif
|
#endif
|
||||||
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||||
#ifdef ANALYSIS_PERF
|
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
|
||||||
__sync_fetch_and_add(&time_collect_seed_hits, tmp_diff);
|
|
||||||
#endif
|
|
||||||
a = (mm128_t *)kmalloc(km, *n_a * sizeof(mm128_t));
|
a = (mm128_t *)kmalloc(km, *n_a * sizeof(mm128_t));
|
||||||
for (i = 0, *n_a = 0; i < n_m; ++i)
|
for (i = 0, *n_a = 0; i < n_m; ++i)
|
||||||
{
|
{
|
||||||
|
|
@ -276,6 +272,10 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
||||||
}
|
}
|
||||||
kfree(km, m);
|
kfree(km, m);
|
||||||
radix_sort_128x(a, a + (*n_a));
|
radix_sort_128x(a, a + (*n_a));
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
|
__sync_fetch_and_add(&time_collect_seed_hits, tmp_diff);
|
||||||
|
#endif
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -317,7 +317,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
||||||
mm_reg1_t *regs0;
|
mm_reg1_t *regs0;
|
||||||
km_stat_t kmst;
|
km_stat_t kmst;
|
||||||
float chn_pen_gap, chn_pen_skip;
|
float chn_pen_gap, chn_pen_skip;
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||||
#endif
|
#endif
|
||||||
for (i = 0, qlen_sum = 0; i < n_segs; ++i)
|
for (i = 0, qlen_sum = 0; i < n_segs; ++i)
|
||||||
|
|
@ -333,13 +333,13 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
||||||
hash = __ac_Wang_hash(hash);
|
hash = __ac_Wang_hash(hash);
|
||||||
|
|
||||||
collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv);
|
collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv);
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_mm_map_frag_b1, tmp_diff);
|
__sync_fetch_and_add(&time_mm_map_frag_b1, tmp_diff);
|
||||||
tmp_cur_time = get_mseconds();
|
tmp_cur_time = get_mseconds();
|
||||||
#endif
|
#endif
|
||||||
if (opt->q_occ_frac > 0.0f)
|
if (opt->q_occ_frac > 0.0f)
|
||||||
mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac);
|
mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac); // 过滤掉出现次数太多的minimizer
|
||||||
if (opt->flag & MM_F_HEAP_SORT)
|
if (opt->flag & MM_F_HEAP_SORT)
|
||||||
a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
|
a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
|
||||||
else
|
else
|
||||||
|
|
@ -352,7 +352,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
||||||
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x << 1 >> 33].name, (int32_t)a[i].x, "+-"[a[i].x >> 63], (int32_t)a[i].y, (int32_t)(a[i].y >> 32 & 0xff),
|
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x << 1 >> 33].name, (int32_t)a[i].x, "+-"[a[i].x >> 63], (int32_t)a[i].y, (int32_t)(a[i].y >> 32 & 0xff),
|
||||||
i == 0 ? 0 : ((int32_t)a[i].y - (int32_t)a[i - 1].y) - ((int32_t)a[i].x - (int32_t)a[i - 1].x));
|
i == 0 ? 0 : ((int32_t)a[i].y - (int32_t)a[i - 1].y) - ((int32_t)a[i].x - (int32_t)a[i - 1].x));
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_mm_map_frag_b2, tmp_diff);
|
__sync_fetch_and_add(&time_mm_map_frag_b2, tmp_diff);
|
||||||
tmp_cur_time = get_mseconds();
|
tmp_cur_time = get_mseconds();
|
||||||
|
|
@ -387,7 +387,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
||||||
a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
|
a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
|
||||||
chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
|
chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_mm_map_frag_b3, tmp_diff);
|
__sync_fetch_and_add(&time_mm_map_frag_b3, tmp_diff);
|
||||||
tmp_cur_time = get_mseconds();
|
tmp_cur_time = get_mseconds();
|
||||||
|
|
@ -441,7 +441,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
||||||
}
|
}
|
||||||
b->frag_gap = max_chain_gap_ref;
|
b->frag_gap = max_chain_gap_ref;
|
||||||
b->rep_len = rep_len;
|
b->rep_len = rep_len;
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_mm_map_frag_b4, tmp_diff);
|
__sync_fetch_and_add(&time_mm_map_frag_b4, tmp_diff);
|
||||||
tmp_cur_time = get_mseconds();
|
tmp_cur_time = get_mseconds();
|
||||||
|
|
@ -465,7 +465,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
||||||
mm_est_err(mi, qlen_sum, n_regs0, regs0, a, n_mini_pos, mini_pos);
|
mm_est_err(mi, qlen_sum, n_regs0, regs0, a, n_mini_pos, mini_pos);
|
||||||
n_regs0 = mm_filter_strand_retained(n_regs0, regs0);
|
n_regs0 = mm_filter_strand_retained(n_regs0, regs0);
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_mm_map_frag_b5, tmp_diff);
|
__sync_fetch_and_add(&time_mm_map_frag_b5, tmp_diff);
|
||||||
tmp_cur_time = get_mseconds();
|
tmp_cur_time = get_mseconds();
|
||||||
|
|
@ -492,7 +492,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
||||||
if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag & MM_F_CIGAR))
|
if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag & MM_F_CIGAR))
|
||||||
mm_pair(b->km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing
|
mm_pair(b->km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_mm_map_frag_b6, tmp_diff);
|
__sync_fetch_and_add(&time_mm_map_frag_b6, tmp_diff);
|
||||||
tmp_cur_time = get_mseconds();
|
tmp_cur_time = get_mseconds();
|
||||||
|
|
@ -566,7 +566,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
||||||
fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq);
|
fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq);
|
||||||
t = realtime();
|
t = realtime();
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
int64_t tmp_cur_time = get_mseconds(), tmp_diff = 0;
|
||||||
#endif
|
#endif
|
||||||
for (j = 0; j < s->n_seg[i]; ++j)
|
for (j = 0; j < s->n_seg[i]; ++j)
|
||||||
|
|
@ -576,7 +576,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
||||||
qlens[j] = s->seq[off + j].l_seq;
|
qlens[j] = s->seq[off + j].l_seq;
|
||||||
qseqs[j] = s->seq[off + j].seq;
|
qseqs[j] = s->seq[off + j].seq;
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_map_work_for_block_1, tmp_diff);
|
__sync_fetch_and_add(&time_map_work_for_block_1, tmp_diff);
|
||||||
tmp_cur_time = get_mseconds();
|
tmp_cur_time = get_mseconds();
|
||||||
|
|
@ -599,7 +599,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
||||||
s->frag_gap[off + j] = b->frag_gap;
|
s->frag_gap[off + j] = b->frag_gap;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_map_work_for_block_2, tmp_diff);
|
__sync_fetch_and_add(&time_map_work_for_block_2, tmp_diff);
|
||||||
tmp_cur_time = get_mseconds();
|
tmp_cur_time = get_mseconds();
|
||||||
|
|
@ -618,7 +618,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
|
||||||
r->rev = !r->rev;
|
r->rev = !r->rev;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
tmp_diff = get_mseconds() - tmp_cur_time;
|
tmp_diff = get_mseconds() - tmp_cur_time;
|
||||||
__sync_fetch_and_add(&time_map_work_for_block_3, tmp_diff);
|
__sync_fetch_and_add(&time_map_work_for_block_3, tmp_diff);
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -851,7 +851,7 @@ static mm_bseq_file_t **open_bseqs(int n, const char **fn)
|
||||||
|
|
||||||
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads)
|
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads)
|
||||||
{
|
{
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
int64_t tmp_cur_time = get_mseconds();
|
int64_t tmp_cur_time = get_mseconds();
|
||||||
#endif
|
#endif
|
||||||
int i, pl_threads;
|
int i, pl_threads;
|
||||||
|
|
@ -878,7 +878,7 @@ int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_
|
||||||
for (i = 0; i < pl.n_fp; ++i)
|
for (i = 0; i < pl.n_fp; ++i)
|
||||||
mm_bseq_close(pl.fp[i]);
|
mm_bseq_close(pl.fp[i]);
|
||||||
free(pl.fp);
|
free(pl.fp);
|
||||||
#ifdef ANALYSIS_PERF
|
#ifdef SHOW_PERF
|
||||||
time_mm_map_file_frag += get_mseconds() - tmp_cur_time;
|
time_mm_map_file_frag += get_mseconds() - tmp_cur_time;
|
||||||
#endif
|
#endif
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
43
minimap.h
43
minimap.h
|
|
@ -8,7 +8,7 @@
|
||||||
#define MM_VERSION "2.26-r1175"
|
#define MM_VERSION "2.26-r1175"
|
||||||
|
|
||||||
// 用来开关调试性能分析,运行时间等信息
|
// 用来开关调试性能分析,运行时间等信息
|
||||||
#define ANALYSIS_PERF 1
|
#define SHOW_PERF 1
|
||||||
|
|
||||||
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
||||||
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
||||||
|
|
@ -69,22 +69,32 @@
|
||||||
#define MM_CIGAR_STR "MIDNSHP=XB"
|
#define MM_CIGAR_STR "MIDNSHP=XB"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// emulate 128-bit integers and arrays
|
// emulate 128-bit integers and arrays
|
||||||
typedef struct { uint64_t x, y; } mm128_t;
|
typedef struct
|
||||||
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
|
{
|
||||||
|
uint64_t x, y;
|
||||||
|
} mm128_t;
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
size_t n, m;
|
||||||
|
mm128_t *a;
|
||||||
|
} mm128_v;
|
||||||
|
|
||||||
// minimap2 index
|
// minimap2 index
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
char *name; // name of the db sequence
|
char *name; // name of the db sequence
|
||||||
uint64_t offset; // offset in mm_idx_t::S
|
uint64_t offset; // offset in mm_idx_t::S
|
||||||
uint32_t len; // length
|
uint32_t len; // length
|
||||||
uint32_t is_alt;
|
uint32_t is_alt;
|
||||||
} mm_idx_seq_t;
|
} mm_idx_seq_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
int32_t b, w, k, flag;
|
int32_t b, w, k, flag;
|
||||||
uint32_t n_seq; // number of reference sequences
|
uint32_t n_seq; // number of reference sequences
|
||||||
int32_t index;
|
int32_t index;
|
||||||
|
|
@ -97,7 +107,8 @@ typedef struct {
|
||||||
} mm_idx_t;
|
} mm_idx_t;
|
||||||
|
|
||||||
// minimap2 alignment
|
// minimap2 alignment
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
uint32_t capacity; // the capacity of cigar[]
|
uint32_t capacity; // the capacity of cigar[]
|
||||||
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
|
||||||
uint32_t n_ambi : 30, trans_strand : 2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
uint32_t n_ambi : 30, trans_strand : 2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
|
||||||
|
|
@ -105,7 +116,8 @@ typedef struct {
|
||||||
uint32_t cigar[];
|
uint32_t cigar[];
|
||||||
} mm_extra_t;
|
} mm_extra_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
int32_t id; // ID for internal uses (see also parent below)
|
int32_t id; // ID for internal uses (see also parent below)
|
||||||
int32_t cnt; // number of minimizers; if on the reverse strand
|
int32_t cnt; // number of minimizers; if on the reverse strand
|
||||||
int32_t rid; // reference index; if this is an alignment from inversion rescue
|
int32_t rid; // reference index; if this is an alignment from inversion rescue
|
||||||
|
|
@ -123,13 +135,15 @@ typedef struct {
|
||||||
} mm_reg1_t;
|
} mm_reg1_t;
|
||||||
|
|
||||||
// indexing and mapping options
|
// indexing and mapping options
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
short k, w, flag, bucket_bits;
|
short k, w, flag, bucket_bits;
|
||||||
int64_t mini_batch_size;
|
int64_t mini_batch_size;
|
||||||
uint64_t batch_size;
|
uint64_t batch_size;
|
||||||
} mm_idxopt_t;
|
} mm_idxopt_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
int64_t flag; // see MM_F_* macros
|
int64_t flag; // see MM_F_* macros
|
||||||
int seed;
|
int seed;
|
||||||
int sdust_thres; // score threshold for SDUST; 0 to disable
|
int sdust_thres; // score threshold for SDUST; 0 to disable
|
||||||
|
|
@ -185,19 +199,22 @@ typedef struct {
|
||||||
} mm_mapopt_t;
|
} mm_mapopt_t;
|
||||||
|
|
||||||
// index reader
|
// index reader
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
int is_idx, n_parts;
|
int is_idx, n_parts;
|
||||||
int64_t idx_size;
|
int64_t idx_size;
|
||||||
mm_idxopt_t opt;
|
mm_idxopt_t opt;
|
||||||
FILE *fp_out;
|
FILE *fp_out;
|
||||||
union {
|
union
|
||||||
|
{
|
||||||
struct mm_bseq_file_s *seq;
|
struct mm_bseq_file_s *seq;
|
||||||
FILE *idx;
|
FILE *idx;
|
||||||
} fp;
|
} fp;
|
||||||
} mm_idx_reader_t;
|
} mm_idx_reader_t;
|
||||||
|
|
||||||
// memory buffer for thread-local storage during mapping
|
// memory buffer for thread-local storage during mapping
|
||||||
struct mm_tbuf_s {
|
struct mm_tbuf_s
|
||||||
|
{
|
||||||
void *km;
|
void *km;
|
||||||
int rep_len, frag_gap;
|
int rep_len, frag_gap;
|
||||||
};
|
};
|
||||||
|
|
|
||||||
84
seed.c
84
seed.c
|
|
@ -6,13 +6,16 @@ void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
|
||||||
{
|
{
|
||||||
mm128_t *a;
|
mm128_t *a;
|
||||||
size_t i, j, st;
|
size_t i, j, st;
|
||||||
if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
|
if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0)
|
||||||
|
return;
|
||||||
a = Kmalloc(km, mm128_t, mv->n);
|
a = Kmalloc(km, mm128_t, mv->n);
|
||||||
for (i = 0; i < mv->n; ++i)
|
for (i = 0; i < mv->n; ++i)
|
||||||
a[i].x = mv->a[i].x, a[i].y = i;
|
a[i].x = mv->a[i].x, a[i].y = i;
|
||||||
radix_sort_128x(a, a + mv->n);
|
radix_sort_128x(a, a + mv->n);
|
||||||
for (st = 0, i = 1; i <= mv->n; ++i) {
|
for (st = 0, i = 1; i <= mv->n; ++i)
|
||||||
if (i == mv->n || a[i].x != a[st].x) {
|
{
|
||||||
|
if (i == mv->n || a[i].x != a[st].x)
|
||||||
|
{
|
||||||
int32_t cnt = i - st;
|
int32_t cnt = i - st;
|
||||||
if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
|
if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
|
||||||
for (j = st; j < i; ++j)
|
for (j = st; j < i; ++j)
|
||||||
|
|
@ -32,20 +35,24 @@ mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv,
|
||||||
mm_seed_t *m;
|
mm_seed_t *m;
|
||||||
size_t i;
|
size_t i;
|
||||||
int32_t k;
|
int32_t k;
|
||||||
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
m = (mm_seed_t *)kmalloc(km, mv->n * sizeof(mm_seed_t)); // 为每一个minimizer开辟一个mm_seed_t
|
||||||
for (i = k = 0; i < mv->n; ++i) {
|
for (i = k = 0; i < mv->n; ++i)
|
||||||
|
{
|
||||||
const uint64_t *cr;
|
const uint64_t *cr;
|
||||||
mm_seed_t *q;
|
mm_seed_t *q;
|
||||||
mm128_t *p = &mv->a[i];
|
mm128_t *p = &mv->a[i];
|
||||||
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
||||||
int t;
|
int t; // t表示hash值的低32位,表示啥?
|
||||||
cr = mm_idx_get(mi, p->x>>8, &t);
|
cr = mm_idx_get(mi, p->x >> 8, &t); // cr是hash值的高32位,代表位置
|
||||||
if (t == 0) continue;
|
if (t == 0)
|
||||||
|
continue;
|
||||||
q = &m[k++];
|
q = &m[k++];
|
||||||
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
||||||
q->is_tandem = q->flt = 0;
|
q->is_tandem = q->flt = 0;
|
||||||
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
if (i > 0 && p->x >> 8 == mv->a[i - 1].x >> 8)
|
||||||
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
q->is_tandem = 1;
|
||||||
|
if (i < mv->n - 1 && p->x >> 8 == mv->a[i + 1].x >> 8)
|
||||||
|
q->is_tandem = 1;
|
||||||
}
|
}
|
||||||
*n_m_ = k;
|
*n_m_ = k;
|
||||||
return m;
|
return m;
|
||||||
|
|
@ -60,32 +67,43 @@ void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_o
|
||||||
int32_t i, last0, m;
|
int32_t i, last0, m;
|
||||||
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
|
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
|
||||||
|
|
||||||
if (n == 0 || n == 1) return;
|
if (n == 0 || n == 1)
|
||||||
|
return;
|
||||||
for (i = m = 0; i < n; ++i)
|
for (i = m = 0; i < n; ++i)
|
||||||
if (a[i].n > max_occ) ++m;
|
if (a[i].n > max_occ)
|
||||||
if (m == 0) return; // no high-frequency k-mers; do nothing
|
++m;
|
||||||
for (i = 0, last0 = -1; i <= n; ++i) {
|
if (m == 0)
|
||||||
if (i == n || a[i].n <= max_occ) {
|
return; // no high-frequency k-mers; do nothing
|
||||||
if (i - last0 > 1) {
|
for (i = 0, last0 = -1; i <= n; ++i)
|
||||||
|
{
|
||||||
|
if (i == n || a[i].n <= max_occ)
|
||||||
|
{
|
||||||
|
if (i - last0 > 1)
|
||||||
|
{
|
||||||
int32_t ps = last0 < 0 ? 0 : (uint32_t)a[last0].q_pos >> 1;
|
int32_t ps = last0 < 0 ? 0 : (uint32_t)a[last0].q_pos >> 1;
|
||||||
int32_t pe = i == n ? len : (uint32_t)a[i].q_pos >> 1;
|
int32_t pe = i == n ? len : (uint32_t)a[i].q_pos >> 1;
|
||||||
int32_t j, k, st = last0 + 1, en = i;
|
int32_t j, k, st = last0 + 1, en = i;
|
||||||
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
|
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
|
||||||
if (max_high_occ > 0) {
|
if (max_high_occ > 0)
|
||||||
|
{
|
||||||
if (max_high_occ > MAX_MAX_HIGH_OCC)
|
if (max_high_occ > MAX_MAX_HIGH_OCC)
|
||||||
max_high_occ = MAX_MAX_HIGH_OCC;
|
max_high_occ = MAX_MAX_HIGH_OCC;
|
||||||
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
|
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
|
||||||
b[k] = (uint64_t)a[j].n << 32 | j;
|
b[k] = (uint64_t)a[j].n << 32 | j;
|
||||||
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
|
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
|
||||||
for (; j < en; ++j) { // if there are more, choose top max_high_occ
|
for (; j < en; ++j)
|
||||||
if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
|
{ // if there are more, choose top max_high_occ
|
||||||
|
if (a[j].n < (int32_t)(b[0] >> 32))
|
||||||
|
{ // then update the heap
|
||||||
b[0] = (uint64_t)a[j].n << 32 | j;
|
b[0] = (uint64_t)a[j].n << 32 | j;
|
||||||
ks_heapdown_uint64_t(0, k, b);
|
ks_heapdown_uint64_t(0, k, b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
|
for (j = 0; j < k; ++j)
|
||||||
|
a[(uint32_t)b[j]].flt = 1;
|
||||||
}
|
}
|
||||||
for (j = st; j < en; ++j) a[j].flt ^= 1;
|
for (j = st; j < en; ++j)
|
||||||
|
a[j].flt ^= 1;
|
||||||
for (j = st; j < en; ++j)
|
for (j = st; j < en; ++j)
|
||||||
if (a[j].n > max_max_occ)
|
if (a[j].n > max_max_occ)
|
||||||
a[j].flt = 1;
|
a[j].flt = 1;
|
||||||
|
|
@ -103,23 +121,33 @@ mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int ma
|
||||||
*n_mini_pos = 0;
|
*n_mini_pos = 0;
|
||||||
*mini_pos = (uint64_t *)kmalloc(km, mv->n * sizeof(uint64_t));
|
*mini_pos = (uint64_t *)kmalloc(km, mv->n * sizeof(uint64_t));
|
||||||
m = mm_seed_collect_all(km, mi, mv, &n_m0);
|
m = mm_seed_collect_all(km, mi, mv, &n_m0);
|
||||||
if (dist > 0 && max_max_occ > max_occ) {
|
if (dist > 0 && max_max_occ > max_occ)
|
||||||
|
{
|
||||||
mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
|
mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
for (i = 0; i < n_m0; ++i)
|
for (i = 0; i < n_m0; ++i)
|
||||||
if (m[i].n > max_occ)
|
if (m[i].n > max_occ)
|
||||||
m[i].flt = 1;
|
m[i].flt = 1;
|
||||||
}
|
}
|
||||||
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
|
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i)
|
||||||
|
{
|
||||||
mm_seed_t *q = &m[i];
|
mm_seed_t *q = &m[i];
|
||||||
// fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
|
// fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
|
||||||
if (q->flt) {
|
if (q->flt)
|
||||||
|
{
|
||||||
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
||||||
if (st > rep_en) {
|
if (st > rep_en)
|
||||||
|
{
|
||||||
*rep_len += rep_en - rep_st;
|
*rep_len += rep_en - rep_st;
|
||||||
rep_st = st, rep_en = en;
|
rep_st = st, rep_en = en;
|
||||||
} else rep_en = en;
|
}
|
||||||
} else {
|
else
|
||||||
|
rep_en = en;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
*n_a += q->n;
|
*n_a += q->n;
|
||||||
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
|
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span << 32 | q->q_pos >> 1;
|
||||||
m[n_m++] = *q;
|
m[n_m++] = *q;
|
||||||
|
|
|
||||||
82
sketch.c
82
sketch.c
|
|
@ -22,8 +22,7 @@ unsigned char seq_nt4_table[256] = {
|
||||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
|
||||||
};
|
|
||||||
|
|
||||||
static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
||||||
{
|
{
|
||||||
|
|
@ -37,7 +36,8 @@ static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
||||||
return key;
|
return key;
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef struct { // a simplified version of kdq
|
typedef struct
|
||||||
|
{ // a simplified version of kdq
|
||||||
int front, count;
|
int front, count;
|
||||||
int a[32];
|
int a[32];
|
||||||
} tiny_queue_t;
|
} tiny_queue_t;
|
||||||
|
|
@ -50,7 +50,8 @@ static inline void tq_push(tiny_queue_t *q, int x)
|
||||||
static inline int tq_shift(tiny_queue_t *q)
|
static inline int tq_shift(tiny_queue_t *q)
|
||||||
{
|
{
|
||||||
int x;
|
int x;
|
||||||
if (q->count == 0) return -1;
|
if (q->count == 0)
|
||||||
|
return -1;
|
||||||
x = q->a[q->front++];
|
x = q->a[q->front++];
|
||||||
q->front &= 0x1f;
|
q->front &= 0x1f;
|
||||||
--q->count;
|
--q->count;
|
||||||
|
|
@ -84,16 +85,20 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
|
||||||
assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
||||||
memset(buf, 0xff, w * 16);
|
memset(buf, 0xff, w * 16);
|
||||||
memset(&tq, 0, sizeof(tiny_queue_t));
|
memset(&tq, 0, sizeof(tiny_queue_t));
|
||||||
kv_resize(mm128_t, km, *p, p->n + len/w);
|
kv_resize(mm128_t, km, *p, p->n + len / w); // 扩充p,将新生成len/w个minimizer
|
||||||
|
|
||||||
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
|
for (i = l = buf_pos = min_pos = 0; i < len; ++i)
|
||||||
|
{
|
||||||
int c = seq_nt4_table[(uint8_t)str[i]];
|
int c = seq_nt4_table[(uint8_t)str[i]];
|
||||||
mm128_t info = {UINT64_MAX, UINT64_MAX};
|
mm128_t info = {UINT64_MAX, UINT64_MAX};
|
||||||
if (c < 4) { // not an ambiguous base
|
if (c < 4)
|
||||||
|
{ // not an ambiguous base
|
||||||
int z;
|
int z;
|
||||||
if (is_hpc) {
|
if (is_hpc)
|
||||||
|
{
|
||||||
int skip_len = 1;
|
int skip_len = 1;
|
||||||
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
|
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c)
|
||||||
|
{
|
||||||
for (skip_len = 2; i + skip_len < len; ++skip_len)
|
for (skip_len = 2; i + skip_len < len; ++skip_len)
|
||||||
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
|
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
|
||||||
break;
|
break;
|
||||||
|
|
@ -101,42 +106,63 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
|
||||||
}
|
}
|
||||||
tq_push(&tq, skip_len);
|
tq_push(&tq, skip_len);
|
||||||
kmer_span += skip_len;
|
kmer_span += skip_len;
|
||||||
if (tq.count > k) kmer_span -= tq_shift(&tq);
|
if (tq.count > k)
|
||||||
} else kmer_span = l + 1 < k? l + 1 : k;
|
kmer_span -= tq_shift(&tq);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
kmer_span = l + 1 < k ? l + 1 : k;
|
||||||
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
||||||
kmer[1] = (kmer[1] >> 2) | (3ULL ^ c) << shift1; // reverse k-mer
|
kmer[1] = (kmer[1] >> 2) | (3ULL ^ c) << shift1; // reverse k-mer
|
||||||
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
|
if (kmer[0] == kmer[1])
|
||||||
z = kmer[0] < kmer[1]? 0 : 1; // strand // kmer的strand到底是什么意思?为什么通过比较就能确定正反?
|
continue; // skip "symmetric k-mers" as we don't know it strand
|
||||||
|
z = kmer[0] < kmer[1] ? 0 : 1; // strand // 选取小的那个kmer,kmer的strand到底是什么意思?为什么通过比较就能确定正反?
|
||||||
++l;
|
++l;
|
||||||
if (l >= k && kmer_span < 256) {
|
if (l >= k && kmer_span < 256)
|
||||||
|
{
|
||||||
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
|
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
|
||||||
info.y = (uint64_t)rid << 32 | (uint32_t)i << 1 | z;
|
info.y = (uint64_t)rid << 32 | (uint32_t)i << 1 | z;
|
||||||
}
|
}
|
||||||
} else l = 0, tq.count = tq.front = 0, kmer_span = 0;
|
}
|
||||||
|
else
|
||||||
|
l = 0, tq.count = tq.front = 0, kmer_span = 0;
|
||||||
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
|
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
|
||||||
if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
|
if (l == w + k - 1 && min.x != UINT64_MAX)
|
||||||
|
{ // special case for the first window - because identical k-mers are not stored yet
|
||||||
for (j = buf_pos + 1; j < w; ++j)
|
for (j = buf_pos + 1; j < w; ++j)
|
||||||
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
if (min.x == buf[j].x && buf[j].y != min.y)
|
||||||
|
kv_push(mm128_t, km, *p, buf[j]);
|
||||||
for (j = 0; j < buf_pos; ++j)
|
for (j = 0; j < buf_pos; ++j)
|
||||||
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
if (min.x == buf[j].x && buf[j].y != min.y)
|
||||||
|
kv_push(mm128_t, km, *p, buf[j]);
|
||||||
}
|
}
|
||||||
if (info.x <= min.x) { // a new minimum; then write the old min
|
if (info.x <= min.x)
|
||||||
if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
|
{ // a new minimum; then write the old min
|
||||||
|
if (l >= w + k && min.x != UINT64_MAX)
|
||||||
|
kv_push(mm128_t, km, *p, min);
|
||||||
min = info, min_pos = buf_pos;
|
min = info, min_pos = buf_pos;
|
||||||
} else if (buf_pos == min_pos) { // old min has moved outside the window
|
}
|
||||||
if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
|
else if (buf_pos == min_pos)
|
||||||
|
{ // old min has moved outside the window
|
||||||
|
if (l >= w + k - 1 && min.x != UINT64_MAX)
|
||||||
|
kv_push(mm128_t, km, *p, min);
|
||||||
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
|
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
|
||||||
if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
if (min.x >= buf[j].x)
|
||||||
|
min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
||||||
for (j = 0; j <= buf_pos; ++j)
|
for (j = 0; j <= buf_pos; ++j)
|
||||||
if (min.x >= buf[j].x) min = buf[j], min_pos = j;
|
if (min.x >= buf[j].x)
|
||||||
if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
|
min = buf[j], min_pos = j; // 如果有多个min相同,取离当前位置最近的
|
||||||
|
if (l >= w + k - 1 && min.x != UINT64_MAX) // 往回找相同值的kmer,放进p里
|
||||||
|
{ // write identical k-mers
|
||||||
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
||||||
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
if (min.x == buf[j].x && min.y != buf[j].y)
|
||||||
|
kv_push(mm128_t, km, *p, buf[j]);
|
||||||
for (j = 0; j <= buf_pos; ++j)
|
for (j = 0; j <= buf_pos; ++j)
|
||||||
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
if (min.x == buf[j].x && min.y != buf[j].y)
|
||||||
|
kv_push(mm128_t, km, *p, buf[j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (++buf_pos == w) buf_pos = 0;
|
if (++buf_pos == w)
|
||||||
|
buf_pos = 0;
|
||||||
}
|
}
|
||||||
if (min.x != UINT64_MAX)
|
if (min.x != UINT64_MAX)
|
||||||
kv_push(mm128_t, km, *p, min);
|
kv_push(mm128_t, km, *p, min);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue