r1025: seed rescuring
This commit is contained in:
parent
cdbd96be0c
commit
feb92d32ea
37
Makefile
37
Makefile
|
|
@ -1,7 +1,9 @@
|
|||
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
|
||||
CPPFLAGS= -DHAVE_KALLOC
|
||||
INCLUDES=
|
||||
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o
|
||||
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
|
||||
chain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
|
||||
ksw2_ll_sse.o
|
||||
PROG= minimap2
|
||||
PROG_EXTRA= sdust minimap2-lite
|
||||
LIBS= -lm -lz -lpthread
|
||||
|
|
@ -103,26 +105,29 @@ depend:
|
|||
|
||||
# DO NOT DELETE
|
||||
|
||||
align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
|
||||
align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
|
||||
bseq.o: bseq.h kvec.h kalloc.h kseq.h
|
||||
chain.o: minimap.h mmpriv.h bseq.h kalloc.h
|
||||
esterr.o: mmpriv.h minimap.h bseq.h
|
||||
chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h
|
||||
esterr.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
example.o: minimap.h kseq.h
|
||||
format.o: kalloc.h mmpriv.h minimap.h bseq.h
|
||||
hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h
|
||||
index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
|
||||
format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
|
||||
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
|
||||
index.o: ksort.h
|
||||
kalloc.o: kalloc.h
|
||||
ksw2_extd2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_exts2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_extz2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_ll_sse.o: ksw2.h kalloc.h
|
||||
kthread.o: kthread.h
|
||||
main.o: bseq.h minimap.h mmpriv.h ketopt.h
|
||||
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h
|
||||
map.o: ksort.h
|
||||
misc.o: mmpriv.h minimap.h bseq.h ksort.h
|
||||
options.o: mmpriv.h minimap.h bseq.h
|
||||
pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h
|
||||
sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h
|
||||
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h
|
||||
splitidx.o: mmpriv.h minimap.h bseq.h
|
||||
main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
|
||||
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
map.o: khash.h ksort.h
|
||||
misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
|
||||
options.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
|
||||
sdust.o: kalloc.h kdq.h kvec.h sdust.h
|
||||
seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h
|
||||
self-chain.o: minimap.h kseq.h
|
||||
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
|
|
|
|||
5
main.c
5
main.c
|
|
@ -7,7 +7,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "ketopt.h"
|
||||
|
||||
#define MM_VERSION "2.18-r1015"
|
||||
#define MM_VERSION "2.18-r1025-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -108,7 +108,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha
|
|||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:";
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:";
|
||||
ketopt_t o = KETOPT_INIT;
|
||||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
|
|
@ -171,6 +171,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'C') opt.noncan = atoi(o.arg);
|
||||
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
||||
else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg);
|
||||
else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg);
|
||||
else if (c == 'R') rg = o.arg;
|
||||
else if (c == 'h') fp_help = stdout;
|
||||
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
|
||||
|
|
|
|||
39
map.c
39
map.c
|
|
@ -80,41 +80,6 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t
|
|||
#define heap_lt(a, b) ((a).x > (b).x)
|
||||
KSORT_INIT(heap, mm128_t, heap_lt)
|
||||
|
||||
static mm_seed_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
|
||||
{
|
||||
int rep_st = 0, rep_en = 0, n_m;
|
||||
size_t i;
|
||||
mm_seed_t *m;
|
||||
*n_mini_pos = 0;
|
||||
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
||||
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
||||
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
|
||||
const uint64_t *cr;
|
||||
mm128_t *p = &mv->a[i];
|
||||
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
||||
int t;
|
||||
cr = mm_idx_get(mi, p->x>>8, &t);
|
||||
if (t >= max_occ) {
|
||||
int en = (q_pos >> 1) + 1, st = en - q_span;
|
||||
if (st > rep_en) {
|
||||
*rep_len += rep_en - rep_st;
|
||||
rep_st = st, rep_en = en;
|
||||
} else rep_en = en;
|
||||
} else {
|
||||
mm_seed_t *q = &m[n_m++];
|
||||
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
||||
q->is_tandem = 0;
|
||||
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
||||
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
||||
*n_a += q->n;
|
||||
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1;
|
||||
}
|
||||
}
|
||||
*rep_len += rep_en - rep_st;
|
||||
*_n_m = n_m;
|
||||
return m;
|
||||
}
|
||||
|
||||
static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
|
||||
{
|
||||
*is_self = 0;
|
||||
|
|
@ -147,7 +112,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
|
|||
mm_seed_t *m;
|
||||
mm128_t *a, *heap;
|
||||
|
||||
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
|
||||
heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t));
|
||||
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
|
||||
|
|
@ -211,7 +176,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
|||
int i, n_m;
|
||||
mm_seed_t *m;
|
||||
mm128_t *a;
|
||||
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
|
||||
for (i = 0, *n_a = 0; i < n_m; ++i) {
|
||||
mm_seed_t *q = &m[i];
|
||||
|
|
|
|||
|
|
@ -148,7 +148,7 @@ typedef struct {
|
|||
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
||||
int32_t min_mid_occ;
|
||||
int32_t mid_occ; // ignore seeds with occurrences above this threshold
|
||||
int32_t max_occ;
|
||||
int32_t max_occ, max_max_occ, occ_dist;
|
||||
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
|
||||
int64_t max_sw_mat;
|
||||
|
||||
|
|
|
|||
1
misc.c
1
misc.c
|
|
@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
|
|||
KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
|
||||
|
||||
KSORT_INIT_GENERIC(uint32_t)
|
||||
KSORT_INIT_GENERIC(uint64_t)
|
||||
|
|
|
|||
2
mmpriv.h
2
mmpriv.h
|
|
@ -60,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
|||
|
||||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
||||
|
||||
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
|
||||
|
||||
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->max_chain_skip = 25;
|
||||
opt->max_chain_iter = 5000;
|
||||
opt->chain_gap_scale = 1.0f;
|
||||
opt->max_max_occ = 5000;
|
||||
opt->occ_dist = 0;
|
||||
|
||||
opt->mask_level = 0.5f;
|
||||
opt->mask_len = INT_MAX;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,98 @@
|
|||
#include "mmpriv.h"
|
||||
#include "kalloc.h"
|
||||
|
||||
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv)
|
||||
{
|
||||
mm_seed_t *m;
|
||||
size_t i;
|
||||
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
||||
for (i = 0; i < mv->n; ++i) {
|
||||
const uint64_t *cr;
|
||||
mm_seed_t *q = &m[i];
|
||||
mm128_t *p = &mv->a[i];
|
||||
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
||||
int t;
|
||||
cr = mm_idx_get(mi, p->x>>8, &t);
|
||||
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
||||
q->is_tandem = q->flt = 0;
|
||||
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
||||
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
#define MAX_MAX_HIGH_OCC 128
|
||||
|
||||
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
|
||||
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
|
||||
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
|
||||
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
|
||||
int32_t i, last0, m;
|
||||
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
|
||||
|
||||
if (n == 0 || n == 1) return;
|
||||
for (i = m = 0; i < n; ++i)
|
||||
if (a[i].n > max_occ) ++m;
|
||||
if (m == 0) return; // no high-frequency k-mers; do nothing
|
||||
for (i = 0, last0 = -1; i <= n; ++i) {
|
||||
if (i == n || a[i].n <= max_occ) {
|
||||
if (i - last0 > 1) {
|
||||
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos;
|
||||
int32_t pe = i == n? len : (uint32_t)a[i].q_pos;
|
||||
int32_t j, k, st = last0 + 1, en = i;
|
||||
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
|
||||
if (max_high_occ > MAX_MAX_HIGH_OCC)
|
||||
max_high_occ = MAX_MAX_HIGH_OCC;
|
||||
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
|
||||
b[k] = (uint64_t)a[j].n<<32 | j;
|
||||
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
|
||||
for (; j < en; ++j) { // if there are more, choose top max_high_occ
|
||||
if (a[j].n < (uint32_t)b[0]) { // then update the heap
|
||||
b[0] = (b[0]&0xFFFFFFFF00000000ULL) | j;
|
||||
ks_heapdown_uint64_t(0, k, b);
|
||||
}
|
||||
}
|
||||
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
|
||||
for (j = st; j < en; ++j) a[j].flt ^= 1;
|
||||
for (j = st; j < en; ++j)
|
||||
if (a[j].n > max_max_occ)
|
||||
a[j].flt = 1;
|
||||
}
|
||||
last0 = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
|
||||
{
|
||||
int rep_st = 0, rep_en = 0, n_m;
|
||||
size_t i;
|
||||
mm_seed_t *m;
|
||||
*n_mini_pos = 0;
|
||||
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
||||
m = mm_seed_collect_all(km, mi, mv);
|
||||
if (dist > 0 && max_max_occ > max_occ) {
|
||||
mm_seed_select(mv->n, m, qlen, max_occ, max_max_occ, dist);
|
||||
} else {
|
||||
for (i = 0; i < mv->n; ++i)
|
||||
if (m[i].n > max_occ)
|
||||
m[i].flt = 1;
|
||||
}
|
||||
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
|
||||
mm_seed_t *q = &m[i];
|
||||
if (q->flt) {
|
||||
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
||||
if (st > rep_en) {
|
||||
*rep_len += rep_en - rep_st;
|
||||
rep_st = st, rep_en = en;
|
||||
} else rep_en = en;
|
||||
} else {
|
||||
*n_a += q->n;
|
||||
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
|
||||
m[n_m++] = *q;
|
||||
}
|
||||
}
|
||||
*rep_len += rep_en - rep_st;
|
||||
*_n_m = n_m;
|
||||
return m;
|
||||
}
|
||||
Loading…
Reference in New Issue