r1025: seed rescuring
This commit is contained in:
parent
cdbd96be0c
commit
feb92d32ea
37
Makefile
37
Makefile
|
|
@ -1,7 +1,9 @@
|
||||||
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
|
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
|
||||||
CPPFLAGS= -DHAVE_KALLOC
|
CPPFLAGS= -DHAVE_KALLOC
|
||||||
INCLUDES=
|
INCLUDES=
|
||||||
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o
|
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
|
||||||
|
chain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
|
||||||
|
ksw2_ll_sse.o
|
||||||
PROG= minimap2
|
PROG= minimap2
|
||||||
PROG_EXTRA= sdust minimap2-lite
|
PROG_EXTRA= sdust minimap2-lite
|
||||||
LIBS= -lm -lz -lpthread
|
LIBS= -lm -lz -lpthread
|
||||||
|
|
@ -103,26 +105,29 @@ depend:
|
||||||
|
|
||||||
# DO NOT DELETE
|
# DO NOT DELETE
|
||||||
|
|
||||||
align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
|
align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
|
||||||
bseq.o: bseq.h kvec.h kalloc.h kseq.h
|
bseq.o: bseq.h kvec.h kalloc.h kseq.h
|
||||||
chain.o: minimap.h mmpriv.h bseq.h kalloc.h
|
chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h
|
||||||
esterr.o: mmpriv.h minimap.h bseq.h
|
esterr.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||||
example.o: minimap.h kseq.h
|
example.o: minimap.h kseq.h
|
||||||
format.o: kalloc.h mmpriv.h minimap.h bseq.h
|
format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
|
||||||
hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h
|
hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
|
||||||
index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
|
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
|
||||||
|
index.o: ksort.h
|
||||||
kalloc.o: kalloc.h
|
kalloc.o: kalloc.h
|
||||||
ksw2_extd2_sse.o: ksw2.h kalloc.h
|
ksw2_extd2_sse.o: ksw2.h kalloc.h
|
||||||
ksw2_exts2_sse.o: ksw2.h kalloc.h
|
ksw2_exts2_sse.o: ksw2.h kalloc.h
|
||||||
ksw2_extz2_sse.o: ksw2.h kalloc.h
|
ksw2_extz2_sse.o: ksw2.h kalloc.h
|
||||||
ksw2_ll_sse.o: ksw2.h kalloc.h
|
ksw2_ll_sse.o: ksw2.h kalloc.h
|
||||||
kthread.o: kthread.h
|
kthread.o: kthread.h
|
||||||
main.o: bseq.h minimap.h mmpriv.h ketopt.h
|
main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
|
||||||
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h
|
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
|
||||||
map.o: ksort.h
|
map.o: khash.h ksort.h
|
||||||
misc.o: mmpriv.h minimap.h bseq.h ksort.h
|
misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
|
||||||
options.o: mmpriv.h minimap.h bseq.h
|
options.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||||
pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h
|
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
|
||||||
sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h
|
sdust.o: kalloc.h kdq.h kvec.h sdust.h
|
||||||
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h
|
seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h
|
||||||
splitidx.o: mmpriv.h minimap.h bseq.h
|
self-chain.o: minimap.h kseq.h
|
||||||
|
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
|
||||||
|
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||||
|
|
|
||||||
5
main.c
5
main.c
|
|
@ -7,7 +7,7 @@
|
||||||
#include "mmpriv.h"
|
#include "mmpriv.h"
|
||||||
#include "ketopt.h"
|
#include "ketopt.h"
|
||||||
|
|
||||||
#define MM_VERSION "2.18-r1015"
|
#define MM_VERSION "2.18-r1025-dirty"
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
|
@ -108,7 +108,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:";
|
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:";
|
||||||
ketopt_t o = KETOPT_INIT;
|
ketopt_t o = KETOPT_INIT;
|
||||||
mm_mapopt_t opt;
|
mm_mapopt_t opt;
|
||||||
mm_idxopt_t ipt;
|
mm_idxopt_t ipt;
|
||||||
|
|
@ -171,6 +171,7 @@ int main(int argc, char *argv[])
|
||||||
else if (c == 'C') opt.noncan = atoi(o.arg);
|
else if (c == 'C') opt.noncan = atoi(o.arg);
|
||||||
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
||||||
else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg);
|
else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg);
|
||||||
|
else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg);
|
||||||
else if (c == 'R') rg = o.arg;
|
else if (c == 'R') rg = o.arg;
|
||||||
else if (c == 'h') fp_help = stdout;
|
else if (c == 'h') fp_help = stdout;
|
||||||
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
|
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
|
||||||
|
|
|
||||||
39
map.c
39
map.c
|
|
@ -80,41 +80,6 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t
|
||||||
#define heap_lt(a, b) ((a).x > (b).x)
|
#define heap_lt(a, b) ((a).x > (b).x)
|
||||||
KSORT_INIT(heap, mm128_t, heap_lt)
|
KSORT_INIT(heap, mm128_t, heap_lt)
|
||||||
|
|
||||||
static mm_seed_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
|
|
||||||
{
|
|
||||||
int rep_st = 0, rep_en = 0, n_m;
|
|
||||||
size_t i;
|
|
||||||
mm_seed_t *m;
|
|
||||||
*n_mini_pos = 0;
|
|
||||||
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
|
||||||
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
|
||||||
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
|
|
||||||
const uint64_t *cr;
|
|
||||||
mm128_t *p = &mv->a[i];
|
|
||||||
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
|
||||||
int t;
|
|
||||||
cr = mm_idx_get(mi, p->x>>8, &t);
|
|
||||||
if (t >= max_occ) {
|
|
||||||
int en = (q_pos >> 1) + 1, st = en - q_span;
|
|
||||||
if (st > rep_en) {
|
|
||||||
*rep_len += rep_en - rep_st;
|
|
||||||
rep_st = st, rep_en = en;
|
|
||||||
} else rep_en = en;
|
|
||||||
} else {
|
|
||||||
mm_seed_t *q = &m[n_m++];
|
|
||||||
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
|
||||||
q->is_tandem = 0;
|
|
||||||
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
|
||||||
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
|
||||||
*n_a += q->n;
|
|
||||||
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*rep_len += rep_en - rep_st;
|
|
||||||
*_n_m = n_m;
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
|
static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
|
||||||
{
|
{
|
||||||
*is_self = 0;
|
*is_self = 0;
|
||||||
|
|
@ -147,7 +112,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
|
||||||
mm_seed_t *m;
|
mm_seed_t *m;
|
||||||
mm128_t *a, *heap;
|
mm128_t *a, *heap;
|
||||||
|
|
||||||
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||||
|
|
||||||
heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t));
|
heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t));
|
||||||
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
|
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
|
||||||
|
|
@ -211,7 +176,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
||||||
int i, n_m;
|
int i, n_m;
|
||||||
mm_seed_t *m;
|
mm_seed_t *m;
|
||||||
mm128_t *a;
|
mm128_t *a;
|
||||||
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||||
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
|
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
|
||||||
for (i = 0, *n_a = 0; i < n_m; ++i) {
|
for (i = 0, *n_a = 0; i < n_m; ++i) {
|
||||||
mm_seed_t *q = &m[i];
|
mm_seed_t *q = &m[i];
|
||||||
|
|
|
||||||
|
|
@ -148,7 +148,7 @@ typedef struct {
|
||||||
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
||||||
int32_t min_mid_occ;
|
int32_t min_mid_occ;
|
||||||
int32_t mid_occ; // ignore seeds with occurrences above this threshold
|
int32_t mid_occ; // ignore seeds with occurrences above this threshold
|
||||||
int32_t max_occ;
|
int32_t max_occ, max_max_occ, occ_dist;
|
||||||
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
|
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
|
||||||
int64_t max_sw_mat;
|
int64_t max_sw_mat;
|
||||||
|
|
||||||
|
|
|
||||||
1
misc.c
1
misc.c
|
|
@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
|
||||||
KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
|
KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
|
||||||
|
|
||||||
KSORT_INIT_GENERIC(uint32_t)
|
KSORT_INIT_GENERIC(uint32_t)
|
||||||
|
KSORT_INIT_GENERIC(uint64_t)
|
||||||
|
|
|
||||||
2
mmpriv.h
2
mmpriv.h
|
|
@ -60,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
||||||
|
|
||||||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
||||||
|
|
||||||
|
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
|
||||||
|
|
||||||
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
||||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||||
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);
|
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,8 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
||||||
opt->max_chain_skip = 25;
|
opt->max_chain_skip = 25;
|
||||||
opt->max_chain_iter = 5000;
|
opt->max_chain_iter = 5000;
|
||||||
opt->chain_gap_scale = 1.0f;
|
opt->chain_gap_scale = 1.0f;
|
||||||
|
opt->max_max_occ = 5000;
|
||||||
|
opt->occ_dist = 0;
|
||||||
|
|
||||||
opt->mask_level = 0.5f;
|
opt->mask_level = 0.5f;
|
||||||
opt->mask_len = INT_MAX;
|
opt->mask_len = INT_MAX;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,98 @@
|
||||||
|
#include "mmpriv.h"
|
||||||
|
#include "kalloc.h"
|
||||||
|
|
||||||
|
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv)
|
||||||
|
{
|
||||||
|
mm_seed_t *m;
|
||||||
|
size_t i;
|
||||||
|
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
||||||
|
for (i = 0; i < mv->n; ++i) {
|
||||||
|
const uint64_t *cr;
|
||||||
|
mm_seed_t *q = &m[i];
|
||||||
|
mm128_t *p = &mv->a[i];
|
||||||
|
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
||||||
|
int t;
|
||||||
|
cr = mm_idx_get(mi, p->x>>8, &t);
|
||||||
|
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
||||||
|
q->is_tandem = q->flt = 0;
|
||||||
|
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
||||||
|
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
||||||
|
}
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define MAX_MAX_HIGH_OCC 128
|
||||||
|
|
||||||
|
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
|
||||||
|
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
|
||||||
|
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
|
||||||
|
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
|
||||||
|
int32_t i, last0, m;
|
||||||
|
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
|
||||||
|
|
||||||
|
if (n == 0 || n == 1) return;
|
||||||
|
for (i = m = 0; i < n; ++i)
|
||||||
|
if (a[i].n > max_occ) ++m;
|
||||||
|
if (m == 0) return; // no high-frequency k-mers; do nothing
|
||||||
|
for (i = 0, last0 = -1; i <= n; ++i) {
|
||||||
|
if (i == n || a[i].n <= max_occ) {
|
||||||
|
if (i - last0 > 1) {
|
||||||
|
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos;
|
||||||
|
int32_t pe = i == n? len : (uint32_t)a[i].q_pos;
|
||||||
|
int32_t j, k, st = last0 + 1, en = i;
|
||||||
|
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
|
||||||
|
if (max_high_occ > MAX_MAX_HIGH_OCC)
|
||||||
|
max_high_occ = MAX_MAX_HIGH_OCC;
|
||||||
|
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
|
||||||
|
b[k] = (uint64_t)a[j].n<<32 | j;
|
||||||
|
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
|
||||||
|
for (; j < en; ++j) { // if there are more, choose top max_high_occ
|
||||||
|
if (a[j].n < (uint32_t)b[0]) { // then update the heap
|
||||||
|
b[0] = (b[0]&0xFFFFFFFF00000000ULL) | j;
|
||||||
|
ks_heapdown_uint64_t(0, k, b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
|
||||||
|
for (j = st; j < en; ++j) a[j].flt ^= 1;
|
||||||
|
for (j = st; j < en; ++j)
|
||||||
|
if (a[j].n > max_max_occ)
|
||||||
|
a[j].flt = 1;
|
||||||
|
}
|
||||||
|
last0 = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
|
||||||
|
{
|
||||||
|
int rep_st = 0, rep_en = 0, n_m;
|
||||||
|
size_t i;
|
||||||
|
mm_seed_t *m;
|
||||||
|
*n_mini_pos = 0;
|
||||||
|
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
||||||
|
m = mm_seed_collect_all(km, mi, mv);
|
||||||
|
if (dist > 0 && max_max_occ > max_occ) {
|
||||||
|
mm_seed_select(mv->n, m, qlen, max_occ, max_max_occ, dist);
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < mv->n; ++i)
|
||||||
|
if (m[i].n > max_occ)
|
||||||
|
m[i].flt = 1;
|
||||||
|
}
|
||||||
|
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
|
||||||
|
mm_seed_t *q = &m[i];
|
||||||
|
if (q->flt) {
|
||||||
|
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
||||||
|
if (st > rep_en) {
|
||||||
|
*rep_len += rep_en - rep_st;
|
||||||
|
rep_st = st, rep_en = en;
|
||||||
|
} else rep_en = en;
|
||||||
|
} else {
|
||||||
|
*n_a += q->n;
|
||||||
|
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
|
||||||
|
m[n_m++] = *q;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*rep_len += rep_en - rep_st;
|
||||||
|
*_n_m = n_m;
|
||||||
|
return m;
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue