r1025: seed rescuring

This commit is contained in:
Heng Li 2021-04-30 17:33:16 -04:00
parent cdbd96be0c
commit feb92d32ea
8 changed files with 130 additions and 56 deletions

View File

@ -1,7 +1,9 @@
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC
INCLUDES=
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
chain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
ksw2_ll_sse.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread
@ -103,26 +105,29 @@ depend:
# DO NOT DELETE
align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
bseq.o: bseq.h kvec.h kalloc.h kseq.h
chain.o: minimap.h mmpriv.h bseq.h kalloc.h
esterr.o: mmpriv.h minimap.h bseq.h
chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h
esterr.o: mmpriv.h minimap.h bseq.h kseq.h
example.o: minimap.h kseq.h
format.o: kalloc.h mmpriv.h minimap.h bseq.h
hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h
index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
index.o: ksort.h
kalloc.o: kalloc.h
ksw2_extd2_sse.o: ksw2.h kalloc.h
ksw2_exts2_sse.o: ksw2.h kalloc.h
ksw2_extz2_sse.o: ksw2.h kalloc.h
ksw2_ll_sse.o: ksw2.h kalloc.h
kthread.o: kthread.h
main.o: bseq.h minimap.h mmpriv.h ketopt.h
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h
map.o: ksort.h
misc.o: mmpriv.h minimap.h bseq.h ksort.h
options.o: mmpriv.h minimap.h bseq.h
pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h
sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h
splitidx.o: mmpriv.h minimap.h bseq.h
main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
map.o: khash.h ksort.h
misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
options.o: mmpriv.h minimap.h bseq.h kseq.h
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
sdust.o: kalloc.h kdq.h kvec.h sdust.h
seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h
self-chain.o: minimap.h kseq.h
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h

5
main.c
View File

@ -7,7 +7,7 @@
#include "mmpriv.h"
#include "ketopt.h"
#define MM_VERSION "2.18-r1015"
#define MM_VERSION "2.18-r1025-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -108,7 +108,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha
int main(int argc, char *argv[])
{
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:";
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:";
ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt;
mm_idxopt_t ipt;
@ -171,6 +171,7 @@ int main(int argc, char *argv[])
else if (c == 'C') opt.noncan = atoi(o.arg);
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg);
else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg);
else if (c == 'R') rg = o.arg;
else if (c == 'h') fp_help = stdout;
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;

39
map.c
View File

@ -80,41 +80,6 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t
#define heap_lt(a, b) ((a).x > (b).x)
KSORT_INIT(heap, mm128_t, heap_lt)
static mm_seed_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
{
int rep_st = 0, rep_en = 0, n_m;
size_t i;
mm_seed_t *m;
*n_mini_pos = 0;
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
const uint64_t *cr;
mm128_t *p = &mv->a[i];
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
int t;
cr = mm_idx_get(mi, p->x>>8, &t);
if (t >= max_occ) {
int en = (q_pos >> 1) + 1, st = en - q_span;
if (st > rep_en) {
*rep_len += rep_en - rep_st;
rep_st = st, rep_en = en;
} else rep_en = en;
} else {
mm_seed_t *q = &m[n_m++];
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
q->is_tandem = 0;
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
*n_a += q->n;
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1;
}
}
*rep_len += rep_en - rep_st;
*_n_m = n_m;
return m;
}
static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
{
*is_self = 0;
@ -147,7 +112,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
mm_seed_t *m;
mm128_t *a, *heap;
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t));
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
@ -211,7 +176,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
int i, n_m;
mm_seed_t *m;
mm128_t *a;
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
for (i = 0, *n_a = 0; i < n_m; ++i) {
mm_seed_t *q = &m[i];

View File

@ -148,7 +148,7 @@ typedef struct {
float mid_occ_frac; // only used by mm_mapopt_update(); see below
int32_t min_mid_occ;
int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ;
int32_t max_occ, max_max_occ, occ_dist;
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
int64_t max_sw_mat;

1
misc.c
View File

@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
KSORT_INIT_GENERIC(uint32_t)
KSORT_INIT_GENERIC(uint64_t)

View File

@ -60,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);

View File

@ -26,6 +26,8 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->max_chain_skip = 25;
opt->max_chain_iter = 5000;
opt->chain_gap_scale = 1.0f;
opt->max_max_occ = 5000;
opt->occ_dist = 0;
opt->mask_level = 0.5f;
opt->mask_len = INT_MAX;

98
seed.c 100644
View File

@ -0,0 +1,98 @@
#include "mmpriv.h"
#include "kalloc.h"
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv)
{
mm_seed_t *m;
size_t i;
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
for (i = 0; i < mv->n; ++i) {
const uint64_t *cr;
mm_seed_t *q = &m[i];
mm128_t *p = &mv->a[i];
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
int t;
cr = mm_idx_get(mi, p->x>>8, &t);
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
q->is_tandem = q->flt = 0;
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
}
return m;
}
#define MAX_MAX_HIGH_OCC 128
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
int32_t i, last0, m;
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
if (n == 0 || n == 1) return;
for (i = m = 0; i < n; ++i)
if (a[i].n > max_occ) ++m;
if (m == 0) return; // no high-frequency k-mers; do nothing
for (i = 0, last0 = -1; i <= n; ++i) {
if (i == n || a[i].n <= max_occ) {
if (i - last0 > 1) {
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos;
int32_t pe = i == n? len : (uint32_t)a[i].q_pos;
int32_t j, k, st = last0 + 1, en = i;
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
if (max_high_occ > MAX_MAX_HIGH_OCC)
max_high_occ = MAX_MAX_HIGH_OCC;
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
b[k] = (uint64_t)a[j].n<<32 | j;
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
for (; j < en; ++j) { // if there are more, choose top max_high_occ
if (a[j].n < (uint32_t)b[0]) { // then update the heap
b[0] = (b[0]&0xFFFFFFFF00000000ULL) | j;
ks_heapdown_uint64_t(0, k, b);
}
}
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
for (j = st; j < en; ++j) a[j].flt ^= 1;
for (j = st; j < en; ++j)
if (a[j].n > max_max_occ)
a[j].flt = 1;
}
last0 = i;
}
}
}
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
{
int rep_st = 0, rep_en = 0, n_m;
size_t i;
mm_seed_t *m;
*n_mini_pos = 0;
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
m = mm_seed_collect_all(km, mi, mv);
if (dist > 0 && max_max_occ > max_occ) {
mm_seed_select(mv->n, m, qlen, max_occ, max_max_occ, dist);
} else {
for (i = 0; i < mv->n; ++i)
if (m[i].n > max_occ)
m[i].flt = 1;
}
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
mm_seed_t *q = &m[i];
if (q->flt) {
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
if (st > rep_en) {
*rep_len += rep_en - rep_st;
rep_st = st, rep_en = en;
} else rep_en = en;
} else {
*n_a += q->n;
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
m[n_m++] = *q;
}
}
*rep_len += rep_en - rep_st;
*_n_m = n_m;
return m;
}