From feb92d32ea16692c104fe351a579c2c1d4ada27d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 30 Apr 2021 17:33:16 -0400 Subject: [PATCH] r1025: seed rescuring --- Makefile | 37 ++++++++++++--------- main.c | 5 +-- map.c | 39 ++-------------------- minimap.h | 2 +- misc.c | 1 + mmpriv.h | 2 ++ options.c | 2 ++ seed.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 130 insertions(+), 56 deletions(-) create mode 100644 seed.c diff --git a/Makefile b/Makefile index 18622f5..8bb0890 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,9 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC INCLUDES= -OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \ + chain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ + ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread @@ -103,26 +105,29 @@ depend: # DO NOT DELETE -align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h +align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h bseq.o: bseq.h kvec.h kalloc.h kseq.h -chain.o: minimap.h mmpriv.h bseq.h kalloc.h -esterr.o: mmpriv.h minimap.h bseq.h +chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h +esterr.o: mmpriv.h minimap.h bseq.h kseq.h example.o: minimap.h kseq.h -format.o: kalloc.h mmpriv.h minimap.h bseq.h -hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h -index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h +format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h +hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h +index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h +index.o: ksort.h kalloc.o: kalloc.h ksw2_extd2_sse.o: ksw2.h kalloc.h ksw2_exts2_sse.o: ksw2.h kalloc.h ksw2_extz2_sse.o: ksw2.h kalloc.h ksw2_ll_sse.o: ksw2.h kalloc.h kthread.o: kthread.h -main.o: bseq.h minimap.h mmpriv.h ketopt.h -map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h -map.o: ksort.h -misc.o: mmpriv.h minimap.h bseq.h ksort.h -options.o: mmpriv.h minimap.h bseq.h -pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h -sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h -sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h -splitidx.o: mmpriv.h minimap.h bseq.h +main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h +map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h +map.o: khash.h ksort.h +misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h +options.o: mmpriv.h minimap.h bseq.h kseq.h +pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h +sdust.o: kalloc.h kdq.h kvec.h sdust.h +seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h +self-chain.o: minimap.h kseq.h +sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h +splitidx.o: mmpriv.h minimap.h bseq.h kseq.h diff --git a/main.c b/main.c index 395c0d9..0e95bbc 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1015" +#define MM_VERSION "2.18-r1025-dirty" #ifdef __linux__ #include @@ -108,7 +108,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -171,6 +171,7 @@ int main(int argc, char *argv[]) else if (c == 'C') opt.noncan = atoi(o.arg); else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg); else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg); + else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg); else if (c == 'R') rg = o.arg; else if (c == 'h') fp_help = stdout; else if (c == '2') opt.flag |= MM_F_2_IO_THREADS; diff --git a/map.c b/map.c index 89f7818..b4b07d1 100644 --- a/map.c +++ b/map.c @@ -80,41 +80,6 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t #define heap_lt(a, b) ((a).x > (b).x) KSORT_INIT(heap, mm128_t, heap_lt) -static mm_seed_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) -{ - int rep_st = 0, rep_en = 0, n_m; - size_t i; - mm_seed_t *m; - *n_mini_pos = 0; - *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t)); - m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t)); - for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) { - const uint64_t *cr; - mm128_t *p = &mv->a[i]; - uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff; - int t; - cr = mm_idx_get(mi, p->x>>8, &t); - if (t >= max_occ) { - int en = (q_pos >> 1) + 1, st = en - q_span; - if (st > rep_en) { - *rep_len += rep_en - rep_st; - rep_st = st, rep_en = en; - } else rep_en = en; - } else { - mm_seed_t *q = &m[n_m++]; - q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32; - q->is_tandem = 0; - if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1; - if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1; - *n_a += q->n; - (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1; - } - } - *rep_len += rep_en - rep_st; - *_n_m = n_m; - return m; -} - static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self) { *is_self = 0; @@ -147,7 +112,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max mm_seed_t *m; mm128_t *a, *heap; - m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); + m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t)); a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); @@ -211,7 +176,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, int i, n_m; mm_seed_t *m; mm128_t *a; - m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); + m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); for (i = 0, *n_a = 0; i < n_m; ++i) { mm_seed_t *q = &m[i]; diff --git a/minimap.h b/minimap.h index a00869e..9b7b4ae 100644 --- a/minimap.h +++ b/minimap.h @@ -148,7 +148,7 @@ typedef struct { float mid_occ_frac; // only used by mm_mapopt_update(); see below int32_t min_mid_occ; int32_t mid_occ; // ignore seeds with occurrences above this threshold - int32_t max_occ; + int32_t max_occ, max_max_occ, occ_dist; int64_t mini_batch_size; // size of a batch of query bases to process in parallel int64_t max_sw_mat; diff --git a/misc.c b/misc.c index ddfa9de..f3a29f8 100644 --- a/misc.c +++ b/misc.c @@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8) KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8) KSORT_INIT_GENERIC(uint32_t) +KSORT_INIT_GENERIC(uint64_t) diff --git a/mmpriv.h b/mmpriv.h index 7583001..d2d8641 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -60,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); +mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); + int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len); diff --git a/options.c b/options.c index d4eafaa..7b87f1c 100644 --- a/options.c +++ b/options.c @@ -26,6 +26,8 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->max_chain_skip = 25; opt->max_chain_iter = 5000; opt->chain_gap_scale = 1.0f; + opt->max_max_occ = 5000; + opt->occ_dist = 0; opt->mask_level = 0.5f; opt->mask_len = INT_MAX; diff --git a/seed.c b/seed.c new file mode 100644 index 0000000..27f7ffe --- /dev/null +++ b/seed.c @@ -0,0 +1,98 @@ +#include "mmpriv.h" +#include "kalloc.h" + +mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv) +{ + mm_seed_t *m; + size_t i; + m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t)); + for (i = 0; i < mv->n; ++i) { + const uint64_t *cr; + mm_seed_t *q = &m[i]; + mm128_t *p = &mv->a[i]; + uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff; + int t; + cr = mm_idx_get(mi, p->x>>8, &t); + q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32; + q->is_tandem = q->flt = 0; + if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1; + if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1; + } + return m; +} + +#define MAX_MAX_HIGH_OCC 128 + +void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist) +{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak + extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*); + extern void ks_heapmake_uint64_t(size_t n, uint64_t*); + int32_t i, last0, m; + uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation + + if (n == 0 || n == 1) return; + for (i = m = 0; i < n; ++i) + if (a[i].n > max_occ) ++m; + if (m == 0) return; // no high-frequency k-mers; do nothing + for (i = 0, last0 = -1; i <= n; ++i) { + if (i == n || a[i].n <= max_occ) { + if (i - last0 > 1) { + int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos; + int32_t pe = i == n? len : (uint32_t)a[i].q_pos; + int32_t j, k, st = last0 + 1, en = i; + int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499); + if (max_high_occ > MAX_MAX_HIGH_OCC) + max_high_occ = MAX_MAX_HIGH_OCC; + for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k) + b[k] = (uint64_t)a[j].n<<32 | j; + ks_heapmake_uint64_t(k, b); // initialize the binomial heap + for (; j < en; ++j) { // if there are more, choose top max_high_occ + if (a[j].n < (uint32_t)b[0]) { // then update the heap + b[0] = (b[0]&0xFFFFFFFF00000000ULL) | j; + ks_heapdown_uint64_t(0, k, b); + } + } + for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1; + for (j = st; j < en; ++j) a[j].flt ^= 1; + for (j = st; j < en; ++j) + if (a[j].n > max_max_occ) + a[j].flt = 1; + } + last0 = i; + } + } +} + +mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) +{ + int rep_st = 0, rep_en = 0, n_m; + size_t i; + mm_seed_t *m; + *n_mini_pos = 0; + *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t)); + m = mm_seed_collect_all(km, mi, mv); + if (dist > 0 && max_max_occ > max_occ) { + mm_seed_select(mv->n, m, qlen, max_occ, max_max_occ, dist); + } else { + for (i = 0; i < mv->n; ++i) + if (m[i].n > max_occ) + m[i].flt = 1; + } + for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) { + mm_seed_t *q = &m[i]; + if (q->flt) { + int en = (q->q_pos >> 1) + 1, st = en - q->q_span; + if (st > rep_en) { + *rep_len += rep_en - rep_st; + rep_st = st, rep_en = en; + } else rep_en = en; + } else { + *n_a += q->n; + (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1; + m[n_m++] = *q; + } + } + *rep_len += rep_en - rep_st; + *_n_m = n_m; + return m; +}