r1105: avoid long running time occasionally (#771)
Caused by highly repetitive minimizers on a query sequence. The solution is to filter out these query minimizers.
This commit is contained in:
parent
cc14d1afdf
commit
05a8a45d44
4
main.c
4
main.c
|
|
@ -7,7 +7,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "ketopt.h"
|
||||
|
||||
#define MM_VERSION "2.22-r1101"
|
||||
#define MM_VERSION "2.22-r1105-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -74,6 +74,7 @@ static ko_longopt_t long_options[] = {
|
|||
{ "rmq", ko_optional_argument, 347 },
|
||||
{ "qstrand", ko_no_argument, 348 },
|
||||
{ "cap-kalloc", ko_required_argument, 349 },
|
||||
{ "q-occ-frac", ko_required_argument, 350 },
|
||||
{ "help", ko_no_argument, 'h' },
|
||||
{ "max-intron-len", ko_required_argument, 'G' },
|
||||
{ "version", ko_no_argument, 'V' },
|
||||
|
|
@ -229,6 +230,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len
|
||||
else if (c == 348) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand
|
||||
else if (c == 349) opt.cap_kalloc = mm_parse_num(o.arg); // --cap-kalloc
|
||||
else if (c == 350) opt.q_occ_frac = atof(o.arg); // --q-occ-frac
|
||||
else if (c == 330) {
|
||||
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
|
||||
} else if (c == 314) { // --frag
|
||||
|
|
|
|||
1
map.c
1
map.c
|
|
@ -252,6 +252,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
hash = __ac_Wang_hash(hash);
|
||||
|
||||
collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv);
|
||||
if (opt->q_occ_frac > 0.0f) mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac);
|
||||
if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
|
||||
else a = collect_seed_hits(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
|
||||
|
||||
|
|
|
|||
|
|
@ -163,6 +163,7 @@ typedef struct {
|
|||
int pe_ori, pe_bonus;
|
||||
|
||||
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
||||
float q_occ_frac;
|
||||
int32_t min_mid_occ, max_mid_occ;
|
||||
int32_t mid_occ; // ignore seeds with occurrences above this threshold
|
||||
int32_t max_occ, max_max_occ, occ_dist;
|
||||
|
|
|
|||
|
|
@ -151,10 +151,16 @@ Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurr
|
|||
.BR -f }}.
|
||||
This option prevents excessively small or large
|
||||
.B -f
|
||||
estimated from the input reference. It deprecates
|
||||
estimated from the input reference. Available since r1034 and deprecating
|
||||
.B --min-occ-floor
|
||||
in earlier versions of minimap2.
|
||||
.TP
|
||||
.BI --q-occ-frac \ FLOAT
|
||||
Discard a query minimizer if its occurrence is higher than
|
||||
.I FLOAT
|
||||
fraction of query minimizers and than the reference occurrence threshold
|
||||
[0.02]. Set 0 to disable. Available since r1105.
|
||||
.TP
|
||||
.BI -e \ INT
|
||||
Sample a high-frequency minimizer every
|
||||
.I INT
|
||||
|
|
|
|||
1
mmpriv.h
1
mmpriv.h
|
|
@ -61,6 +61,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
|||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
||||
|
||||
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
|
||||
void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac);
|
||||
|
||||
double mm_event_identity(const mm_reg1_t *r);
|
||||
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->min_mid_occ = 10;
|
||||
opt->max_mid_occ = 1000000;
|
||||
opt->sdust_thres = 0; // no SDUST masking
|
||||
opt->q_occ_frac = 0.02f;
|
||||
|
||||
opt->min_cnt = 3;
|
||||
opt->min_chain_score = 40;
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ cdef extern from "minimap.h":
|
|||
int pe_ori, pe_bonus
|
||||
|
||||
float mid_occ_frac
|
||||
float q_occ_frac
|
||||
int32_t min_mid_occ
|
||||
int32_t mid_occ
|
||||
int32_t max_occ
|
||||
|
|
|
|||
25
seed.c
25
seed.c
|
|
@ -2,6 +2,31 @@
|
|||
#include "kalloc.h"
|
||||
#include "ksort.h"
|
||||
|
||||
void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
|
||||
{
|
||||
mm128_t *a;
|
||||
size_t i, j, st;
|
||||
if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
|
||||
KMALLOC(km, a, mv->n);
|
||||
for (i = 0; i < mv->n; ++i)
|
||||
a[i].x = mv->a[i].x, a[i].y = i;
|
||||
radix_sort_128x(a, a + mv->n);
|
||||
for (st = 0, i = 1; i <= mv->n; ++i) {
|
||||
if (i == mv->n || a[i].x != a[st].x) {
|
||||
int32_t cnt = i - st;
|
||||
if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
|
||||
for (j = st; j < i; ++j)
|
||||
mv->a[a[j].y].x = 0;
|
||||
st = i;
|
||||
}
|
||||
}
|
||||
kfree(km, a);
|
||||
for (i = j = 0; i < mv->n; ++i)
|
||||
if (mv->a[i].x != 0)
|
||||
mv->a[j++] = mv->a[i];
|
||||
mv->n = j;
|
||||
}
|
||||
|
||||
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
|
||||
{
|
||||
mm_seed_t *m;
|
||||
|
|
|
|||
Loading…
Reference in New Issue