From 426c2975f64c4a10049d60cca43bde989dfb1c71 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 30 Jun 2017 22:15:45 -0400 Subject: [PATCH] r126: filter by fraction of seed coverage otherwise we may get too many poor overlap mappings. --- align.c | 16 ++-------------- hit.c | 25 +++++++++++++++++++++++++ main.c | 16 +++++++++------- map.c | 5 +++-- minimap.h | 1 + mmpriv.h | 1 + 6 files changed, 41 insertions(+), 23 deletions(-) diff --git a/align.c b/align.c index 0dc6dc8..d6b7dc5 100644 --- a/align.c +++ b/align.c @@ -304,21 +304,9 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m ++n_regs; } } + *n_regs_ = n_regs; kfree(km, qseq0[0]); kfree(km, qseq0[1]); kfree(km, ez.cigar); - - // filter - for (r = i = 0; r < n_regs; ++r) { - mm_reg1_t *reg = ®s[r]; - int flt = 0; - if (reg->cnt < opt->min_cnt) flt = 1; - else if (reg->p->blen - reg->p->n_ambi - reg->p->n_diff < opt->min_chain_score) flt = 1; - else if (reg->p->dp_max < opt->min_dp_max) flt = 1; - if (flt) free(reg->p); - else if (i < r) regs[i++] = regs[r]; // NB: this also move the regs[r].p pointer - else ++i; - } - *n_regs_ = i; - mm_sync_regs(km, i, regs); + mm_filter_regs(km, opt, n_regs_, regs); return regs; } diff --git a/hit.c b/hit.c index f367ef1..5a3b3c7 100644 --- a/hit.c +++ b/hit.c @@ -147,6 +147,31 @@ void mm_select_sub(void *km, float mask_level, float pri_ratio, int *n_, mm_reg1 } } +void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs) +{ + int i, k; + for (i = k = 0; i < *n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + int flt = 0; + if (r->cnt < opt->min_cnt) flt = 1; + else { + int blen = r->qe - r->qs < r->re - r->rs? r->qe - r->qs : r->re - r->rs; + if (r->score < blen * opt->min_seedcov_ratio) flt = 1; + } + if (r->p) { + if (r->p->blen - r->p->n_ambi - r->p->n_diff < opt->min_chain_score) flt = 1; + else if (r->p->dp_max < opt->min_dp_max) flt = 1; + if (flt) free(r->p); + } + if (!flt) { + if (k < i) regs[k++] = regs[i]; + else ++k; + } + } + *n_regs = k; + mm_sync_regs(km, k, regs); +} + int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a) { // squeeze out regions in a[] that are not referenced by regs[] int i, as = 0; diff --git a/main.c b/main.c index 8355d5c..ea3d6d6 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r125-pre" +#define MM_VERSION "2.0-r126-pre" void liftrlimit() { @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "bw:k:t:r:f:Vv:g:I:d:ST:s:x:Hcp:M:n:z:A:B:O:E:m:", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "bw:k:t:r:f:Vv:g:I:d:ST:s:x:Hcp:M:n:z:A:B:O:E:m:D:", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg); else if (c == 'k') k = atoi(optarg); else if (c == 'H') is_hpc = 1; @@ -79,6 +79,7 @@ int main(int argc, char *argv[]) else if (c == 'v') mm_verbose = atoi(optarg); else if (c == 'g') opt.max_gap = atoi(optarg); else if (c == 'p') opt.pri_ratio = atof(optarg); + else if (c == 'D') opt.min_seedcov_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); else if (c == 'c') opt.flag |= MM_F_CIGAR; else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF; @@ -109,7 +110,7 @@ int main(int argc, char *argv[]) } else if (c == 'x') { if (strcmp(optarg, "ava10k") == 0) { opt.flag |= MM_F_AVA | MM_F_NO_SELF; - opt.min_chain_score = 100, opt.pri_ratio = 0.0f; + opt.min_chain_score = 100, opt.pri_ratio = 0.0f, opt.min_seedcov_ratio = 0.05f; is_hpc = 1, k = 19, w = 5; } else if (strcmp(optarg, "map10k") == 0) { is_hpc = 1, k = 19; @@ -140,11 +141,12 @@ int main(int argc, char *argv[]) fprintf(stderr, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); // fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy fprintf(stderr, " -S skip self and dual mappings (for the all-vs-all mode)\n"); - fprintf(stderr, " -p FLOAT threshold to output a mapping [%g]\n", opt.pri_ratio); + fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); + fprintf(stderr, " -D FLOAT min fraction of seed matches [%g]\n", opt.min_seedcov_ratio); fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); - fprintf(stderr, " ava10k: -Hk19 -Sw5 -p0 -m100 (PacBio/ONT all-vs-all read mapping)\n"); - fprintf(stderr, " map10k: -Hk19 (PacBio/ONT vs reference mapping)\n"); - fprintf(stderr, " asm1m: -k19 -w19 (intra-species assembly to ref mapping)\n"); + fprintf(stderr, " ava10k: -Hk19 -Sw5 -p0 -m100 -D.05 (PacBio/ONT all-vs-all read mapping)\n"); + fprintf(stderr, " map10k: -Hk19 (PacBio/ONT vs reference mapping)\n"); + fprintf(stderr, " asm1m: -k19 -w19 (intra-species assembly to ref mapping)\n"); fprintf(stderr, " Alignment:\n"); fprintf(stderr, " -A INT matching score [%d]\n", opt.a); fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b); diff --git a/map.c b/map.c index 1f4f9c9..3e0d26b 100644 --- a/map.c +++ b/map.c @@ -20,6 +20,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->bw = 1000; opt->max_gap = 10000; opt->max_chain_skip = 15; + opt->min_seedcov_ratio = 0.0f; opt->pri_ratio = 2.0f; opt->mask_level = 0.5f; @@ -243,12 +244,12 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, mm_join_long(b->km, opt, qlen, *n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle } if (opt->flag & MM_F_CIGAR) { - regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); + regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() if (!(opt->flag & MM_F_AVA)) { mm_update_parent(b->km, opt->mask_level, *n_regs, regs); mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, n_regs, regs); } - } + } else mm_filter_regs(b->km, opt, n_regs, regs); mm_set_mapq(*n_regs, regs); // free diff --git a/minimap.h b/minimap.h index 1844ba9..51e99e7 100644 --- a/minimap.h +++ b/minimap.h @@ -75,6 +75,7 @@ typedef struct { int max_chain_skip; int min_cnt; int min_chain_score; + float min_seedcov_ratio; float pri_ratio; float mask_level; diff --git a/mmpriv.h b/mmpriv.h index bcbb3c7..62d088a 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -42,6 +42,7 @@ void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r); void mm_update_parent(void *km, float mask_level, int n, mm_reg1_t *r); void mm_select_sub(void *km, float mask_level, float pri_ratio, int *n_, mm_reg1_t *r); +void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs); void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int n_regs, mm_reg1_t *regs, mm128_t *a); void mm_set_mapq(int n_regs, mm_reg1_t *regs);