From f995f55610fc67f98c5a56f4e4b55378bd5b5b00 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 21 Aug 2020 11:12:50 -0400 Subject: [PATCH] added --mask-len for #659 --- hit.c | 4 ++-- main.c | 2 ++ map.c | 8 ++++---- minimap.h | 1 + minimap2.1 | 9 ++++++++- mmpriv.h | 2 +- options.c | 2 ++ python/cmappy.pxd | 1 + 8 files changed, 21 insertions(+), 8 deletions(-) diff --git a/hit.c b/hit.c index 506464b..88bbc03 100644 --- a/hit.c +++ b/hit.c @@ -122,7 +122,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a) r->split |= 1, r2->split |= 2; } -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc +void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc { int i, j, k, *w; uint64_t *cov; @@ -162,7 +162,7 @@ skip_uncov: min = ej - sj < ei - si? ej - sj : ei - si; max = ej - sj > ei - si? ej - sj : ei - si; ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified - if ((float)ol / min - (float)uncov_len / max > mask_level) { + if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit int cnt_sub = 0, sci = ri->score; ri->parent = rp->parent; if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac); diff --git a/main.c b/main.c index 6780f41..9b41ff6 100644 --- a/main.c +++ b/main.c @@ -70,6 +70,7 @@ static ko_longopt_t long_options[] = { { "chain-gap-scale",ko_required_argument, 343 }, { "alt", ko_required_argument, 344 }, { "alt-drop", ko_required_argument, 345 }, + { "mask-len", ko_required_argument, 346 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -217,6 +218,7 @@ int main(int argc, char *argv[]) else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale else if (c == 344) alt_list = o.arg; // --alt else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop + else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len else if (c == 314) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); } else if (c == 315) { // --secondary diff --git a/map.c b/map.c index 01d323c..0812783 100644 --- a/map.c +++ b/map.c @@ -249,7 +249,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a) { if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) - mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains @@ -262,7 +262,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k if (!(opt->flag & MM_F_CIGAR)) return regs; regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) - mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); mm_set_sam_pri(*n_regs, regs); } @@ -365,7 +365,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains free(regs0); for (i = 0; i < n_segs; ++i) { - mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent + mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a); mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr); } @@ -509,7 +509,7 @@ static void merge_hits(step_t *s) } } mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_drop); - mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); if (!(opt->flag & MM_F_ALL_CHAINS)) { mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]); mm_set_sam_pri(s->n_reg[k], s->reg[k]); diff --git a/minimap.h b/minimap.h index 251879b..a00869e 100644 --- a/minimap.h +++ b/minimap.h @@ -122,6 +122,7 @@ typedef struct { float chain_gap_scale; float mask_level; + int mask_len; float pri_ratio; int best_n; // top best_n chains are subjected to DP alignment diff --git a/minimap2.1 b/minimap2.1 index 88d895c..aeb1492 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -237,7 +237,14 @@ or more of the shorter chain [0.5] .B --hard-mask-level Honor option .B -M -and disable a heurstic to save unmapped subsequences. +and disable a heurstic to save unmapped subsequences and disables +.BR --mask-len . +.TP +.BI --mask-len \ NUM +Keep an alignment if dropping it leaves an unaligned region on query longer than +.IR INT +[inf]. Effective without +.BR --hard-mask-level . .TP .BI --max-chain-skip \ INT A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming diff --git a/mmpriv.h b/mmpriv.h index ffba6eb..1eebb5c 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -71,7 +71,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a); void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a); int mm_set_sam_pri(int n, mm_reg1_t *r); -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac); +void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac); void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r); void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs); diff --git a/options.c b/options.c index 8276b24..d4eafaa 100644 --- a/options.c +++ b/options.c @@ -1,4 +1,5 @@ #include +#include #include "mmpriv.h" void mm_idxopt_init(mm_idxopt_t *opt) @@ -27,6 +28,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->chain_gap_scale = 1.0f; opt->mask_level = 0.5f; + opt->mask_len = INT_MAX; opt->pri_ratio = 0.8f; opt->best_n = 5; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 72a19d7..459b246 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -22,6 +22,7 @@ cdef extern from "minimap.h": int min_chain_score float chain_gap_scale float mask_level + int mask_len float pri_ratio int best_n int max_join_long, max_join_short