diff --git a/chain.c b/chain.c index 95abf5e..a2f7ac5 100644 --- a/chain.c +++ b/chain.c @@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v) return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; } -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) +mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) { // TODO: make sure this works when n has more than 32 bits int32_t k, *f, *p, *t, *v, n_u, n_v; int64_t i, j, st = 0; @@ -52,7 +52,7 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m if (i - st > max_iter) st = i - max_iter; for (j = i - 1; j >= st; --j) { int64_t dr = ri - a[j].x; - int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd; + int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost; int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue; @@ -62,14 +62,16 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m min_d = dq < dr? dq : dr; sc = min_d > q_span? q_span : dq < dr? dq : dr; log_dd = dd? ilog2_32(dd) : 0; + gap_cost = 0; if (is_cdna || sidi != sidj) { int c_log, c_lin; c_lin = (int)(dd * .01 * avg_qspan); c_log = log_dd; if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus - else if (dr > dq || sidi != sidj) sc -= c_lin < c_log? c_lin : c_log; - else sc -= c_lin + (c_log>>1); - } else sc -= (int)(dd * .01 * avg_qspan) + (log_dd>>1); + else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log; + else gap_cost = c_lin + (c_log>>1); + } else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1); + sc -= (int)((double)gap_cost * gap_scale + .499); sc += f[j]; if (sc > max_f) { max_f = sc, max_j = j; diff --git a/main.c b/main.c index 63730c1..d6535e7 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.17-r963-dirty" +#define MM_VERSION "2.17-r965-dirty" #ifdef __linux__ #include @@ -67,6 +67,7 @@ static ko_longopt_t long_options[] = { { "junc-bed", ko_required_argument, 340 }, { "junc-bonus", ko_required_argument, 341 }, { "sam-hit-only", ko_no_argument, 342 }, + { "chain-gap-scale",ko_required_argument, 343 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -211,6 +212,7 @@ int main(int argc, char *argv[]) else if (c == 340) junc_bed = o.arg; // --junc-bed else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only + else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale else if (c == 314) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); } else if (c == 315) { // --secondary diff --git a/map.c b/map.c index 58e74fc..8cfd8f4 100644 --- a/map.c +++ b/map.c @@ -313,7 +313,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; } else max_chain_gap_ref = opt->max_gap; - a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); if (opt->max_occ > opt->mid_occ && rep_len > 0) { int rechain = 0; @@ -335,7 +335,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** kfree(b->km, mini_pos); if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); - a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } } b->frag_gap = max_chain_gap_ref; diff --git a/minimap.h b/minimap.h index cdf955f..8837fdb 100644 --- a/minimap.h +++ b/minimap.h @@ -117,6 +117,7 @@ typedef struct { int max_chain_skip, max_chain_iter; int min_cnt; // min number of minimizers on each chain int min_chain_score; // min chaining score + float chain_gap_scale; float mask_level; float pri_ratio; diff --git a/minimap2.1 b/minimap2.1 index ff0b922..faa42f3 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -245,6 +245,9 @@ Check up to partial chains during chaining [5000]. This is a heuristic to avoid quadratic time complexity in the worst case. .TP +.BI --chain-gap-scale \ FLOAT +Scale of gap cost during chaining [1.0] +.TP .B --no-long-join Disable the long gap patching heuristic. When this option is applied, the maximum alignment gap is mostly controlled by diff --git a/mmpriv.h b/mmpriv.h index c1164e2..e42f0df 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -69,7 +69,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se void mm_idxopt_init(mm_idxopt_t *opt); const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); +mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a); diff --git a/options.c b/options.c index 2e6a43a..171e9bc 100644 --- a/options.c +++ b/options.c @@ -24,6 +24,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->max_chain_iter = 5000; + opt->chain_gap_scale = 1.0f; opt->mask_level = 0.5f; opt->pri_ratio = 0.8f; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 7545cb8..8564ac0 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -20,6 +20,7 @@ cdef extern from "minimap.h": int max_chain_skip, max_chain_iter int min_cnt int min_chain_score + float chain_gap_scale float mask_level float pri_ratio int best_n