r965: added --chain-gap-scale for #540

This commit is contained in:
Heng Li 2020-01-18 10:29:33 -05:00
parent cdb7857841
commit 040f74102c
8 changed files with 19 additions and 9 deletions

12
chain.c
View File

@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v)
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
}
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
{ // TODO: make sure this works when n has more than 32 bits
int32_t k, *f, *p, *t, *v, n_u, n_v;
int64_t i, j, st = 0;
@ -52,7 +52,7 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m
if (i - st > max_iter) st = i - max_iter;
for (j = i - 1; j >= st; --j) {
int64_t dr = ri - a[j].x;
int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd;
int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost;
int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below
if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue;
@ -62,14 +62,16 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m
min_d = dq < dr? dq : dr;
sc = min_d > q_span? q_span : dq < dr? dq : dr;
log_dd = dd? ilog2_32(dd) : 0;
gap_cost = 0;
if (is_cdna || sidi != sidj) {
int c_log, c_lin;
c_lin = (int)(dd * .01 * avg_qspan);
c_log = log_dd;
if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
else if (dr > dq || sidi != sidj) sc -= c_lin < c_log? c_lin : c_log;
else sc -= c_lin + (c_log>>1);
} else sc -= (int)(dd * .01 * avg_qspan) + (log_dd>>1);
else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log;
else gap_cost = c_lin + (c_log>>1);
} else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1);
sc -= (int)((double)gap_cost * gap_scale + .499);
sc += f[j];
if (sc > max_f) {
max_f = sc, max_j = j;

4
main.c
View File

@ -7,7 +7,7 @@
#include "mmpriv.h"
#include "ketopt.h"
#define MM_VERSION "2.17-r963-dirty"
#define MM_VERSION "2.17-r965-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -67,6 +67,7 @@ static ko_longopt_t long_options[] = {
{ "junc-bed", ko_required_argument, 340 },
{ "junc-bonus", ko_required_argument, 341 },
{ "sam-hit-only", ko_no_argument, 342 },
{ "chain-gap-scale",ko_required_argument, 343 },
{ "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' },
@ -211,6 +212,7 @@ int main(int argc, char *argv[])
else if (c == 340) junc_bed = o.arg; // --junc-bed
else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus
else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only
else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale
else if (c == 314) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
} else if (c == 315) { // --secondary

4
map.c
View File

@ -313,7 +313,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
} else max_chain_gap_ref = opt->max_gap;
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
if (opt->max_occ > opt->mid_occ && rep_len > 0) {
int rechain = 0;
@ -335,7 +335,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
kfree(b->km, mini_pos);
if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
}
}
b->frag_gap = max_chain_gap_ref;

View File

@ -117,6 +117,7 @@ typedef struct {
int max_chain_skip, max_chain_iter;
int min_cnt; // min number of minimizers on each chain
int min_chain_score; // min chaining score
float chain_gap_scale;
float mask_level;
float pri_ratio;

View File

@ -245,6 +245,9 @@ Check up to
partial chains during chaining [5000]. This is a heuristic to avoid quadratic
time complexity in the worst case.
.TP
.BI --chain-gap-scale \ FLOAT
Scale of gap cost during chaining [1.0]
.TP
.B --no-long-join
Disable the long gap patching heuristic. When this option is applied, the
maximum alignment gap is mostly controlled by

View File

@ -69,7 +69,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
void mm_idxopt_init(mm_idxopt_t *opt);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a);

View File

@ -24,6 +24,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->max_gap_ref = -1;
opt->max_chain_skip = 25;
opt->max_chain_iter = 5000;
opt->chain_gap_scale = 1.0f;
opt->mask_level = 0.5f;
opt->pri_ratio = 0.8f;

View File

@ -20,6 +20,7 @@ cdef extern from "minimap.h":
int max_chain_skip, max_chain_iter
int min_cnt
int min_chain_score
float chain_gap_scale
float mask_level
float pri_ratio
int best_n