r965: added --chain-gap-scale for #540

This commit is contained in:
Heng Li 2020-01-18 10:29:33 -05:00
parent cdb7857841
commit 040f74102c
8 changed files with 19 additions and 9 deletions

12
chain.c
View File

@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v)
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
} }
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
{ // TODO: make sure this works when n has more than 32 bits { // TODO: make sure this works when n has more than 32 bits
int32_t k, *f, *p, *t, *v, n_u, n_v; int32_t k, *f, *p, *t, *v, n_u, n_v;
int64_t i, j, st = 0; int64_t i, j, st = 0;
@ -52,7 +52,7 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m
if (i - st > max_iter) st = i - max_iter; if (i - st > max_iter) st = i - max_iter;
for (j = i - 1; j >= st; --j) { for (j = i - 1; j >= st; --j) {
int64_t dr = ri - a[j].x; int64_t dr = ri - a[j].x;
int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd; int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost;
int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below
if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue; if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue;
@ -62,14 +62,16 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m
min_d = dq < dr? dq : dr; min_d = dq < dr? dq : dr;
sc = min_d > q_span? q_span : dq < dr? dq : dr; sc = min_d > q_span? q_span : dq < dr? dq : dr;
log_dd = dd? ilog2_32(dd) : 0; log_dd = dd? ilog2_32(dd) : 0;
gap_cost = 0;
if (is_cdna || sidi != sidj) { if (is_cdna || sidi != sidj) {
int c_log, c_lin; int c_log, c_lin;
c_lin = (int)(dd * .01 * avg_qspan); c_lin = (int)(dd * .01 * avg_qspan);
c_log = log_dd; c_log = log_dd;
if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
else if (dr > dq || sidi != sidj) sc -= c_lin < c_log? c_lin : c_log; else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log;
else sc -= c_lin + (c_log>>1); else gap_cost = c_lin + (c_log>>1);
} else sc -= (int)(dd * .01 * avg_qspan) + (log_dd>>1); } else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1);
sc -= (int)((double)gap_cost * gap_scale + .499);
sc += f[j]; sc += f[j];
if (sc > max_f) { if (sc > max_f) {
max_f = sc, max_j = j; max_f = sc, max_j = j;

4
main.c
View File

@ -7,7 +7,7 @@
#include "mmpriv.h" #include "mmpriv.h"
#include "ketopt.h" #include "ketopt.h"
#define MM_VERSION "2.17-r963-dirty" #define MM_VERSION "2.17-r965-dirty"
#ifdef __linux__ #ifdef __linux__
#include <sys/resource.h> #include <sys/resource.h>
@ -67,6 +67,7 @@ static ko_longopt_t long_options[] = {
{ "junc-bed", ko_required_argument, 340 }, { "junc-bed", ko_required_argument, 340 },
{ "junc-bonus", ko_required_argument, 341 }, { "junc-bonus", ko_required_argument, 341 },
{ "sam-hit-only", ko_no_argument, 342 }, { "sam-hit-only", ko_no_argument, 342 },
{ "chain-gap-scale",ko_required_argument, 343 },
{ "help", ko_no_argument, 'h' }, { "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' }, { "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' }, { "version", ko_no_argument, 'V' },
@ -211,6 +212,7 @@ int main(int argc, char *argv[])
else if (c == 340) junc_bed = o.arg; // --junc-bed else if (c == 340) junc_bed = o.arg; // --junc-bed
else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus
else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only
else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale
else if (c == 314) { // --frag else if (c == 314) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
} else if (c == 315) { // --secondary } else if (c == 315) { // --secondary

4
map.c
View File

@ -313,7 +313,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
} else max_chain_gap_ref = opt->max_gap; } else max_chain_gap_ref = opt->max_gap;
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
if (opt->max_occ > opt->mid_occ && rep_len > 0) { if (opt->max_occ > opt->mid_occ && rep_len > 0) {
int rechain = 0; int rechain = 0;
@ -335,7 +335,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
kfree(b->km, mini_pos); kfree(b->km, mini_pos);
if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
} }
} }
b->frag_gap = max_chain_gap_ref; b->frag_gap = max_chain_gap_ref;

View File

@ -117,6 +117,7 @@ typedef struct {
int max_chain_skip, max_chain_iter; int max_chain_skip, max_chain_iter;
int min_cnt; // min number of minimizers on each chain int min_cnt; // min number of minimizers on each chain
int min_chain_score; // min chaining score int min_chain_score; // min chaining score
float chain_gap_scale;
float mask_level; float mask_level;
float pri_ratio; float pri_ratio;

View File

@ -245,6 +245,9 @@ Check up to
partial chains during chaining [5000]. This is a heuristic to avoid quadratic partial chains during chaining [5000]. This is a heuristic to avoid quadratic
time complexity in the worst case. time complexity in the worst case.
.TP .TP
.BI --chain-gap-scale \ FLOAT
Scale of gap cost during chaining [1.0]
.TP
.B --no-long-join .B --no-long-join
Disable the long gap patching heuristic. When this option is applied, the Disable the long gap patching heuristic. When this option is applied, the
maximum alignment gap is mostly controlled by maximum alignment gap is mostly controlled by

View File

@ -69,7 +69,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
void mm_idxopt_init(mm_idxopt_t *opt); void mm_idxopt_init(mm_idxopt_t *opt);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a); mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a);

View File

@ -24,6 +24,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->max_gap_ref = -1; opt->max_gap_ref = -1;
opt->max_chain_skip = 25; opt->max_chain_skip = 25;
opt->max_chain_iter = 5000; opt->max_chain_iter = 5000;
opt->chain_gap_scale = 1.0f;
opt->mask_level = 0.5f; opt->mask_level = 0.5f;
opt->pri_ratio = 0.8f; opt->pri_ratio = 0.8f;

View File

@ -20,6 +20,7 @@ cdef extern from "minimap.h":
int max_chain_skip, max_chain_iter int max_chain_skip, max_chain_iter
int min_cnt int min_cnt
int min_chain_score int min_chain_score
float chain_gap_scale
float mask_level float mask_level
float pri_ratio float pri_ratio
int best_n int best_n