r1049: removed the long-join heuristics

This commit is contained in:
Heng Li 2021-05-24 16:21:40 -04:00
parent 4f91558160
commit 379728726a
7 changed files with 5 additions and 78 deletions

58
hit.c
View File

@ -312,64 +312,6 @@ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
return as;
}
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
{
int i, n_aux, n_regs = *n_regs_, n_drop = 0;
uint64_t *aux;
if (n_regs < 2) return; // nothing to join
mm_squeeze_a(km, n_regs, regs, a);
aux = (uint64_t*)kmalloc(km, n_regs * 8);
for (i = n_aux = 0; i < n_regs; ++i)
if (regs[i].parent == i || regs[i].parent < 0)
aux[n_aux++] = (uint64_t)regs[i].as << 32 | i;
radix_sort_64(aux, aux + n_aux);
for (i = n_aux - 1; i >= 1; --i) {
mm_reg1_t *r0 = &regs[(int32_t)aux[i-1]], *r1 = &regs[(int32_t)aux[i]];
mm128_t *a0e, *a1s;
int max_gap, min_gap, sc_thres, min_flank_len;
// test
if (r0->as + r0->cnt != r1->as) continue; // not adjacent in a[]
if (r0->rid != r1->rid || r0->rev != r1->rev) continue; // make sure on the same target and strand
a0e = &a[r0->as + r0->cnt - 1];
a1s = &a[r1->as];
if (a1s->x <= a0e->x || (int32_t)a1s->y <= (int32_t)a0e->y) continue; // keep colinearity
max_gap = min_gap = (int32_t)a1s->y - (int32_t)a0e->y;
max_gap = a0e->x + max_gap > a1s->x? max_gap : a1s->x - a0e->x;
min_gap = a0e->x + min_gap < a1s->x? min_gap : a1s->x - a0e->x;
if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue;
sc_thres = (int)((float)opt->min_join_flank_sc / opt->max_join_long * max_gap + .499);
if (r0->score < sc_thres || r1->score < sc_thres) continue; // require good flanking chains
min_flank_len = (int)(max_gap * opt->min_join_flank_ratio);
if (r0->re - r0->rs < min_flank_len || r0->qe - r0->qs < min_flank_len) continue; // require enough flanking length
if (r1->re - r1->rs < min_flank_len || r1->qe - r1->qs < min_flank_len) continue;
// all conditions satisfied; join
a[r1->as].y |= MM_SEED_LONG_JOIN;
r0->cnt += r1->cnt, r0->score += r1->score;
mm_reg_set_coor(r0, qlen, a);
r1->cnt = 0;
r1->parent = r0->id;
++n_drop;
}
kfree(km, aux);
if (n_drop > 0) { // then fix the hits hierarchy
for (i = 0; i < n_regs; ++i) { // adjust the mm_reg1_t::parent
mm_reg1_t *r = &regs[i];
if (r->parent >= 0 && r->id != r->parent) { // fix for secondary hits only
if (regs[r->parent].parent >= 0 && regs[r->parent].parent != r->parent)
r->parent = regs[r->parent].parent;
}
}
mm_filter_regs(opt, qlen, n_regs_, regs);
mm_sync_regs(km, *n_regs_, regs);
}
}
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
{
int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;

7
main.c
View File

@ -7,7 +7,7 @@
#include "mmpriv.h"
#include "ketopt.h"
#define MM_VERSION "2.18-r1048-dirty"
#define MM_VERSION "2.18-r1049-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -205,7 +205,6 @@ int main(int argc, char *argv[])
else if (c == 327) opt.max_clip_ratio = atof(o.arg); // --max-clip-ratio
else if (c == 328) opt.min_mid_occ = atoi(o.arg); // --min-occ-floor
else if (c == 329) opt.flag |= MM_F_OUT_MD; // --MD
else if (c == 330) opt.min_join_flank_ratio = atof(o.arg); // --lj-min-ratio
else if (c == 331) opt.sc_ambi = atoi(o.arg); // --score-N
else if (c == 332) opt.flag |= MM_F_EQX; // --eqx
else if (c == 333) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit
@ -221,7 +220,9 @@ int main(int argc, char *argv[])
else if (c == 344) alt_list = o.arg; // --alt
else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop
else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len
else if (c == 314) { // --frag
else if (c == 330) {
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
} else if (c == 314) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
} else if (c == 315) { // --secondary
yes_or_no(&opt, MM_F_NO_PRINT_2ND, o.longidx, o.arg, 0);

4
map.c
View File

@ -210,8 +210,6 @@ static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_i
mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs);
if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains
mm_join_long(km, opt, qlen, n_regs, regs, a);
}
}
@ -302,7 +300,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
}
} else if (opt->bw_long > opt->bw && !(opt->flag & MM_F_RMQ) && n_segs == 1 && n_regs0 > 1) {
} else if (opt->bw_long > opt->bw && !(opt->flag & (MM_F_RMQ|MM_F_NO_LJOIN)) && n_segs == 1 && n_regs0 > 1) {
int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y;
if (qlen_sum - (en - st) > 1000 || en - st > qlen_sum * .1) {
int32_t i;

View File

@ -128,10 +128,6 @@ typedef struct {
float pri_ratio;
int best_n; // top best_n chains are subjected to DP alignment
int max_join_long, max_join_short;
int min_join_flank_sc;
float min_join_flank_ratio;
float alt_drop;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties

View File

@ -91,7 +91,6 @@ void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r);
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);

View File

@ -38,11 +38,6 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->pri_ratio = 0.8f;
opt->best_n = 5;
opt->max_join_long = 20000;
opt->max_join_short = 2000;
opt->min_join_flank_sc = 1000;
opt->min_join_flank_ratio = 0.5f;
opt->alt_drop = 0.15f;
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;

View File

@ -30,10 +30,6 @@ cdef extern from "minimap.h":
float pri_ratio
int best_n
int max_join_long, max_join_short
int min_join_flank_sc
float min_join_flank_ratio
float alt_drop
int a, b, q, e, q2, e2