From 4f91558160be3b6595b18e013984a50ed3ffae67 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 May 2021 16:09:09 -0400 Subject: [PATCH] r1048: rescue long gaps --- align.c | 6 ++++-- main.c | 2 +- map.c | 10 ++++++++++ minimap.h | 2 +- options.c | 13 +++++++------ python/cmappy.pxd | 2 +- 6 files changed, 24 insertions(+), 11 deletions(-) diff --git a/align.c b/align.c index 6028abb..f2f154f 100644 --- a/align.c +++ b/align.c @@ -567,7 +567,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE); int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; uint8_t *tseq, *qseq, *junc; - int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; + int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; int32_t rs, re, qs, qe; int32_t rs1, qs1, re1, qe1; int8_t mat[25]; @@ -578,6 +578,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (r->cnt == 0) return; ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); bw = (int)(opt->bw * 1.5 + 1.); + bw_long = (int)(opt->bw_long * 1.5 + 1.); + if (bw_long < bw) bw_long = bw; if (is_sr && !(mi->flag & MM_I_HPC)) { mm_max_stretch(r, a, &as1, &cnt1); @@ -714,7 +716,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int } else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe); re1 = re, qe1 = qe; if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) { - int j, bw1 = bw, zdrop_code; + int j, bw1 = bw_long, zdrop_code; if (a[as1+i].y & MM_SEED_LONG_JOIN) bw1 = qe - qs > re - rs? qe - qs : re - rs; // perform alignment diff --git a/main.c b/main.c index 390adf1..27470e7 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1047-dirty" +#define MM_VERSION "2.18-r1048-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index b974155..42b0eaa 100644 --- a/map.c +++ b/map.c @@ -302,6 +302,16 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } + } else if (opt->bw_long > opt->bw && !(opt->flag & MM_F_RMQ) && n_segs == 1 && n_regs0 > 1) { + int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; + if (qlen_sum - (en - st) > 1000 || en - st > qlen_sum * .1) { + int32_t i; + for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i]; + kfree(b->km, u); + radix_sort_128x(a, a + n_a); + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km); + } } b->frag_gap = max_chain_gap_ref; b->rep_len = rep_len; diff --git a/minimap.h b/minimap.h index 281ca3d..10a4b60 100644 --- a/minimap.h +++ b/minimap.h @@ -114,7 +114,7 @@ typedef struct { int max_qlen; // max query length - int bw; // bandwidth + int bw, bw_long; // bandwidth int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window int max_frag_len; int max_chain_skip, max_chain_iter; diff --git a/options.c b/options.c index a64a071..6a3e780 100644 --- a/options.c +++ b/options.c @@ -22,14 +22,14 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_cnt = 3; opt->min_chain_score = 40; - opt->bw = 500; + opt->bw = 500, opt->bw_long = 20000; opt->max_gap = 10000; opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->max_chain_iter = 5000; opt->rmq_inner_dist = 1000; opt->rmq_size_cap = 100000; - opt->chain_gap_scale = 1.0f; + opt->chain_gap_scale = 0.8f; opt->max_max_occ = 4095; opt->occ_dist = 500; @@ -91,7 +91,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; - mo->bw = 2000; + mo->bw = mo->bw_long = 2000; mo->occ_dist = 0; } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19; @@ -99,6 +99,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag |= MM_I_HPC, io->k = 19, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; + mo->bw_long = mo->bw; mo->occ_dist = 0; } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; @@ -108,7 +109,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->min_dp_max = 200; } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; - mo->bw = 100000; + mo->bw = mo->bw_long = 100000; mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN; mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; @@ -130,7 +131,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->end_bonus = 10; mo->max_frag_len = 800; mo->max_gap = 100; - mo->bw = 100; + mo->bw = mo->bw_long = 100; mo->pri_ratio = 0.5f; mo->min_cnt = 2; mo->min_chain_score = 25; @@ -143,7 +144,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; mo->max_sw_mat = 0; - mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; + mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->noncan = 9; mo->junc_bonus = 9; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index c864f55..0c8c57f 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -16,7 +16,7 @@ cdef extern from "minimap.h": int max_qlen - int bw + int bw, bw_long int max_gap, max_gap_ref int max_frag_len int max_chain_skip, max_chain_iter