From 4d3768bf2627d70ce8e57cc681bdb40afb7dd972 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 14 Sep 2017 12:37:03 -0400 Subject: [PATCH] r364: improved the mapq heuristics * use repetitive seed lengths, not counts * compute n_sub to higher accuracy * use bwa-mem mapq heuristic as a backup For short single-end reads, minimap2's ROC is not as good as bwa-mem's, but is close. --- hit.c | 24 +++++++++++++++++------- main.c | 2 +- map.c | 17 +++++++++++------ mmpriv.h | 4 ++-- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/hit.c b/hit.c index 9acb5e4..fa3e068 100644 --- a/hit.c +++ b/hit.c @@ -87,7 +87,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a) r->split |= 1, r2->split |= 2; } -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r) // and compute mm_reg1_t::subsc +void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff) // and compute mm_reg1_t::subsc { int i, j, k, *w; if (n <= 0) return; @@ -103,11 +103,15 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r) // and compu int min = ej - sj < ei - si? ej - sj : ei - si; int ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); if (ol > mask_level * min) { + int cnt_sub = 0; ri->parent = rp->parent; rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score; - if (ri->cnt >= rp->cnt) ++ri->n_sub; - if (rp->p && ri->p) + if (ri->cnt >= rp->cnt) cnt_sub = 1; + if (rp->p && ri->p) { rp->p->dp_max2 = rp->p->dp_max2 > ri->p->dp_max? rp->p->dp_max2 : ri->p->dp_max; + if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1; + } + if (cnt_sub) ++rp->n_sub; break; } } @@ -289,9 +293,9 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_r } } -void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini) +void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len) { - static const float q_coef = 30.0f; + static const float q_coef = 40.0f; int i; for (i = 0; i < n_regs; ++i) { mm_reg1_t *r = ®s[i]; @@ -301,13 +305,19 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini) int mapq, subsc; float pen_s1 = r->score > 100? 1.0f : 0.01f * r->score; float pen_cm = r->cnt > 10? 1.0f : 0.1f * r->cnt; + if (r->score <= 100 && rep_len > 0) { + pen_s1 = 0.01f * (r->score - rep_len); + pen_s1 = pen_s1 > 0.1f? pen_s1 : 0.1f; + } pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm; subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc; if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) { float identity = (float)(r->p->blen - r->p->n_diff - r->p->n_ambi) / (r->p->blen - r->p->n_ambi); - mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score)); + int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads + mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score)); // more for long reads + mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails } else mapq = (int)(pen_cm * q_coef * (1. - (float)subsc / r->score) * logf(r->score)); - mapq -= (int)(4.343 * log(r->n_sub + 1) + .499); + mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f); mapq = mapq > 0? mapq : 0; r->mapq = mapq < 60? mapq : 60; } else r->mapq = 0; diff --git a/main.c b/main.c index 2653dae..1f6b354 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.1.1-r363-dirty" +#define MM_VERSION "2.1.1-r364-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index d2d615a..dad6423 100644 --- a/map.c +++ b/map.c @@ -107,7 +107,7 @@ static void mm_dust_minier(mm128_v *mini, int l_seq, const char *seq, int sdust_ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { - int i, n, j, n_u, max_gap_ref, n_rep_mini = 0; + int i, n, j, n_u, max_gap_ref, rep_st = 0, rep_en = 0, rep_len = 0; int64_t n_a; uint64_t *u; mm_match_t *m; @@ -142,7 +142,11 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm const uint64_t *r = q->x.cr; int k, q_span = p->x & 0xff, is_tandem = 0; if (q->n >= opt->mid_occ) { - ++n_rep_mini; + int en = (q->qpos>>1) + 1, st = en - q_span; + if (st > rep_en) { + rep_len += rep_en - rep_st; + rep_st = st, rep_en = en; + } else rep_en = en; continue; } if (i > 0 && p->x>>8 == b->mini.a[i - 1].x>>8) is_tandem = 1; @@ -168,12 +172,13 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm if (is_tandem) p->y |= MM_SEED_TANDEM; } } + rep_len += rep_en - rep_st; n_a = j; radix_sort_128x(a, a + n_a); kfree(b->km, m); if (mm_dbg_flag & MM_DBG_PRINT_SEED) { - fprintf(stderr, "RS\t%d\n", n_rep_mini); + fprintf(stderr, "RS\t%d\n", rep_len); for (i = 0; i < n_a; ++i) fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff), i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); @@ -191,7 +196,7 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm i == regs[j].as? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap - mm_set_parent(b->km, opt->mask_level, *n_regs, regs); + mm_set_parent(b->km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b); mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); if (!(opt->flag & MM_F_SPLICE)) mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle @@ -199,12 +204,12 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm if (opt->flag & MM_F_CIGAR) { regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() if (!(opt->flag & MM_F_AVA)) { - mm_set_parent(b->km, opt->mask_level, *n_regs, regs); + mm_set_parent(b->km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b); mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); mm_set_sam_pri(*n_regs, regs); } } - mm_set_mapq(*n_regs, regs, opt->min_chain_score, n_rep_mini); + mm_set_mapq(*n_regs, regs, opt->min_chain_score, opt->a, rep_len); // free kfree(b->km, a); diff --git a/mmpriv.h b/mmpriv.h index 6674535..11fba89 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -53,12 +53,12 @@ mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a); void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a); void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); int mm_set_sam_pri(int n, mm_reg1_t *r); -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r); +void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff); void mm_select_sub(void *km, float mask_level, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r); void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs); void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r); -void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini); +void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len); #ifdef __cplusplus }