r364: improved the mapq heuristics

* use repetitive seed lengths, not counts
* compute n_sub to higher accuracy
* use bwa-mem mapq heuristic as a backup

For short single-end reads, minimap2's ROC is not as good as bwa-mem's, but is
close.
This commit is contained in:
Heng Li 2017-09-14 12:37:03 -04:00
parent 47e9d76ca1
commit 4d3768bf26
4 changed files with 31 additions and 16 deletions

24
hit.c
View File

@ -87,7 +87,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a)
r->split |= 1, r2->split |= 2;
}
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r) // and compute mm_reg1_t::subsc
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff) // and compute mm_reg1_t::subsc
{
int i, j, k, *w;
if (n <= 0) return;
@ -103,11 +103,15 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r) // and compu
int min = ej - sj < ei - si? ej - sj : ei - si;
int ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si);
if (ol > mask_level * min) {
int cnt_sub = 0;
ri->parent = rp->parent;
rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score;
if (ri->cnt >= rp->cnt) ++ri->n_sub;
if (rp->p && ri->p)
if (ri->cnt >= rp->cnt) cnt_sub = 1;
if (rp->p && ri->p) {
rp->p->dp_max2 = rp->p->dp_max2 > ri->p->dp_max? rp->p->dp_max2 : ri->p->dp_max;
if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
}
if (cnt_sub) ++rp->n_sub;
break;
}
}
@ -289,9 +293,9 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_r
}
}
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini)
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len)
{
static const float q_coef = 30.0f;
static const float q_coef = 40.0f;
int i;
for (i = 0; i < n_regs; ++i) {
mm_reg1_t *r = &regs[i];
@ -301,13 +305,19 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini)
int mapq, subsc;
float pen_s1 = r->score > 100? 1.0f : 0.01f * r->score;
float pen_cm = r->cnt > 10? 1.0f : 0.1f * r->cnt;
if (r->score <= 100 && rep_len > 0) {
pen_s1 = 0.01f * (r->score - rep_len);
pen_s1 = pen_s1 > 0.1f? pen_s1 : 0.1f;
}
pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
float identity = (float)(r->p->blen - r->p->n_diff - r->p->n_ambi) / (r->p->blen - r->p->n_ambi);
mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score));
int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score)); // more for long reads
mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
} else mapq = (int)(pen_cm * q_coef * (1. - (float)subsc / r->score) * logf(r->score));
mapq -= (int)(4.343 * log(r->n_sub + 1) + .499);
mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
mapq = mapq > 0? mapq : 0;
r->mapq = mapq < 60? mapq : 60;
} else r->mapq = 0;

2
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.1.1-r363-dirty"
#define MM_VERSION "2.1.1-r364-dirty"
#ifdef __linux__
#include <sys/resource.h>

17
map.c
View File

@ -107,7 +107,7 @@ static void mm_dust_minier(mm128_v *mini, int l_seq, const char *seq, int sdust_
mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
int i, n, j, n_u, max_gap_ref, n_rep_mini = 0;
int i, n, j, n_u, max_gap_ref, rep_st = 0, rep_en = 0, rep_len = 0;
int64_t n_a;
uint64_t *u;
mm_match_t *m;
@ -142,7 +142,11 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
const uint64_t *r = q->x.cr;
int k, q_span = p->x & 0xff, is_tandem = 0;
if (q->n >= opt->mid_occ) {
++n_rep_mini;
int en = (q->qpos>>1) + 1, st = en - q_span;
if (st > rep_en) {
rep_len += rep_en - rep_st;
rep_st = st, rep_en = en;
} else rep_en = en;
continue;
}
if (i > 0 && p->x>>8 == b->mini.a[i - 1].x>>8) is_tandem = 1;
@ -168,12 +172,13 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
if (is_tandem) p->y |= MM_SEED_TANDEM;
}
}
rep_len += rep_en - rep_st;
n_a = j;
radix_sort_128x(a, a + n_a);
kfree(b->km, m);
if (mm_dbg_flag & MM_DBG_PRINT_SEED) {
fprintf(stderr, "RS\t%d\n", n_rep_mini);
fprintf(stderr, "RS\t%d\n", rep_len);
for (i = 0; i < n_a; ++i)
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
@ -191,7 +196,7 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
i == regs[j].as? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
mm_set_parent(b->km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
if (!(opt->flag & MM_F_SPLICE))
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
@ -199,12 +204,12 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
if (opt->flag & MM_F_CIGAR) {
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
if (!(opt->flag & MM_F_AVA)) {
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
mm_set_parent(b->km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
mm_set_sam_pri(*n_regs, regs);
}
}
mm_set_mapq(*n_regs, regs, opt->min_chain_score, n_rep_mini);
mm_set_mapq(*n_regs, regs, opt->min_chain_score, opt->a, rep_len);
// free
kfree(b->km, a);

View File

@ -53,12 +53,12 @@ mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a);
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
int mm_set_sam_pri(int n, mm_reg1_t *r);
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r);
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff);
void mm_select_sub(void *km, float mask_level, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r);
void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs);
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r);
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini);
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len);
#ifdef __cplusplus
}