r364: improved the mapq heuristics
* use repetitive seed lengths, not counts * compute n_sub to higher accuracy * use bwa-mem mapq heuristic as a backup For short single-end reads, minimap2's ROC is not as good as bwa-mem's, but is close.
This commit is contained in:
parent
47e9d76ca1
commit
4d3768bf26
24
hit.c
24
hit.c
|
|
@ -87,7 +87,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a)
|
|||
r->split |= 1, r2->split |= 2;
|
||||
}
|
||||
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r) // and compute mm_reg1_t::subsc
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff) // and compute mm_reg1_t::subsc
|
||||
{
|
||||
int i, j, k, *w;
|
||||
if (n <= 0) return;
|
||||
|
|
@ -103,11 +103,15 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r) // and compu
|
|||
int min = ej - sj < ei - si? ej - sj : ei - si;
|
||||
int ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si);
|
||||
if (ol > mask_level * min) {
|
||||
int cnt_sub = 0;
|
||||
ri->parent = rp->parent;
|
||||
rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score;
|
||||
if (ri->cnt >= rp->cnt) ++ri->n_sub;
|
||||
if (rp->p && ri->p)
|
||||
if (ri->cnt >= rp->cnt) cnt_sub = 1;
|
||||
if (rp->p && ri->p) {
|
||||
rp->p->dp_max2 = rp->p->dp_max2 > ri->p->dp_max? rp->p->dp_max2 : ri->p->dp_max;
|
||||
if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
|
||||
}
|
||||
if (cnt_sub) ++rp->n_sub;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -289,9 +293,9 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_r
|
|||
}
|
||||
}
|
||||
|
||||
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini)
|
||||
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len)
|
||||
{
|
||||
static const float q_coef = 30.0f;
|
||||
static const float q_coef = 40.0f;
|
||||
int i;
|
||||
for (i = 0; i < n_regs; ++i) {
|
||||
mm_reg1_t *r = ®s[i];
|
||||
|
|
@ -301,13 +305,19 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini)
|
|||
int mapq, subsc;
|
||||
float pen_s1 = r->score > 100? 1.0f : 0.01f * r->score;
|
||||
float pen_cm = r->cnt > 10? 1.0f : 0.1f * r->cnt;
|
||||
if (r->score <= 100 && rep_len > 0) {
|
||||
pen_s1 = 0.01f * (r->score - rep_len);
|
||||
pen_s1 = pen_s1 > 0.1f? pen_s1 : 0.1f;
|
||||
}
|
||||
pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
|
||||
subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
|
||||
if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
|
||||
float identity = (float)(r->p->blen - r->p->n_diff - r->p->n_ambi) / (r->p->blen - r->p->n_ambi);
|
||||
mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score));
|
||||
int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
|
||||
mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score)); // more for long reads
|
||||
mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
|
||||
} else mapq = (int)(pen_cm * q_coef * (1. - (float)subsc / r->score) * logf(r->score));
|
||||
mapq -= (int)(4.343 * log(r->n_sub + 1) + .499);
|
||||
mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
|
||||
mapq = mapq > 0? mapq : 0;
|
||||
r->mapq = mapq < 60? mapq : 60;
|
||||
} else r->mapq = 0;
|
||||
|
|
|
|||
2
main.c
2
main.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "getopt.h"
|
||||
|
||||
#define MM_VERSION "2.1.1-r363-dirty"
|
||||
#define MM_VERSION "2.1.1-r364-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
|
|||
17
map.c
17
map.c
|
|
@ -107,7 +107,7 @@ static void mm_dust_minier(mm128_v *mini, int l_seq, const char *seq, int sdust_
|
|||
|
||||
mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
|
||||
{
|
||||
int i, n, j, n_u, max_gap_ref, n_rep_mini = 0;
|
||||
int i, n, j, n_u, max_gap_ref, rep_st = 0, rep_en = 0, rep_len = 0;
|
||||
int64_t n_a;
|
||||
uint64_t *u;
|
||||
mm_match_t *m;
|
||||
|
|
@ -142,7 +142,11 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
|
|||
const uint64_t *r = q->x.cr;
|
||||
int k, q_span = p->x & 0xff, is_tandem = 0;
|
||||
if (q->n >= opt->mid_occ) {
|
||||
++n_rep_mini;
|
||||
int en = (q->qpos>>1) + 1, st = en - q_span;
|
||||
if (st > rep_en) {
|
||||
rep_len += rep_en - rep_st;
|
||||
rep_st = st, rep_en = en;
|
||||
} else rep_en = en;
|
||||
continue;
|
||||
}
|
||||
if (i > 0 && p->x>>8 == b->mini.a[i - 1].x>>8) is_tandem = 1;
|
||||
|
|
@ -168,12 +172,13 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
|
|||
if (is_tandem) p->y |= MM_SEED_TANDEM;
|
||||
}
|
||||
}
|
||||
rep_len += rep_en - rep_st;
|
||||
n_a = j;
|
||||
radix_sort_128x(a, a + n_a);
|
||||
kfree(b->km, m);
|
||||
|
||||
if (mm_dbg_flag & MM_DBG_PRINT_SEED) {
|
||||
fprintf(stderr, "RS\t%d\n", n_rep_mini);
|
||||
fprintf(stderr, "RS\t%d\n", rep_len);
|
||||
for (i = 0; i < n_a; ++i)
|
||||
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
|
||||
i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
|
||||
|
|
@ -191,7 +196,7 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
|
|||
i == regs[j].as? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
|
||||
|
||||
if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
|
||||
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
|
||||
mm_set_parent(b->km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b);
|
||||
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
|
||||
if (!(opt->flag & MM_F_SPLICE))
|
||||
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
|
||||
|
|
@ -199,12 +204,12 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm
|
|||
if (opt->flag & MM_F_CIGAR) {
|
||||
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
|
||||
if (!(opt->flag & MM_F_AVA)) {
|
||||
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
|
||||
mm_set_parent(b->km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b);
|
||||
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
|
||||
mm_set_sam_pri(*n_regs, regs);
|
||||
}
|
||||
}
|
||||
mm_set_mapq(*n_regs, regs, opt->min_chain_score, n_rep_mini);
|
||||
mm_set_mapq(*n_regs, regs, opt->min_chain_score, opt->a, rep_len);
|
||||
|
||||
// free
|
||||
kfree(b->km, a);
|
||||
|
|
|
|||
4
mmpriv.h
4
mmpriv.h
|
|
@ -53,12 +53,12 @@ mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);
|
|||
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a);
|
||||
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
|
||||
int mm_set_sam_pri(int n, mm_reg1_t *r);
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r);
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff);
|
||||
void mm_select_sub(void *km, float mask_level, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r);
|
||||
void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs);
|
||||
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
|
||||
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r);
|
||||
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini);
|
||||
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue