From fb8a1b55367d554f65b50e73d496b04183ee10bc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 31 Oct 2017 14:25:09 -0400 Subject: [PATCH] r542: tuning mapQ calculation --- hit.c | 23 ++++++++++++++++++----- main.c | 2 +- map.c | 8 ++++---- mmpriv.h | 2 +- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/hit.c b/hit.c index 0d4e86d..ed013ac 100644 --- a/hit.c +++ b/hit.c @@ -442,7 +442,7 @@ void mm_seg_free(void *km, int n_segs, mm_seg_t *segs) kfree(km, segs); } -void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len) +void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr) { static const float q_coef = 40.0f; int i; @@ -458,10 +458,23 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, in subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc; if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) { float identity = (float)r->mlen / r->blen; - int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads - mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score)); // more for long reads - mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails - } else mapq = (int)(pen_cm * q_coef * (1. - (float)subsc / r->score) * logf(r->score)); + float x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score; + if (is_sr) { + mapq = (int)(identity * pen_cm * q_coef * (1.0f - x * x) * logf((float)r->p->dp_max / match_sc)); + } else { + int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads + mapq = (int)(identity * pen_cm * q_coef * (1.0f - x) * logf(r->score)); // more for long reads + mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails + } + } else { + float x = (float)subsc / r->score; + if (is_sr && r->p) { + float identity = (float)r->mlen / r->blen; + mapq = (int)(identity * pen_cm * q_coef * (1.0f - x) * logf((float)r->p->dp_max / match_sc)); + } else { + mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score)); + } + } mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f); mapq = mapq > 0? mapq : 0; r->mapq = mapq < 60? mapq : 60; diff --git a/main.c b/main.c index e1ae7fe..b919958 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.3-r540-dirty" +#define MM_VERSION "2.3-r542-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index e78975e..eea9690 100644 --- a/map.c +++ b/map.c @@ -270,7 +270,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, const char **quals, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { int i, j, rep_len, qlen_sum, n_regs0; - int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE); + int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR); uint32_t hash; int64_t n_a; uint64_t *u; @@ -298,7 +298,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** } // set max chaining gap on the query and the reference sequence - if (opt->flag & MM_F_SR) + if (is_sr) max_chain_gap_qry = qlen_sum > opt->max_gap? qlen_sum : opt->max_gap; else max_chain_gap_qry = opt->max_gap; if (opt->max_gap_ref > 0) { @@ -345,7 +345,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (n_segs == 1) { // uni-segment regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], quals? quals[0] : 0, &n_regs0, regs0, a); - mm_set_mapq(n_regs0, regs0, opt->min_chain_score, opt->a, rep_len); + mm_set_mapq(n_regs0, regs0, opt->min_chain_score, opt->a, rep_len, is_sr); n_regs[0] = n_regs0, regs[0] = regs0; } else { // multi-segment mm_seg_t *seg; @@ -354,7 +354,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** for (i = 0; i < n_segs; ++i) { mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b); // update mm_reg1_t::parent regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], quals? quals[i] : 0, &n_regs[i], regs[i], seg[i].a); - mm_set_mapq(n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len); + mm_set_mapq(n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr); } mm_seg_free(b->km, n_segs, seg); if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR)) diff --git a/mmpriv.h b/mmpriv.h index 7fe23e3..bbf54e5 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -78,7 +78,7 @@ void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *re void mm_filter_by_identity(void *km, int n_regs, mm_reg1_t *regs, float min_iden, int qlen, const char *qual); void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r); -void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len); +void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr); mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a); void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);