From 6a82a21deedee7eeab11132c145e17d184961143 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 13 Sep 2017 15:32:39 -0400 Subject: [PATCH] r361: improved mapq for short reads --- hit.c | 13 +++++++++---- main.c | 4 ++-- map.c | 17 +++++++++++------ misc/sim-eval.js | 4 ++-- mmpriv.h | 2 +- 5 files changed, 25 insertions(+), 15 deletions(-) diff --git a/hit.c b/hit.c index 41c3b8e..d1a81cb 100644 --- a/hit.c +++ b/hit.c @@ -105,12 +105,13 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r) // and compu if (ol > mask_level * min) { ri->parent = rp->parent; rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score; + if (ri->cnt >= rp->cnt) ++ri->n_sub; if (rp->p && ri->p) rp->p->dp_max2 = rp->p->dp_max2 > ri->p->dp_max? rp->p->dp_max2 : ri->p->dp_max; break; } } - if (j == k) w[k++] = i, ri->parent = i; + if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0; } kfree(km, w); } @@ -288,7 +289,7 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_r } } -void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc) +void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini) { static const float q_coef = 30.0f; int i; @@ -298,12 +299,16 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc) r->mapq = 0; } else if (r->parent == r->id) { int mapq, subsc; - float pen_cm = r->cnt >= 10? 1.0f : 0.1f * r->cnt; + float pen_cm = 1.0f; + if (r->cnt < 10) + pen_cm = 0.1f * r->cnt * (r->cnt > n_rep_mini? 1.0f : (1.0f + r->cnt) / (1.0f + n_rep_mini)); subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc; if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) { float identity = (float)(r->p->blen - r->p->n_diff - r->p->n_ambi) / (r->p->blen - r->p->n_ambi); - mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score)); + float chain_ratio = subsc > r->score? (float)subsc / r->score : 1.0f; + mapq = (int)(identity * pen_cm * q_coef * (1. - (float)chain_ratio * r->p->dp_max2 / r->p->dp_max) * logf(r->score)); } else mapq = (int)(pen_cm * q_coef * (1. - (float)subsc / r->score) * logf(r->score)); + mapq -= (int)(4.343 * log(r->n_sub + 1) + .499); mapq = mapq > 0? mapq : 0; r->mapq = mapq < 60? mapq : 60; } else r->mapq = 0; diff --git a/main.c b/main.c index da019b2..1f0ac56 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.1.1-r360-dirty" +#define MM_VERSION "2.1.1-r361-dirty" #ifdef __linux__ #include @@ -29,7 +29,7 @@ static struct option long_options[] = { { "no-kalloc", no_argument, 0, 0 }, { "print-qname", no_argument, 0, 0 }, { "no-self", no_argument, 0, 0 }, - { "print-seed", no_argument, 0, 0 }, + { "print-seeds", no_argument, 0, 0 }, { "max-chain-skip", required_argument, 0, 0 }, { "min-dp-len", required_argument, 0, 0 }, { "print-aln-seq", no_argument, 0, 0 }, diff --git a/map.c b/map.c index 5510de6..d2d615a 100644 --- a/map.c +++ b/map.c @@ -107,7 +107,7 @@ static void mm_dust_minier(mm128_v *mini, int l_seq, const char *seq, int sdust_ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { - int i, n, j, n_u, max_gap_ref; + int i, n, j, n_u, max_gap_ref, n_rep_mini = 0; int64_t n_a; uint64_t *u; mm_match_t *m; @@ -141,7 +141,10 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm mm_match_t *q = &m[i]; const uint64_t *r = q->x.cr; int k, q_span = p->x & 0xff, is_tandem = 0; - if (q->n >= opt->mid_occ) continue; + if (q->n >= opt->mid_occ) { + ++n_rep_mini; + continue; + } if (i > 0 && p->x>>8 == b->mini.a[i - 1].x>>8) is_tandem = 1; if (i < n - 1 && p->x>>8 == b->mini.a[i + 1].x>>8) is_tandem = 1; for (k = 0; k < q->n; ++k) { @@ -156,10 +159,10 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm } p = &a[j++]; if ((r[k]&1) == (q->qpos&1)) { // forward strand - p->x = (r[k]&0xffffffff00000000ULL) | (uint32_t)r[k]>>1; + p->x = (r[k]&0xffffffff00000000ULL) | rpos; p->y = (uint64_t)q_span << 32 | q->qpos >> 1; } else { // reverse strand - p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | (uint32_t)r[k]>>1; + p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | rpos; p->y = (uint64_t)q_span << 32 | (qlen - ((q->qpos>>1) + 1 - q_span) - 1); } if (is_tandem) p->y |= MM_SEED_TANDEM; @@ -169,10 +172,12 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm radix_sort_128x(a, a + n_a); kfree(b->km, m); - if (mm_dbg_flag & MM_DBG_PRINT_SEED) + if (mm_dbg_flag & MM_DBG_PRINT_SEED) { + fprintf(stderr, "RS\t%d\n", n_rep_mini); for (i = 0; i < n_a; ++i) fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff), i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); + } max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap; n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_a, a, &u, b->km); @@ -199,7 +204,7 @@ mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm mm_set_sam_pri(*n_regs, regs); } } - mm_set_mapq(*n_regs, regs, opt->min_chain_score); + mm_set_mapq(*n_regs, regs, opt->min_chain_score, n_rep_mini); // free kfree(b->km, a); diff --git a/misc/sim-eval.js b/misc/sim-eval.js index 07e9256..79edd57 100644 --- a/misc/sim-eval.js +++ b/misc/sim-eval.js @@ -181,11 +181,11 @@ var sum_tot = 0, sum_err = 0, q_out = -1, sum_tot2 = 0, sum_err2 = 0; for (var q = max_mapq; q >= 0; --q) { if (tot[q] == 0) continue; if (q_out < 0 || err[q] > 0) { - if (q_out >= 0) print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9)); + if (q_out >= 0) print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9), sum_tot2); sum_tot = sum_err = 0, q_out = q; } sum_tot += tot[q], sum_err += err[q]; sum_tot2 += tot[q], sum_err2 += err[q]; } -print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9)); +print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9), sum_tot2); if (n_unmapped != null) print('U', n_unmapped); diff --git a/mmpriv.h b/mmpriv.h index 3f87686..6674535 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -58,7 +58,7 @@ void mm_select_sub(void *km, float mask_level, float pri_ratio, int min_diff, in void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs); void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r); -void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc); +void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int n_rep_mini); #ifdef __cplusplus }