From 623da055e1aa0a673286bb64777a7155bead490b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 6 Sep 2013 12:31:47 -0400 Subject: [PATCH] alternative way to estimate mapQ the old mapQ estimate is too conservative --- bwamem.c | 15 +++++++++++++-- bwamem.h | 2 ++ fastmap.c | 8 ++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 06fc8ac..e0aa3e4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -63,6 +63,8 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_matesw = 100; + o->mapQ_coef_len = 100; + o->mapQ_coef_fac = log(o->mapQ_coef_len); bwa_fill_scmat(o->a, o->b, o->mat); return o; } @@ -768,9 +770,18 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) sub = a->csub > sub? a->csub : sub; if (sub >= a->score) return 0; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; - mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; - mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (a->score == 0) { + mapq = 0; + } else if (opt->mapQ_coef_len > 0) { + double tmp; + tmp = 6.02 * (a->score - sub) / opt->a * identity; + if (l > opt->mapQ_coef_len) tmp *= log(l) / opt->mapQ_coef_fac; + mapq = (int)(tmp + .499); + } else { + mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + } if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; diff --git a/bwamem.h b/bwamem.h index a81c92b..01b690c 100644 --- a/bwamem.h +++ b/bwamem.h @@ -35,6 +35,8 @@ typedef struct { int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain + float mapQ_coef_len; + int mapQ_coef_fac; int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset diff --git a/fastmap.c b/fastmap.c index 598194b..21e47e0 100644 --- a/fastmap.c +++ b/fastmap.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "bwa.h" #include "bwamem.h" #include "kvec.h" @@ -28,7 +29,7 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { + while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -48,7 +49,10 @@ int main_mem(int argc, char *argv[]) else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'C') copy_comment = 1; - else if (c == 'L') { + else if (c == 'Q') { + opt->mapQ_coef_len = atoi(optarg); + opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; + } else if (c == 'L') { char *p; opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1]))