alternative way to estimate mapQ

the old mapQ estimate is too conservative
This commit is contained in:
Heng Li 2013-09-06 12:31:47 -04:00
parent ed78df9184
commit 623da055e1
3 changed files with 21 additions and 4 deletions

View File

@ -63,6 +63,8 @@ mem_opt_t *mem_opt_init()
o->chunk_size = 10000000;
o->n_threads = 1;
o->max_matesw = 100;
o->mapQ_coef_len = 100;
o->mapQ_coef_fac = log(o->mapQ_coef_len);
bwa_fill_scmat(o->a, o->b, o->mat);
return o;
}
@ -768,9 +770,18 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
sub = a->csub > sub? a->csub : sub;
if (sub >= a->score) return 0;
l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb;
mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0;
identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
if (a->score == 0) {
mapq = 0;
} else if (opt->mapQ_coef_len > 0) {
double tmp;
tmp = 6.02 * (a->score - sub) / opt->a * identity;
if (l > opt->mapQ_coef_len) tmp *= log(l) / opt->mapQ_coef_fac;
mapq = (int)(tmp + .499);
} else {
mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499);
mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
}
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
if (mapq > 60) mapq = 60;
if (mapq < 0) mapq = 0;

View File

@ -35,6 +35,8 @@ typedef struct {
int chunk_size; // process chunk_size-bp sequences in a batch
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
float mapQ_coef_len;
int mapQ_coef_fac;
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset

View File

@ -3,6 +3,7 @@
#include <unistd.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include "bwa.h"
#include "bwamem.h"
#include "kvec.h"
@ -28,7 +29,7 @@ int main_mem(int argc, char *argv[])
void *ko = 0, *ko2 = 0;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) {
while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
else if (c == 'A') opt->a = atoi(optarg);
@ -48,7 +49,10 @@ int main_mem(int argc, char *argv[])
else if (c == 'v') bwa_verbose = atoi(optarg);
else if (c == 'r') opt->split_factor = atof(optarg);
else if (c == 'C') copy_comment = 1;
else if (c == 'L') {
else if (c == 'Q') {
opt->mapQ_coef_len = atoi(optarg);
opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0;
} else if (c == 'L') {
char *p;
opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10);
if (*p != 0 && ispunct(*p) && isdigit(p[1]))