r279: for PE mapq, consider the number of pairs

If there are a lot of proper pairs, it is more likely that the best pair is
wrong.
This commit is contained in:
Heng Li 2013-02-25 13:00:35 -05:00
parent 9957e04590
commit 20aa848b3c
4 changed files with 27 additions and 23 deletions

View File

@ -604,7 +604,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0;
identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
if (a->sub_n) mapq -= (int)(4.343 * log(a->sub_n) + .499);
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
if (mapq > 60) mapq = 60;
if (mapq < 0) mapq = 0;
return mapq;

View File

@ -5,7 +5,7 @@
#include "bntseq.h"
#include "bwa.h"
#define MEM_MAPQ_COEF 40.0
#define MEM_MAPQ_COEF 30.0
#define MEM_MAPQ_MAX 60
struct __smem_i;

View File

@ -167,13 +167,12 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me
return n;
}
int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int z[2])
int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2])
{
extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
pair64_v v;
pair64_t o, subo; // .x: score<<32 | raw_score<<8 | hash; .y: pair
int r, i, k, y[4]; // y[] keeps the last hit
kv_init(v);
pair64_v v, u;
int r, i, k, y[4], ret; // y[] keeps the last hit
kv_init(v); kv_init(u);
for (r = 0; r < 2; ++r) { // loop through read number
for (i = 0; i < a[r].n; ++i) {
pair64_t key;
@ -185,7 +184,6 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
}
ks_introsort_128(v.n, v.a);
y[0] = y[1] = y[2] = y[3] = -1;
o.x = subo.x = o.y = subo.y = 0;
//for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
for (i = 0; i < v.n; ++i) {
for (r = 0; r < 2; ++r) { // loop through direction
@ -197,7 +195,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
int64_t dist;
int q;
double ns;
uint64_t x, pair;
pair64_t *p;
if ((v.a[k].y&3) != which) continue;
dist = (int64_t)v.a[i].x - v.a[k].x;
//printf("%d: %lld\n", k, dist);
@ -206,23 +204,27 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
ns = (dist - pes[dir].avg) / pes[dir].std;
q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4)
if (q < 0) q = 0;
pair = (uint64_t)k<<32 | i;
x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU);
p = kv_pushp(pair64_t, u);
p->y = (uint64_t)k<<32 | i;
p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
//printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
if (x > o.x) subo = o, o.x = x, o.y = pair;
else if (x > subo.x) subo.x = x, subo.y = pair;
}
}
y[v.a[i].y&3] = i;
}
if (o.x > 0) {
i = o.y >> 32; k = o.y << 32 >> 32;
z[v.a[i].y&1] = v.a[i].y<<32>>34;
if (u.n) { // found at least one proper pair
int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
ks_introsort_128(u.n, u.a);
i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
z[v.a[k].y&1] = v.a[k].y<<32>>34;
}
free(v.a);
*sub = subo.x>>32;
return o.x>>32;
ret = u.a[u.n-1].x >> 32;
*sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
} else ret = 0, *sub = 0, *n_sub = 0;
free(u.a); free(v.a);
return ret;
}
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
@ -233,7 +235,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m);
int n = 0, i, j, z[2], o, subo;
int n = 0, i, j, z[2], o, subo, n_sub;
kstring_t str;
mem_alnreg_v b[2];
bwahit_t h[2];
@ -253,7 +255,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
mem_mark_primary_se(opt, a[1].n, a[1].a);
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
// pairing single-end hits
if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) {
if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {
int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2];
// check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) {
@ -267,6 +269,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
subo = subo > score_un? subo : score_un;
q_pe = (o - subo) * 6;
if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
if (q_pe < 0) q_pe = 0;
if (q_pe > 60) q_pe = 60;
// the following assumes no split hits
if (o > score_un) { // paired alignment is preferred

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.2-r278-beta"
#define PACKAGE_VERSION "0.6.2-r279-beta"
#endif
static int usage()