From 20aa848b3c4b48382dc24112e6da6bfad1c991ba Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 13:00:35 -0500 Subject: [PATCH] r279: for PE mapq, consider the number of pairs If there are a lot of proper pairs, it is more likely that the best pair is wrong. --- bwamem.c | 2 +- bwamem.h | 2 +- bwamem_pair.c | 44 ++++++++++++++++++++++++-------------------- main.c | 2 +- 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/bwamem.c b/bwamem.c index 88086ee..a1f25b7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -604,7 +604,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; - if (a->sub_n) mapq -= (int)(4.343 * log(a->sub_n) + .499); + if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; return mapq; diff --git a/bwamem.h b/bwamem.h index d99a9da..7fc2c85 100644 --- a/bwamem.h +++ b/bwamem.h @@ -5,7 +5,7 @@ #include "bntseq.h" #include "bwa.h" -#define MEM_MAPQ_COEF 40.0 +#define MEM_MAPQ_COEF 30.0 #define MEM_MAPQ_MAX 60 struct __smem_i; diff --git a/bwamem_pair.c b/bwamem_pair.c index 3ef71ea..3fbdec7 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -167,13 +167,12 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int z[2]) +int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); - pair64_v v; - pair64_t o, subo; // .x: score<<32 | raw_score<<8 | hash; .y: pair - int r, i, k, y[4]; // y[] keeps the last hit - kv_init(v); + pair64_v v, u; + int r, i, k, y[4], ret; // y[] keeps the last hit + kv_init(v); kv_init(u); for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < a[r].n; ++i) { pair64_t key; @@ -185,7 +184,6 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = subo.x = o.y = subo.y = 0; //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction @@ -197,7 +195,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int64_t dist; int q; double ns; - uint64_t x, pair; + pair64_t *p; if ((v.a[k].y&3) != which) continue; dist = (int64_t)v.a[i].x - v.a[k].x; //printf("%d: %lld\n", k, dist); @@ -206,23 +204,27 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ ns = (dist - pes[dir].avg) / pes[dir].std; q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4) if (q < 0) q = 0; - pair = (uint64_t)k<<32 | i; - x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); + p = kv_pushp(pair64_t, u); + p->y = (uint64_t)k<<32 | i; + p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU); //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); - if (x > o.x) subo = o, o.x = x, o.y = pair; - else if (x > subo.x) subo.x = x, subo.y = pair; } } y[v.a[i].y&3] = i; } - if (o.x > 0) { - i = o.y >> 32; k = o.y << 32 >> 32; - z[v.a[i].y&1] = v.a[i].y<<32>>34; + if (u.n) { // found at least one proper pair + int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; + ks_introsort_128(u.n, u.a); + i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; + z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair z[v.a[k].y&1] = v.a[k].y<<32>>34; - } - free(v.a); - *sub = subo.x>>32; - return o.x>>32; + ret = u.a[u.n-1].x >> 32; + *sub = u.n > 1? u.a[u.n-2].x>>32 : 0; + for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i) + if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub; + } else ret = 0, *sub = 0, *n_sub = 0; + free(u.a); free(v.a); + return ret; } int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) @@ -233,7 +235,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m); - int n = 0, i, j, z[2], o, subo; + int n = 0, i, j, z[2], o, subo, n_sub; kstring_t str; mem_alnreg_v b[2]; bwahit_t h[2]; @@ -253,7 +255,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { + if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { @@ -267,6 +269,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; subo = subo > score_un? subo : score_un; q_pe = (o - subo) * 6; + if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); + if (q_pe < 0) q_pe = 0; if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (o > score_un) { // paired alignment is preferred diff --git a/main.c b/main.c index 9ef33fa..a0e8ec6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r278-beta" +#define PACKAGE_VERSION "0.6.2-r279-beta" #endif static int usage()