From ee80fb8bd07451f0eba4d7dc9f76d507ed325a13 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 22:55:44 -0500 Subject: [PATCH] Test each seed to see if extension is needed The old version wastefully extends many seeds contained in an aligned region found before. While this wastes little time for short reads, it becomes a serious defect for long query sequences. This is an attempt to fix this problem, but more tuning are needed. --- bwamem.c | 83 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4471682..8b2216e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -13,6 +13,7 @@ #include "ksw.h" #include "kvec.h" #include "ksort.h" +#include "utils.h" /* Theory on probability and scoring *ungapped* alignment * @@ -417,6 +418,21 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT * Construct the alignment from a chain * ****************************************/ +static const char LogTable256[256] = { +#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n + -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), + LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) +}; +#undef LT + +static inline int ilog2(uint32_t v) +{ + register uint32_t t, tt; + if ((tt = (v >> 16))) return (t = (tt >> 8)) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; + return (t = (v >> 8)) ? 8 + LogTable256[t] : LogTable256[v]; +} + static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); @@ -429,8 +445,9 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; + uint64_t *srt; - av->n = 0; + if (c->n == 0) return; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -446,11 +463,31 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); if (rlen != rmax[1] - rmax[0]) return; - for (k = 0; k < c->n;) { + srt = malloc(c->n * 8); + for (i = 0; i < c->n; ++i) + srt[i] = (uint64_t)c->seeds[i].len<<32 | i; + ks_introsort_64(c->n, srt); + + for (k = c->n - 1; k >= 0; --k) { mem_alnreg_t *a; + s = &c->seeds[(uint32_t)srt[k]]; + + for (i = 0; i < av->n; ++i) { // test whether extension has been made before + mem_alnreg_t *p = &av->a[i]; + int64_t rd; + int qd, w; + if (s->qbeg < p->qb || s->qbeg + s->len > p->qe || s->rbeg < p->rb || s->rbeg + s->len > p->re) continue; + qd = s->qbeg - p->qb; + rd = s->rbeg - p->rb; + w = ilog2(p->re - p->rb)<<1; // heuristic band width: small size for short hits + w = w < opt->w? w : opt->w; + if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + } + if (i < av->n) continue; + a = kv_pushp(mem_alnreg_t, *av); - s = &c->seeds[k]; memset(a, 0, sizeof(mem_alnreg_t)); + if (s->qbeg) { // left extension uint8_t *rs, *qs; int qle, tle; @@ -464,7 +501,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int free(qs); free(rs); } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; - if (s->qbeg + s->len != l_query) { // right extension of the first seed + if (s->qbeg + s->len != l_query) { // right extension int qle, tle, qe, re; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; @@ -472,21 +509,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } - // jump to the next seed that: 1) has no >7bp overlap with the previous seed, or 2) is not fully contained in the alignment - for (i = k + 1; i < c->n; ++i) { - const mem_seed_t *t = &c->seeds[i]; - if ((t-1)->rbeg + (t-1)->len >= t->rbeg + 7 || (t-1)->qbeg + (t-1)->len >= t->qbeg + 7) break; - if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; - } - k = i; } - free(rseq); + free(srt); free(rseq); } /***************************** @@ -648,34 +679,24 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { - int i, j, k; + int i; mem_chain_v chn; - mem_alnreg_v regs, tmp; - for (i = 0; i < l_seq; ++i) + mem_alnreg_v regs; + + for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); - kv_init(regs); kv_init(tmp); + + kv_init(regs); for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; - for (j = 0; j < regs.n; ++j) { // check if all the seeds are contained in alnreg found previously - mem_alnreg_t *q = ®s.a[j]; - for (k = 0; k < p->n; ++k) { - mem_seed_t *s = &p->seeds[k]; - if (!(s->qbeg >= q->qb && s->qbeg + s->len <= q->qe && s->rbeg >= q->rb && s->rbeg + s->len <= q->re)) - break; // stop if seed is not contained - } - if (k == p->n) break; // if all seeds are contained, stop - } - if (j == regs.n) { - mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &tmp); - for (j = 0; j < tmp.n; ++j) - kv_push(mem_alnreg_t, regs, tmp.a[j]); - } + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } - free(chn.a); free(tmp.a); + free(chn.a); regs.n = mem_sort_and_dedup(regs.n, regs.a); return regs; }