From 7d33acbccce15597947875289e5b97e335e5587e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 16 Jun 2017 12:17:23 -0400 Subject: [PATCH] diagonal global almost working one shot, though gaps not left-aligned at the boundary --- ksw2.c | 55 ++++++++++++++++++++++++++++++++++++++------------ ksw2.h | 2 ++ tex/aln-dp.tex | 11 +++++++--- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/ksw2.c b/ksw2.c index 268b458..a680919 100644 --- a/ksw2.c +++ b/ksw2.c @@ -278,7 +278,7 @@ int ksw_global(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t #include int ksw_global2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int8_t q, int8_t e, int w, int *n_cigar_, uint32_t **cigar_) { - int qe = q + e, qe2 = qe + qe, r, t, n_col, *off; + int qe = q + e, qe2 = qe + qe, r, t, n_col, *off, score; int8_t *u, *v, *x, *y, *s; uint8_t *p, *qr; @@ -286,25 +286,37 @@ int ksw_global2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_ v = (int8_t*)kcalloc(km, tlen + 1, 1); x = (int8_t*)kcalloc(km, tlen + 1, 1); y = (int8_t*)kcalloc(km, tlen + 1, 1); - s = (int8_t*)kmalloc(km, tlen, 1); - qr = (uint8_t*)kmalloc(km, qlen, 1); - n_col = 2 * w < tlen + 1? 2 * w : tlen + 1; + s = (int8_t*)kmalloc(km, tlen); + qr = (uint8_t*)kmalloc(km, qlen); + n_col = w + 1 < tlen? w + 1 : tlen; p = (uint8_t*)kmalloc(km, (qlen + tlen) * n_col); + off = (int*)kmalloc(km, (qlen + tlen) * sizeof(int)); for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t]; - for (r = 0; r <= qlen + tlen; ++r) { - int st = 0, en = tlen; - int8_t x1 = 0, v1 = 0; + for (r = 0; r < qlen + tlen - 1; ++r) { + int st = 0, en = tlen - 1; + int8_t x1, v1; uint8_t *pr = p + r * n_col; - if (st < r - qlen) st = r - qlen; + // find the boundaries + if (st < r - qlen + 1) st = r - qlen + 1; if (en > r) en = r; if (st < (r-w+1)>>1) st = (r-w+1)>>1; // take the ceil if (en > (r+w)>>1) en = (r+w)>>1; // take the floor off[r] = st; + // set boundary conditions + if (st != 0) { + if (r > st + st + w - 1) x1 = v1 = 0; + else x1 = x[st-1], v1 = v[st-1]; // (r-1, st-1) in the band + } else x1 = 0, v1 = r? q : 0; + if (en != r) { + if (r < en + en - w - 1) y[en] = u[en] = 0; // (r-1,en) out of the band; TODO: is this line necessary? + } else y[r] = 0, u[r] = r? q : 0; + // loop fission: set scores first for (t = st; t <= en; ++t) s[t] = mat[target[t] * m + qr[t + qlen - 1 - r]]; + // core loop for (t = st; t <= en; ++t) { /* At the beginning of the loop, v1=v(r-1,t-1), x1=x(r-1,t-1), u[t]=u(r-1,t), v[t]=v(r-1,t), x[t]=x(r-1,t), y[t]=y(r-1,t) a = x(r-1,t-1) + v(r-1,t-1) @@ -339,10 +351,11 @@ int ksw_global2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_ pr[t - st] = d; } } - if (n_cigar_ && cigar_) { - int n_cigar = 0, m_cigar = 0, which = 0, i, j; + kfree(km, u); kfree(km, v); kfree(km, x); kfree(km, y); kfree(km, s); kfree(km, qr); + { // backtrack + int n_cigar = 0, m_cigar = 0, which = 0, i, j, k, l; uint32_t *cigar = 0, tmp; - i = tlen - 1, j = qlen - 1; // (i,k) points to the last cell; + i = tlen - 1, j = qlen - 1; while (i >= 0 && j >= 0) { r = i + j; tmp = p[r * n_col + i - off[r]]; @@ -358,7 +371,23 @@ int ksw_global2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_ for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; *n_cigar_ = n_cigar, *cigar_ = cigar; + + // compute score + for (k = 0, score = 0, i = j = 0; k < n_cigar; ++k) { + int op = cigar[k] & 0xf, len = cigar[k] >> 4; + if (op == 0) { + for (l = 0; l < len; ++l) + score += mat[target[i + l] * m + query[j + l]]; + i += len, j += len; + } else if (op == 1) { + score -= q + len * e; + j += len; + } else if (op == 2) { + score -= q + len * e; + i += len; + } + } } - kfree(km, u); kfree(km, v); kfree(km, x); kfree(km, y); kfree(km, s); kfree(km, qr); kfree(km, p); - return 0; + kfree(km, p); kfree(km, off); + return score; } diff --git a/ksw2.h b/ksw2.h index fdb2109..cbc2f39 100644 --- a/ksw2.h +++ b/ksw2.h @@ -27,6 +27,8 @@ extern "C" { */ int ksw_global(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar); +int ksw_global2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int8_t q, int8_t e, int w, int *n_cigar_, uint32_t **cigar_); + void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b); #ifdef __cplusplus diff --git a/tex/aln-dp.tex b/tex/aln-dp.tex index 262488b..8d0e637 100644 --- a/tex/aln-dp.tex +++ b/tex/aln-dp.tex @@ -144,8 +144,8 @@ x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}\\ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\} \end{eqnarray*} Due to the definition of $r$ and $t$, the following inequation must stand: -\[0\le r-t<\ell_q\] -\[0\le t<\ell_t\] +\[0\le r-t \le\ell_q-1\] +\[0\le t \le\ell_t-1\] where $\ell_t$ is the length of the sequence indexed by $i$ and $\ell_q$ the length indexed by $j$. In case of banded alignment with a fixed diagonal band of size $w$, @@ -153,14 +153,19 @@ of size $w$, In the $(r,t)$ coordinate, it is: \[\frac{r-w}{2}\le t\le \frac{r+w}{2}\] Putting these together: -\[0\le r< \ell_q+\ell_t\] +\[0\le r\le \ell_q+\ell_t-2\] \[\max\left\{0,r-\ell_q+1,\frac{r-w}{2}\right\}\le t\le\min\left\{\ell_t-1,r,\frac{r+w}{2}\right\}\] \subsubsection{Initial conditions} \[x_{r-1,-1}=x''_{-1,r}=E'_{-1,r}-H_{-1,r}+q+e=0\] +\[y_{r-1,r}=y''_{r,-1}=0\] \[v_{r-1,-1}=v''_{-1,r}=H_{-1,r}-H_{-1,r-1}+q+e=\left\{\begin{array}{ll} q & (r>0) \\ 0 & (r=0) \end{array}\right.\] +\[u_{r-1,r}=u''_{r,-1}=H_{r,-1}-H_{r-1,-1}+q+e=\left\{\begin{array}{ll} + q & (r>0) \\ + 0 & (r=0) +\end{array}\right.\] \end{document}