diff --git a/ksw2.c b/ksw2.c index 5a50ddf..268b458 100644 --- a/ksw2.c +++ b/ksw2.c @@ -275,3 +275,90 @@ int ksw_global(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t } return score; } +#include +int ksw_global2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int8_t q, int8_t e, int w, int *n_cigar_, uint32_t **cigar_) +{ + int qe = q + e, qe2 = qe + qe, r, t, n_col, *off; + int8_t *u, *v, *x, *y, *s; + uint8_t *p, *qr; + + u = (int8_t*)kcalloc(km, tlen + 1, 1); + v = (int8_t*)kcalloc(km, tlen + 1, 1); + x = (int8_t*)kcalloc(km, tlen + 1, 1); + y = (int8_t*)kcalloc(km, tlen + 1, 1); + s = (int8_t*)kmalloc(km, tlen, 1); + qr = (uint8_t*)kmalloc(km, qlen, 1); + n_col = 2 * w < tlen + 1? 2 * w : tlen + 1; + p = (uint8_t*)kmalloc(km, (qlen + tlen) * n_col); + + for (t = 0; t < qlen; ++t) + qr[t] = query[qlen - 1 - t]; + + for (r = 0; r <= qlen + tlen; ++r) { + int st = 0, en = tlen; + int8_t x1 = 0, v1 = 0; + uint8_t *pr = p + r * n_col; + if (st < r - qlen) st = r - qlen; + if (en > r) en = r; + if (st < (r-w+1)>>1) st = (r-w+1)>>1; // take the ceil + if (en > (r+w)>>1) en = (r+w)>>1; // take the floor + off[r] = st; + for (t = st; t <= en; ++t) + s[t] = mat[target[t] * m + qr[t + qlen - 1 - r]]; + for (t = st; t <= en; ++t) { + /* At the beginning of the loop, v1=v(r-1,t-1), x1=x(r-1,t-1), u[t]=u(r-1,t), v[t]=v(r-1,t), x[t]=x(r-1,t), y[t]=y(r-1,t) + a = x(r-1,t-1) + v(r-1,t-1) + b = y(r-1,t) + u(r-1,t) + z = max{ S(t,r-t) + 2q + 2r, a, b } + u(r,t) = z - v(r-1,t-1) + v(r,t) = z - u(r-1,t) + x(r,t) = max{ 0, a - z + q } + y(r,t) = max{ 0, b - z + q } + */ + uint8_t d; + int8_t u1; + int8_t z = s[t] + qe2; + int8_t a = x1 + v1; + int8_t b = y[t] + u[t]; + d = z >= a? 0 : 1; + z = z >= a? z : a; + d = z >= b? d : 2; + z = z >= b? z : b; + u1 = u[t]; // u1 = u(r-1,t) (temporary variable) + u[t] = z - v1; // u[t] = u(r,t) + v1 = v[t]; // v1 = v(r-1,t) (set for the next iteration) + v[t] = z - u1; // v[t] = v(r,t) + z -= q; + a -= z; + b -= z; + x1 = x[t]; // x1 = x(r-1,t) (set for the next iteration) + d |= a > 0? 1<<2 : 0; + x[t] = a > 0? a : 0; // x[t] = x(r,t) + d |= b > 0? 2<<4 : 0; + y[t] = b > 0? b : 0; // y[t] = y(r,t) + pr[t - st] = d; + } + } + if (n_cigar_ && cigar_) { + int n_cigar = 0, m_cigar = 0, which = 0, i, j; + uint32_t *cigar = 0, tmp; + i = tlen - 1, j = qlen - 1; // (i,k) points to the last cell; + while (i >= 0 && j >= 0) { + r = i + j; + tmp = p[r * n_col + i - off[r]]; + which = tmp >> (which << 1) & 3; + if (which == 0 && tmp>>6) break; + if (which == 0) which = tmp & 3; + if (which == 0) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match + else if (which == 1) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion + else cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion + } + if (i >= 0) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion + if (j >= 0) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion + for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR + tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; + *n_cigar_ = n_cigar, *cigar_ = cigar; + } + kfree(km, u); kfree(km, v); kfree(km, x); kfree(km, y); kfree(km, s); kfree(km, qr); kfree(km, p); + return 0; +} diff --git a/tex/aln-dp.tex b/tex/aln-dp.tex new file mode 100644 index 0000000..262488b --- /dev/null +++ b/tex/aln-dp.tex @@ -0,0 +1,166 @@ +\documentclass[10pt]{article} + +\title{Alignment with dynamic programming} +\author{Heng Li} + +\begin{document} + +\maketitle + +\section{General notations} + +Suppose we have two sequences: a \emph{target} sequence and a \emph{query} +sequence. The length of the target sequence is $\ell_t$ with each residue +indexed by $i$. The length of query is $\ell_q$ with each residue indexed by +$j$. Gaps on the target sequence are \emph{deletions} and gaps on the query are +\emph{insertions}. Function $S(i,j)$ gives the score between two residues on +the target and the query, respectively. $q>0$ is the gap open/initiation +penalty and $e>0$ the gap extension penalty. A gap of length $k$ costs +$q+k\cdot e$. + +\section{Global alignment with affine-gap penalties} + +\subsection{Durbin's formulation} + +The original Durbin's formulation is: +\begin{eqnarray*} +M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\ +E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}\} - e\\ +F_{ij}&=&\max\{M_{i,j-1}-q, F_{i,j-1}\} - e +\end{eqnarray*} +This formulation disallows a deletion immediately followed an insertion, or +vice versa. A more general form is: +\begin{eqnarray*} +M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\ +E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}, F_{i-1,j}-q\} - e\\ +F_{ij}&=&\max\{M_{i,j-1}-q, E_{i,j-1}-q, F_{i,j-1}\} - e +\end{eqnarray*} + +\subsection{Green's formulation} + +If we define: +\[H_{ij}=\max\{M_{ij},E_{ij},F_{ij}\}\] +the Durbin's formulation can be transformed to +\begin{eqnarray*} +E_{ij} &=& \max\{H_{i-1,j}-q, E_{i-1,j}\} - e \\ +F_{ij} &=& \max\{H_{i,j-1}-q, F_{i,j-1}\} - e \\ +H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j), E_{ij}, F_{ij}\} +\end{eqnarray*} +I first saw this formulation in Phrap developed by Phil Green, though it may +have been used earlier. If we further introduce +\begin{eqnarray*} +E'_{ij}&=&E_{i+1,j}\\ +F'_{ij}&=&F_{i,j+1} +\end{eqnarray*} +we have +\begin{eqnarray*} +H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j),E'_{i-1,j},F'_{i,j-1}\}\\ +E'_{ij}&=& \max\{H_{ij}-q,E'_{i-1,j}\}-e\\ +F'_{ij}&=& \max\{H_{ij}-q,F'_{i,j-1}\}-e +\end{eqnarray*} +In fact, we more often use this set of equations in practical implementations. +The initial conditions are +\begin{eqnarray*} +H_{-1,j}&=& + \left\{\begin{array}{ll} + 0 & (j=-1)\\ + -q-(j+1)\cdot e & (0\le j<\ell_q) + \end{array}\right.\\ +H_{i,-1}&=& + \left\{\begin{array}{ll} + 0 & (i=-1)\\ + -q-(i+1)\cdot e & (0\le i<\ell_t) + \end{array}\right.\\ +E'_{-1,j}&=&E_{0,j}=H_{-1,j}-q-e=-2q-(j+2)\cdot e\\ +F'_{i,-1}&=&F_{i,0}=-2q-(i+2)\cdot e +\end{eqnarray*} + +\subsection{Suzuki's formulation} + +\subsubsection{Standard coordinate} +Now let +\begin{eqnarray*} +u'_{ij}&=&H_{ij}-H_{i-1,j}\\ +v'_{ij}&=&H_{ij}-H_{i,j-1}\\ +x'_{ij}&=&E'_{ij}-H_{ij}\\ +y'_{ij}&=&F'_{ij}-H_{ij} +\end{eqnarray*} +We have +\begin{eqnarray}\label{eq:x} +x'_{ij}&=&\max\{-q,E'_{i-1,j}-H_{i-1,j}+H_{i-1,j}-H_{ij}\}-e\\\nonumber +&=&\max\{-q,x'_{i-1,j}-u'_{ij}\}-e +\end{eqnarray} +Similarly +\begin{equation} +y'_{ij}=\max\{-q,y'_{i,j-1}-v'_{ij}\}-e +\end{equation} +To derive the equation to compute $u'(i,j)$ and $v'(i,j)$, we note that +\begin{eqnarray*} +H_{ij}-H_{i-1,j-1} +&=&\max\{S(i,j),E'_{i-1,j}-H_{i-1,j-1},F'_{i,j-1}-H_{i-1,j-1}\}\\ +&=&\max\{S(i,j),x'_{i-1,j}+v'_{i-1,j},y'_{i,j-1}+u'_{i,j-1}\} +\end{eqnarray*} +and +\[H_{ij}-H_{i-1,j-1}=u'_{ij}+v'_{i-1,j}=v'_{ij}+u'_{i,j-1}\] +We can derive the recursive equation for $u'_{ij}$ and $v'_{ij}$. + +From eq.~(\ref{eq:x}) we can infer that $x'_{ij}\ge-q-e$ and similarly +$y'_{ij}\ge-q-e$. We further have: +\[ +u'_{ij}=H_{ij}-H_{i-1,j-1}-v'_{i-1,j}\ge x'_{i-1,j}\ge-q-e +\] +Therefore, we have a lower bound $-q-e$ for $u'$, $v'$, $x'$ and $y'$. +This motivates us to redefine the four variables as: +\begin{eqnarray*} +u''_{ij}&=&H_{ij}-H_{i-1,j}+q+e\\ +v''_{ij}&=&H_{ij}-H_{i,j-1}+q+e\\ +x''_{ij}&=&E'_{ij}-H_{ij}+q+e\\ +y''_{ij}&=&F'_{ij}-H_{ij}+q+e +\end{eqnarray*} +The recursion becomes +\begin{eqnarray*} +z''_{ij}&=&\max\{S(i,j)+2q+2e,x''_{i-1,j}+v''_{i-1,j},y''_{i,j-1}+u''_{i,j-1}\}\\ +u''_{ij}&=&z''_{ij}-v''_{i-1,j}\\ +v''_{ij}&=&z''_{ij}-u''_{i,j-1}\\ +x''_{ij}&=&\max\{0,x''_{i-1,j}-u''_{ij}+q\}=\max\{0,x''_{i-1,j}+v''_{i-1,j}-z''_{ij}+q\}\\ +y''_{ij}&=&\max\{0,y''_{i,j-1}-v''_{ij}+q\}=\max\{0,y''_{i,j-1}+u''_{i,j-1}-z''_{ij}+q\} +\end{eqnarray*} +Here $z_{ij}$ is a temporary variable. $u''$, $v''$, $x''$ and $y''$ are all +non-negtive. + +\subsubsection{Rotated coordinate} + +We let +\begin{eqnarray*} +r&=&i+j\\ +t&=&i +\end{eqnarray*} +We have +\begin{eqnarray*} +z_{rt}&=&\max\{S(t,r-t)+2q+2e,x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t}\}\\ +u_{rt}&=&z_{rt}-v_{r-1,t-1}\\ +v_{rt}&=&z_{rt}-u_{r-1,t}\\ +x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}\\ +y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\} +\end{eqnarray*} +Due to the definition of $r$ and $t$, the following inequation must stand: +\[0\le r-t<\ell_q\] +\[0\le t<\ell_t\] +where $\ell_t$ is the length of the sequence indexed by $i$ and $\ell_q$ the +length indexed by $j$. In case of banded alignment with a fixed diagonal band +of size $w$, +\[-w\le j-i\le w\] +In the $(r,t)$ coordinate, it is: +\[\frac{r-w}{2}\le t\le \frac{r+w}{2}\] +Putting these together: +\[0\le r< \ell_q+\ell_t\] +\[\max\left\{0,r-\ell_q+1,\frac{r-w}{2}\right\}\le t\le\min\left\{\ell_t-1,r,\frac{r+w}{2}\right\}\] + +\subsubsection{Initial conditions} +\[x_{r-1,-1}=x''_{-1,r}=E'_{-1,r}-H_{-1,r}+q+e=0\] +\[v_{r-1,-1}=v''_{-1,r}=H_{-1,r}-H_{-1,r-1}+q+e=\left\{\begin{array}{ll} + q & (r>0) \\ + 0 & (r=0) +\end{array}\right.\] + +\end{document}