diff --git a/ksw2.c b/ksw2.c index f494ccc..e4b72bf 100644 --- a/ksw2.c +++ b/ksw2.c @@ -302,6 +302,7 @@ int ksw_global2_sse(void *km, int qlen, const uint8_t *query, int tlen, const ui w = (w + 1 + 15) / 16 * 16 - 1; tlen16 = (tlen + 15) / 16 * 16; n_col = w + 1 < tlen16? w + 1 : tlen16; // number of columns in the backtrack matrix + n_col += 16, tlen16 += 16; // leave enough space at the end mem = (uint8_t*)kcalloc(km, tlen16 * 5 + 15, 1); u = (int8_t*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned (though not necessary) diff --git a/tex/aln-dp.tex b/tex/aln-dp.tex deleted file mode 100644 index 8d0e637..0000000 --- a/tex/aln-dp.tex +++ /dev/null @@ -1,171 +0,0 @@ -\documentclass[10pt]{article} - -\title{Alignment with dynamic programming} -\author{Heng Li} - -\begin{document} - -\maketitle - -\section{General notations} - -Suppose we have two sequences: a \emph{target} sequence and a \emph{query} -sequence. The length of the target sequence is $\ell_t$ with each residue -indexed by $i$. The length of query is $\ell_q$ with each residue indexed by -$j$. Gaps on the target sequence are \emph{deletions} and gaps on the query are -\emph{insertions}. Function $S(i,j)$ gives the score between two residues on -the target and the query, respectively. $q>0$ is the gap open/initiation -penalty and $e>0$ the gap extension penalty. A gap of length $k$ costs -$q+k\cdot e$. - -\section{Global alignment with affine-gap penalties} - -\subsection{Durbin's formulation} - -The original Durbin's formulation is: -\begin{eqnarray*} -M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\ -E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}\} - e\\ -F_{ij}&=&\max\{M_{i,j-1}-q, F_{i,j-1}\} - e -\end{eqnarray*} -This formulation disallows a deletion immediately followed an insertion, or -vice versa. A more general form is: -\begin{eqnarray*} -M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\ -E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}, F_{i-1,j}-q\} - e\\ -F_{ij}&=&\max\{M_{i,j-1}-q, E_{i,j-1}-q, F_{i,j-1}\} - e -\end{eqnarray*} - -\subsection{Green's formulation} - -If we define: -\[H_{ij}=\max\{M_{ij},E_{ij},F_{ij}\}\] -the Durbin's formulation can be transformed to -\begin{eqnarray*} -E_{ij} &=& \max\{H_{i-1,j}-q, E_{i-1,j}\} - e \\ -F_{ij} &=& \max\{H_{i,j-1}-q, F_{i,j-1}\} - e \\ -H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j), E_{ij}, F_{ij}\} -\end{eqnarray*} -I first saw this formulation in Phrap developed by Phil Green, though it may -have been used earlier. If we further introduce -\begin{eqnarray*} -E'_{ij}&=&E_{i+1,j}\\ -F'_{ij}&=&F_{i,j+1} -\end{eqnarray*} -we have -\begin{eqnarray*} -H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j),E'_{i-1,j},F'_{i,j-1}\}\\ -E'_{ij}&=& \max\{H_{ij}-q,E'_{i-1,j}\}-e\\ -F'_{ij}&=& \max\{H_{ij}-q,F'_{i,j-1}\}-e -\end{eqnarray*} -In fact, we more often use this set of equations in practical implementations. -The initial conditions are -\begin{eqnarray*} -H_{-1,j}&=& - \left\{\begin{array}{ll} - 0 & (j=-1)\\ - -q-(j+1)\cdot e & (0\le j<\ell_q) - \end{array}\right.\\ -H_{i,-1}&=& - \left\{\begin{array}{ll} - 0 & (i=-1)\\ - -q-(i+1)\cdot e & (0\le i<\ell_t) - \end{array}\right.\\ -E'_{-1,j}&=&E_{0,j}=H_{-1,j}-q-e=-2q-(j+2)\cdot e\\ -F'_{i,-1}&=&F_{i,0}=-2q-(i+2)\cdot e -\end{eqnarray*} - -\subsection{Suzuki's formulation} - -\subsubsection{Standard coordinate} -Now let -\begin{eqnarray*} -u'_{ij}&=&H_{ij}-H_{i-1,j}\\ -v'_{ij}&=&H_{ij}-H_{i,j-1}\\ -x'_{ij}&=&E'_{ij}-H_{ij}\\ -y'_{ij}&=&F'_{ij}-H_{ij} -\end{eqnarray*} -We have -\begin{eqnarray}\label{eq:x} -x'_{ij}&=&\max\{-q,E'_{i-1,j}-H_{i-1,j}+H_{i-1,j}-H_{ij}\}-e\\\nonumber -&=&\max\{-q,x'_{i-1,j}-u'_{ij}\}-e -\end{eqnarray} -Similarly -\begin{equation} -y'_{ij}=\max\{-q,y'_{i,j-1}-v'_{ij}\}-e -\end{equation} -To derive the equation to compute $u'(i,j)$ and $v'(i,j)$, we note that -\begin{eqnarray*} -H_{ij}-H_{i-1,j-1} -&=&\max\{S(i,j),E'_{i-1,j}-H_{i-1,j-1},F'_{i,j-1}-H_{i-1,j-1}\}\\ -&=&\max\{S(i,j),x'_{i-1,j}+v'_{i-1,j},y'_{i,j-1}+u'_{i,j-1}\} -\end{eqnarray*} -and -\[H_{ij}-H_{i-1,j-1}=u'_{ij}+v'_{i-1,j}=v'_{ij}+u'_{i,j-1}\] -We can derive the recursive equation for $u'_{ij}$ and $v'_{ij}$. - -From eq.~(\ref{eq:x}) we can infer that $x'_{ij}\ge-q-e$ and similarly -$y'_{ij}\ge-q-e$. We further have: -\[ -u'_{ij}=H_{ij}-H_{i-1,j-1}-v'_{i-1,j}\ge x'_{i-1,j}\ge-q-e -\] -Therefore, we have a lower bound $-q-e$ for $u'$, $v'$, $x'$ and $y'$. -This motivates us to redefine the four variables as: -\begin{eqnarray*} -u''_{ij}&=&H_{ij}-H_{i-1,j}+q+e\\ -v''_{ij}&=&H_{ij}-H_{i,j-1}+q+e\\ -x''_{ij}&=&E'_{ij}-H_{ij}+q+e\\ -y''_{ij}&=&F'_{ij}-H_{ij}+q+e -\end{eqnarray*} -The recursion becomes -\begin{eqnarray*} -z''_{ij}&=&\max\{S(i,j)+2q+2e,x''_{i-1,j}+v''_{i-1,j},y''_{i,j-1}+u''_{i,j-1}\}\\ -u''_{ij}&=&z''_{ij}-v''_{i-1,j}\\ -v''_{ij}&=&z''_{ij}-u''_{i,j-1}\\ -x''_{ij}&=&\max\{0,x''_{i-1,j}-u''_{ij}+q\}=\max\{0,x''_{i-1,j}+v''_{i-1,j}-z''_{ij}+q\}\\ -y''_{ij}&=&\max\{0,y''_{i,j-1}-v''_{ij}+q\}=\max\{0,y''_{i,j-1}+u''_{i,j-1}-z''_{ij}+q\} -\end{eqnarray*} -Here $z_{ij}$ is a temporary variable. $u''$, $v''$, $x''$ and $y''$ are all -non-negtive. - -\subsubsection{Rotated coordinate} - -We let -\begin{eqnarray*} -r&=&i+j\\ -t&=&i -\end{eqnarray*} -We have -\begin{eqnarray*} -z_{rt}&=&\max\{S(t,r-t)+2q+2e,x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t}\}\\ -u_{rt}&=&z_{rt}-v_{r-1,t-1}\\ -v_{rt}&=&z_{rt}-u_{r-1,t}\\ -x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}\\ -y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\} -\end{eqnarray*} -Due to the definition of $r$ and $t$, the following inequation must stand: -\[0\le r-t \le\ell_q-1\] -\[0\le t \le\ell_t-1\] -where $\ell_t$ is the length of the sequence indexed by $i$ and $\ell_q$ the -length indexed by $j$. In case of banded alignment with a fixed diagonal band -of size $w$, -\[-w\le j-i\le w\] -In the $(r,t)$ coordinate, it is: -\[\frac{r-w}{2}\le t\le \frac{r+w}{2}\] -Putting these together: -\[0\le r\le \ell_q+\ell_t-2\] -\[\max\left\{0,r-\ell_q+1,\frac{r-w}{2}\right\}\le t\le\min\left\{\ell_t-1,r,\frac{r+w}{2}\right\}\] - -\subsubsection{Initial conditions} -\[x_{r-1,-1}=x''_{-1,r}=E'_{-1,r}-H_{-1,r}+q+e=0\] -\[y_{r-1,r}=y''_{r,-1}=0\] -\[v_{r-1,-1}=v''_{-1,r}=H_{-1,r}-H_{-1,r-1}+q+e=\left\{\begin{array}{ll} - q & (r>0) \\ - 0 & (r=0) -\end{array}\right.\] -\[u_{r-1,r}=u''_{r,-1}=H_{r,-1}-H_{r-1,-1}+q+e=\left\{\begin{array}{ll} - q & (r>0) \\ - 0 & (r=0) -\end{array}\right.\] - -\end{document}