minimap2/tex/aln-dp.tex

172 lines
5.4 KiB
TeX

\documentclass[10pt]{article}
\title{Alignment with dynamic programming}
\author{Heng Li}
\begin{document}
\maketitle
\section{General notations}
Suppose we have two sequences: a \emph{target} sequence and a \emph{query}
sequence. The length of the target sequence is $\ell_t$ with each residue
indexed by $i$. The length of query is $\ell_q$ with each residue indexed by
$j$. Gaps on the target sequence are \emph{deletions} and gaps on the query are
\emph{insertions}. Function $S(i,j)$ gives the score between two residues on
the target and the query, respectively. $q>0$ is the gap open/initiation
penalty and $e>0$ the gap extension penalty. A gap of length $k$ costs
$q+k\cdot e$.
\section{Global alignment with affine-gap penalties}
\subsection{Durbin's formulation}
The original Durbin's formulation is:
\begin{eqnarray*}
M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\
E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}\} - e\\
F_{ij}&=&\max\{M_{i,j-1}-q, F_{i,j-1}\} - e
\end{eqnarray*}
This formulation disallows a deletion immediately followed an insertion, or
vice versa. A more general form is:
\begin{eqnarray*}
M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\
E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}, F_{i-1,j}-q\} - e\\
F_{ij}&=&\max\{M_{i,j-1}-q, E_{i,j-1}-q, F_{i,j-1}\} - e
\end{eqnarray*}
\subsection{Green's formulation}
If we define:
\[H_{ij}=\max\{M_{ij},E_{ij},F_{ij}\}\]
the Durbin's formulation can be transformed to
\begin{eqnarray*}
E_{ij} &=& \max\{H_{i-1,j}-q, E_{i-1,j}\} - e \\
F_{ij} &=& \max\{H_{i,j-1}-q, F_{i,j-1}\} - e \\
H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j), E_{ij}, F_{ij}\}
\end{eqnarray*}
I first saw this formulation in Phrap developed by Phil Green, though it may
have been used earlier. If we further introduce
\begin{eqnarray*}
E'_{ij}&=&E_{i+1,j}\\
F'_{ij}&=&F_{i,j+1}
\end{eqnarray*}
we have
\begin{eqnarray*}
H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j),E'_{i-1,j},F'_{i,j-1}\}\\
E'_{ij}&=& \max\{H_{ij}-q,E'_{i-1,j}\}-e\\
F'_{ij}&=& \max\{H_{ij}-q,F'_{i,j-1}\}-e
\end{eqnarray*}
In fact, we more often use this set of equations in practical implementations.
The initial conditions are
\begin{eqnarray*}
H_{-1,j}&=&
\left\{\begin{array}{ll}
0 & (j=-1)\\
-q-(j+1)\cdot e & (0\le j<\ell_q)
\end{array}\right.\\
H_{i,-1}&=&
\left\{\begin{array}{ll}
0 & (i=-1)\\
-q-(i+1)\cdot e & (0\le i<\ell_t)
\end{array}\right.\\
E'_{-1,j}&=&E_{0,j}=H_{-1,j}-q-e=-2q-(j+2)\cdot e\\
F'_{i,-1}&=&F_{i,0}=-2q-(i+2)\cdot e
\end{eqnarray*}
\subsection{Suzuki's formulation}
\subsubsection{Standard coordinate}
Now let
\begin{eqnarray*}
u'_{ij}&=&H_{ij}-H_{i-1,j}\\
v'_{ij}&=&H_{ij}-H_{i,j-1}\\
x'_{ij}&=&E'_{ij}-H_{ij}\\
y'_{ij}&=&F'_{ij}-H_{ij}
\end{eqnarray*}
We have
\begin{eqnarray}\label{eq:x}
x'_{ij}&=&\max\{-q,E'_{i-1,j}-H_{i-1,j}+H_{i-1,j}-H_{ij}\}-e\\\nonumber
&=&\max\{-q,x'_{i-1,j}-u'_{ij}\}-e
\end{eqnarray}
Similarly
\begin{equation}
y'_{ij}=\max\{-q,y'_{i,j-1}-v'_{ij}\}-e
\end{equation}
To derive the equation to compute $u'(i,j)$ and $v'(i,j)$, we note that
\begin{eqnarray*}
H_{ij}-H_{i-1,j-1}
&=&\max\{S(i,j),E'_{i-1,j}-H_{i-1,j-1},F'_{i,j-1}-H_{i-1,j-1}\}\\
&=&\max\{S(i,j),x'_{i-1,j}+v'_{i-1,j},y'_{i,j-1}+u'_{i,j-1}\}
\end{eqnarray*}
and
\[H_{ij}-H_{i-1,j-1}=u'_{ij}+v'_{i-1,j}=v'_{ij}+u'_{i,j-1}\]
We can derive the recursive equation for $u'_{ij}$ and $v'_{ij}$.
From eq.~(\ref{eq:x}) we can infer that $x'_{ij}\ge-q-e$ and similarly
$y'_{ij}\ge-q-e$. We further have:
\[
u'_{ij}=H_{ij}-H_{i-1,j-1}-v'_{i-1,j}\ge x'_{i-1,j}\ge-q-e
\]
Therefore, we have a lower bound $-q-e$ for $u'$, $v'$, $x'$ and $y'$.
This motivates us to redefine the four variables as:
\begin{eqnarray*}
u''_{ij}&=&H_{ij}-H_{i-1,j}+q+e\\
v''_{ij}&=&H_{ij}-H_{i,j-1}+q+e\\
x''_{ij}&=&E'_{ij}-H_{ij}+q+e\\
y''_{ij}&=&F'_{ij}-H_{ij}+q+e
\end{eqnarray*}
The recursion becomes
\begin{eqnarray*}
z''_{ij}&=&\max\{S(i,j)+2q+2e,x''_{i-1,j}+v''_{i-1,j},y''_{i,j-1}+u''_{i,j-1}\}\\
u''_{ij}&=&z''_{ij}-v''_{i-1,j}\\
v''_{ij}&=&z''_{ij}-u''_{i,j-1}\\
x''_{ij}&=&\max\{0,x''_{i-1,j}-u''_{ij}+q\}=\max\{0,x''_{i-1,j}+v''_{i-1,j}-z''_{ij}+q\}\\
y''_{ij}&=&\max\{0,y''_{i,j-1}-v''_{ij}+q\}=\max\{0,y''_{i,j-1}+u''_{i,j-1}-z''_{ij}+q\}
\end{eqnarray*}
Here $z_{ij}$ is a temporary variable. $u''$, $v''$, $x''$ and $y''$ are all
non-negtive.
\subsubsection{Rotated coordinate}
We let
\begin{eqnarray*}
r&=&i+j\\
t&=&i
\end{eqnarray*}
We have
\begin{eqnarray*}
z_{rt}&=&\max\{S(t,r-t)+2q+2e,x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t}\}\\
u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
v_{rt}&=&z_{rt}-u_{r-1,t}\\
x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}\\
y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}
\end{eqnarray*}
Due to the definition of $r$ and $t$, the following inequation must stand:
\[0\le r-t \le\ell_q-1\]
\[0\le t \le\ell_t-1\]
where $\ell_t$ is the length of the sequence indexed by $i$ and $\ell_q$ the
length indexed by $j$. In case of banded alignment with a fixed diagonal band
of size $w$,
\[-w\le j-i\le w\]
In the $(r,t)$ coordinate, it is:
\[\frac{r-w}{2}\le t\le \frac{r+w}{2}\]
Putting these together:
\[0\le r\le \ell_q+\ell_t-2\]
\[\max\left\{0,r-\ell_q+1,\frac{r-w}{2}\right\}\le t\le\min\left\{\ell_t-1,r,\frac{r+w}{2}\right\}\]
\subsubsection{Initial conditions}
\[x_{r-1,-1}=x''_{-1,r}=E'_{-1,r}-H_{-1,r}+q+e=0\]
\[y_{r-1,r}=y''_{r,-1}=0\]
\[v_{r-1,-1}=v''_{-1,r}=H_{-1,r}-H_{-1,r-1}+q+e=\left\{\begin{array}{ll}
q & (r>0) \\
0 & (r=0)
\end{array}\right.\]
\[u_{r-1,r}=u''_{r,-1}=H_{r,-1}-H_{r-1,-1}+q+e=\left\{\begin{array}{ll}
q & (r>0) \\
0 & (r=0)
\end{array}\right.\]
\end{document}