explain DP
This commit is contained in:
parent
78fe89d1ab
commit
c4564c31a2
87
ksw2.c
87
ksw2.c
|
|
@ -275,3 +275,90 @@ int ksw_global(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t
|
|||
}
|
||||
return score;
|
||||
}
|
||||
#include <stdio.h>
|
||||
int ksw_global2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int8_t q, int8_t e, int w, int *n_cigar_, uint32_t **cigar_)
|
||||
{
|
||||
int qe = q + e, qe2 = qe + qe, r, t, n_col, *off;
|
||||
int8_t *u, *v, *x, *y, *s;
|
||||
uint8_t *p, *qr;
|
||||
|
||||
u = (int8_t*)kcalloc(km, tlen + 1, 1);
|
||||
v = (int8_t*)kcalloc(km, tlen + 1, 1);
|
||||
x = (int8_t*)kcalloc(km, tlen + 1, 1);
|
||||
y = (int8_t*)kcalloc(km, tlen + 1, 1);
|
||||
s = (int8_t*)kmalloc(km, tlen, 1);
|
||||
qr = (uint8_t*)kmalloc(km, qlen, 1);
|
||||
n_col = 2 * w < tlen + 1? 2 * w : tlen + 1;
|
||||
p = (uint8_t*)kmalloc(km, (qlen + tlen) * n_col);
|
||||
|
||||
for (t = 0; t < qlen; ++t)
|
||||
qr[t] = query[qlen - 1 - t];
|
||||
|
||||
for (r = 0; r <= qlen + tlen; ++r) {
|
||||
int st = 0, en = tlen;
|
||||
int8_t x1 = 0, v1 = 0;
|
||||
uint8_t *pr = p + r * n_col;
|
||||
if (st < r - qlen) st = r - qlen;
|
||||
if (en > r) en = r;
|
||||
if (st < (r-w+1)>>1) st = (r-w+1)>>1; // take the ceil
|
||||
if (en > (r+w)>>1) en = (r+w)>>1; // take the floor
|
||||
off[r] = st;
|
||||
for (t = st; t <= en; ++t)
|
||||
s[t] = mat[target[t] * m + qr[t + qlen - 1 - r]];
|
||||
for (t = st; t <= en; ++t) {
|
||||
/* At the beginning of the loop, v1=v(r-1,t-1), x1=x(r-1,t-1), u[t]=u(r-1,t), v[t]=v(r-1,t), x[t]=x(r-1,t), y[t]=y(r-1,t)
|
||||
a = x(r-1,t-1) + v(r-1,t-1)
|
||||
b = y(r-1,t) + u(r-1,t)
|
||||
z = max{ S(t,r-t) + 2q + 2r, a, b }
|
||||
u(r,t) = z - v(r-1,t-1)
|
||||
v(r,t) = z - u(r-1,t)
|
||||
x(r,t) = max{ 0, a - z + q }
|
||||
y(r,t) = max{ 0, b - z + q }
|
||||
*/
|
||||
uint8_t d;
|
||||
int8_t u1;
|
||||
int8_t z = s[t] + qe2;
|
||||
int8_t a = x1 + v1;
|
||||
int8_t b = y[t] + u[t];
|
||||
d = z >= a? 0 : 1;
|
||||
z = z >= a? z : a;
|
||||
d = z >= b? d : 2;
|
||||
z = z >= b? z : b;
|
||||
u1 = u[t]; // u1 = u(r-1,t) (temporary variable)
|
||||
u[t] = z - v1; // u[t] = u(r,t)
|
||||
v1 = v[t]; // v1 = v(r-1,t) (set for the next iteration)
|
||||
v[t] = z - u1; // v[t] = v(r,t)
|
||||
z -= q;
|
||||
a -= z;
|
||||
b -= z;
|
||||
x1 = x[t]; // x1 = x(r-1,t) (set for the next iteration)
|
||||
d |= a > 0? 1<<2 : 0;
|
||||
x[t] = a > 0? a : 0; // x[t] = x(r,t)
|
||||
d |= b > 0? 2<<4 : 0;
|
||||
y[t] = b > 0? b : 0; // y[t] = y(r,t)
|
||||
pr[t - st] = d;
|
||||
}
|
||||
}
|
||||
if (n_cigar_ && cigar_) {
|
||||
int n_cigar = 0, m_cigar = 0, which = 0, i, j;
|
||||
uint32_t *cigar = 0, tmp;
|
||||
i = tlen - 1, j = qlen - 1; // (i,k) points to the last cell;
|
||||
while (i >= 0 && j >= 0) {
|
||||
r = i + j;
|
||||
tmp = p[r * n_col + i - off[r]];
|
||||
which = tmp >> (which << 1) & 3;
|
||||
if (which == 0 && tmp>>6) break;
|
||||
if (which == 0) which = tmp & 3;
|
||||
if (which == 0) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
|
||||
else if (which == 1) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
|
||||
else cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
|
||||
}
|
||||
if (i >= 0) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion
|
||||
if (j >= 0) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
|
||||
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
|
||||
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
|
||||
*n_cigar_ = n_cigar, *cigar_ = cigar;
|
||||
}
|
||||
kfree(km, u); kfree(km, v); kfree(km, x); kfree(km, y); kfree(km, s); kfree(km, qr); kfree(km, p);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,166 @@
|
|||
\documentclass[10pt]{article}
|
||||
|
||||
\title{Alignment with dynamic programming}
|
||||
\author{Heng Li}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\maketitle
|
||||
|
||||
\section{General notations}
|
||||
|
||||
Suppose we have two sequences: a \emph{target} sequence and a \emph{query}
|
||||
sequence. The length of the target sequence is $\ell_t$ with each residue
|
||||
indexed by $i$. The length of query is $\ell_q$ with each residue indexed by
|
||||
$j$. Gaps on the target sequence are \emph{deletions} and gaps on the query are
|
||||
\emph{insertions}. Function $S(i,j)$ gives the score between two residues on
|
||||
the target and the query, respectively. $q>0$ is the gap open/initiation
|
||||
penalty and $e>0$ the gap extension penalty. A gap of length $k$ costs
|
||||
$q+k\cdot e$.
|
||||
|
||||
\section{Global alignment with affine-gap penalties}
|
||||
|
||||
\subsection{Durbin's formulation}
|
||||
|
||||
The original Durbin's formulation is:
|
||||
\begin{eqnarray*}
|
||||
M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\
|
||||
E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}\} - e\\
|
||||
F_{ij}&=&\max\{M_{i,j-1}-q, F_{i,j-1}\} - e
|
||||
\end{eqnarray*}
|
||||
This formulation disallows a deletion immediately followed an insertion, or
|
||||
vice versa. A more general form is:
|
||||
\begin{eqnarray*}
|
||||
M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\
|
||||
E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}, F_{i-1,j}-q\} - e\\
|
||||
F_{ij}&=&\max\{M_{i,j-1}-q, E_{i,j-1}-q, F_{i,j-1}\} - e
|
||||
\end{eqnarray*}
|
||||
|
||||
\subsection{Green's formulation}
|
||||
|
||||
If we define:
|
||||
\[H_{ij}=\max\{M_{ij},E_{ij},F_{ij}\}\]
|
||||
the Durbin's formulation can be transformed to
|
||||
\begin{eqnarray*}
|
||||
E_{ij} &=& \max\{H_{i-1,j}-q, E_{i-1,j}\} - e \\
|
||||
F_{ij} &=& \max\{H_{i,j-1}-q, F_{i,j-1}\} - e \\
|
||||
H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j), E_{ij}, F_{ij}\}
|
||||
\end{eqnarray*}
|
||||
I first saw this formulation in Phrap developed by Phil Green, though it may
|
||||
have been used earlier. If we further introduce
|
||||
\begin{eqnarray*}
|
||||
E'_{ij}&=&E_{i+1,j}\\
|
||||
F'_{ij}&=&F_{i,j+1}
|
||||
\end{eqnarray*}
|
||||
we have
|
||||
\begin{eqnarray*}
|
||||
H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j),E'_{i-1,j},F'_{i,j-1}\}\\
|
||||
E'_{ij}&=& \max\{H_{ij}-q,E'_{i-1,j}\}-e\\
|
||||
F'_{ij}&=& \max\{H_{ij}-q,F'_{i,j-1}\}-e
|
||||
\end{eqnarray*}
|
||||
In fact, we more often use this set of equations in practical implementations.
|
||||
The initial conditions are
|
||||
\begin{eqnarray*}
|
||||
H_{-1,j}&=&
|
||||
\left\{\begin{array}{ll}
|
||||
0 & (j=-1)\\
|
||||
-q-(j+1)\cdot e & (0\le j<\ell_q)
|
||||
\end{array}\right.\\
|
||||
H_{i,-1}&=&
|
||||
\left\{\begin{array}{ll}
|
||||
0 & (i=-1)\\
|
||||
-q-(i+1)\cdot e & (0\le i<\ell_t)
|
||||
\end{array}\right.\\
|
||||
E'_{-1,j}&=&E_{0,j}=H_{-1,j}-q-e=-2q-(j+2)\cdot e\\
|
||||
F'_{i,-1}&=&F_{i,0}=-2q-(i+2)\cdot e
|
||||
\end{eqnarray*}
|
||||
|
||||
\subsection{Suzuki's formulation}
|
||||
|
||||
\subsubsection{Standard coordinate}
|
||||
Now let
|
||||
\begin{eqnarray*}
|
||||
u'_{ij}&=&H_{ij}-H_{i-1,j}\\
|
||||
v'_{ij}&=&H_{ij}-H_{i,j-1}\\
|
||||
x'_{ij}&=&E'_{ij}-H_{ij}\\
|
||||
y'_{ij}&=&F'_{ij}-H_{ij}
|
||||
\end{eqnarray*}
|
||||
We have
|
||||
\begin{eqnarray}\label{eq:x}
|
||||
x'_{ij}&=&\max\{-q,E'_{i-1,j}-H_{i-1,j}+H_{i-1,j}-H_{ij}\}-e\\\nonumber
|
||||
&=&\max\{-q,x'_{i-1,j}-u'_{ij}\}-e
|
||||
\end{eqnarray}
|
||||
Similarly
|
||||
\begin{equation}
|
||||
y'_{ij}=\max\{-q,y'_{i,j-1}-v'_{ij}\}-e
|
||||
\end{equation}
|
||||
To derive the equation to compute $u'(i,j)$ and $v'(i,j)$, we note that
|
||||
\begin{eqnarray*}
|
||||
H_{ij}-H_{i-1,j-1}
|
||||
&=&\max\{S(i,j),E'_{i-1,j}-H_{i-1,j-1},F'_{i,j-1}-H_{i-1,j-1}\}\\
|
||||
&=&\max\{S(i,j),x'_{i-1,j}+v'_{i-1,j},y'_{i,j-1}+u'_{i,j-1}\}
|
||||
\end{eqnarray*}
|
||||
and
|
||||
\[H_{ij}-H_{i-1,j-1}=u'_{ij}+v'_{i-1,j}=v'_{ij}+u'_{i,j-1}\]
|
||||
We can derive the recursive equation for $u'_{ij}$ and $v'_{ij}$.
|
||||
|
||||
From eq.~(\ref{eq:x}) we can infer that $x'_{ij}\ge-q-e$ and similarly
|
||||
$y'_{ij}\ge-q-e$. We further have:
|
||||
\[
|
||||
u'_{ij}=H_{ij}-H_{i-1,j-1}-v'_{i-1,j}\ge x'_{i-1,j}\ge-q-e
|
||||
\]
|
||||
Therefore, we have a lower bound $-q-e$ for $u'$, $v'$, $x'$ and $y'$.
|
||||
This motivates us to redefine the four variables as:
|
||||
\begin{eqnarray*}
|
||||
u''_{ij}&=&H_{ij}-H_{i-1,j}+q+e\\
|
||||
v''_{ij}&=&H_{ij}-H_{i,j-1}+q+e\\
|
||||
x''_{ij}&=&E'_{ij}-H_{ij}+q+e\\
|
||||
y''_{ij}&=&F'_{ij}-H_{ij}+q+e
|
||||
\end{eqnarray*}
|
||||
The recursion becomes
|
||||
\begin{eqnarray*}
|
||||
z''_{ij}&=&\max\{S(i,j)+2q+2e,x''_{i-1,j}+v''_{i-1,j},y''_{i,j-1}+u''_{i,j-1}\}\\
|
||||
u''_{ij}&=&z''_{ij}-v''_{i-1,j}\\
|
||||
v''_{ij}&=&z''_{ij}-u''_{i,j-1}\\
|
||||
x''_{ij}&=&\max\{0,x''_{i-1,j}-u''_{ij}+q\}=\max\{0,x''_{i-1,j}+v''_{i-1,j}-z''_{ij}+q\}\\
|
||||
y''_{ij}&=&\max\{0,y''_{i,j-1}-v''_{ij}+q\}=\max\{0,y''_{i,j-1}+u''_{i,j-1}-z''_{ij}+q\}
|
||||
\end{eqnarray*}
|
||||
Here $z_{ij}$ is a temporary variable. $u''$, $v''$, $x''$ and $y''$ are all
|
||||
non-negtive.
|
||||
|
||||
\subsubsection{Rotated coordinate}
|
||||
|
||||
We let
|
||||
\begin{eqnarray*}
|
||||
r&=&i+j\\
|
||||
t&=&i
|
||||
\end{eqnarray*}
|
||||
We have
|
||||
\begin{eqnarray*}
|
||||
z_{rt}&=&\max\{S(t,r-t)+2q+2e,x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t}\}\\
|
||||
u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
|
||||
v_{rt}&=&z_{rt}-u_{r-1,t}\\
|
||||
x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}\\
|
||||
y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}
|
||||
\end{eqnarray*}
|
||||
Due to the definition of $r$ and $t$, the following inequation must stand:
|
||||
\[0\le r-t<\ell_q\]
|
||||
\[0\le t<\ell_t\]
|
||||
where $\ell_t$ is the length of the sequence indexed by $i$ and $\ell_q$ the
|
||||
length indexed by $j$. In case of banded alignment with a fixed diagonal band
|
||||
of size $w$,
|
||||
\[-w\le j-i\le w\]
|
||||
In the $(r,t)$ coordinate, it is:
|
||||
\[\frac{r-w}{2}\le t\le \frac{r+w}{2}\]
|
||||
Putting these together:
|
||||
\[0\le r< \ell_q+\ell_t\]
|
||||
\[\max\left\{0,r-\ell_q+1,\frac{r-w}{2}\right\}\le t\le\min\left\{\ell_t-1,r,\frac{r+w}{2}\right\}\]
|
||||
|
||||
\subsubsection{Initial conditions}
|
||||
\[x_{r-1,-1}=x''_{-1,r}=E'_{-1,r}-H_{-1,r}+q+e=0\]
|
||||
\[v_{r-1,-1}=v''_{-1,r}=H_{-1,r}-H_{-1,r-1}+q+e=\left\{\begin{array}{ll}
|
||||
q & (r>0) \\
|
||||
0 & (r=0)
|
||||
\end{array}\right.\]
|
||||
|
||||
\end{document}
|
||||
Loading…
Reference in New Issue