explain DP

2017-06-15 19:37:48 -04:00 · 2017-06-15 19:37:48 -04:00 · c4564c31a2
parent 78fe89d1ab
commit c4564c31a2
2 changed files with 253 additions and 0 deletions
--- a/ksw2.c
+++ b/ksw2.c
@ -275,3 +275,90 @@ int ksw_global(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t
 	}
 	return score;
 }
+#include <stdio.h>
+int ksw_global2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int8_t q, int8_t e, int w, int *n_cigar_, uint32_t **cigar_)
+{
+	int qe = q + e, qe2 = qe + qe, r, t, n_col, *off;
+	int8_t *u, *v, *x, *y, *s;
+	uint8_t *p, *qr;
+
+	u = (int8_t*)kcalloc(km, tlen + 1, 1);
+	v = (int8_t*)kcalloc(km, tlen + 1, 1);
+	x = (int8_t*)kcalloc(km, tlen + 1, 1);
+	y = (int8_t*)kcalloc(km, tlen + 1, 1);
+	s = (int8_t*)kmalloc(km, tlen, 1);
+	qr = (uint8_t*)kmalloc(km, qlen, 1);
+	n_col = 2 * w < tlen + 1? 2 * w : tlen + 1;
+	p = (uint8_t*)kmalloc(km, (qlen + tlen) * n_col);
+
+	for (t = 0; t < qlen; ++t)
+		qr[t] = query[qlen - 1 - t];
+
+	for (r = 0; r <= qlen + tlen; ++r) {
+		int st = 0, en = tlen;
+		int8_t x1 = 0, v1 = 0;
+		uint8_t *pr = p + r * n_col;
+		if (st < r - qlen) st = r - qlen;
+		if (en > r) en = r;
+		if (st < (r-w+1)>>1) st = (r-w+1)>>1; // take the ceil
+		if (en > (r+w)>>1) en = (r+w)>>1; // take the floor
+		off[r] = st;
+		for (t = st; t <= en; ++t)
+			s[t] = mat[target[t] * m + qr[t + qlen - 1 - r]];
+		for (t = st; t <= en; ++t) {
+			/* At the beginning of the loop, v1=v(r-1,t-1), x1=x(r-1,t-1), u[t]=u(r-1,t), v[t]=v(r-1,t), x[t]=x(r-1,t), y[t]=y(r-1,t)
+			   a      = x(r-1,t-1) + v(r-1,t-1)
+			   b      = y(r-1,t)   + u(r-1,t)
+			   z      = max{ S(t,r-t) + 2q + 2r, a, b }
+			   u(r,t) = z - v(r-1,t-1)
+			   v(r,t) = z - u(r-1,t)
+			   x(r,t) = max{ 0, a - z + q }
+			   y(r,t) = max{ 0, b - z + q }
+			*/
+			uint8_t d;
+			int8_t u1;
+			int8_t z = s[t] + qe2;
+			int8_t a = x1   + v1;
+			int8_t b = y[t] + u[t];
+			d = z >= a? 0 : 1;
+			z = z >= a? z : a;
+			d = z >= b? d : 2;
+			z = z >= b? z : b;
+			u1 = u[t];              // u1   = u(r-1,t) (temporary variable)
+			u[t] = z - v1;          // u[t] = u(r,t)
+			v1 = v[t];              // v1   = v(r-1,t) (set for the next iteration)
+			v[t] = z - u1;          // v[t] = v(r,t)
+			z -= q;
+			a -= z;
+			b -= z;
+			x1 = x[t];              // x1   = x(r-1,t) (set for the next iteration)
+			d   |= a > 0? 1<<2 : 0;
+			x[t] = a > 0? a    : 0; // x[t] = x(r,t)
+			d   |= b > 0? 2<<4 : 0;
+			y[t] = b > 0? b    : 0; // y[t] = y(r,t)
+			pr[t - st] = d;
+		}
+	}
+	if (n_cigar_ && cigar_) {
+		int n_cigar = 0, m_cigar = 0, which = 0, i, j;
+		uint32_t *cigar = 0, tmp;
+		i = tlen - 1, j = qlen - 1; // (i,k) points to the last cell;
+		while (i >= 0 && j >= 0) {
+			r = i + j;
+			tmp = p[r * n_col + i - off[r]];
+			which = tmp >> (which << 1) & 3;
+			if (which == 0 && tmp>>6) break;
+			if (which == 0) which = tmp & 3;
+			if (which == 0)      cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
+			else if (which == 1) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i;      // deletion
+			else                 cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j;      // insertion
+		}
+		if (i >= 0) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion
+		if (j >= 0) cigar = push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
+		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
+			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
+		*n_cigar_ = n_cigar, *cigar_ = cigar;
+	}
+	kfree(km, u); kfree(km, v); kfree(km, x); kfree(km, y); kfree(km, s); kfree(km, qr); kfree(km, p);
+	return 0;
+}
--- a/tex/aln-dp.tex
+++ b/tex/aln-dp.tex
@ -0,0 +1,166 @@
+\documentclass[10pt]{article}
+
+\title{Alignment with dynamic programming}
+\author{Heng Li}
+
+\begin{document}
+
+\maketitle
+
+\section{General notations}
+
+Suppose we have two sequences: a \emph{target} sequence and a \emph{query}
+sequence. The length of the target sequence is $\ell_t$ with each residue
+indexed by $i$.  The length of query is $\ell_q$ with each residue indexed by
+$j$. Gaps on the target sequence are \emph{deletions} and gaps on the query are
+\emph{insertions}. Function $S(i,j)$ gives the score between two residues on
+the target and the query, respectively. $q>0$ is the gap open/initiation
+penalty and $e>0$ the gap extension penalty. A gap of length $k$ costs
+$q+k\cdot e$.
+
+\section{Global alignment with affine-gap penalties}
+
+\subsection{Durbin's formulation}
+
+The original Durbin's formulation is:
+\begin{eqnarray*}
+M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\
+E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}\} - e\\
+F_{ij}&=&\max\{M_{i,j-1}-q, F_{i,j-1}\} - e
+\end{eqnarray*}
+This formulation disallows a deletion immediately followed an insertion, or
+vice versa. A more general form is:
+\begin{eqnarray*}
+M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\
+E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}, F_{i-1,j}-q\} - e\\
+F_{ij}&=&\max\{M_{i,j-1}-q, E_{i,j-1}-q, F_{i,j-1}\} - e
+\end{eqnarray*}
+
+\subsection{Green's formulation}
+
+If we define:
+\[H_{ij}=\max\{M_{ij},E_{ij},F_{ij}\}\]
+the Durbin's formulation can be transformed to
+\begin{eqnarray*}
+E_{ij} &=& \max\{H_{i-1,j}-q, E_{i-1,j}\} - e \\
+F_{ij} &=& \max\{H_{i,j-1}-q, F_{i,j-1}\} - e \\
+H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j), E_{ij}, F_{ij}\}
+\end{eqnarray*}
+I first saw this formulation in Phrap developed by Phil Green, though it may
+have been used earlier. If we further introduce
+\begin{eqnarray*}
+E'_{ij}&=&E_{i+1,j}\\
+F'_{ij}&=&F_{i,j+1}
+\end{eqnarray*}
+we have
+\begin{eqnarray*}
+H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j),E'_{i-1,j},F'_{i,j-1}\}\\
+E'_{ij}&=& \max\{H_{ij}-q,E'_{i-1,j}\}-e\\
+F'_{ij}&=& \max\{H_{ij}-q,F'_{i,j-1}\}-e
+\end{eqnarray*}
+In fact, we more often use this set of equations in practical implementations.
+The initial conditions are
+\begin{eqnarray*}
+H_{-1,j}&=&
+  \left\{\begin{array}{ll}
+    0 & (j=-1)\\
+	-q-(j+1)\cdot e & (0\le j<\ell_q)
+  \end{array}\right.\\
+H_{i,-1}&=&
+  \left\{\begin{array}{ll}
+    0 & (i=-1)\\
+	-q-(i+1)\cdot e & (0\le i<\ell_t)
+  \end{array}\right.\\
+E'_{-1,j}&=&E_{0,j}=H_{-1,j}-q-e=-2q-(j+2)\cdot e\\
+F'_{i,-1}&=&F_{i,0}=-2q-(i+2)\cdot e
+\end{eqnarray*}
+
+\subsection{Suzuki's formulation}
+
+\subsubsection{Standard coordinate}
+Now let
+\begin{eqnarray*}
+u'_{ij}&=&H_{ij}-H_{i-1,j}\\
+v'_{ij}&=&H_{ij}-H_{i,j-1}\\
+x'_{ij}&=&E'_{ij}-H_{ij}\\
+y'_{ij}&=&F'_{ij}-H_{ij}
+\end{eqnarray*}
+We have
+\begin{eqnarray}\label{eq:x}
+x'_{ij}&=&\max\{-q,E'_{i-1,j}-H_{i-1,j}+H_{i-1,j}-H_{ij}\}-e\\\nonumber
+&=&\max\{-q,x'_{i-1,j}-u'_{ij}\}-e
+\end{eqnarray}
+Similarly
+\begin{equation}
+y'_{ij}=\max\{-q,y'_{i,j-1}-v'_{ij}\}-e
+\end{equation}
+To derive the equation to compute $u'(i,j)$ and $v'(i,j)$, we note that
+\begin{eqnarray*}
+H_{ij}-H_{i-1,j-1}
+&=&\max\{S(i,j),E'_{i-1,j}-H_{i-1,j-1},F'_{i,j-1}-H_{i-1,j-1}\}\\
+&=&\max\{S(i,j),x'_{i-1,j}+v'_{i-1,j},y'_{i,j-1}+u'_{i,j-1}\}
+\end{eqnarray*}
+and
+\[H_{ij}-H_{i-1,j-1}=u'_{ij}+v'_{i-1,j}=v'_{ij}+u'_{i,j-1}\]
+We can derive the recursive equation for $u'_{ij}$ and $v'_{ij}$.
+
+From eq.~(\ref{eq:x}) we can infer that $x'_{ij}\ge-q-e$ and similarly
+$y'_{ij}\ge-q-e$. We further have:
+\[
+u'_{ij}=H_{ij}-H_{i-1,j-1}-v'_{i-1,j}\ge x'_{i-1,j}\ge-q-e
+\]
+Therefore, we have a lower bound $-q-e$ for $u'$, $v'$, $x'$ and $y'$.
+This motivates us to redefine the four variables as:
+\begin{eqnarray*}
+u''_{ij}&=&H_{ij}-H_{i-1,j}+q+e\\
+v''_{ij}&=&H_{ij}-H_{i,j-1}+q+e\\
+x''_{ij}&=&E'_{ij}-H_{ij}+q+e\\
+y''_{ij}&=&F'_{ij}-H_{ij}+q+e
+\end{eqnarray*}
+The recursion becomes
+\begin{eqnarray*}
+z''_{ij}&=&\max\{S(i,j)+2q+2e,x''_{i-1,j}+v''_{i-1,j},y''_{i,j-1}+u''_{i,j-1}\}\\
+u''_{ij}&=&z''_{ij}-v''_{i-1,j}\\
+v''_{ij}&=&z''_{ij}-u''_{i,j-1}\\
+x''_{ij}&=&\max\{0,x''_{i-1,j}-u''_{ij}+q\}=\max\{0,x''_{i-1,j}+v''_{i-1,j}-z''_{ij}+q\}\\
+y''_{ij}&=&\max\{0,y''_{i,j-1}-v''_{ij}+q\}=\max\{0,y''_{i,j-1}+u''_{i,j-1}-z''_{ij}+q\}
+\end{eqnarray*}
+Here $z_{ij}$ is a temporary variable. $u''$, $v''$, $x''$ and $y''$ are all
+non-negtive.
+
+\subsubsection{Rotated coordinate}
+
+We let
+\begin{eqnarray*}
+r&=&i+j\\
+t&=&i
+\end{eqnarray*}
+We have
+\begin{eqnarray*}
+z_{rt}&=&\max\{S(t,r-t)+2q+2e,x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t}\}\\
+u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
+v_{rt}&=&z_{rt}-u_{r-1,t}\\
+x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}\\
+y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}
+\end{eqnarray*}
+Due to the definition of $r$ and $t$, the following inequation must stand:
+\[0\le r-t<\ell_q\]
+\[0\le t<\ell_t\]
+where $\ell_t$ is the length of the sequence indexed by $i$ and $\ell_q$ the
+length indexed by $j$. In case of banded alignment with a fixed diagonal band
+of size $w$,
+\[-w\le j-i\le w\]
+In the $(r,t)$ coordinate, it is:
+\[\frac{r-w}{2}\le t\le \frac{r+w}{2}\]
+Putting these together:
+\[0\le r< \ell_q+\ell_t\]
+\[\max\left\{0,r-\ell_q+1,\frac{r-w}{2}\right\}\le t\le\min\left\{\ell_t-1,r,\frac{r+w}{2}\right\}\]
+
+\subsubsection{Initial conditions}
+\[x_{r-1,-1}=x''_{-1,r}=E'_{-1,r}-H_{-1,r}+q+e=0\]
+\[v_{r-1,-1}=v''_{-1,r}=H_{-1,r}-H_{-1,r-1}+q+e=\left\{\begin{array}{ll}
+  q & (r>0) \\
+  0 & (r=0)
+\end{array}\right.\]
+
+\end{document}