r290: in techrep, explain spliced alignment
This commit is contained in:
parent
a23df2dc91
commit
53b3265d84
4
main.c
4
main.c
|
|
@ -8,7 +8,7 @@
|
||||||
#include "minimap.h"
|
#include "minimap.h"
|
||||||
#include "mmpriv.h"
|
#include "mmpriv.h"
|
||||||
|
|
||||||
#define MM_VERSION "2.0-r289-dirty"
|
#define MM_VERSION "2.0-r290-dirty"
|
||||||
|
|
||||||
void liftrlimit()
|
void liftrlimit()
|
||||||
{
|
{
|
||||||
|
|
@ -197,7 +197,7 @@ int main(int argc, char *argv[])
|
||||||
fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
|
fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
|
||||||
fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
|
fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
|
||||||
fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
|
fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
|
||||||
fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n");
|
fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n");
|
||||||
fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
|
fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
.TH minimap2 1 "8 August 2017" "minimap2-2.0-r275" "Bioinformatics tools"
|
.TH minimap2 1 "12 August 2017" "minimap2-2.0-r290-dirty" "Bioinformatics tools"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
.PP
|
.PP
|
||||||
minimap2 - mapping and alignment between collections of DNA sequences
|
minimap2 - mapping and alignment between collections of DNA sequences
|
||||||
|
|
|
||||||
|
|
@ -72,17 +72,17 @@ sorted by ending reference position $x$, let $f(i)$ be the maximal chaining
|
||||||
score up to the $i$-th anchor in the list. $f(i)$ can be calculated with
|
score up to the $i$-th anchor in the list. $f(i)$ can be calculated with
|
||||||
dynamic programming (DP):
|
dynamic programming (DP):
|
||||||
\begin{equation}\label{eq:chain}
|
\begin{equation}\label{eq:chain}
|
||||||
f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\gamma(j,i) \},w_i\big\}
|
f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\beta_c(j,i) \},w_i\big\}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where $d(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
|
where $d(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
|
||||||
matching bases between the two anchors. $\gamma(j,i)>0$ is the gap cost. It
|
matching bases between the two anchors. $\beta_c(j,i)>0$ is the gap cost. It
|
||||||
equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the
|
equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the
|
||||||
distance between two anchors is too large); otherwise
|
distance between two anchors is too large); otherwise
|
||||||
\[
|
\begin{equation}\label{eq:chain-gap}
|
||||||
\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
|
\beta_c(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big)
|
||||||
\]
|
\end{equation}
|
||||||
In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
|
In implementation, a gap of length $l$ costs $\gamma_c(l)=0.01\cdot \bar{w}\cdot
|
||||||
l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with
|
|l|+0.5\log_2|l|$, where $\bar{w}$ is the average seed length. For $m$ anchors, directly computing all $f(\cdot)$ with
|
||||||
Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster
|
Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster
|
||||||
chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
|
chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
|
||||||
are inapplicable to generic gap cost, complex to implement and usually
|
are inapplicable to generic gap cost, complex to implement and usually
|
||||||
|
|
@ -134,13 +134,15 @@ but with a 1000bp band, it is considerably faster. When performing global
|
||||||
alignment between anchors, we expect the alignment to stay close to the
|
alignment between anchors, we expect the alignment to stay close to the
|
||||||
diagonal of the DP matrix. Banding is often applicable.
|
diagonal of the DP matrix. Banding is often applicable.
|
||||||
|
|
||||||
Minimap2 uses a 2-piece affine gap cost
|
Minimap2 uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}:
|
||||||
$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
|
\begin{equation}\label{eq:2-piece}
|
||||||
|
\gamma_a(l)=\min\{q+|l|\cdot e,\tilde{q}+|l|\cdot\tilde{e}\}
|
||||||
|
\end{equation}
|
||||||
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this
|
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this
|
||||||
cost function is concave. It applies cost $q+l\cdot e$ to gaps shorter than
|
cost function is concave. It applies cost $q+|l|\cdot e$ to gaps shorter than
|
||||||
$\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
|
$\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
|
||||||
$\tilde{q}+l\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
|
$\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
|
||||||
longer insertions and deletions~(INDEL; \citealp{Gotoh:1990aa}).
|
longer insertions and deletions~(INDEL).
|
||||||
|
|
||||||
With global alignment, minimap2 may force to align unrelated sequences between
|
With global alignment, minimap2 may force to align unrelated sequences between
|
||||||
two adjacent anchors. To avoid such an artifact, we compute accumulative
|
two adjacent anchors. To avoid such an artifact, we compute accumulative
|
||||||
|
|
@ -150,7 +152,7 @@ the alignment score along the alignment path ending at cell $(i,j)$ in the DP
|
||||||
matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
|
matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
|
||||||
$j'<j$, such that
|
$j'<j$, such that
|
||||||
\[
|
\[
|
||||||
S(i',j')-S(i,j)>Z+e\cdot(\max\{i-i',j-j'\}-\min\{i-i',j-j'\})
|
S(i',j')-S(i,j)>Z+e\cdot|(i-i')-(j-j')|
|
||||||
\]
|
\]
|
||||||
where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
|
where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
|
||||||
This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}.
|
This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}.
|
||||||
|
|
@ -162,6 +164,30 @@ alignment between the two subsequences involved in the global alignment, but
|
||||||
this time with the one subsequence reverse complemented. This additional
|
this time with the one subsequence reverse complemented. This additional
|
||||||
alignment step may identify short inversions that are missed during chaining.
|
alignment step may identify short inversions that are missed during chaining.
|
||||||
|
|
||||||
|
\subsection{Spliced alignment}
|
||||||
|
|
||||||
|
The algorithm described above can be adapted to spliced alignment. In this
|
||||||
|
mode, the chaining gap cost distinguishes insertions to and deletions from the
|
||||||
|
reference: $\gamma_c(l)$ in Eq.~(\ref{eq:chain-gap}) takes the form of
|
||||||
|
\[
|
||||||
|
\gamma_c(l)=\left\{\begin{array}{ll}
|
||||||
|
0.01\cdot\bar{w}\cdot l+0.5\log_2 l & (l>0) \\
|
||||||
|
\min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & (l<0)
|
||||||
|
\end{array}\right.
|
||||||
|
\]
|
||||||
|
Similarly, the gap cost function used for DP-based alignment is changed to
|
||||||
|
\[
|
||||||
|
\gamma_a(l)=\left\{\begin{array}{ll}
|
||||||
|
q+l\cdot e & (l>0) \\
|
||||||
|
\min\{q+|l|\cdot e,\tilde{q}\} & (l<0)
|
||||||
|
\end{array}\right.
|
||||||
|
\]
|
||||||
|
In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is
|
||||||
|
regarded as an intron, which pays no cost to gap extensions. With these
|
||||||
|
modifications, minimap2 can retain multiple reasonably long introns in one
|
||||||
|
alignment, rather than fragment the alignment into local hits, which often
|
||||||
|
leads to the loss of small exons especially given noisy reads.
|
||||||
|
|
||||||
\end{methods}
|
\end{methods}
|
||||||
|
|
||||||
\section{Results}
|
\section{Results}
|
||||||
|
|
@ -232,7 +258,6 @@ issues.
|
||||||
|
|
||||||
\bibliography{minimap2}
|
\bibliography{minimap2}
|
||||||
|
|
||||||
\pagebreak
|
|
||||||
\appendix
|
\appendix
|
||||||
|
|
||||||
\begin{methods}
|
\begin{methods}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue