This commit is contained in:
Heng Li 2017-08-01 13:29:02 -04:00
parent 7542b4b40a
commit d933a263d7
2 changed files with 73 additions and 15 deletions

View File

@ -33,7 +33,7 @@
Journal = {arXiv:1303.3997},
Title = {Aligning sequence reads, clone sequences and assembly contigs with {BWA-MEM}},
archivePrefix = "arXiv",
eprint = {0902.0885},
eprint = {1303.3997},
primaryClass = "q-bio",
Year = {2013}}
@ -101,3 +101,34 @@
Title = {Parasail: SIMD C library for global, semi-global, and local pairwise sequence alignments},
Volume = {17},
Year = {2016}}
@article{Sedlazeck169557,
author = {Sedlazeck, Fritz J and Rescheneder, Philipp and Smolka, Moritz and Fang, Han and Nattestad, Maria and others},
title = {Accurate detection of complex structural variations using single molecule sequencing},
note = {doi:10.1101/169557},
journal = {bioRxiv},
year = {2017}}
@article{Altschul:1997vn,
Author = {Altschul, S F and Madden, T L and Sch{\"a}ffer, A A and Zhang, J and Zhang, Z and others},
Journal = {Nucleic Acids Res},
Pages = {3389-402},
Title = {Gapped BLAST and PSI-BLAST: a new generation of protein database search programs},
Volume = {25},
Year = {1997}}
@article{Sosic:2017aa,
Author = {{\v S}o{\v s}i\'{c}, Martin and {\v S}ikic, Mile},
Journal = {Bioinformatics},
Pages = {1394-1395},
Title = {Edlib: a {C/C++} library for fast, exact sequence alignment using edit distance},
Volume = {33},
Year = {2017}}
@article{Abouelhoda:2005aa,
Author = {Mohamed Ibrahim Abouelhoda and Enno Ohlebusch},
Journal = {J. Discrete Algorithms},
Pages = {321-41},
Title = {Chaining algorithms for multiple genome comparison},
Volume = {3},
Year = {2005}}

View File

@ -44,7 +44,7 @@ improved alignment around potential structural variations.
Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore
technologies (ONT) produce reads over 10kbp in length at an error rate
$\sim$15\%. Several aligners have been developed for such
data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa}.
data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}.
They are usually five times as slow as mainstream short-read
aligners~\citep{Langmead:2012fk,Li:2013aa}. We speculated there could be
substantial room for speedup on the thought that 10kb long sequences should be
@ -84,25 +84,32 @@ distance between two anchors is too large); otherwise
\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
\]
In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
l+\beta\log_2(l)$, which is concave. For $m$ anchors, computing all $f(\cdot)$
l+\beta\log_2(l)$. For $m$ anchors, computing all $f(\cdot)$
with Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. We note that if anchor $i$ is
appended to $j$, appending $i$ to a predecessor of $j$ is likely to yield a
lower score. When evaluating Eq.~(\ref{eq:chain}), we start from anchor $i-1$
and stop the evaluation if we cannot find a better $\max$ after up to $h$
and stop the evaluation if we cannot find a better score after up to $h$
iterations. This heuristic reduces the average time to $O(h\cdot m)$. In
practical, we can almost always find the optimal chain with $h=50$; even if the
heuristic fails, the optimal chain often looks dubious.
%Although theoretically faster chaining algorithms exist for simple gap
%cost~\citep{Abouelhoda:2005aa}, they are not as flexible as DP and may not lead
%to better performance than our approach in practice. Furthermore, chaining
%takes much less computing time than alignment. It is not critical to the
%performance of minimap2.
\subsubsection{Backtracking}
Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
$f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$ otherwise. For each
anchor $i$ in the descending order of $f(i)$, we mark $i$ as `used' and apply
$P(\cdot)$ repeatedly to find its predecessor. This process stops at $P(j)=0$
or at a `used' $j$. This way we find all chains with no anchors used in
more than one chains.
anchor $i$ in the descending order of $f(i)$, we apply $P(\cdot)$ repeatedly to
find its predecessor and mark each visited $i$ as `used'. This process stops at
$P(j)=0$ or at a `used' $j$. This way we find all chains with no anchors used
in more than one chains.
\subsection{Alignment}
Minimap2 performs DP to align sequences between adjacent anchors in a chain. It
Minimap2 performs global alignment between adjacent anchors in a chain. It
adopted difference-based formulation to derive
alignment~\citep{Wu:1996aa,Suzuki:2016}. When combined with SSE vectorization,
this formulation has two advantages. First, because each score in the DP matrix
@ -112,17 +119,37 @@ alignment. Second, filling the DP matrix along the diagonal, we can simplify
banded alignment, which is critical to performance. In practice, our
implementation is three times as fast as Parasail's 4-way
vectorization~\citep{Daily:2016aa} for global alignment.
Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa},
but with a 1000bp band, it is much faster. When performing global alignment
between anchors, we expect the alignment to stay close to the diagonal of the
DP matrix. Banding is applicable most of time.
Minimap2 uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}:
\[
\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}
\]
Minimap2 uses a 2-piece affine gap cost
$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this
cost function is concave. It applies cost $q+l\cdot e$ to gaps shorter than
$\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
$\tilde{q}+l\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
longer insertions and deletions.
longer insertions and deletions~\citep{Gotoh:1990aa}.
With global alignment, minimap2 may force to align unrelated sequences between
two adjacent anchors. To avoid such an artifact, we compute accumulative
alignment score along the alignment path and break the alignment where the
score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be
the alignment score along the alignment path ending at cell $(i,j)$ in the DP
matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
$j'<j$, such that
\[
S(i',j')-S(i,j)>Z+e\cdot(\max\{i-i',j-j'\}-\min\{i-i',j-j'\})
\]
where $e$ is the gap extension penalty and $Z$ is an aribitrary threshold.
This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}.
However, unlike X-drop, it would not break the alignment in the presence of a
single long gap. When minimap2 breaks a global alignment between two anchors,
it performs local alignment between the two subsequences involved in the global
alignment, but this time with the one subsequence reverse complemented. This
additional alignment step may identify short inversions that are missed during
chaining.
%\begin{equation}\label{eq:ae86}
%\left\{\begin{array}{l}