diff --git a/tex/minimap2.bib b/tex/minimap2.bib index 374a06d..2edb4a9 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -33,7 +33,7 @@ Journal = {arXiv:1303.3997}, Title = {Aligning sequence reads, clone sequences and assembly contigs with {BWA-MEM}}, archivePrefix = "arXiv", - eprint = {0902.0885}, + eprint = {1303.3997}, primaryClass = "q-bio", Year = {2013}} @@ -101,3 +101,34 @@ Title = {Parasail: SIMD C library for global, semi-global, and local pairwise sequence alignments}, Volume = {17}, Year = {2016}} + +@article{Sedlazeck169557, + author = {Sedlazeck, Fritz J and Rescheneder, Philipp and Smolka, Moritz and Fang, Han and Nattestad, Maria and others}, + title = {Accurate detection of complex structural variations using single molecule sequencing}, + note = {doi:10.1101/169557}, + journal = {bioRxiv}, + year = {2017}} + +@article{Altschul:1997vn, + Author = {Altschul, S F and Madden, T L and Sch{\"a}ffer, A A and Zhang, J and Zhang, Z and others}, + Journal = {Nucleic Acids Res}, + Pages = {3389-402}, + Title = {Gapped BLAST and PSI-BLAST: a new generation of protein database search programs}, + Volume = {25}, + Year = {1997}} + +@article{Sosic:2017aa, + Author = {{\v S}o{\v s}i\'{c}, Martin and {\v S}ikic, Mile}, + Journal = {Bioinformatics}, + Pages = {1394-1395}, + Title = {Edlib: a {C/C++} library for fast, exact sequence alignment using edit distance}, + Volume = {33}, + Year = {2017}} + +@article{Abouelhoda:2005aa, + Author = {Mohamed Ibrahim Abouelhoda and Enno Ohlebusch}, + Journal = {J. Discrete Algorithms}, + Pages = {321-41}, + Title = {Chaining algorithms for multiple genome comparison}, + Volume = {3}, + Year = {2005}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index 659ebda..08abf75 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -44,7 +44,7 @@ improved alignment around potential structural variations. Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore technologies (ONT) produce reads over 10kbp in length at an error rate $\sim$15\%. Several aligners have been developed for such -data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa}. +data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}. They are usually five times as slow as mainstream short-read aligners~\citep{Langmead:2012fk,Li:2013aa}. We speculated there could be substantial room for speedup on the thought that 10kb long sequences should be @@ -84,25 +84,32 @@ distance between two anchors is too large); otherwise \gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\}) \] In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot -l+\beta\log_2(l)$, which is concave. For $m$ anchors, computing all $f(\cdot)$ +l+\beta\log_2(l)$. For $m$ anchors, computing all $f(\cdot)$ with Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. We note that if anchor $i$ is appended to $j$, appending $i$ to a predecessor of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}), we start from anchor $i-1$ -and stop the evaluation if we cannot find a better $\max$ after up to $h$ +and stop the evaluation if we cannot find a better score after up to $h$ iterations. This heuristic reduces the average time to $O(h\cdot m)$. In practical, we can almost always find the optimal chain with $h=50$; even if the heuristic fails, the optimal chain often looks dubious. +%Although theoretically faster chaining algorithms exist for simple gap +%cost~\citep{Abouelhoda:2005aa}, they are not as flexible as DP and may not lead +%to better performance than our approach in practice. Furthermore, chaining +%takes much less computing time than alignment. It is not critical to the +%performance of minimap2. + \subsubsection{Backtracking} Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if $f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$ otherwise. For each -anchor $i$ in the descending order of $f(i)$, we mark $i$ as `used' and apply -$P(\cdot)$ repeatedly to find its predecessor. This process stops at $P(j)=0$ -or at a `used' $j$. This way we find all chains with no anchors used in -more than one chains. +anchor $i$ in the descending order of $f(i)$, we apply $P(\cdot)$ repeatedly to +find its predecessor and mark each visited $i$ as `used'. This process stops at +$P(j)=0$ or at a `used' $j$. This way we find all chains with no anchors used +in more than one chains. \subsection{Alignment} -Minimap2 performs DP to align sequences between adjacent anchors in a chain. It + +Minimap2 performs global alignment between adjacent anchors in a chain. It adopted difference-based formulation to derive alignment~\citep{Wu:1996aa,Suzuki:2016}. When combined with SSE vectorization, this formulation has two advantages. First, because each score in the DP matrix @@ -112,17 +119,37 @@ alignment. Second, filling the DP matrix along the diagonal, we can simplify banded alignment, which is critical to performance. In practice, our implementation is three times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa} for global alignment. +Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, +but with a 1000bp band, it is much faster. When performing global alignment +between anchors, we expect the alignment to stay close to the diagonal of the +DP matrix. Banding is applicable most of time. - -Minimap2 uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}: -\[ -\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\} -\] +Minimap2 uses a 2-piece affine gap cost +$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$. On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this cost function is concave. It applies cost $q+l\cdot e$ to gaps shorter than $\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies $\tilde{q}+l\cdot\tilde{e}$ to longer gaps. This scheme helps to recover -longer insertions and deletions. +longer insertions and deletions~\citep{Gotoh:1990aa}. + +With global alignment, minimap2 may force to align unrelated sequences between +two adjacent anchors. To avoid such an artifact, we compute accumulative +alignment score along the alignment path and break the alignment where the +score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be +the alignment score along the alignment path ending at cell $(i,j)$ in the DP +matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'Z+e\cdot(\max\{i-i',j-j'\}-\min\{i-i',j-j'\}) +\] +where $e$ is the gap extension penalty and $Z$ is an aribitrary threshold. +This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}. +However, unlike X-drop, it would not break the alignment in the presence of a +single long gap. When minimap2 breaks a global alignment between two anchors, +it performs local alignment between the two subsequences involved in the global +alignment, but this time with the one subsequence reverse complemented. This +additional alignment step may identify short inversions that are missed during +chaining. %\begin{equation}\label{eq:ae86} %\left\{\begin{array}{l}