From 240f6caaff8848320d00fb55103c1a26ea2ac3cc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 24 Aug 2017 22:05:14 +0800 Subject: [PATCH] backup manuscript --- misc/mapstat.js | 2 +- tex/minimap2.bib | 8 ++ tex/minimap2.tex | 199 ++++++++++++++++++++++++++++------------------- 3 files changed, 129 insertions(+), 80 deletions(-) diff --git a/misc/mapstat.js b/misc/mapstat.js index 2f80bb4..98e42c3 100644 --- a/misc/mapstat.js +++ b/misc/mapstat.js @@ -94,7 +94,7 @@ while (file.readline(buf) >= 0) { ori_qlen = parseInt(t[1]); } else { // SAM var flag = parseInt(t[1]); - if (flag & 4) continue; + if ((flag & 4) || t[2] == '*' || t[5] == '*') continue; if (flag & 0x100) { ++n_2nd; continue; diff --git a/tex/minimap2.bib b/tex/minimap2.bib index eb49c9c..f6f03ec 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -251,3 +251,11 @@ Title = {A cross-species alignment tool {(CAT)}}, Volume = {8}, Year = {2007}} + +@article{Farrar:2007hs, + Author = {Farrar, Michael}, + Journal = {Bioinformatics}, + Pages = {156-61}, + Title = {{Striped Smith-Waterman speeds database searches six times over other SIMD implementations}}, + Volume = {23}, + Year = {2007}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index 7481206..9036a27 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -49,16 +49,24 @@ Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore technologies (ONT) produce reads over 10kbp in length at an error rate $\sim$15\%. Several aligners have been developed for such data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}. -They are usually five times as slow as mainstream short-read -aligners~\citep{Langmead:2012fk,Li:2013aa}. We speculated there could be -substantial room for speedup on the thought that 10kb long sequences should be -easier to map than 100bp reads because we can more effectively skip repetitive -regions, which are often the bottleneck of short-read alignment. We confirmed -our speculation by achieving approximate mapping 50 times faster than -BWA-MEM~\citep{Li:2016aa}. \citet{Suzuki:2016} extended our work with a fast -and novel algorithm on generating base-level alignment, which in turn inspired -us to develop minimap2 towards higher accuracy and more practical -functionality. +Most of them were five times as slow as mainstream short-read +aligners~\citep{Langmead:2012fk,Li:2013aa} in terms of the number of bases +mapped per second. We speculated there could be substantial room for speedup on +the thought that 10kb long sequences should be easier to map than 100bp reads +because we can more effectively skip repetitive regions, which are often the +bottleneck of short-read alignment. We confirmed our speculation by achieving +approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}. +\citet{Suzuki:2016} extended our work with a fast and novel algorithm on +generating base-level alignment, which in turn inspired us to develop minimap2 +towards higher accuracy and more practical functionality. + +Both SMRT and ONT have been applied to sequence spliced mRNAs (RNA-seq). While +traditional mRNA aligners work~\citep{Wu:2005vn,Iwata:2012aa}, they are not +optimized for long noisy sequence reads and are tens of times slower than +dedicated long-read aligners. When developing minimap2 initially for aligning +genomic DNA only, we realized minor modifications could make it competitive for +aligning mRNAs as well. Minimap2 is a first RNA-seq aligner specifically +designed for long noisy reads. \begin{methods} \section{Methods} @@ -83,7 +91,8 @@ spliced alignment. An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors sorted by ending reference position $x$, let $f(i)$ be the maximal chaining -score up to the $i$-th anchor in the list. $f(i)$ can be calculated with DP: +score up to the $i$-th anchor in the list. $f(i)$ can be calculated with +dynamic programming: \begin{equation}\label{eq:chain} f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\} \end{equation} @@ -161,11 +170,15 @@ gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}. \subsubsection{Suzuki's formulation} -To efficiently align long sequences, minimap2 did not directly implement -Eq.~(\ref{eq:ae86}). It instead adopted a difference-based -formulation first proposed by \citet{Wu:1996aa} and later adapted by -\citet{Suzuki:2016} for affine gap cost. In case of 2-piece affine gap cost in -Eq.~(\ref{eq:2-piece}), define +When we allow gaps longer than several hundred base pairs, nucleotide-level +alignment is much slower than chaining. SSE acceleration is critical to the +performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs} +based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short +sequences, but only 4-way parallelization when the peak alignment score reaches +32767. Long sequence alignment may exceed this threshold. Inspired by +\citet{Wu:1996aa} and the following work, \citet{Suzuki:2016} proposed a +difference-based formulation that lifted this limitation. In case of 2-piece +gap cost, define \[ \left\{\begin{array}{ll} u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ @@ -186,9 +199,9 @@ y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ \tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} \end{array}\right. \end{equation} -where $z_{ij}$ is a temporary variable that does not need to be stored. -This is Suzuki's formulation for 2-piece affine gap cost. An important property -of this formulation is that all values are bounded. To see that, +where $z_{ij}$ is a temporary variable that does not need to be stored. An +important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded. To +see that, \[ x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e \] @@ -229,10 +242,11 @@ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\ \end{equation*} In this formulation, cells with the same diagonal index $r$ are independent of each other. This allows us to fully vectorize the computation of all cells on -the same anti-diagonal in one inner loop. +the same anti-diagonal in one inner loop. It also simplifies banded alignment, +which would be difficult with striped vectorization~\citep{Farrar:2007hs}. On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary -condition of this equation in the diagonal-antidiagonal coordinate is +condition of the equation above is \[ \left\{\begin{array}{l} x_{r-1,-1}=y_{r-1,r}=-q-e\\ @@ -249,6 +263,7 @@ r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\til -\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \end{array}\right. \] +These can be derived from the initial conditions of Eq.~(\ref{eq:ae86}). In practice, our 16-way vectorized implementation of global alignment is three times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without @@ -298,8 +313,10 @@ q+l\cdot e & (l>0) \\ \end{array}\right. \] In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is -regarded as an intron, which pays no cost to gap extensions. Minimap2 further -introduces reference-dependent cost to penalize non-canonical splicing: +regarded as an intron, which pays no cost to gap extensions. + +To pinpoint precise splicing junctions, minimap2 introduces reference-dependent +cost to penalize non-canonical splicing: \begin{equation}\label{eq:splice} \left\{\begin{array}{l} H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a(i)\}\\ @@ -312,10 +329,9 @@ Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a postive number $p$ otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is -almost the same as the equation used by EXALIN~\citep{Zhang:2006aa} and -CAT~\citep{Li:2007aa} except that we allow insertions immediately followed by -deletions and vice versa; in addition, we use Suzuki's diagonal formulation in -actual implementation. +almost equivalent to the equation used by EXALIN~\citep{Zhang:2006aa} except +that we allow insertions immediately followed by deletions and vice versa; in +addition, we use Suzuki's diagonal formulation in actual implementation. %Given that $d_i$ and $a_i$ %are a function of the reference sequence, it is possible to incorporate @@ -345,60 +361,66 @@ alignment. \centering \includegraphics[width=.5\textwidth]{roc-color.pdf} \caption{Evaluation on simulated SMRT reads aligned against human genome -GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function -of mapping quality. 33,088 $\ge$1000bp reads were simulated using -pbsim~\citep{Ono:2013aa} with error profile sampled from file -`m131017\_060208\_42213\_*.1.*' downloaded at -\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is -11,628. A read is considered correctly mapped if the true position overlaps +GRCh38. (a) ROC-like curve. Alignments are sorted by mapping quality in the +descending order. For each mapping quality threshold, the fraction of +alignments with mapping quality above the threshold and their error rate +are plotted. (b) Accumulative mapping error rate as a function of mapping +quality. 33,088 $\ge$1000bp reads were simulated using pbsim~\citep{Ono:2013aa} +with error profile sampled from file `m131017\_060208\_42213\_*.1.*' downloaded +at \href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length +is 11,628. A read is considered correctly mapped if the true position overlaps with the best mapping position by 10\% of the read length. All aligners were -run under the default setting for SMRT reads.}\label{fig:eval} +run under the default setting for SMRT reads. Kart outputted all alignments at +mapping quality 60, so is not shown in the figure. It mapped nearly all reads +with 4.1\% of alignments being wrong, less accurate than others.}\label{fig:eval} \end{figure} As a sanity check, we evaluated minimap2 on simulated human reads along with -BLASR~\citep{Chaisson:2012aa}, -BWA-MEM~\citep{Li:2013aa}, -GraphMap~\citep{Sovic:2016aa}, -minialign~\citep{Suzuki:2016} and -NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab}, -LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either +BLASR~(v1.MC.rc64; \citealp{Chaisson:2012aa}), +BWA-MEM~(v0.7.15; \citealp{Li:2013aa}), +GraphMap~(v0.5.2; \citealp{Sovic:2016aa}), +Kart~(v2.2.5; \citealp{Lin:2017aa}), +minialign~(v0.5.3; \citealp{Suzuki:2016}) and +NGMLR~(v0.2.5; \citealp{Sedlazeck169557}). We excluded rHAT~\citep{Liu:2016ab} +and LAMSA~\citep{Liu:2017aa} because they either crashed or produced malformatted output. In this evaluation, Minimap2 has higher power to distinguish unique and repetitive hits, and achieves overall higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate -even if we skip DP-based alignment (data not shown), suggesting chaining alone +even if we skip DP-based alignment (data not shown), confirming chaining alone is sufficient to achieve high accuracy for approximate mapping. Minimap2 and NGMLR provide better mapping quality estimate: they rarely give repetitive hits high mapping quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may occasionally miss close suboptimal hits and be overconfident in wrong mappings. -On run time, minialign is slightly faster than minimap2. They are over 30 times -faster than the rest. Minimap2 consumed 6.1GB memory at the peak, more than -BWA-MEM but less than others. +On run time, minialign is slightly faster than minimap2 and Kart. They are over +30 times faster than the rest. Minimap2 consumed 6.1GB memory at the peak, +more than BWA-MEM but less than others. On real human SMRT reads, the relative performance and sensitivity of -these aligners are broadly similar to those on simulated data. We are unable to -provide a good estimate of mapping error rate due to the lack of the truth. On -ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and -minimap2 are over 70 times faster than others. We have also examined tens of -$\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the -observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into -shorter gaps. Minimap2 does not have this issue. +these aligners are broadly similar to the metrics on simulated data. We are +unable to provide a good estimate of mapping error rate due to the lack of the +truth. On ONT $\sim$100kb human reads~\citep{Jain128835}, BWA-MEM failed. +Kart, minialign and minimap2 are over 70 times faster than others. We have also +examined tens of $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can +confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks +them into shorter gaps. The issue is much alleviated with minimap2, thanks +to the 2-piece affine gap cost. \subsection{Aligning spliced reads} -We first evaluated minimap2 on SIRV control data~(AC:SRR5286959; +We evaluated minimap2 on SIRV control data~(AC:SRR5286959; \citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,916 -introns, 93.0\% of which are precise. We examined wrongly predicted introns and -found the majority were caused by clustered splicing signals (e.g. two adjacent -${\tt GT}$ sites). When INDEL sequencing errors are frequent, it is difficult -to found precise splicing sites in this case. If we allow up to 10bp distance -from true splicing sites, 98.4\% of aligned introns are approximately correct. -Given this observation, we might be able to improve boundary detection by -initializing $d(\cdot)$ and $a(\cdot)$ in Eq.~(\ref{eq:splice}) with -position-specific scoring matrices or more sophisticated models. We have -not tried this approach. +introns from 11\,017 reads. 93.0\% of splice juctions are precise. We examined +wrongly predicted junctions and found the majority were caused by clustered +splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing +errors are frequent, it is difficult to find precise splicing sites in this +case. If we allow up to 10bp distance from true splicing sites, 98.4\% of +aligned introns are approximately correct. Given this observation, we might be +able to improve boundary detection by initializing $d(\cdot)$ and $a(\cdot)$ in +Eq.~(\ref{eq:splice}) with position-specific scoring matrices or more +sophisticated models. We have not tried this approach. \begin{table}[!tb] -\processtable{Evaluation of splicing accuracy on 2D ONT reads} +\processtable{Evaluation of junction accuracy on 2D ONT reads} {\footnotesize\label{tab:intron} \begin{tabular}{p{3.1cm}rrrr} \toprule @@ -431,26 +453,45 @@ We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20; \citealp{Wu:2005vn}), minimap2, SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more consistent with existing annotations (Table~\ref{tab:intron}): it finds -more splicing with a higher percentage being exactly or approximately correct. -We noted that GMAP and SpAln have not been optimized for noisy reads. We have -tried different settings, but their developers should be able to improve the -accuracy further. On run time, minimap2 is over 40 times faster than GMAP and -SpAln. While STAR is close to minimap2 in speed, it does not work well with -noisy reads. +more junctions with a higher percentage being exactly or approximately correct. +Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to +minimap2 in speed, it does not work well with noisy reads. We have also +evaluated spliced aligners on public Iso-Seq data (human Alzheimer brain +from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The observation +is similar: minimap2 is faster at higher junction accuracy. -\section{Discussions} +We noted that GMAP and SpAln have not been optimized for noisy reads. We are +showing the best setting we have experimented, but their developers should be +able to improve their accuracy further. -Minialign and minimap2 are fast because a) with chaining, they can quickly -filter out most false seed hits~\citep{Li:2016aa} and reduce unsuccessful but -costly DP-based alignments; b) they implemented so far the fastest DP-based -alignment algorithm for long sequences~\citep{Suzuki:2016}. It is possible to -further accelerate minimap2 with a few other tweaks such as adaptive -banding~\citep{Suzuki130633} or incremental banding. +%\begin{table}[!tb] +%\processtable{Evaluation of junction accuracy on SMRT Iso-Seq reads} +%{\footnotesize +%\begin{tabular}{lrrrr} +%\toprule +%& GMAP & minimap2 & SpAln & STAR\\ +%\midrule +%Run time (CPU min) & & 243 & 2\,352 & 1\,647 \\ +%\# aligned reads & & 1\,123\,025 & 1\,094\,092 & 682\,452\\ +%\# chimeric alignments & & 33\,091 & 0 & 0\\ +%\# non-spliced alignments & & 339\,081 & 291\,447 & 272\,536\vspace{1em}\\ +%\# aligned introns & & 9\,071\,755 & 9\,208\,564 & 3\,029\,121 \\ +%\# novel introns & & 42\,773 & 82\,230 & 17\,791 \\ +%\% exact introns & & 94.9\% & 91.7\% & 84.7\% \\ +%\% approx. introns&& 96.9\% & 93.4\% & 93.8\% \\ +%\botrule +%\end{tabular} +%}{} +%\end{table} -In addition to reference-based read mapping, minimap2 inherits minimap's -functionality to search against huge multi-species databases and to find read -overlaps. On a few test data sets, minimap2 appears to yield slightly better -miniasm assembly~\citep{Li:2016aa}. Minimap2 can also align closely related + +\section{Conclusion} + +Minimap2 is a fast, accurate and versatile aligner for long nucleotide +sequences. In addition to reference-based read mapping, minimap2 inherits +minimap's functionality to search against huge multi-species databases and to +find read overlaps. On a few test data sets, minimap2 appears to yield slightly +better miniasm assembly~\citep{Li:2016aa}. Minimap2 can also align similar genomes or different assemblies of the same species. However, full-genome alignment is an intricate research topic. More thorough evaluations would be necessary to justify the use of minimap2 for such applications.