From 6c9390b54af0fd1df475eb52829d8ff1a70533e4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 2 Aug 2017 19:52:31 -0400 Subject: [PATCH] backup --- tex/Makefile | 5 +- tex/bioinfo.cls | 2 +- tex/eval2roc.pl | 17 +++- tex/minimap2.bib | 47 +++++++-- tex/minimap2.tex | 241 +++++++++++++++++++++++++++++++++-------------- tex/roc.gp | 64 +++++++++---- 6 files changed, 272 insertions(+), 104 deletions(-) diff --git a/tex/Makefile b/tex/Makefile index 3614548..418a962 100644 --- a/tex/Makefile +++ b/tex/Makefile @@ -11,7 +11,10 @@ all:minimap2.pdf -minimap2.pdf:minimap2.tex minimap2.bib +roc-color.eps:roc.gp + gnuplot roc.gp + +minimap2.pdf:minimap2.tex minimap2.bib roc-color.pdf pdflatex minimap2; bibtex minimap2; pdflatex minimap2; pdflatex minimap2; clean: diff --git a/tex/bioinfo.cls b/tex/bioinfo.cls index 4ec8ce7..8d6b8c8 100644 --- a/tex/bioinfo.cls +++ b/tex/bioinfo.cls @@ -455,7 +455,7 @@ \vspace*{\aboveskipchk}% \vspace{\dropfromtop}% \hbox to \textwidth{% - {\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL NOTES} + {\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL REPORT} \hfil \if@appnotes APPLICATIONS NOTE\hfil\fi }% diff --git a/tex/eval2roc.pl b/tex/eval2roc.pl index 66307a3..9c32e38 100755 --- a/tex/eval2roc.pl +++ b/tex/eval2roc.pl @@ -4,17 +4,30 @@ use strict; use warnings; use Getopt::Std; -my %opts = (n=>33088); +my %opts = (n=>33088, s=>100); getopts('n:', \%opts); my $pseudo = .5; my $tot = $pseudo; my $err = $pseudo; +my $tot_last_out = -$opts{s}; +my $state = 0; +my $mapq = 0; while (<>) { chomp; if (/^Q\t(\d+)\t(\d+)\t(\d+)/) { $tot += $2; $err += $3; - print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n"; + if ($tot - $tot_last_out >= $opts{s}) { + print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n"; + $tot_last_out = $tot; + $state = 0; + } else { + $state = 1; + $mapq = $1; + } } } +if ($state) { + print join("\t", $mapq, $err/$tot, $tot / $opts{n}), "\n"; +} diff --git a/tex/minimap2.bib b/tex/minimap2.bib index 2edb4a9..f9204f3 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -7,7 +7,7 @@ Year = {2012}} @article{Liu:2016ab, - Author = {Liu, Bo and Guan, Dengfeng and Teng, Mingxiang and Wang, Yadong}, + Author = {Liu, Bo and others}, Journal = {Bioinformatics}, Pages = {1625-31}, Title = {{rHAT}: fast alignment of noisy long reads with regional hashing}, @@ -15,7 +15,7 @@ Year = {2016}} @article{Liu:2017aa, - Author = {Liu, Bo and Gao, Yan and Wang, Yadong}, + Author = {Liu, Bo and others}, Journal = {Bioinformatics}, Pages = {192-201}, Title = {{LAMSA}: fast split read alignment with long approximate matches}, @@ -38,7 +38,7 @@ Year = {2013}} @article{Sovic:2016aa, - Author = {Sovi{\'c}, Ivan and {\v S}iki{\'c}, Mile and Wilm, Andreas and Fenlon, Shannon Nicole and Chen, Swaine and others}, + Author = {Sovi{\'c}, Ivan and others}, Journal = {Nat Commun}, Pages = {11307}, Title = {Fast and sensitive mapping of nanopore sequencing reads with {GraphMap}}, @@ -68,6 +68,13 @@ howpublished = {\href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}}, year = {2016}} +@misc{Ruan:2016, + title = {Ultra-fast de novo assembler using long noisy reads}, + author = {Jue Ruan}, + journal = {Unpulished}, + howpublished = {\href{https://github.com/ruanjue/smartdenovo}{https://github.com/ruanjue/smartdenovo}}, + year = {2016}} + @article{Miller:1988aa, Author = {Miller, W and Myers, E W}, Journal = {Bull Math Biol}, @@ -86,7 +93,7 @@ Year = {1990}} @article{Wu:1996aa, - Author = {Wu, Sun and Manber, U and Myers, Gene}, + Author = {Wu, Sun and others}, Journal = {Algorithmica}, Pages = {50-67}, Title = {A subquadratic algorithm for approximate limited expression matching}, @@ -98,22 +105,22 @@ Journal = {BMC Bioinformatics}, Month = {Feb}, Pages = {81}, - Title = {Parasail: SIMD C library for global, semi-global, and local pairwise sequence alignments}, + Title = {Parasail: {SIMD C} library for global, semi-global, and local pairwise sequence alignments}, Volume = {17}, Year = {2016}} @article{Sedlazeck169557, - author = {Sedlazeck, Fritz J and Rescheneder, Philipp and Smolka, Moritz and Fang, Han and Nattestad, Maria and others}, + author = {Sedlazeck, Fritz J and others}, title = {Accurate detection of complex structural variations using single molecule sequencing}, note = {doi:10.1101/169557}, journal = {bioRxiv}, year = {2017}} @article{Altschul:1997vn, - Author = {Altschul, S F and Madden, T L and Sch{\"a}ffer, A A and Zhang, J and Zhang, Z and others}, + Author = {Altschul, S F and others}, Journal = {Nucleic Acids Res}, Pages = {3389-402}, - Title = {Gapped BLAST and PSI-BLAST: a new generation of protein database search programs}, + Title = {Gapped {BLAST} and {PSI-BLAST}: a new generation of protein database search programs}, Volume = {25}, Year = {1997}} @@ -132,3 +139,27 @@ Title = {Chaining algorithms for multiple genome comparison}, Volume = {3}, Year = {2005}} + +@article{Ono:2013aa, + Author = {Ono, Yukiteru and others}, + Journal = {Bioinformatics}, + Pages = {119-21}, + Title = {{PBSIM}: {PacBio} reads simulator--toward accurate genome assembly}, + Volume = {29}, + Year = {2013}} + +@article {Jain128835, + author = {Jain, Miten and others}, + title = {Nanopore sequencing and assembly of a human genome with ultra-long reads}, + year = {2017}, + note = {doi:10.1101/128835}, + publisher = {Cold Spring Harbor Labs Journals}, + journal = {bioRxiv}} + +@article{Lau:2016aa, + Author = {Lau, Bayo and others}, + Journal = {Bioinformatics}, + Pages = {3829-3832}, + Title = {LongISLND: in silico sequencing of lengthy and noisy datatypes}, + Volume = {32}, + Year = {2016}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index 08abf75..e69fd87 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -2,6 +2,7 @@ \copyrightyear{2017} \pubyear{2017} +\usepackage{graphicx} \usepackage{hyperref} \usepackage{url} \usepackage{amsmath} @@ -19,7 +20,7 @@ \begin{document} \firstpage{1} -\title[Long-read and assembly alignment with minimap2]{Minimap2: fast sequence alignment for long noisy reads and assembly contigs} +\title[Long sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long noisy sequences} \author[Li]{Heng Li} \address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA} @@ -29,7 +30,7 @@ \section{Summary:} Minimap2 is a program to align long noisy sequences against a large reference database. It targets query sequences of 1kb--100Mb in length with sequence divergence typically below 25\%. Minimap2 is $\sim$30 times -faster than most existing long-read aligners and achieves higher accuracy on +faster than many mainstream long-read aligners and achieves higher accuracy on simulated data. It also employs concave gap cost and rescues inversions for improved alignment around potential structural variations. @@ -60,10 +61,10 @@ towards higher accuracy. Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar indexing and seeding algorithms except that minimap2 optionally uses -homopolymer-compressed (HPC; cite) $k$-mers in addition to normal $k$-mers. -Indexing with HPC $k$-mers leads to higher mapping sensitivity for SMRT reads. -Minimap2 further implements a more accurate chaining algorithm and adds -the ability to produce detailed alignment. +homopolymer-compressed (HPC; \citealp{Ruan:2016,Lau:2016aa}) $k$-mers in +addition to normal $k$-mers. Indexing with HPC $k$-mers leads to higher +mapping sensitivity for SMRT reads. Minimap2 further implements a more +accurate chaining algorithm and adds the ability to produce detailed alignment. \subsection{Chaining} @@ -84,20 +85,19 @@ distance between two anchors is too large); otherwise \gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\}) \] In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot -l+\beta\log_2(l)$. For $m$ anchors, computing all $f(\cdot)$ -with Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. We note that if anchor $i$ is -appended to $j$, appending $i$ to a predecessor of $j$ is likely to yield a -lower score. When evaluating Eq.~(\ref{eq:chain}), we start from anchor $i-1$ -and stop the evaluation if we cannot find a better score after up to $h$ -iterations. This heuristic reduces the average time to $O(h\cdot m)$. In -practical, we can almost always find the optimal chain with $h=50$; even if the -heuristic fails, the optimal chain often looks dubious. +l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with +Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster +chaining algorithms exist~\citep{Abouelhoda:2005aa}, they +are inapplicable to generic gap cost, complex to implement and usually +associated with a large constant. We introduced a simple heurstic to accelerate +chaining. -%Although theoretically faster chaining algorithms exist for simple gap -%cost~\citep{Abouelhoda:2005aa}, they are not as flexible as DP and may not lead -%to better performance than our approach in practice. Furthermore, chaining -%takes much less computing time than alignment. It is not critical to the -%performance of minimap2. +We note that if anchor $i$ is appended to $j$, appending $i$ to a predecessor +of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}), +we start from anchor $i-1$ and stop the evaluation if we cannot find a better +score after up to $h$ iterations. This heuristic reduces the average time to +$O(h\cdot m)$. In practice, we can almost always find the optimal chain with +$h=50$; even if the heuristic fails, the optimal chain often looks dubious. \subsubsection{Backtracking} Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if @@ -120,9 +120,9 @@ banded alignment, which is critical to performance. In practice, our implementation is three times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa} for global alignment. Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, -but with a 1000bp band, it is much faster. When performing global alignment -between anchors, we expect the alignment to stay close to the diagonal of the -DP matrix. Banding is applicable most of time. +but with a 1000bp band, it is considerably faster. When performing global +alignment between anchors, we expect the alignment to stay close to the +diagonal of the DP matrix. Banding is often applicable. Minimap2 uses a 2-piece affine gap cost $\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$. @@ -151,57 +151,154 @@ alignment, but this time with the one subsequence reverse complemented. This additional alignment step may identify short inversions that are missed during chaining. -%\begin{equation}\label{eq:ae86} -%\left\{\begin{array}{l} -%H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\ -%E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ -%F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ -%\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\ -%\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e} -%\end{array}\right. -%\end{equation} -%where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query -%base. If we define~\citep{Wu:1996aa,Suzuki:2016} -%\[ -%\left\{\begin{array}{ll} -%u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ -%x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\ -%y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij} -%\end{array}\right. -%\] -%we can transform Eq.~(\ref{eq:ae86}) to -%\[ -%\left\{\begin{array}{lll} -%z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\ -%&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\ -%u_{ij}&=&z_{ij}-v_{i-1,j}\\ -%v_{ij}&=&z_{ij}-u_{i,j-1}\\ -%x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\ -%y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ -%\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ -%\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} -%\end{array}\right. -%\] -%with boundary conditions -%\[ -%\left\{\begin{array}{l} -%x_{-1,\cdot}=y_{\cdot,-1}=-q-e\\ -%\tilde{x}_{-1,\cdot}=\tilde{y}_{\cdot,-1}=-\tilde{q}-\tilde{e}\\ -%u_{i,-1}=\eta(i)\\ -%v_{-1,j}=\eta(j) -%\end{array}\right. -%\] -%where -%\[ -%\eta(k)=\left\{\begin{array}{ll} -%-q-e & (k=0) \\ -%-e & (k<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ -%i\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (k=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ -%-\tilde{e} & (k>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) -%\end{array}\right. -%\] - \end{methods} +\section{Results and discussions} +\begin{figure}[!tb] +\centering +\includegraphics[width=.5\textwidth]{roc-color.pdf} +\caption{Evaluation on simulated SMRT reads aligned against human genome +GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function +of mapping quality. 33,088 $\ge$1000bp reads were simulated using +pbsim~\citep{Ono:2013aa} with error profile sampled from file +`m131017\_060208\_42213\_*.1.*' downloaded at +\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is +11,628. A read is considered correctly mapped if the true position overlaps +with the best mapping position by 10\% of the read length. All aligners were +run under the default setting for SMRT reads.}\label{fig:eval} +\end{figure} + +As a sanity check, we evaluated minimap2 on simulated human reads along with +BLASR~\citep{Chaisson:2012aa}, +BWA-MEM~\citep{Li:2013aa}, +GraphMap~\citep{Sovic:2016aa}, +minialign~\citep{Suzuki:2016} and +NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab}, +LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either +crashed or produced malformatted SAM. In this evaluation, Minimap2 has a +higher power to distinguish unique and repetitive hits, and achieves overall +higher mapping accuracy (Fig.~\ref{fig:eval}a). Minimap2 and NGMLR provide +better mapping quality estimate: they rarely give repetitive hits high mapping +quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may occasionally +miss close suboptimal hits and be overconfident in wrong mappings. On run time, +minialign is slightly faster than minimap2. They are over 30 times faster than +the rest. + +On real SMRT reads from human, the relative performance and sensitivity of +these aligners are broadly similar to those on simulated data. We are unable to +provide a good estimate of mapping error rate due to the lack of the truth. On +ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and +minimap2 are over 70 times faster than others. In addition to reference-based +read mapping, minimap2 can also find overlaps between long reads and align +long-read assemblies. + \bibliography{minimap2} + +\pagebreak + +\begin{methods} +\section*{Appendix} +A 2-piece gap cost function is +\[ +\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\} +\] +Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation +to compute the optimal alignment under such a gap cost is +\begin{equation}\label{eq:ae86} +\left\{\begin{array}{l} +H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\ +E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ +F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ +\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\ +\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e} +\end{array}\right. +\end{equation} +where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query +base. If we define +\[ +\left\{\begin{array}{ll} +u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ +x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\ +y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij} +\end{array}\right. +\] +we can transform Eq.~(\ref{eq:ae86}) to +\begin{equation}\label{eq:suzuki} +\left\{\begin{array}{lll} +z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\ +&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\ +u_{ij}&=&z_{ij}-v_{i-1,j}\\ +v_{ij}&=&z_{ij}-u_{i,j-1}\\ +x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\ +y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ +\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ +\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} +\end{array}\right. +\end{equation} +where $z_{ij}$ is a temporary variable that does not need to be stored. We can +see that +\[ +x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e +\] +With $E_{ij}\le H_{ij}$, we have +\[ +-q-e\le x_{ij}\le\max\{-q,0\}-e=-e +\] +and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$. +In addition, +\[ +u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e +\] +We also note that the maximum possible $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the +maximal matching score. As a result, +\[ +u_{ij}\le M-v_{i-1,j}\le M+q+e +\] +In conclusion, all values in Eq.~(\ref{eq:suzuki}) are bounded: $x$ and $y$ by +$[-q-e,-e]$ and $\tilde{x}$, $\tilde{y}$ by +$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When +matching score and gap cost are small, each of them can be stored as a 8-bit +integer. This enables efficient SSE vectorization regardless of the peak score +of the alignment. + +For more efficient SSE implementation, we transform the row-column coordinate +to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$. +Eq.~(\ref{eq:suzuki}) becomes: +\begin{equation*} +\left\{\begin{array}{lll} +z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\ +&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\ +u_{rt}&=&z_{rt}-v_{r-1,t-1}\\ +v_{rt}&=&z_{rt}-u_{r-1,t}\\ +x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\ +y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\ +\tilde{x}_{rt}&=&\max\{0,\tilde{x}_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ +\tilde{y}_{rt}&=&\max\{0,\tilde{y}_{r-1,t}+u_{r-1,t}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e} +\end{array}\right. +\end{equation*} +In this formulation, cells with the same row index $r$ are independent of each +other. This allows us to vectorize the computation of all cells on the same +anti-diagonal in one inner loop. + +On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary +condition of this equation in the diagonal-anti-diagonal coordinate is +\[ +\left\{\begin{array}{l} +x_{r-1,-1}=y_{r-1,r}=-q-e\\ +\tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\ +u_{r-1,r}=v_{r-1,-1}=\eta(r)\\ +\end{array}\right. +\] +where +\[ +\eta(r)=\left\{\begin{array}{ll} +-q-e & (r=0) \\ +-e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ +r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ +-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) +\end{array}\right. +\] +\citet{Suzuki:2016} first derived a similar set of equations under affine gap +cost but with different notations. +\end{methods} \end{document} diff --git a/tex/roc.gp b/tex/roc.gp index 630b56f..a053db8 100644 --- a/tex/roc.gp +++ b/tex/roc.gp @@ -1,28 +1,52 @@ -set t po eps enh co so "Helvetica,18" +set t po eps enh co so "Helvetica,26" -set style line 1 lt 1 pt 1 lc rgb "#FF0000" lw 2; -set style line 2 lt 1 pt 2 lc rgb "#00C000" lw 2; -set style line 3 lt 1 pt 3 lc rgb "#0080FF" lw 2; -set style line 4 lt 1 pt 4 lc rgb "#C000FF" lw 2; -set style line 5 lt 1 pt 5 lc rgb "#00EEEE" lw 2; -set style line 6 lt 1 pt 6 lc rgb "#C04000" lw 2; -set style line 7 lt 1 lc rgb "#C8C800" lw 2; -set style line 8 lt 1 lc rgb "#FF80FF" lw 2; -set style line 9 lt 1 lc rgb "#4E642E" lw 2; -set style line 10 lt 1 lc rgb "#800000" lw 2; -set style line 11 lt 1 lc rgb "#67B7F7" lw 2; -set style line 12 lt 1 lc rgb "#FFC127" lw 2; - -set xlab "False positive rate" -set ylab "Sensitivity" -set yran [0.9:1] +set style line 1 lt 1 pt 1 lc rgb "#e41a1c" lw 2; +set style line 2 lt 1 pt 2 lc rgb "#377eb8" lw 2; +set style line 3 lt 1 pt 3 lc rgb "#4daf4a" lw 2; +set style line 4 lt 1 pt 4 lc rgb "#984ea3" lw 2; +set style line 5 lt 1 pt 6 lc rgb "#ff7f00" lw 2; +set style line 6 lt 1 pt 8 lc rgb "#f781bf" lw 2; set out "roc-color.eps" + +set pointsize 2.0 +set size 1.59,1.04 +set multiplot layout 1,2 + +set label "(a)" at graph -0.245,1.06 font "Helvetica-bold,40" +set xlab "Error rate of mapped reads" +set ylab "Fraction of mapped reads" off +1.8 +set ytics 0.02 +set yran [0.9:1] + +set size 0.8,1 set log x set format x "10^{%L}" set key bot right -plot "<./eval2roc.pl blasr-mc.eval" u 2:3 t "blasr-mc" w lp ls 1, \ +plot "<./eval2roc.pl blasr-mc.eval" u 2:3 t "blasr-mc" w lp ls 4, \ "<./eval2roc.pl bwa.eval" u 2:3 t "bwa-mem" w lp ls 2, \ - "<./eval2roc.pl minialign.eval" u 2:3 t "minialign" w lp ls 3, \ - "<./eval2roc.pl mm2.eval" u 2:3 t "minimap2" w lp ls 4, \ + "<./eval2roc.pl graphmap.eval" u 2:3 t "graphmap" w lp ls 3, \ + "<./eval2roc.pl minialign.eval" u 2:3 t "minialign" w lp ls 1, \ + "<./eval2roc.pl mm2.eval" u 2:3 t "minimap2" w lp ls 6, \ "<./eval2roc.pl ngmlr.eval" u 2:3 t "ngm-lr" w lp ls 5 +unset label + +set origin 0.8,0 +set size 0.79,1 +set label "(b)" at graph -0.245,1.06 font "Helvetica-bold,40" +unset log +unset format +unset key +set log y +set ylab "Accumulative mapping error rate" off +0 +set xlab "Mapping quality" +set yran [1e-5:0.1] +set ytics 1e-5,0.1 +set format y "10^{%L}" +set xran [60:0] reverse +plot "<./eval2roc.pl blasr-mc.eval" u 1:2 w lp ls 4, \ + "<./eval2roc.pl bwa.eval" u 1:2 t "bwa-mem" w lp ls 2, \ + "<./eval2roc.pl graphmap.eval" u 1:2 t "graphmap" w lp ls 3, \ + "<./eval2roc.pl minialign.eval" u 1:2 t "minialign" w lp ls 1, \ + "<./eval2roc.pl mm2.eval" u 1:2 t "minimap2" w lp ls 6, \ + "<./eval2roc.pl ngmlr.eval" u 1:2 t "ngm-lr" w lp ls 5