This commit is contained in:
Heng Li 2017-08-02 19:52:31 -04:00
parent 9306299e4d
commit 6c9390b54a
6 changed files with 272 additions and 104 deletions

View File

@ -11,7 +11,10 @@
all:minimap2.pdf
minimap2.pdf:minimap2.tex minimap2.bib
roc-color.eps:roc.gp
gnuplot roc.gp
minimap2.pdf:minimap2.tex minimap2.bib roc-color.pdf
pdflatex minimap2; bibtex minimap2; pdflatex minimap2; pdflatex minimap2;
clean:

View File

@ -455,7 +455,7 @@
\vspace*{\aboveskipchk}%
\vspace{\dropfromtop}%
\hbox to \textwidth{%
{\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL NOTES}
{\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL REPORT}
\hfil
\if@appnotes APPLICATIONS NOTE\hfil\fi
}%

View File

@ -4,17 +4,30 @@ use strict;
use warnings;
use Getopt::Std;
my %opts = (n=>33088);
my %opts = (n=>33088, s=>100);
getopts('n:', \%opts);
my $pseudo = .5;
my $tot = $pseudo;
my $err = $pseudo;
my $tot_last_out = -$opts{s};
my $state = 0;
my $mapq = 0;
while (<>) {
chomp;
if (/^Q\t(\d+)\t(\d+)\t(\d+)/) {
$tot += $2;
$err += $3;
print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n";
if ($tot - $tot_last_out >= $opts{s}) {
print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n";
$tot_last_out = $tot;
$state = 0;
} else {
$state = 1;
$mapq = $1;
}
}
}
if ($state) {
print join("\t", $mapq, $err/$tot, $tot / $opts{n}), "\n";
}

View File

@ -7,7 +7,7 @@
Year = {2012}}
@article{Liu:2016ab,
Author = {Liu, Bo and Guan, Dengfeng and Teng, Mingxiang and Wang, Yadong},
Author = {Liu, Bo and others},
Journal = {Bioinformatics},
Pages = {1625-31},
Title = {{rHAT}: fast alignment of noisy long reads with regional hashing},
@ -15,7 +15,7 @@
Year = {2016}}
@article{Liu:2017aa,
Author = {Liu, Bo and Gao, Yan and Wang, Yadong},
Author = {Liu, Bo and others},
Journal = {Bioinformatics},
Pages = {192-201},
Title = {{LAMSA}: fast split read alignment with long approximate matches},
@ -38,7 +38,7 @@
Year = {2013}}
@article{Sovic:2016aa,
Author = {Sovi{\'c}, Ivan and {\v S}iki{\'c}, Mile and Wilm, Andreas and Fenlon, Shannon Nicole and Chen, Swaine and others},
Author = {Sovi{\'c}, Ivan and others},
Journal = {Nat Commun},
Pages = {11307},
Title = {Fast and sensitive mapping of nanopore sequencing reads with {GraphMap}},
@ -68,6 +68,13 @@
howpublished = {\href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}},
year = {2016}}
@misc{Ruan:2016,
title = {Ultra-fast de novo assembler using long noisy reads},
author = {Jue Ruan},
journal = {Unpulished},
howpublished = {\href{https://github.com/ruanjue/smartdenovo}{https://github.com/ruanjue/smartdenovo}},
year = {2016}}
@article{Miller:1988aa,
Author = {Miller, W and Myers, E W},
Journal = {Bull Math Biol},
@ -86,7 +93,7 @@
Year = {1990}}
@article{Wu:1996aa,
Author = {Wu, Sun and Manber, U and Myers, Gene},
Author = {Wu, Sun and others},
Journal = {Algorithmica},
Pages = {50-67},
Title = {A subquadratic algorithm for approximate limited expression matching},
@ -98,22 +105,22 @@
Journal = {BMC Bioinformatics},
Month = {Feb},
Pages = {81},
Title = {Parasail: SIMD C library for global, semi-global, and local pairwise sequence alignments},
Title = {Parasail: {SIMD C} library for global, semi-global, and local pairwise sequence alignments},
Volume = {17},
Year = {2016}}
@article{Sedlazeck169557,
author = {Sedlazeck, Fritz J and Rescheneder, Philipp and Smolka, Moritz and Fang, Han and Nattestad, Maria and others},
author = {Sedlazeck, Fritz J and others},
title = {Accurate detection of complex structural variations using single molecule sequencing},
note = {doi:10.1101/169557},
journal = {bioRxiv},
year = {2017}}
@article{Altschul:1997vn,
Author = {Altschul, S F and Madden, T L and Sch{\"a}ffer, A A and Zhang, J and Zhang, Z and others},
Author = {Altschul, S F and others},
Journal = {Nucleic Acids Res},
Pages = {3389-402},
Title = {Gapped BLAST and PSI-BLAST: a new generation of protein database search programs},
Title = {Gapped {BLAST} and {PSI-BLAST}: a new generation of protein database search programs},
Volume = {25},
Year = {1997}}
@ -132,3 +139,27 @@
Title = {Chaining algorithms for multiple genome comparison},
Volume = {3},
Year = {2005}}
@article{Ono:2013aa,
Author = {Ono, Yukiteru and others},
Journal = {Bioinformatics},
Pages = {119-21},
Title = {{PBSIM}: {PacBio} reads simulator--toward accurate genome assembly},
Volume = {29},
Year = {2013}}
@article {Jain128835,
author = {Jain, Miten and others},
title = {Nanopore sequencing and assembly of a human genome with ultra-long reads},
year = {2017},
note = {doi:10.1101/128835},
publisher = {Cold Spring Harbor Labs Journals},
journal = {bioRxiv}}
@article{Lau:2016aa,
Author = {Lau, Bayo and others},
Journal = {Bioinformatics},
Pages = {3829-3832},
Title = {LongISLND: in silico sequencing of lengthy and noisy datatypes},
Volume = {32},
Year = {2016}}

View File

@ -2,6 +2,7 @@
\copyrightyear{2017}
\pubyear{2017}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{url}
\usepackage{amsmath}
@ -19,7 +20,7 @@
\begin{document}
\firstpage{1}
\title[Long-read and assembly alignment with minimap2]{Minimap2: fast sequence alignment for long noisy reads and assembly contigs}
\title[Long sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long noisy sequences}
\author[Li]{Heng Li}
\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
@ -29,7 +30,7 @@
\section{Summary:} Minimap2 is a program to align long noisy sequences against
a large reference database. It targets query sequences of 1kb--100Mb in length
with sequence divergence typically below 25\%. Minimap2 is $\sim$30 times
faster than most existing long-read aligners and achieves higher accuracy on
faster than many mainstream long-read aligners and achieves higher accuracy on
simulated data. It also employs concave gap cost and rescues inversions for
improved alignment around potential structural variations.
@ -60,10 +61,10 @@ towards higher accuracy.
Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar
indexing and seeding algorithms except that minimap2 optionally uses
homopolymer-compressed (HPC; cite) $k$-mers in addition to normal $k$-mers.
Indexing with HPC $k$-mers leads to higher mapping sensitivity for SMRT reads.
Minimap2 further implements a more accurate chaining algorithm and adds
the ability to produce detailed alignment.
homopolymer-compressed (HPC; \citealp{Ruan:2016,Lau:2016aa}) $k$-mers in
addition to normal $k$-mers. Indexing with HPC $k$-mers leads to higher
mapping sensitivity for SMRT reads. Minimap2 further implements a more
accurate chaining algorithm and adds the ability to produce detailed alignment.
\subsection{Chaining}
@ -84,20 +85,19 @@ distance between two anchors is too large); otherwise
\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
\]
In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
l+\beta\log_2(l)$. For $m$ anchors, computing all $f(\cdot)$
with Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. We note that if anchor $i$ is
appended to $j$, appending $i$ to a predecessor of $j$ is likely to yield a
lower score. When evaluating Eq.~(\ref{eq:chain}), we start from anchor $i-1$
and stop the evaluation if we cannot find a better score after up to $h$
iterations. This heuristic reduces the average time to $O(h\cdot m)$. In
practical, we can almost always find the optimal chain with $h=50$; even if the
heuristic fails, the optimal chain often looks dubious.
l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with
Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster
chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
are inapplicable to generic gap cost, complex to implement and usually
associated with a large constant. We introduced a simple heurstic to accelerate
chaining.
%Although theoretically faster chaining algorithms exist for simple gap
%cost~\citep{Abouelhoda:2005aa}, they are not as flexible as DP and may not lead
%to better performance than our approach in practice. Furthermore, chaining
%takes much less computing time than alignment. It is not critical to the
%performance of minimap2.
We note that if anchor $i$ is appended to $j$, appending $i$ to a predecessor
of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}),
we start from anchor $i-1$ and stop the evaluation if we cannot find a better
score after up to $h$ iterations. This heuristic reduces the average time to
$O(h\cdot m)$. In practice, we can almost always find the optimal chain with
$h=50$; even if the heuristic fails, the optimal chain often looks dubious.
\subsubsection{Backtracking}
Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
@ -120,9 +120,9 @@ banded alignment, which is critical to performance. In practice, our
implementation is three times as fast as Parasail's 4-way
vectorization~\citep{Daily:2016aa} for global alignment.
Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa},
but with a 1000bp band, it is much faster. When performing global alignment
between anchors, we expect the alignment to stay close to the diagonal of the
DP matrix. Banding is applicable most of time.
but with a 1000bp band, it is considerably faster. When performing global
alignment between anchors, we expect the alignment to stay close to the
diagonal of the DP matrix. Banding is often applicable.
Minimap2 uses a 2-piece affine gap cost
$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
@ -151,57 +151,154 @@ alignment, but this time with the one subsequence reverse complemented. This
additional alignment step may identify short inversions that are missed during
chaining.
%\begin{equation}\label{eq:ae86}
%\left\{\begin{array}{l}
%H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
%E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
%F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
%\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\
%\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e}
%\end{array}\right.
%\end{equation}
%where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
%base. If we define~\citep{Wu:1996aa,Suzuki:2016}
%\[
%\left\{\begin{array}{ll}
%u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
%x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\
%y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
%\end{array}\right.
%\]
%we can transform Eq.~(\ref{eq:ae86}) to
%\[
%\left\{\begin{array}{lll}
%z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
%&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\
%u_{ij}&=&z_{ij}-v_{i-1,j}\\
%v_{ij}&=&z_{ij}-u_{i,j-1}\\
%x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\
%y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
%\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
%\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}
%\end{array}\right.
%\]
%with boundary conditions
%\[
%\left\{\begin{array}{l}
%x_{-1,\cdot}=y_{\cdot,-1}=-q-e\\
%\tilde{x}_{-1,\cdot}=\tilde{y}_{\cdot,-1}=-\tilde{q}-\tilde{e}\\
%u_{i,-1}=\eta(i)\\
%v_{-1,j}=\eta(j)
%\end{array}\right.
%\]
%where
%\[
%\eta(k)=\left\{\begin{array}{ll}
%-q-e & (k=0) \\
%-e & (k<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
%i\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (k=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
%-\tilde{e} & (k>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
%\end{array}\right.
%\]
\end{methods}
\section{Results and discussions}
\begin{figure}[!tb]
\centering
\includegraphics[width=.5\textwidth]{roc-color.pdf}
\caption{Evaluation on simulated SMRT reads aligned against human genome
GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function
of mapping quality. 33,088 $\ge$1000bp reads were simulated using
pbsim~\citep{Ono:2013aa} with error profile sampled from file
`m131017\_060208\_42213\_*.1.*' downloaded at
\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is
11,628. A read is considered correctly mapped if the true position overlaps
with the best mapping position by 10\% of the read length. All aligners were
run under the default setting for SMRT reads.}\label{fig:eval}
\end{figure}
As a sanity check, we evaluated minimap2 on simulated human reads along with
BLASR~\citep{Chaisson:2012aa},
BWA-MEM~\citep{Li:2013aa},
GraphMap~\citep{Sovic:2016aa},
minialign~\citep{Suzuki:2016} and
NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab},
LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either
crashed or produced malformatted SAM. In this evaluation, Minimap2 has a
higher power to distinguish unique and repetitive hits, and achieves overall
higher mapping accuracy (Fig.~\ref{fig:eval}a). Minimap2 and NGMLR provide
better mapping quality estimate: they rarely give repetitive hits high mapping
quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may occasionally
miss close suboptimal hits and be overconfident in wrong mappings. On run time,
minialign is slightly faster than minimap2. They are over 30 times faster than
the rest.
On real SMRT reads from human, the relative performance and sensitivity of
these aligners are broadly similar to those on simulated data. We are unable to
provide a good estimate of mapping error rate due to the lack of the truth. On
ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and
minimap2 are over 70 times faster than others. In addition to reference-based
read mapping, minimap2 can also find overlaps between long reads and align
long-read assemblies.
\bibliography{minimap2}
\pagebreak
\begin{methods}
\section*{Appendix}
A 2-piece gap cost function is
\[
\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}
\]
Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation
to compute the optimal alignment under such a gap cost is
\begin{equation}\label{eq:ae86}
\left\{\begin{array}{l}
H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\
\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e}
\end{array}\right.
\end{equation}
where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
base. If we define
\[
\left\{\begin{array}{ll}
u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\
y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
\end{array}\right.
\]
we can transform Eq.~(\ref{eq:ae86}) to
\begin{equation}\label{eq:suzuki}
\left\{\begin{array}{lll}
z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\
u_{ij}&=&z_{ij}-v_{i-1,j}\\
v_{ij}&=&z_{ij}-u_{i,j-1}\\
x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\
y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}
\end{array}\right.
\end{equation}
where $z_{ij}$ is a temporary variable that does not need to be stored. We can
see that
\[
x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e
\]
With $E_{ij}\le H_{ij}$, we have
\[
-q-e\le x_{ij}\le\max\{-q,0\}-e=-e
\]
and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$.
In addition,
\[
u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e
\]
We also note that the maximum possible $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the
maximal matching score. As a result,
\[
u_{ij}\le M-v_{i-1,j}\le M+q+e
\]
In conclusion, all values in Eq.~(\ref{eq:suzuki}) are bounded: $x$ and $y$ by
$[-q-e,-e]$ and $\tilde{x}$, $\tilde{y}$ by
$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When
matching score and gap cost are small, each of them can be stored as a 8-bit
integer. This enables efficient SSE vectorization regardless of the peak score
of the alignment.
For more efficient SSE implementation, we transform the row-column coordinate
to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
Eq.~(\ref{eq:suzuki}) becomes:
\begin{equation*}
\left\{\begin{array}{lll}
z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\
&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
v_{rt}&=&z_{rt}-u_{r-1,t}\\
x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\
y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\
\tilde{x}_{rt}&=&\max\{0,\tilde{x}_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
\tilde{y}_{rt}&=&\max\{0,\tilde{y}_{r-1,t}+u_{r-1,t}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}
\end{array}\right.
\end{equation*}
In this formulation, cells with the same row index $r$ are independent of each
other. This allows us to vectorize the computation of all cells on the same
anti-diagonal in one inner loop.
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary
condition of this equation in the diagonal-anti-diagonal coordinate is
\[
\left\{\begin{array}{l}
x_{r-1,-1}=y_{r-1,r}=-q-e\\
\tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\
u_{r-1,r}=v_{r-1,-1}=\eta(r)\\
\end{array}\right.
\]
where
\[
\eta(r)=\left\{\begin{array}{ll}
-q-e & (r=0) \\
-e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
\end{array}\right.
\]
\citet{Suzuki:2016} first derived a similar set of equations under affine gap
cost but with different notations.
\end{methods}
\end{document}

View File

@ -1,28 +1,52 @@
set t po eps enh co so "Helvetica,18"
set t po eps enh co so "Helvetica,26"
set style line 1 lt 1 pt 1 lc rgb "#FF0000" lw 2;
set style line 2 lt 1 pt 2 lc rgb "#00C000" lw 2;
set style line 3 lt 1 pt 3 lc rgb "#0080FF" lw 2;
set style line 4 lt 1 pt 4 lc rgb "#C000FF" lw 2;
set style line 5 lt 1 pt 5 lc rgb "#00EEEE" lw 2;
set style line 6 lt 1 pt 6 lc rgb "#C04000" lw 2;
set style line 7 lt 1 lc rgb "#C8C800" lw 2;
set style line 8 lt 1 lc rgb "#FF80FF" lw 2;
set style line 9 lt 1 lc rgb "#4E642E" lw 2;
set style line 10 lt 1 lc rgb "#800000" lw 2;
set style line 11 lt 1 lc rgb "#67B7F7" lw 2;
set style line 12 lt 1 lc rgb "#FFC127" lw 2;
set xlab "False positive rate"
set ylab "Sensitivity"
set yran [0.9:1]
set style line 1 lt 1 pt 1 lc rgb "#e41a1c" lw 2;
set style line 2 lt 1 pt 2 lc rgb "#377eb8" lw 2;
set style line 3 lt 1 pt 3 lc rgb "#4daf4a" lw 2;
set style line 4 lt 1 pt 4 lc rgb "#984ea3" lw 2;
set style line 5 lt 1 pt 6 lc rgb "#ff7f00" lw 2;
set style line 6 lt 1 pt 8 lc rgb "#f781bf" lw 2;
set out "roc-color.eps"
set pointsize 2.0
set size 1.59,1.04
set multiplot layout 1,2
set label "(a)" at graph -0.245,1.06 font "Helvetica-bold,40"
set xlab "Error rate of mapped reads"
set ylab "Fraction of mapped reads" off +1.8
set ytics 0.02
set yran [0.9:1]
set size 0.8,1
set log x
set format x "10^{%L}"
set key bot right
plot "<./eval2roc.pl blasr-mc.eval" u 2:3 t "blasr-mc" w lp ls 1, \
plot "<./eval2roc.pl blasr-mc.eval" u 2:3 t "blasr-mc" w lp ls 4, \
"<./eval2roc.pl bwa.eval" u 2:3 t "bwa-mem" w lp ls 2, \
"<./eval2roc.pl minialign.eval" u 2:3 t "minialign" w lp ls 3, \
"<./eval2roc.pl mm2.eval" u 2:3 t "minimap2" w lp ls 4, \
"<./eval2roc.pl graphmap.eval" u 2:3 t "graphmap" w lp ls 3, \
"<./eval2roc.pl minialign.eval" u 2:3 t "minialign" w lp ls 1, \
"<./eval2roc.pl mm2.eval" u 2:3 t "minimap2" w lp ls 6, \
"<./eval2roc.pl ngmlr.eval" u 2:3 t "ngm-lr" w lp ls 5
unset label
set origin 0.8,0
set size 0.79,1
set label "(b)" at graph -0.245,1.06 font "Helvetica-bold,40"
unset log
unset format
unset key
set log y
set ylab "Accumulative mapping error rate" off +0
set xlab "Mapping quality"
set yran [1e-5:0.1]
set ytics 1e-5,0.1
set format y "10^{%L}"
set xran [60:0] reverse
plot "<./eval2roc.pl blasr-mc.eval" u 1:2 w lp ls 4, \
"<./eval2roc.pl bwa.eval" u 1:2 t "bwa-mem" w lp ls 2, \
"<./eval2roc.pl graphmap.eval" u 1:2 t "graphmap" w lp ls 3, \
"<./eval2roc.pl minialign.eval" u 1:2 t "minialign" w lp ls 1, \
"<./eval2roc.pl mm2.eval" u 1:2 t "minimap2" w lp ls 6, \
"<./eval2roc.pl ngmlr.eval" u 1:2 t "ngm-lr" w lp ls 5