backup
This commit is contained in:
parent
9306299e4d
commit
6c9390b54a
|
|
@ -11,7 +11,10 @@
|
||||||
|
|
||||||
all:minimap2.pdf
|
all:minimap2.pdf
|
||||||
|
|
||||||
minimap2.pdf:minimap2.tex minimap2.bib
|
roc-color.eps:roc.gp
|
||||||
|
gnuplot roc.gp
|
||||||
|
|
||||||
|
minimap2.pdf:minimap2.tex minimap2.bib roc-color.pdf
|
||||||
pdflatex minimap2; bibtex minimap2; pdflatex minimap2; pdflatex minimap2;
|
pdflatex minimap2; bibtex minimap2; pdflatex minimap2; pdflatex minimap2;
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
|
|
||||||
|
|
@ -455,7 +455,7 @@
|
||||||
\vspace*{\aboveskipchk}%
|
\vspace*{\aboveskipchk}%
|
||||||
\vspace{\dropfromtop}%
|
\vspace{\dropfromtop}%
|
||||||
\hbox to \textwidth{%
|
\hbox to \textwidth{%
|
||||||
{\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL NOTES}
|
{\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL REPORT}
|
||||||
\hfil
|
\hfil
|
||||||
\if@appnotes APPLICATIONS NOTE\hfil\fi
|
\if@appnotes APPLICATIONS NOTE\hfil\fi
|
||||||
}%
|
}%
|
||||||
|
|
|
||||||
|
|
@ -4,17 +4,30 @@ use strict;
|
||||||
use warnings;
|
use warnings;
|
||||||
use Getopt::Std;
|
use Getopt::Std;
|
||||||
|
|
||||||
my %opts = (n=>33088);
|
my %opts = (n=>33088, s=>100);
|
||||||
getopts('n:', \%opts);
|
getopts('n:', \%opts);
|
||||||
|
|
||||||
my $pseudo = .5;
|
my $pseudo = .5;
|
||||||
my $tot = $pseudo;
|
my $tot = $pseudo;
|
||||||
my $err = $pseudo;
|
my $err = $pseudo;
|
||||||
|
my $tot_last_out = -$opts{s};
|
||||||
|
my $state = 0;
|
||||||
|
my $mapq = 0;
|
||||||
while (<>) {
|
while (<>) {
|
||||||
chomp;
|
chomp;
|
||||||
if (/^Q\t(\d+)\t(\d+)\t(\d+)/) {
|
if (/^Q\t(\d+)\t(\d+)\t(\d+)/) {
|
||||||
$tot += $2;
|
$tot += $2;
|
||||||
$err += $3;
|
$err += $3;
|
||||||
print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n";
|
if ($tot - $tot_last_out >= $opts{s}) {
|
||||||
|
print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n";
|
||||||
|
$tot_last_out = $tot;
|
||||||
|
$state = 0;
|
||||||
|
} else {
|
||||||
|
$state = 1;
|
||||||
|
$mapq = $1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if ($state) {
|
||||||
|
print join("\t", $mapq, $err/$tot, $tot / $opts{n}), "\n";
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@
|
||||||
Year = {2012}}
|
Year = {2012}}
|
||||||
|
|
||||||
@article{Liu:2016ab,
|
@article{Liu:2016ab,
|
||||||
Author = {Liu, Bo and Guan, Dengfeng and Teng, Mingxiang and Wang, Yadong},
|
Author = {Liu, Bo and others},
|
||||||
Journal = {Bioinformatics},
|
Journal = {Bioinformatics},
|
||||||
Pages = {1625-31},
|
Pages = {1625-31},
|
||||||
Title = {{rHAT}: fast alignment of noisy long reads with regional hashing},
|
Title = {{rHAT}: fast alignment of noisy long reads with regional hashing},
|
||||||
|
|
@ -15,7 +15,7 @@
|
||||||
Year = {2016}}
|
Year = {2016}}
|
||||||
|
|
||||||
@article{Liu:2017aa,
|
@article{Liu:2017aa,
|
||||||
Author = {Liu, Bo and Gao, Yan and Wang, Yadong},
|
Author = {Liu, Bo and others},
|
||||||
Journal = {Bioinformatics},
|
Journal = {Bioinformatics},
|
||||||
Pages = {192-201},
|
Pages = {192-201},
|
||||||
Title = {{LAMSA}: fast split read alignment with long approximate matches},
|
Title = {{LAMSA}: fast split read alignment with long approximate matches},
|
||||||
|
|
@ -38,7 +38,7 @@
|
||||||
Year = {2013}}
|
Year = {2013}}
|
||||||
|
|
||||||
@article{Sovic:2016aa,
|
@article{Sovic:2016aa,
|
||||||
Author = {Sovi{\'c}, Ivan and {\v S}iki{\'c}, Mile and Wilm, Andreas and Fenlon, Shannon Nicole and Chen, Swaine and others},
|
Author = {Sovi{\'c}, Ivan and others},
|
||||||
Journal = {Nat Commun},
|
Journal = {Nat Commun},
|
||||||
Pages = {11307},
|
Pages = {11307},
|
||||||
Title = {Fast and sensitive mapping of nanopore sequencing reads with {GraphMap}},
|
Title = {Fast and sensitive mapping of nanopore sequencing reads with {GraphMap}},
|
||||||
|
|
@ -68,6 +68,13 @@
|
||||||
howpublished = {\href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}},
|
howpublished = {\href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}},
|
||||||
year = {2016}}
|
year = {2016}}
|
||||||
|
|
||||||
|
@misc{Ruan:2016,
|
||||||
|
title = {Ultra-fast de novo assembler using long noisy reads},
|
||||||
|
author = {Jue Ruan},
|
||||||
|
journal = {Unpulished},
|
||||||
|
howpublished = {\href{https://github.com/ruanjue/smartdenovo}{https://github.com/ruanjue/smartdenovo}},
|
||||||
|
year = {2016}}
|
||||||
|
|
||||||
@article{Miller:1988aa,
|
@article{Miller:1988aa,
|
||||||
Author = {Miller, W and Myers, E W},
|
Author = {Miller, W and Myers, E W},
|
||||||
Journal = {Bull Math Biol},
|
Journal = {Bull Math Biol},
|
||||||
|
|
@ -86,7 +93,7 @@
|
||||||
Year = {1990}}
|
Year = {1990}}
|
||||||
|
|
||||||
@article{Wu:1996aa,
|
@article{Wu:1996aa,
|
||||||
Author = {Wu, Sun and Manber, U and Myers, Gene},
|
Author = {Wu, Sun and others},
|
||||||
Journal = {Algorithmica},
|
Journal = {Algorithmica},
|
||||||
Pages = {50-67},
|
Pages = {50-67},
|
||||||
Title = {A subquadratic algorithm for approximate limited expression matching},
|
Title = {A subquadratic algorithm for approximate limited expression matching},
|
||||||
|
|
@ -98,22 +105,22 @@
|
||||||
Journal = {BMC Bioinformatics},
|
Journal = {BMC Bioinformatics},
|
||||||
Month = {Feb},
|
Month = {Feb},
|
||||||
Pages = {81},
|
Pages = {81},
|
||||||
Title = {Parasail: SIMD C library for global, semi-global, and local pairwise sequence alignments},
|
Title = {Parasail: {SIMD C} library for global, semi-global, and local pairwise sequence alignments},
|
||||||
Volume = {17},
|
Volume = {17},
|
||||||
Year = {2016}}
|
Year = {2016}}
|
||||||
|
|
||||||
@article{Sedlazeck169557,
|
@article{Sedlazeck169557,
|
||||||
author = {Sedlazeck, Fritz J and Rescheneder, Philipp and Smolka, Moritz and Fang, Han and Nattestad, Maria and others},
|
author = {Sedlazeck, Fritz J and others},
|
||||||
title = {Accurate detection of complex structural variations using single molecule sequencing},
|
title = {Accurate detection of complex structural variations using single molecule sequencing},
|
||||||
note = {doi:10.1101/169557},
|
note = {doi:10.1101/169557},
|
||||||
journal = {bioRxiv},
|
journal = {bioRxiv},
|
||||||
year = {2017}}
|
year = {2017}}
|
||||||
|
|
||||||
@article{Altschul:1997vn,
|
@article{Altschul:1997vn,
|
||||||
Author = {Altschul, S F and Madden, T L and Sch{\"a}ffer, A A and Zhang, J and Zhang, Z and others},
|
Author = {Altschul, S F and others},
|
||||||
Journal = {Nucleic Acids Res},
|
Journal = {Nucleic Acids Res},
|
||||||
Pages = {3389-402},
|
Pages = {3389-402},
|
||||||
Title = {Gapped BLAST and PSI-BLAST: a new generation of protein database search programs},
|
Title = {Gapped {BLAST} and {PSI-BLAST}: a new generation of protein database search programs},
|
||||||
Volume = {25},
|
Volume = {25},
|
||||||
Year = {1997}}
|
Year = {1997}}
|
||||||
|
|
||||||
|
|
@ -132,3 +139,27 @@
|
||||||
Title = {Chaining algorithms for multiple genome comparison},
|
Title = {Chaining algorithms for multiple genome comparison},
|
||||||
Volume = {3},
|
Volume = {3},
|
||||||
Year = {2005}}
|
Year = {2005}}
|
||||||
|
|
||||||
|
@article{Ono:2013aa,
|
||||||
|
Author = {Ono, Yukiteru and others},
|
||||||
|
Journal = {Bioinformatics},
|
||||||
|
Pages = {119-21},
|
||||||
|
Title = {{PBSIM}: {PacBio} reads simulator--toward accurate genome assembly},
|
||||||
|
Volume = {29},
|
||||||
|
Year = {2013}}
|
||||||
|
|
||||||
|
@article {Jain128835,
|
||||||
|
author = {Jain, Miten and others},
|
||||||
|
title = {Nanopore sequencing and assembly of a human genome with ultra-long reads},
|
||||||
|
year = {2017},
|
||||||
|
note = {doi:10.1101/128835},
|
||||||
|
publisher = {Cold Spring Harbor Labs Journals},
|
||||||
|
journal = {bioRxiv}}
|
||||||
|
|
||||||
|
@article{Lau:2016aa,
|
||||||
|
Author = {Lau, Bayo and others},
|
||||||
|
Journal = {Bioinformatics},
|
||||||
|
Pages = {3829-3832},
|
||||||
|
Title = {LongISLND: in silico sequencing of lengthy and noisy datatypes},
|
||||||
|
Volume = {32},
|
||||||
|
Year = {2016}}
|
||||||
|
|
|
||||||
241
tex/minimap2.tex
241
tex/minimap2.tex
|
|
@ -2,6 +2,7 @@
|
||||||
\copyrightyear{2017}
|
\copyrightyear{2017}
|
||||||
\pubyear{2017}
|
\pubyear{2017}
|
||||||
|
|
||||||
|
\usepackage{graphicx}
|
||||||
\usepackage{hyperref}
|
\usepackage{hyperref}
|
||||||
\usepackage{url}
|
\usepackage{url}
|
||||||
\usepackage{amsmath}
|
\usepackage{amsmath}
|
||||||
|
|
@ -19,7 +20,7 @@
|
||||||
\begin{document}
|
\begin{document}
|
||||||
\firstpage{1}
|
\firstpage{1}
|
||||||
|
|
||||||
\title[Long-read and assembly alignment with minimap2]{Minimap2: fast sequence alignment for long noisy reads and assembly contigs}
|
\title[Long sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long noisy sequences}
|
||||||
\author[Li]{Heng Li}
|
\author[Li]{Heng Li}
|
||||||
\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
|
\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
|
||||||
|
|
||||||
|
|
@ -29,7 +30,7 @@
|
||||||
\section{Summary:} Minimap2 is a program to align long noisy sequences against
|
\section{Summary:} Minimap2 is a program to align long noisy sequences against
|
||||||
a large reference database. It targets query sequences of 1kb--100Mb in length
|
a large reference database. It targets query sequences of 1kb--100Mb in length
|
||||||
with sequence divergence typically below 25\%. Minimap2 is $\sim$30 times
|
with sequence divergence typically below 25\%. Minimap2 is $\sim$30 times
|
||||||
faster than most existing long-read aligners and achieves higher accuracy on
|
faster than many mainstream long-read aligners and achieves higher accuracy on
|
||||||
simulated data. It also employs concave gap cost and rescues inversions for
|
simulated data. It also employs concave gap cost and rescues inversions for
|
||||||
improved alignment around potential structural variations.
|
improved alignment around potential structural variations.
|
||||||
|
|
||||||
|
|
@ -60,10 +61,10 @@ towards higher accuracy.
|
||||||
|
|
||||||
Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar
|
Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar
|
||||||
indexing and seeding algorithms except that minimap2 optionally uses
|
indexing and seeding algorithms except that minimap2 optionally uses
|
||||||
homopolymer-compressed (HPC; cite) $k$-mers in addition to normal $k$-mers.
|
homopolymer-compressed (HPC; \citealp{Ruan:2016,Lau:2016aa}) $k$-mers in
|
||||||
Indexing with HPC $k$-mers leads to higher mapping sensitivity for SMRT reads.
|
addition to normal $k$-mers. Indexing with HPC $k$-mers leads to higher
|
||||||
Minimap2 further implements a more accurate chaining algorithm and adds
|
mapping sensitivity for SMRT reads. Minimap2 further implements a more
|
||||||
the ability to produce detailed alignment.
|
accurate chaining algorithm and adds the ability to produce detailed alignment.
|
||||||
|
|
||||||
\subsection{Chaining}
|
\subsection{Chaining}
|
||||||
|
|
||||||
|
|
@ -84,20 +85,19 @@ distance between two anchors is too large); otherwise
|
||||||
\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
|
\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
|
||||||
\]
|
\]
|
||||||
In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
|
In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
|
||||||
l+\beta\log_2(l)$. For $m$ anchors, computing all $f(\cdot)$
|
l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with
|
||||||
with Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. We note that if anchor $i$ is
|
Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster
|
||||||
appended to $j$, appending $i$ to a predecessor of $j$ is likely to yield a
|
chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
|
||||||
lower score. When evaluating Eq.~(\ref{eq:chain}), we start from anchor $i-1$
|
are inapplicable to generic gap cost, complex to implement and usually
|
||||||
and stop the evaluation if we cannot find a better score after up to $h$
|
associated with a large constant. We introduced a simple heurstic to accelerate
|
||||||
iterations. This heuristic reduces the average time to $O(h\cdot m)$. In
|
chaining.
|
||||||
practical, we can almost always find the optimal chain with $h=50$; even if the
|
|
||||||
heuristic fails, the optimal chain often looks dubious.
|
|
||||||
|
|
||||||
%Although theoretically faster chaining algorithms exist for simple gap
|
We note that if anchor $i$ is appended to $j$, appending $i$ to a predecessor
|
||||||
%cost~\citep{Abouelhoda:2005aa}, they are not as flexible as DP and may not lead
|
of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}),
|
||||||
%to better performance than our approach in practice. Furthermore, chaining
|
we start from anchor $i-1$ and stop the evaluation if we cannot find a better
|
||||||
%takes much less computing time than alignment. It is not critical to the
|
score after up to $h$ iterations. This heuristic reduces the average time to
|
||||||
%performance of minimap2.
|
$O(h\cdot m)$. In practice, we can almost always find the optimal chain with
|
||||||
|
$h=50$; even if the heuristic fails, the optimal chain often looks dubious.
|
||||||
|
|
||||||
\subsubsection{Backtracking}
|
\subsubsection{Backtracking}
|
||||||
Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
|
Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
|
||||||
|
|
@ -120,9 +120,9 @@ banded alignment, which is critical to performance. In practice, our
|
||||||
implementation is three times as fast as Parasail's 4-way
|
implementation is three times as fast as Parasail's 4-way
|
||||||
vectorization~\citep{Daily:2016aa} for global alignment.
|
vectorization~\citep{Daily:2016aa} for global alignment.
|
||||||
Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa},
|
Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa},
|
||||||
but with a 1000bp band, it is much faster. When performing global alignment
|
but with a 1000bp band, it is considerably faster. When performing global
|
||||||
between anchors, we expect the alignment to stay close to the diagonal of the
|
alignment between anchors, we expect the alignment to stay close to the
|
||||||
DP matrix. Banding is applicable most of time.
|
diagonal of the DP matrix. Banding is often applicable.
|
||||||
|
|
||||||
Minimap2 uses a 2-piece affine gap cost
|
Minimap2 uses a 2-piece affine gap cost
|
||||||
$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
|
$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
|
||||||
|
|
@ -151,57 +151,154 @@ alignment, but this time with the one subsequence reverse complemented. This
|
||||||
additional alignment step may identify short inversions that are missed during
|
additional alignment step may identify short inversions that are missed during
|
||||||
chaining.
|
chaining.
|
||||||
|
|
||||||
%\begin{equation}\label{eq:ae86}
|
|
||||||
%\left\{\begin{array}{l}
|
|
||||||
%H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
|
|
||||||
%E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
|
|
||||||
%F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
|
|
||||||
%\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\
|
|
||||||
%\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e}
|
|
||||||
%\end{array}\right.
|
|
||||||
%\end{equation}
|
|
||||||
%where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
|
|
||||||
%base. If we define~\citep{Wu:1996aa,Suzuki:2016}
|
|
||||||
%\[
|
|
||||||
%\left\{\begin{array}{ll}
|
|
||||||
%u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
|
|
||||||
%x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\
|
|
||||||
%y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
|
|
||||||
%\end{array}\right.
|
|
||||||
%\]
|
|
||||||
%we can transform Eq.~(\ref{eq:ae86}) to
|
|
||||||
%\[
|
|
||||||
%\left\{\begin{array}{lll}
|
|
||||||
%z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
|
|
||||||
%&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\
|
|
||||||
%u_{ij}&=&z_{ij}-v_{i-1,j}\\
|
|
||||||
%v_{ij}&=&z_{ij}-u_{i,j-1}\\
|
|
||||||
%x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\
|
|
||||||
%y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
|
|
||||||
%\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
|
|
||||||
%\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}
|
|
||||||
%\end{array}\right.
|
|
||||||
%\]
|
|
||||||
%with boundary conditions
|
|
||||||
%\[
|
|
||||||
%\left\{\begin{array}{l}
|
|
||||||
%x_{-1,\cdot}=y_{\cdot,-1}=-q-e\\
|
|
||||||
%\tilde{x}_{-1,\cdot}=\tilde{y}_{\cdot,-1}=-\tilde{q}-\tilde{e}\\
|
|
||||||
%u_{i,-1}=\eta(i)\\
|
|
||||||
%v_{-1,j}=\eta(j)
|
|
||||||
%\end{array}\right.
|
|
||||||
%\]
|
|
||||||
%where
|
|
||||||
%\[
|
|
||||||
%\eta(k)=\left\{\begin{array}{ll}
|
|
||||||
%-q-e & (k=0) \\
|
|
||||||
%-e & (k<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
|
|
||||||
%i\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (k=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
|
|
||||||
%-\tilde{e} & (k>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
|
|
||||||
%\end{array}\right.
|
|
||||||
%\]
|
|
||||||
|
|
||||||
\end{methods}
|
\end{methods}
|
||||||
|
|
||||||
|
\section{Results and discussions}
|
||||||
|
\begin{figure}[!tb]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=.5\textwidth]{roc-color.pdf}
|
||||||
|
\caption{Evaluation on simulated SMRT reads aligned against human genome
|
||||||
|
GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function
|
||||||
|
of mapping quality. 33,088 $\ge$1000bp reads were simulated using
|
||||||
|
pbsim~\citep{Ono:2013aa} with error profile sampled from file
|
||||||
|
`m131017\_060208\_42213\_*.1.*' downloaded at
|
||||||
|
\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is
|
||||||
|
11,628. A read is considered correctly mapped if the true position overlaps
|
||||||
|
with the best mapping position by 10\% of the read length. All aligners were
|
||||||
|
run under the default setting for SMRT reads.}\label{fig:eval}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
As a sanity check, we evaluated minimap2 on simulated human reads along with
|
||||||
|
BLASR~\citep{Chaisson:2012aa},
|
||||||
|
BWA-MEM~\citep{Li:2013aa},
|
||||||
|
GraphMap~\citep{Sovic:2016aa},
|
||||||
|
minialign~\citep{Suzuki:2016} and
|
||||||
|
NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab},
|
||||||
|
LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either
|
||||||
|
crashed or produced malformatted SAM. In this evaluation, Minimap2 has a
|
||||||
|
higher power to distinguish unique and repetitive hits, and achieves overall
|
||||||
|
higher mapping accuracy (Fig.~\ref{fig:eval}a). Minimap2 and NGMLR provide
|
||||||
|
better mapping quality estimate: they rarely give repetitive hits high mapping
|
||||||
|
quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may occasionally
|
||||||
|
miss close suboptimal hits and be overconfident in wrong mappings. On run time,
|
||||||
|
minialign is slightly faster than minimap2. They are over 30 times faster than
|
||||||
|
the rest.
|
||||||
|
|
||||||
|
On real SMRT reads from human, the relative performance and sensitivity of
|
||||||
|
these aligners are broadly similar to those on simulated data. We are unable to
|
||||||
|
provide a good estimate of mapping error rate due to the lack of the truth. On
|
||||||
|
ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and
|
||||||
|
minimap2 are over 70 times faster than others. In addition to reference-based
|
||||||
|
read mapping, minimap2 can also find overlaps between long reads and align
|
||||||
|
long-read assemblies.
|
||||||
|
|
||||||
\bibliography{minimap2}
|
\bibliography{minimap2}
|
||||||
|
|
||||||
|
\pagebreak
|
||||||
|
|
||||||
|
\begin{methods}
|
||||||
|
\section*{Appendix}
|
||||||
|
A 2-piece gap cost function is
|
||||||
|
\[
|
||||||
|
\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}
|
||||||
|
\]
|
||||||
|
Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation
|
||||||
|
to compute the optimal alignment under such a gap cost is
|
||||||
|
\begin{equation}\label{eq:ae86}
|
||||||
|
\left\{\begin{array}{l}
|
||||||
|
H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
|
||||||
|
E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
|
||||||
|
F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
|
||||||
|
\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\
|
||||||
|
\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e}
|
||||||
|
\end{array}\right.
|
||||||
|
\end{equation}
|
||||||
|
where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
|
||||||
|
base. If we define
|
||||||
|
\[
|
||||||
|
\left\{\begin{array}{ll}
|
||||||
|
u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
|
||||||
|
x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\
|
||||||
|
y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
|
||||||
|
\end{array}\right.
|
||||||
|
\]
|
||||||
|
we can transform Eq.~(\ref{eq:ae86}) to
|
||||||
|
\begin{equation}\label{eq:suzuki}
|
||||||
|
\left\{\begin{array}{lll}
|
||||||
|
z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
|
||||||
|
&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\
|
||||||
|
u_{ij}&=&z_{ij}-v_{i-1,j}\\
|
||||||
|
v_{ij}&=&z_{ij}-u_{i,j-1}\\
|
||||||
|
x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\
|
||||||
|
y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
|
||||||
|
\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
|
||||||
|
\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}
|
||||||
|
\end{array}\right.
|
||||||
|
\end{equation}
|
||||||
|
where $z_{ij}$ is a temporary variable that does not need to be stored. We can
|
||||||
|
see that
|
||||||
|
\[
|
||||||
|
x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e
|
||||||
|
\]
|
||||||
|
With $E_{ij}\le H_{ij}$, we have
|
||||||
|
\[
|
||||||
|
-q-e\le x_{ij}\le\max\{-q,0\}-e=-e
|
||||||
|
\]
|
||||||
|
and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$.
|
||||||
|
In addition,
|
||||||
|
\[
|
||||||
|
u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e
|
||||||
|
\]
|
||||||
|
We also note that the maximum possible $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the
|
||||||
|
maximal matching score. As a result,
|
||||||
|
\[
|
||||||
|
u_{ij}\le M-v_{i-1,j}\le M+q+e
|
||||||
|
\]
|
||||||
|
In conclusion, all values in Eq.~(\ref{eq:suzuki}) are bounded: $x$ and $y$ by
|
||||||
|
$[-q-e,-e]$ and $\tilde{x}$, $\tilde{y}$ by
|
||||||
|
$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When
|
||||||
|
matching score and gap cost are small, each of them can be stored as a 8-bit
|
||||||
|
integer. This enables efficient SSE vectorization regardless of the peak score
|
||||||
|
of the alignment.
|
||||||
|
|
||||||
|
For more efficient SSE implementation, we transform the row-column coordinate
|
||||||
|
to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
|
||||||
|
Eq.~(\ref{eq:suzuki}) becomes:
|
||||||
|
\begin{equation*}
|
||||||
|
\left\{\begin{array}{lll}
|
||||||
|
z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\
|
||||||
|
&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
|
||||||
|
u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
|
||||||
|
v_{rt}&=&z_{rt}-u_{r-1,t}\\
|
||||||
|
x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\
|
||||||
|
y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\
|
||||||
|
\tilde{x}_{rt}&=&\max\{0,\tilde{x}_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
|
||||||
|
\tilde{y}_{rt}&=&\max\{0,\tilde{y}_{r-1,t}+u_{r-1,t}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}
|
||||||
|
\end{array}\right.
|
||||||
|
\end{equation*}
|
||||||
|
In this formulation, cells with the same row index $r$ are independent of each
|
||||||
|
other. This allows us to vectorize the computation of all cells on the same
|
||||||
|
anti-diagonal in one inner loop.
|
||||||
|
|
||||||
|
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary
|
||||||
|
condition of this equation in the diagonal-anti-diagonal coordinate is
|
||||||
|
\[
|
||||||
|
\left\{\begin{array}{l}
|
||||||
|
x_{r-1,-1}=y_{r-1,r}=-q-e\\
|
||||||
|
\tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\
|
||||||
|
u_{r-1,r}=v_{r-1,-1}=\eta(r)\\
|
||||||
|
\end{array}\right.
|
||||||
|
\]
|
||||||
|
where
|
||||||
|
\[
|
||||||
|
\eta(r)=\left\{\begin{array}{ll}
|
||||||
|
-q-e & (r=0) \\
|
||||||
|
-e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
|
||||||
|
r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
|
||||||
|
-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
|
||||||
|
\end{array}\right.
|
||||||
|
\]
|
||||||
|
\citet{Suzuki:2016} first derived a similar set of equations under affine gap
|
||||||
|
cost but with different notations.
|
||||||
|
\end{methods}
|
||||||
\end{document}
|
\end{document}
|
||||||
|
|
|
||||||
64
tex/roc.gp
64
tex/roc.gp
|
|
@ -1,28 +1,52 @@
|
||||||
set t po eps enh co so "Helvetica,18"
|
set t po eps enh co so "Helvetica,26"
|
||||||
|
|
||||||
set style line 1 lt 1 pt 1 lc rgb "#FF0000" lw 2;
|
set style line 1 lt 1 pt 1 lc rgb "#e41a1c" lw 2;
|
||||||
set style line 2 lt 1 pt 2 lc rgb "#00C000" lw 2;
|
set style line 2 lt 1 pt 2 lc rgb "#377eb8" lw 2;
|
||||||
set style line 3 lt 1 pt 3 lc rgb "#0080FF" lw 2;
|
set style line 3 lt 1 pt 3 lc rgb "#4daf4a" lw 2;
|
||||||
set style line 4 lt 1 pt 4 lc rgb "#C000FF" lw 2;
|
set style line 4 lt 1 pt 4 lc rgb "#984ea3" lw 2;
|
||||||
set style line 5 lt 1 pt 5 lc rgb "#00EEEE" lw 2;
|
set style line 5 lt 1 pt 6 lc rgb "#ff7f00" lw 2;
|
||||||
set style line 6 lt 1 pt 6 lc rgb "#C04000" lw 2;
|
set style line 6 lt 1 pt 8 lc rgb "#f781bf" lw 2;
|
||||||
set style line 7 lt 1 lc rgb "#C8C800" lw 2;
|
|
||||||
set style line 8 lt 1 lc rgb "#FF80FF" lw 2;
|
|
||||||
set style line 9 lt 1 lc rgb "#4E642E" lw 2;
|
|
||||||
set style line 10 lt 1 lc rgb "#800000" lw 2;
|
|
||||||
set style line 11 lt 1 lc rgb "#67B7F7" lw 2;
|
|
||||||
set style line 12 lt 1 lc rgb "#FFC127" lw 2;
|
|
||||||
|
|
||||||
set xlab "False positive rate"
|
|
||||||
set ylab "Sensitivity"
|
|
||||||
set yran [0.9:1]
|
|
||||||
|
|
||||||
set out "roc-color.eps"
|
set out "roc-color.eps"
|
||||||
|
|
||||||
|
set pointsize 2.0
|
||||||
|
set size 1.59,1.04
|
||||||
|
set multiplot layout 1,2
|
||||||
|
|
||||||
|
set label "(a)" at graph -0.245,1.06 font "Helvetica-bold,40"
|
||||||
|
set xlab "Error rate of mapped reads"
|
||||||
|
set ylab "Fraction of mapped reads" off +1.8
|
||||||
|
set ytics 0.02
|
||||||
|
set yran [0.9:1]
|
||||||
|
|
||||||
|
set size 0.8,1
|
||||||
set log x
|
set log x
|
||||||
set format x "10^{%L}"
|
set format x "10^{%L}"
|
||||||
set key bot right
|
set key bot right
|
||||||
plot "<./eval2roc.pl blasr-mc.eval" u 2:3 t "blasr-mc" w lp ls 1, \
|
plot "<./eval2roc.pl blasr-mc.eval" u 2:3 t "blasr-mc" w lp ls 4, \
|
||||||
"<./eval2roc.pl bwa.eval" u 2:3 t "bwa-mem" w lp ls 2, \
|
"<./eval2roc.pl bwa.eval" u 2:3 t "bwa-mem" w lp ls 2, \
|
||||||
"<./eval2roc.pl minialign.eval" u 2:3 t "minialign" w lp ls 3, \
|
"<./eval2roc.pl graphmap.eval" u 2:3 t "graphmap" w lp ls 3, \
|
||||||
"<./eval2roc.pl mm2.eval" u 2:3 t "minimap2" w lp ls 4, \
|
"<./eval2roc.pl minialign.eval" u 2:3 t "minialign" w lp ls 1, \
|
||||||
|
"<./eval2roc.pl mm2.eval" u 2:3 t "minimap2" w lp ls 6, \
|
||||||
"<./eval2roc.pl ngmlr.eval" u 2:3 t "ngm-lr" w lp ls 5
|
"<./eval2roc.pl ngmlr.eval" u 2:3 t "ngm-lr" w lp ls 5
|
||||||
|
unset label
|
||||||
|
|
||||||
|
set origin 0.8,0
|
||||||
|
set size 0.79,1
|
||||||
|
set label "(b)" at graph -0.245,1.06 font "Helvetica-bold,40"
|
||||||
|
unset log
|
||||||
|
unset format
|
||||||
|
unset key
|
||||||
|
set log y
|
||||||
|
set ylab "Accumulative mapping error rate" off +0
|
||||||
|
set xlab "Mapping quality"
|
||||||
|
set yran [1e-5:0.1]
|
||||||
|
set ytics 1e-5,0.1
|
||||||
|
set format y "10^{%L}"
|
||||||
|
set xran [60:0] reverse
|
||||||
|
plot "<./eval2roc.pl blasr-mc.eval" u 1:2 w lp ls 4, \
|
||||||
|
"<./eval2roc.pl bwa.eval" u 1:2 t "bwa-mem" w lp ls 2, \
|
||||||
|
"<./eval2roc.pl graphmap.eval" u 1:2 t "graphmap" w lp ls 3, \
|
||||||
|
"<./eval2roc.pl minialign.eval" u 1:2 t "minialign" w lp ls 1, \
|
||||||
|
"<./eval2roc.pl mm2.eval" u 1:2 t "minimap2" w lp ls 6, \
|
||||||
|
"<./eval2roc.pl ngmlr.eval" u 1:2 t "ngm-lr" w lp ls 5
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue