backup
This commit is contained in:
parent
9306299e4d
commit
6c9390b54a
|
|
@ -11,7 +11,10 @@
|
|||
|
||||
all:minimap2.pdf
|
||||
|
||||
minimap2.pdf:minimap2.tex minimap2.bib
|
||||
roc-color.eps:roc.gp
|
||||
gnuplot roc.gp
|
||||
|
||||
minimap2.pdf:minimap2.tex minimap2.bib roc-color.pdf
|
||||
pdflatex minimap2; bibtex minimap2; pdflatex minimap2; pdflatex minimap2;
|
||||
|
||||
clean:
|
||||
|
|
|
|||
|
|
@ -455,7 +455,7 @@
|
|||
\vspace*{\aboveskipchk}%
|
||||
\vspace{\dropfromtop}%
|
||||
\hbox to \textwidth{%
|
||||
{\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL NOTES}
|
||||
{\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL REPORT}
|
||||
\hfil
|
||||
\if@appnotes APPLICATIONS NOTE\hfil\fi
|
||||
}%
|
||||
|
|
|
|||
|
|
@ -4,17 +4,30 @@ use strict;
|
|||
use warnings;
|
||||
use Getopt::Std;
|
||||
|
||||
my %opts = (n=>33088);
|
||||
my %opts = (n=>33088, s=>100);
|
||||
getopts('n:', \%opts);
|
||||
|
||||
my $pseudo = .5;
|
||||
my $tot = $pseudo;
|
||||
my $err = $pseudo;
|
||||
my $tot_last_out = -$opts{s};
|
||||
my $state = 0;
|
||||
my $mapq = 0;
|
||||
while (<>) {
|
||||
chomp;
|
||||
if (/^Q\t(\d+)\t(\d+)\t(\d+)/) {
|
||||
$tot += $2;
|
||||
$err += $3;
|
||||
print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n";
|
||||
if ($tot - $tot_last_out >= $opts{s}) {
|
||||
print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n";
|
||||
$tot_last_out = $tot;
|
||||
$state = 0;
|
||||
} else {
|
||||
$state = 1;
|
||||
$mapq = $1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($state) {
|
||||
print join("\t", $mapq, $err/$tot, $tot / $opts{n}), "\n";
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@
|
|||
Year = {2012}}
|
||||
|
||||
@article{Liu:2016ab,
|
||||
Author = {Liu, Bo and Guan, Dengfeng and Teng, Mingxiang and Wang, Yadong},
|
||||
Author = {Liu, Bo and others},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {1625-31},
|
||||
Title = {{rHAT}: fast alignment of noisy long reads with regional hashing},
|
||||
|
|
@ -15,7 +15,7 @@
|
|||
Year = {2016}}
|
||||
|
||||
@article{Liu:2017aa,
|
||||
Author = {Liu, Bo and Gao, Yan and Wang, Yadong},
|
||||
Author = {Liu, Bo and others},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {192-201},
|
||||
Title = {{LAMSA}: fast split read alignment with long approximate matches},
|
||||
|
|
@ -38,7 +38,7 @@
|
|||
Year = {2013}}
|
||||
|
||||
@article{Sovic:2016aa,
|
||||
Author = {Sovi{\'c}, Ivan and {\v S}iki{\'c}, Mile and Wilm, Andreas and Fenlon, Shannon Nicole and Chen, Swaine and others},
|
||||
Author = {Sovi{\'c}, Ivan and others},
|
||||
Journal = {Nat Commun},
|
||||
Pages = {11307},
|
||||
Title = {Fast and sensitive mapping of nanopore sequencing reads with {GraphMap}},
|
||||
|
|
@ -68,6 +68,13 @@
|
|||
howpublished = {\href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}},
|
||||
year = {2016}}
|
||||
|
||||
@misc{Ruan:2016,
|
||||
title = {Ultra-fast de novo assembler using long noisy reads},
|
||||
author = {Jue Ruan},
|
||||
journal = {Unpulished},
|
||||
howpublished = {\href{https://github.com/ruanjue/smartdenovo}{https://github.com/ruanjue/smartdenovo}},
|
||||
year = {2016}}
|
||||
|
||||
@article{Miller:1988aa,
|
||||
Author = {Miller, W and Myers, E W},
|
||||
Journal = {Bull Math Biol},
|
||||
|
|
@ -86,7 +93,7 @@
|
|||
Year = {1990}}
|
||||
|
||||
@article{Wu:1996aa,
|
||||
Author = {Wu, Sun and Manber, U and Myers, Gene},
|
||||
Author = {Wu, Sun and others},
|
||||
Journal = {Algorithmica},
|
||||
Pages = {50-67},
|
||||
Title = {A subquadratic algorithm for approximate limited expression matching},
|
||||
|
|
@ -98,22 +105,22 @@
|
|||
Journal = {BMC Bioinformatics},
|
||||
Month = {Feb},
|
||||
Pages = {81},
|
||||
Title = {Parasail: SIMD C library for global, semi-global, and local pairwise sequence alignments},
|
||||
Title = {Parasail: {SIMD C} library for global, semi-global, and local pairwise sequence alignments},
|
||||
Volume = {17},
|
||||
Year = {2016}}
|
||||
|
||||
@article{Sedlazeck169557,
|
||||
author = {Sedlazeck, Fritz J and Rescheneder, Philipp and Smolka, Moritz and Fang, Han and Nattestad, Maria and others},
|
||||
author = {Sedlazeck, Fritz J and others},
|
||||
title = {Accurate detection of complex structural variations using single molecule sequencing},
|
||||
note = {doi:10.1101/169557},
|
||||
journal = {bioRxiv},
|
||||
year = {2017}}
|
||||
|
||||
@article{Altschul:1997vn,
|
||||
Author = {Altschul, S F and Madden, T L and Sch{\"a}ffer, A A and Zhang, J and Zhang, Z and others},
|
||||
Author = {Altschul, S F and others},
|
||||
Journal = {Nucleic Acids Res},
|
||||
Pages = {3389-402},
|
||||
Title = {Gapped BLAST and PSI-BLAST: a new generation of protein database search programs},
|
||||
Title = {Gapped {BLAST} and {PSI-BLAST}: a new generation of protein database search programs},
|
||||
Volume = {25},
|
||||
Year = {1997}}
|
||||
|
||||
|
|
@ -132,3 +139,27 @@
|
|||
Title = {Chaining algorithms for multiple genome comparison},
|
||||
Volume = {3},
|
||||
Year = {2005}}
|
||||
|
||||
@article{Ono:2013aa,
|
||||
Author = {Ono, Yukiteru and others},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {119-21},
|
||||
Title = {{PBSIM}: {PacBio} reads simulator--toward accurate genome assembly},
|
||||
Volume = {29},
|
||||
Year = {2013}}
|
||||
|
||||
@article {Jain128835,
|
||||
author = {Jain, Miten and others},
|
||||
title = {Nanopore sequencing and assembly of a human genome with ultra-long reads},
|
||||
year = {2017},
|
||||
note = {doi:10.1101/128835},
|
||||
publisher = {Cold Spring Harbor Labs Journals},
|
||||
journal = {bioRxiv}}
|
||||
|
||||
@article{Lau:2016aa,
|
||||
Author = {Lau, Bayo and others},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {3829-3832},
|
||||
Title = {LongISLND: in silico sequencing of lengthy and noisy datatypes},
|
||||
Volume = {32},
|
||||
Year = {2016}}
|
||||
|
|
|
|||
241
tex/minimap2.tex
241
tex/minimap2.tex
|
|
@ -2,6 +2,7 @@
|
|||
\copyrightyear{2017}
|
||||
\pubyear{2017}
|
||||
|
||||
\usepackage{graphicx}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{url}
|
||||
\usepackage{amsmath}
|
||||
|
|
@ -19,7 +20,7 @@
|
|||
\begin{document}
|
||||
\firstpage{1}
|
||||
|
||||
\title[Long-read and assembly alignment with minimap2]{Minimap2: fast sequence alignment for long noisy reads and assembly contigs}
|
||||
\title[Long sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long noisy sequences}
|
||||
\author[Li]{Heng Li}
|
||||
\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
|
||||
|
||||
|
|
@ -29,7 +30,7 @@
|
|||
\section{Summary:} Minimap2 is a program to align long noisy sequences against
|
||||
a large reference database. It targets query sequences of 1kb--100Mb in length
|
||||
with sequence divergence typically below 25\%. Minimap2 is $\sim$30 times
|
||||
faster than most existing long-read aligners and achieves higher accuracy on
|
||||
faster than many mainstream long-read aligners and achieves higher accuracy on
|
||||
simulated data. It also employs concave gap cost and rescues inversions for
|
||||
improved alignment around potential structural variations.
|
||||
|
||||
|
|
@ -60,10 +61,10 @@ towards higher accuracy.
|
|||
|
||||
Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar
|
||||
indexing and seeding algorithms except that minimap2 optionally uses
|
||||
homopolymer-compressed (HPC; cite) $k$-mers in addition to normal $k$-mers.
|
||||
Indexing with HPC $k$-mers leads to higher mapping sensitivity for SMRT reads.
|
||||
Minimap2 further implements a more accurate chaining algorithm and adds
|
||||
the ability to produce detailed alignment.
|
||||
homopolymer-compressed (HPC; \citealp{Ruan:2016,Lau:2016aa}) $k$-mers in
|
||||
addition to normal $k$-mers. Indexing with HPC $k$-mers leads to higher
|
||||
mapping sensitivity for SMRT reads. Minimap2 further implements a more
|
||||
accurate chaining algorithm and adds the ability to produce detailed alignment.
|
||||
|
||||
\subsection{Chaining}
|
||||
|
||||
|
|
@ -84,20 +85,19 @@ distance between two anchors is too large); otherwise
|
|||
\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
|
||||
\]
|
||||
In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
|
||||
l+\beta\log_2(l)$. For $m$ anchors, computing all $f(\cdot)$
|
||||
with Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. We note that if anchor $i$ is
|
||||
appended to $j$, appending $i$ to a predecessor of $j$ is likely to yield a
|
||||
lower score. When evaluating Eq.~(\ref{eq:chain}), we start from anchor $i-1$
|
||||
and stop the evaluation if we cannot find a better score after up to $h$
|
||||
iterations. This heuristic reduces the average time to $O(h\cdot m)$. In
|
||||
practical, we can almost always find the optimal chain with $h=50$; even if the
|
||||
heuristic fails, the optimal chain often looks dubious.
|
||||
l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with
|
||||
Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster
|
||||
chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
|
||||
are inapplicable to generic gap cost, complex to implement and usually
|
||||
associated with a large constant. We introduced a simple heurstic to accelerate
|
||||
chaining.
|
||||
|
||||
%Although theoretically faster chaining algorithms exist for simple gap
|
||||
%cost~\citep{Abouelhoda:2005aa}, they are not as flexible as DP and may not lead
|
||||
%to better performance than our approach in practice. Furthermore, chaining
|
||||
%takes much less computing time than alignment. It is not critical to the
|
||||
%performance of minimap2.
|
||||
We note that if anchor $i$ is appended to $j$, appending $i$ to a predecessor
|
||||
of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}),
|
||||
we start from anchor $i-1$ and stop the evaluation if we cannot find a better
|
||||
score after up to $h$ iterations. This heuristic reduces the average time to
|
||||
$O(h\cdot m)$. In practice, we can almost always find the optimal chain with
|
||||
$h=50$; even if the heuristic fails, the optimal chain often looks dubious.
|
||||
|
||||
\subsubsection{Backtracking}
|
||||
Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
|
||||
|
|
@ -120,9 +120,9 @@ banded alignment, which is critical to performance. In practice, our
|
|||
implementation is three times as fast as Parasail's 4-way
|
||||
vectorization~\citep{Daily:2016aa} for global alignment.
|
||||
Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa},
|
||||
but with a 1000bp band, it is much faster. When performing global alignment
|
||||
between anchors, we expect the alignment to stay close to the diagonal of the
|
||||
DP matrix. Banding is applicable most of time.
|
||||
but with a 1000bp band, it is considerably faster. When performing global
|
||||
alignment between anchors, we expect the alignment to stay close to the
|
||||
diagonal of the DP matrix. Banding is often applicable.
|
||||
|
||||
Minimap2 uses a 2-piece affine gap cost
|
||||
$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
|
||||
|
|
@ -151,57 +151,154 @@ alignment, but this time with the one subsequence reverse complemented. This
|
|||
additional alignment step may identify short inversions that are missed during
|
||||
chaining.
|
||||
|
||||
%\begin{equation}\label{eq:ae86}
|
||||
%\left\{\begin{array}{l}
|
||||
%H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
|
||||
%E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
|
||||
%F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
|
||||
%\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\
|
||||
%\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e}
|
||||
%\end{array}\right.
|
||||
%\end{equation}
|
||||
%where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
|
||||
%base. If we define~\citep{Wu:1996aa,Suzuki:2016}
|
||||
%\[
|
||||
%\left\{\begin{array}{ll}
|
||||
%u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
|
||||
%x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\
|
||||
%y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
|
||||
%\end{array}\right.
|
||||
%\]
|
||||
%we can transform Eq.~(\ref{eq:ae86}) to
|
||||
%\[
|
||||
%\left\{\begin{array}{lll}
|
||||
%z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
|
||||
%&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\
|
||||
%u_{ij}&=&z_{ij}-v_{i-1,j}\\
|
||||
%v_{ij}&=&z_{ij}-u_{i,j-1}\\
|
||||
%x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\
|
||||
%y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
|
||||
%\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
|
||||
%\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}
|
||||
%\end{array}\right.
|
||||
%\]
|
||||
%with boundary conditions
|
||||
%\[
|
||||
%\left\{\begin{array}{l}
|
||||
%x_{-1,\cdot}=y_{\cdot,-1}=-q-e\\
|
||||
%\tilde{x}_{-1,\cdot}=\tilde{y}_{\cdot,-1}=-\tilde{q}-\tilde{e}\\
|
||||
%u_{i,-1}=\eta(i)\\
|
||||
%v_{-1,j}=\eta(j)
|
||||
%\end{array}\right.
|
||||
%\]
|
||||
%where
|
||||
%\[
|
||||
%\eta(k)=\left\{\begin{array}{ll}
|
||||
%-q-e & (k=0) \\
|
||||
%-e & (k<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
|
||||
%i\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (k=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
|
||||
%-\tilde{e} & (k>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
|
||||
%\end{array}\right.
|
||||
%\]
|
||||
|
||||
\end{methods}
|
||||
|
||||
\section{Results and discussions}
|
||||
\begin{figure}[!tb]
|
||||
\centering
|
||||
\includegraphics[width=.5\textwidth]{roc-color.pdf}
|
||||
\caption{Evaluation on simulated SMRT reads aligned against human genome
|
||||
GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function
|
||||
of mapping quality. 33,088 $\ge$1000bp reads were simulated using
|
||||
pbsim~\citep{Ono:2013aa} with error profile sampled from file
|
||||
`m131017\_060208\_42213\_*.1.*' downloaded at
|
||||
\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is
|
||||
11,628. A read is considered correctly mapped if the true position overlaps
|
||||
with the best mapping position by 10\% of the read length. All aligners were
|
||||
run under the default setting for SMRT reads.}\label{fig:eval}
|
||||
\end{figure}
|
||||
|
||||
As a sanity check, we evaluated minimap2 on simulated human reads along with
|
||||
BLASR~\citep{Chaisson:2012aa},
|
||||
BWA-MEM~\citep{Li:2013aa},
|
||||
GraphMap~\citep{Sovic:2016aa},
|
||||
minialign~\citep{Suzuki:2016} and
|
||||
NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab},
|
||||
LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either
|
||||
crashed or produced malformatted SAM. In this evaluation, Minimap2 has a
|
||||
higher power to distinguish unique and repetitive hits, and achieves overall
|
||||
higher mapping accuracy (Fig.~\ref{fig:eval}a). Minimap2 and NGMLR provide
|
||||
better mapping quality estimate: they rarely give repetitive hits high mapping
|
||||
quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may occasionally
|
||||
miss close suboptimal hits and be overconfident in wrong mappings. On run time,
|
||||
minialign is slightly faster than minimap2. They are over 30 times faster than
|
||||
the rest.
|
||||
|
||||
On real SMRT reads from human, the relative performance and sensitivity of
|
||||
these aligners are broadly similar to those on simulated data. We are unable to
|
||||
provide a good estimate of mapping error rate due to the lack of the truth. On
|
||||
ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and
|
||||
minimap2 are over 70 times faster than others. In addition to reference-based
|
||||
read mapping, minimap2 can also find overlaps between long reads and align
|
||||
long-read assemblies.
|
||||
|
||||
\bibliography{minimap2}
|
||||
|
||||
\pagebreak
|
||||
|
||||
\begin{methods}
|
||||
\section*{Appendix}
|
||||
A 2-piece gap cost function is
|
||||
\[
|
||||
\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}
|
||||
\]
|
||||
Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation
|
||||
to compute the optimal alignment under such a gap cost is
|
||||
\begin{equation}\label{eq:ae86}
|
||||
\left\{\begin{array}{l}
|
||||
H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
|
||||
E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
|
||||
F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
|
||||
\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\
|
||||
\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e}
|
||||
\end{array}\right.
|
||||
\end{equation}
|
||||
where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
|
||||
base. If we define
|
||||
\[
|
||||
\left\{\begin{array}{ll}
|
||||
u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
|
||||
x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\
|
||||
y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
|
||||
\end{array}\right.
|
||||
\]
|
||||
we can transform Eq.~(\ref{eq:ae86}) to
|
||||
\begin{equation}\label{eq:suzuki}
|
||||
\left\{\begin{array}{lll}
|
||||
z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
|
||||
&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\
|
||||
u_{ij}&=&z_{ij}-v_{i-1,j}\\
|
||||
v_{ij}&=&z_{ij}-u_{i,j-1}\\
|
||||
x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\
|
||||
y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
|
||||
\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
|
||||
\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}
|
||||
\end{array}\right.
|
||||
\end{equation}
|
||||
where $z_{ij}$ is a temporary variable that does not need to be stored. We can
|
||||
see that
|
||||
\[
|
||||
x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e
|
||||
\]
|
||||
With $E_{ij}\le H_{ij}$, we have
|
||||
\[
|
||||
-q-e\le x_{ij}\le\max\{-q,0\}-e=-e
|
||||
\]
|
||||
and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$.
|
||||
In addition,
|
||||
\[
|
||||
u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e
|
||||
\]
|
||||
We also note that the maximum possible $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the
|
||||
maximal matching score. As a result,
|
||||
\[
|
||||
u_{ij}\le M-v_{i-1,j}\le M+q+e
|
||||
\]
|
||||
In conclusion, all values in Eq.~(\ref{eq:suzuki}) are bounded: $x$ and $y$ by
|
||||
$[-q-e,-e]$ and $\tilde{x}$, $\tilde{y}$ by
|
||||
$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When
|
||||
matching score and gap cost are small, each of them can be stored as a 8-bit
|
||||
integer. This enables efficient SSE vectorization regardless of the peak score
|
||||
of the alignment.
|
||||
|
||||
For more efficient SSE implementation, we transform the row-column coordinate
|
||||
to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
|
||||
Eq.~(\ref{eq:suzuki}) becomes:
|
||||
\begin{equation*}
|
||||
\left\{\begin{array}{lll}
|
||||
z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\
|
||||
&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
|
||||
u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
|
||||
v_{rt}&=&z_{rt}-u_{r-1,t}\\
|
||||
x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\
|
||||
y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\
|
||||
\tilde{x}_{rt}&=&\max\{0,\tilde{x}_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
|
||||
\tilde{y}_{rt}&=&\max\{0,\tilde{y}_{r-1,t}+u_{r-1,t}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}
|
||||
\end{array}\right.
|
||||
\end{equation*}
|
||||
In this formulation, cells with the same row index $r$ are independent of each
|
||||
other. This allows us to vectorize the computation of all cells on the same
|
||||
anti-diagonal in one inner loop.
|
||||
|
||||
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary
|
||||
condition of this equation in the diagonal-anti-diagonal coordinate is
|
||||
\[
|
||||
\left\{\begin{array}{l}
|
||||
x_{r-1,-1}=y_{r-1,r}=-q-e\\
|
||||
\tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\
|
||||
u_{r-1,r}=v_{r-1,-1}=\eta(r)\\
|
||||
\end{array}\right.
|
||||
\]
|
||||
where
|
||||
\[
|
||||
\eta(r)=\left\{\begin{array}{ll}
|
||||
-q-e & (r=0) \\
|
||||
-e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
|
||||
r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
|
||||
-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
|
||||
\end{array}\right.
|
||||
\]
|
||||
\citet{Suzuki:2016} first derived a similar set of equations under affine gap
|
||||
cost but with different notations.
|
||||
\end{methods}
|
||||
\end{document}
|
||||
|
|
|
|||
64
tex/roc.gp
64
tex/roc.gp
|
|
@ -1,28 +1,52 @@
|
|||
set t po eps enh co so "Helvetica,18"
|
||||
set t po eps enh co so "Helvetica,26"
|
||||
|
||||
set style line 1 lt 1 pt 1 lc rgb "#FF0000" lw 2;
|
||||
set style line 2 lt 1 pt 2 lc rgb "#00C000" lw 2;
|
||||
set style line 3 lt 1 pt 3 lc rgb "#0080FF" lw 2;
|
||||
set style line 4 lt 1 pt 4 lc rgb "#C000FF" lw 2;
|
||||
set style line 5 lt 1 pt 5 lc rgb "#00EEEE" lw 2;
|
||||
set style line 6 lt 1 pt 6 lc rgb "#C04000" lw 2;
|
||||
set style line 7 lt 1 lc rgb "#C8C800" lw 2;
|
||||
set style line 8 lt 1 lc rgb "#FF80FF" lw 2;
|
||||
set style line 9 lt 1 lc rgb "#4E642E" lw 2;
|
||||
set style line 10 lt 1 lc rgb "#800000" lw 2;
|
||||
set style line 11 lt 1 lc rgb "#67B7F7" lw 2;
|
||||
set style line 12 lt 1 lc rgb "#FFC127" lw 2;
|
||||
|
||||
set xlab "False positive rate"
|
||||
set ylab "Sensitivity"
|
||||
set yran [0.9:1]
|
||||
set style line 1 lt 1 pt 1 lc rgb "#e41a1c" lw 2;
|
||||
set style line 2 lt 1 pt 2 lc rgb "#377eb8" lw 2;
|
||||
set style line 3 lt 1 pt 3 lc rgb "#4daf4a" lw 2;
|
||||
set style line 4 lt 1 pt 4 lc rgb "#984ea3" lw 2;
|
||||
set style line 5 lt 1 pt 6 lc rgb "#ff7f00" lw 2;
|
||||
set style line 6 lt 1 pt 8 lc rgb "#f781bf" lw 2;
|
||||
|
||||
set out "roc-color.eps"
|
||||
|
||||
set pointsize 2.0
|
||||
set size 1.59,1.04
|
||||
set multiplot layout 1,2
|
||||
|
||||
set label "(a)" at graph -0.245,1.06 font "Helvetica-bold,40"
|
||||
set xlab "Error rate of mapped reads"
|
||||
set ylab "Fraction of mapped reads" off +1.8
|
||||
set ytics 0.02
|
||||
set yran [0.9:1]
|
||||
|
||||
set size 0.8,1
|
||||
set log x
|
||||
set format x "10^{%L}"
|
||||
set key bot right
|
||||
plot "<./eval2roc.pl blasr-mc.eval" u 2:3 t "blasr-mc" w lp ls 1, \
|
||||
plot "<./eval2roc.pl blasr-mc.eval" u 2:3 t "blasr-mc" w lp ls 4, \
|
||||
"<./eval2roc.pl bwa.eval" u 2:3 t "bwa-mem" w lp ls 2, \
|
||||
"<./eval2roc.pl minialign.eval" u 2:3 t "minialign" w lp ls 3, \
|
||||
"<./eval2roc.pl mm2.eval" u 2:3 t "minimap2" w lp ls 4, \
|
||||
"<./eval2roc.pl graphmap.eval" u 2:3 t "graphmap" w lp ls 3, \
|
||||
"<./eval2roc.pl minialign.eval" u 2:3 t "minialign" w lp ls 1, \
|
||||
"<./eval2roc.pl mm2.eval" u 2:3 t "minimap2" w lp ls 6, \
|
||||
"<./eval2roc.pl ngmlr.eval" u 2:3 t "ngm-lr" w lp ls 5
|
||||
unset label
|
||||
|
||||
set origin 0.8,0
|
||||
set size 0.79,1
|
||||
set label "(b)" at graph -0.245,1.06 font "Helvetica-bold,40"
|
||||
unset log
|
||||
unset format
|
||||
unset key
|
||||
set log y
|
||||
set ylab "Accumulative mapping error rate" off +0
|
||||
set xlab "Mapping quality"
|
||||
set yran [1e-5:0.1]
|
||||
set ytics 1e-5,0.1
|
||||
set format y "10^{%L}"
|
||||
set xran [60:0] reverse
|
||||
plot "<./eval2roc.pl blasr-mc.eval" u 1:2 w lp ls 4, \
|
||||
"<./eval2roc.pl bwa.eval" u 1:2 t "bwa-mem" w lp ls 2, \
|
||||
"<./eval2roc.pl graphmap.eval" u 1:2 t "graphmap" w lp ls 3, \
|
||||
"<./eval2roc.pl minialign.eval" u 1:2 t "minialign" w lp ls 1, \
|
||||
"<./eval2roc.pl mm2.eval" u 1:2 t "minimap2" w lp ls 6, \
|
||||
"<./eval2roc.pl ngmlr.eval" u 1:2 t "ngm-lr" w lp ls 5
|
||||
|
|
|
|||
Loading…
Reference in New Issue