response to reviewers' comments, round 2
This commit is contained in:
parent
242ff4e91d
commit
d135feb1a5
33
minimap2.1
33
minimap2.1
|
|
@ -1,4 +1,4 @@
|
||||||
.TH minimap2 1 "24 February 2018" "minimap2-2.9 (r720)" "Bioinformatics tools"
|
.TH minimap2 1 "15 March 2018" "minimap2-2.9-dirty (r746)" "Bioinformatics tools"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
.PP
|
.PP
|
||||||
minimap2 - mapping and alignment between collections of DNA sequences
|
minimap2 - mapping and alignment between collections of DNA sequences
|
||||||
|
|
@ -123,10 +123,27 @@ provided as the target sequences, options
|
||||||
will be effectively overridden by the options stored in the index file.
|
will be effectively overridden by the options stored in the index file.
|
||||||
.SS Mapping options
|
.SS Mapping options
|
||||||
.TP 10
|
.TP 10
|
||||||
.BI -f \ FLOAT
|
.BI -f \ FLOAT | INT1 [, INT2 ]
|
||||||
Ignore top
|
If fraction, ignore top
|
||||||
.I FLOAT
|
.I FLOAT
|
||||||
fraction of most frequent minimizers [0.0002]
|
fraction of most frequent minimizers [0.0002]. If integer,
|
||||||
|
ignore minimizers occuring more than
|
||||||
|
.I INT1
|
||||||
|
times.
|
||||||
|
.I INT2
|
||||||
|
is only effective in the
|
||||||
|
.B --sr
|
||||||
|
or
|
||||||
|
.B -xsr
|
||||||
|
mode, which sets the threshold for a second round of seeding.
|
||||||
|
.TP
|
||||||
|
.BI --min-occ-floor \ INT
|
||||||
|
Force minimap2 to always use k-mers occurring
|
||||||
|
.I INT
|
||||||
|
times or less [0]. In effect, the max occurrence threshold is set to
|
||||||
|
the
|
||||||
|
.RI max{ INT ,
|
||||||
|
.BR -f }.
|
||||||
.TP
|
.TP
|
||||||
.BI -g \ INT
|
.BI -g \ INT
|
||||||
Stop chain enlongation if there are no minimizers within
|
Stop chain enlongation if there are no minimizers within
|
||||||
|
|
@ -217,14 +234,6 @@ on chains. Set
|
||||||
.I INT
|
.I INT
|
||||||
to a large number to switch off this heurstics.
|
to a large number to switch off this heurstics.
|
||||||
.TP
|
.TP
|
||||||
.BI --min-occ-floor \ INT
|
|
||||||
Force minimap2 to always use k-mers occurring
|
|
||||||
.I INT
|
|
||||||
times or less [0]. In effect, the max occurrence threshold is set to
|
|
||||||
the
|
|
||||||
.RI max{ INT ,
|
|
||||||
.BR -f }.
|
|
||||||
.TP
|
|
||||||
.B --no-long-join
|
.B --no-long-join
|
||||||
Disable the long gap patching heuristic. When this option is applied, the
|
Disable the long gap patching heuristic. When this option is applied, the
|
||||||
maximum alignment gap is mostly controlled by
|
maximum alignment gap is mostly controlled by
|
||||||
|
|
|
||||||
|
|
@ -61,13 +61,6 @@
|
||||||
Volume = {32},
|
Volume = {32},
|
||||||
Year = {2016}}
|
Year = {2016}}
|
||||||
|
|
||||||
@misc{Suzuki:2016,
|
|
||||||
title = {Fast and accurate alignment tool for PacBio and Nanopore long reads},
|
|
||||||
author = {Hajime Suzuki},
|
|
||||||
journal = {Unpublished},
|
|
||||||
howpublished = {\href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}},
|
|
||||||
year = {2016}}
|
|
||||||
|
|
||||||
@misc{Ruan:2016,
|
@misc{Ruan:2016,
|
||||||
title = {Ultra-fast de novo assembler using long noisy reads},
|
title = {Ultra-fast de novo assembler using long noisy reads},
|
||||||
author = {Jue Ruan},
|
author = {Jue Ruan},
|
||||||
|
|
@ -172,14 +165,6 @@
|
||||||
Volume = {29},
|
Volume = {29},
|
||||||
Year = {2011}}
|
Year = {2011}}
|
||||||
|
|
||||||
@article {Suzuki130633,
|
|
||||||
author = {Suzuki, Hajime and Kasahara, Masahiro},
|
|
||||||
title = {Acceleration Of Nucleotide Semi-Global Alignment With Adaptive Banded Dynamic Programming},
|
|
||||||
year = {2017},
|
|
||||||
note = {doi:10.1101/130633},
|
|
||||||
publisher = {Cold Spring Harbor Labs Journals},
|
|
||||||
journal = {bioRxiv}}
|
|
||||||
|
|
||||||
@article{Gotoh:1982aa,
|
@article{Gotoh:1982aa,
|
||||||
Author = {Gotoh, O},
|
Author = {Gotoh, O},
|
||||||
Journal = {J Mol Biol},
|
Journal = {J Mol Biol},
|
||||||
|
|
@ -337,3 +322,19 @@
|
||||||
Title = {{MUMmer4}: A fast and versatile genome alignment system},
|
Title = {{MUMmer4}: A fast and versatile genome alignment system},
|
||||||
Volume = {14},
|
Volume = {14},
|
||||||
Year = {2018}}
|
Year = {2018}}
|
||||||
|
|
||||||
|
@article{Li:2009ys,
|
||||||
|
Author = {Li, Heng and others},
|
||||||
|
Journal = {Bioinformatics},
|
||||||
|
Pages = {2078-9},
|
||||||
|
Title = {The {Sequence Alignment/Map format and SAMtools}},
|
||||||
|
Volume = {25},
|
||||||
|
Year = {2009}}
|
||||||
|
|
||||||
|
@article{Suzuki:2018aa,
|
||||||
|
Author = {Suzuki, Hajime and Kasahara, Masahiro},
|
||||||
|
Journal = {BMC Bioinformatics},
|
||||||
|
Pages = {45},
|
||||||
|
Title = {Introducing difference recurrence relations for faster semi-global alignment of long sequences},
|
||||||
|
Volume = {19},
|
||||||
|
Year = {2018}}
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@
|
||||||
\begin{document}
|
\begin{document}
|
||||||
\firstpage{1}
|
\firstpage{1}
|
||||||
|
|
||||||
\title[Aligning nucleotide sequences with minimap2]{Minimap2: versatile pairwise alignment for nucleotide sequences}
|
\title[Aligning nucleotide sequences with minimap2]{Minimap2: pairwise alignment for nucleotide sequences}
|
||||||
\author[Li]{Heng Li}
|
\author[Li]{Heng Li}
|
||||||
\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
|
\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
|
||||||
|
|
||||||
|
|
@ -64,7 +64,7 @@ the thought that 10kb long sequences should be easier to map than 100bp reads
|
||||||
because we can more effectively skip repetitive regions, which are often the
|
because we can more effectively skip repetitive regions, which are often the
|
||||||
bottleneck of short-read alignment. We confirmed our speculation by achieving
|
bottleneck of short-read alignment. We confirmed our speculation by achieving
|
||||||
approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}.
|
approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}.
|
||||||
\citet{Suzuki130633} extended our work with a fast and novel algorithm on
|
\citet{Suzuki:2018aa} extended our work with a fast and novel algorithm on
|
||||||
generating base-level alignment, which in turn inspired us to develop minimap2
|
generating base-level alignment, which in turn inspired us to develop minimap2
|
||||||
with added functionality.
|
with added functionality.
|
||||||
|
|
||||||
|
|
@ -88,7 +88,9 @@ the versatility of minimap2.
|
||||||
|
|
||||||
Minimap2 follows a typical seed-chain-align procedure as is used by most
|
Minimap2 follows a typical seed-chain-align procedure as is used by most
|
||||||
full-genome aligners. It collects minimizers~\citep{Roberts:2004fv} of the
|
full-genome aligners. It collects minimizers~\citep{Roberts:2004fv} of the
|
||||||
reference sequences and indexes them in a hash table. Then for each query
|
reference sequences and indexes them in a hash table, with the key being the
|
||||||
|
hash of a minimizer and the value being a list of locations of the minimizer
|
||||||
|
copies. Then for each query
|
||||||
sequence, minimap2 takes query minimizers as \emph{seeds}, finds exact matches
|
sequence, minimap2 takes query minimizers as \emph{seeds}, finds exact matches
|
||||||
(i.e. \emph{anchors}) to the reference, and identifies sets of colinear anchors as
|
(i.e. \emph{anchors}) to the reference, and identifies sets of colinear anchors as
|
||||||
\emph{chains}. If base-level alignment is requested, minimap2 applies dynamic
|
\emph{chains}. If base-level alignment is requested, minimap2 applies dynamic
|
||||||
|
|
@ -118,9 +120,12 @@ distance between two anchors is too large); otherwise
|
||||||
\begin{equation}\label{eq:chain-gap}
|
\begin{equation}\label{eq:chain-gap}
|
||||||
\beta(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big)
|
\beta(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big)
|
||||||
\end{equation}
|
\end{equation}
|
||||||
In implementation, a gap of length $l\not=0$ costs
|
In implementation, a gap of length $l$ costs
|
||||||
\[
|
\[
|
||||||
\gamma_c(l)=0.01\cdot \bar{w}\cdot|l|+0.5\log_2|l|
|
\gamma_c(l)=\left\{\begin{array}{ll}
|
||||||
|
0.01\cdot \bar{w}\cdot|l|+0.5\log_2|l| & (l\not=0) \\
|
||||||
|
0 & (l=0)
|
||||||
|
\end{array}\right.
|
||||||
\]
|
\]
|
||||||
where $\bar{w}$ is the average seed length. For $N$ anchors, directly computing all $f(\cdot)$ with
|
where $\bar{w}$ is the average seed length. For $N$ anchors, directly computing all $f(\cdot)$ with
|
||||||
Eq.~(\ref{eq:chain}) takes $O(N^2)$ time. Although theoretically faster
|
Eq.~(\ref{eq:chain}) takes $O(N^2)$ time. Although theoretically faster
|
||||||
|
|
@ -164,7 +169,7 @@ empirical formula:
|
||||||
\[
|
\[
|
||||||
{\rm mapQ}=40\cdot (1-f_2/f_1)\cdot\min\{1,m/10\}\cdot\log f_1
|
{\rm mapQ}=40\cdot (1-f_2/f_1)\cdot\min\{1,m/10\}\cdot\log f_1
|
||||||
\]
|
\]
|
||||||
where $m$ is the number of anchors on the primary chain, $f_1$ is the chaining
|
where $\log$ denotes natural logarithm, $m$ is the number of anchors on the primary chain, $f_1$ is the chaining
|
||||||
score, and $f_2\le f_1$ is the score of the best chain that is secondary to the
|
score, and $f_2\le f_1$ is the score of the best chain that is secondary to the
|
||||||
primary chain. Intuitively, a chain is assigned to a higher mapping quality if
|
primary chain. Intuitively, a chain is assigned to a higher mapping quality if
|
||||||
it is long and its best secondary chain is weak.
|
it is long and its best secondary chain is weak.
|
||||||
|
|
@ -253,7 +258,7 @@ performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs}
|
||||||
based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short
|
based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short
|
||||||
sequences, but only 4-way parallelization when the peak alignment score reaches
|
sequences, but only 4-way parallelization when the peak alignment score reaches
|
||||||
32767. Long sequence alignment may exceed this threshold. Inspired by
|
32767. Long sequence alignment may exceed this threshold. Inspired by
|
||||||
\citet{Wu:1996aa} and the following work, \citet{Suzuki130633} proposed a
|
\citet{Wu:1996aa} and the following work, \citet{Suzuki:2018aa} proposed a
|
||||||
difference-based formulation that lifted this limitation.
|
difference-based formulation that lifted this limitation.
|
||||||
In case of 2-piece gap cost, define
|
In case of 2-piece gap cost, define
|
||||||
\[
|
\[
|
||||||
|
|
@ -320,7 +325,7 @@ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\
|
||||||
\end{equation*}
|
\end{equation*}
|
||||||
In this formulation, cells with the same diagonal index $r$ are independent of
|
In this formulation, cells with the same diagonal index $r$ are independent of
|
||||||
each other. This allows us to fully vectorize the computation of all cells on
|
each other. This allows us to fully vectorize the computation of all cells on
|
||||||
the same anti-diagonal in one inner loop. It also simplifies banded alignment,
|
the same anti-diagonal in one inner loop. It also simplifies banded alignment (500bp band width by default),
|
||||||
which would be difficult with striped vectorization~\citep{Farrar:2007hs}.
|
which would be difficult with striped vectorization~\citep{Farrar:2007hs}.
|
||||||
|
|
||||||
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the initial
|
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the initial
|
||||||
|
|
@ -355,7 +360,7 @@ times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without
|
||||||
banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, but with
|
banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, but with
|
||||||
a 1000bp band, it is considerably faster. When performing global alignment
|
a 1000bp band, it is considerably faster. When performing global alignment
|
||||||
between anchors, we expect the alignment to stay close to the diagonal of the
|
between anchors, we expect the alignment to stay close to the diagonal of the
|
||||||
DP matrix. Banding is applicable most of time.
|
DP matrix. Banding is applicable most of the time.
|
||||||
|
|
||||||
\subsubsection{The Z-drop heuristic}
|
\subsubsection{The Z-drop heuristic}
|
||||||
|
|
||||||
|
|
@ -478,7 +483,7 @@ both C and Python. It is distributed under the MIT license, free to both
|
||||||
commercial and academic uses. Minimap2 uses the same base algorithm for all
|
commercial and academic uses. Minimap2 uses the same base algorithm for all
|
||||||
applications, but it has to apply different sets of parameters depending on
|
applications, but it has to apply different sets of parameters depending on
|
||||||
input data types. Similar to BWA-MEM, minimap2 introduces `presets' that
|
input data types. Similar to BWA-MEM, minimap2 introduces `presets' that
|
||||||
modify multiple parameters with a simple invokation. Detailed settings
|
modify multiple parameters with a simple invocation. Detailed settings
|
||||||
and command-line options can be found in the minimap2 manpage. In addition to
|
and command-line options can be found in the minimap2 manpage. In addition to
|
||||||
the applications evaluated in the following sections, minimap2 also retains
|
the applications evaluated in the following sections, minimap2 also retains
|
||||||
minimap's functionality to find overlaps between long reads and to search
|
minimap's functionality to find overlaps between long reads and to search
|
||||||
|
|
@ -570,7 +575,7 @@ Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\
|
||||||
\% approx. introns & 91.8\% & 96.9\% & 92.5\% & 82.4\% \\
|
\% approx. introns & 91.8\% & 96.9\% & 92.5\% & 82.4\% \\
|
||||||
\botrule
|
\botrule
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
}{Mouse reads (AC:SRR5286960; R9.4 chemistry) were mapped to the primary assembly of mouse
|
}{Mouse cDNA reads (AC:SRR5286960; R9.4 chemistry) were mapped to the primary assembly of mouse
|
||||||
genome GRCm38 with the following tools and command options: minimap2 (`-ax
|
genome GRCm38 with the following tools and command options: minimap2 (`-ax
|
||||||
splice'); GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS
|
splice'); GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS
|
||||||
-S3'); STARlong (according to
|
-S3'); STARlong (according to
|
||||||
|
|
@ -579,7 +584,7 @@ compared to the EnsEMBL gene annotation, release 89. A predicted intron
|
||||||
is \emph{novel} if it has no overlaps with any annotated introns. An intron
|
is \emph{novel} if it has no overlaps with any annotated introns. An intron
|
||||||
is \emph{exact} if it is identical to an annotated intron. An intron is
|
is \emph{exact} if it is identical to an annotated intron. An intron is
|
||||||
\emph{approximate} if both its 5'- and 3'-end are within 10bp around the ends
|
\emph{approximate} if both its 5'- and 3'-end are within 10bp around the ends
|
||||||
of an annotated intron.}
|
of an annotated intron. Chimeric alignments are defined in the SAM spec~\citep{Li:2009ys}.}
|
||||||
\end{table}
|
\end{table}
|
||||||
|
|
||||||
We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20;
|
We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20;
|
||||||
|
|
@ -687,7 +692,7 @@ involving $>$100kb introns, which was impractically slow ten years ago. The
|
||||||
minimap2 chaining algorithm is fast and highly accurate by itself. In fact,
|
minimap2 chaining algorithm is fast and highly accurate by itself. In fact,
|
||||||
chaining alone is more accurate than all the other long-read mappers in
|
chaining alone is more accurate than all the other long-read mappers in
|
||||||
Fig.~\ref{fig:eval}a (data not shown). This accuracy helps to reduce downstream
|
Fig.~\ref{fig:eval}a (data not shown). This accuracy helps to reduce downstream
|
||||||
base-level alignment of candidate chains, which is still times slower than
|
base-level alignment of candidate chains, which is still several times slower than
|
||||||
chaining even with the Suzuki-Kasahara improvement. In addition, taking a
|
chaining even with the Suzuki-Kasahara improvement. In addition, taking a
|
||||||
general form, minimap2 chaining can be adapted to non-typical data types such as
|
general form, minimap2 chaining can be adapted to non-typical data types such as
|
||||||
spliced reads and multiple reads per fragment. This gives us the opportunity to
|
spliced reads and multiple reads per fragment. This gives us the opportunity to
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue