This commit is contained in:
Heng Li 2017-08-20 22:10:03 +08:00
parent ce859dbe1c
commit 19e4b2aab0
2 changed files with 85 additions and 0 deletions

View File

@ -195,3 +195,35 @@
Title = {Optimal sequence alignment using affine gap costs},
Volume = {48},
Year = {1986}}
@article{Wu:2005vn,
Author = {Wu, Thomas D and Watanabe, Colin K},
Journal = {Bioinformatics},
Pages = {1859-75},
Title = {{GMAP}: a genomic mapping and alignment program for {mRNA} and {EST} sequences},
Volume = {21},
Year = {2005}}
@article{Iwata:2012aa,
Author = {Iwata, Hiroaki and Gotoh, Osamu},
Journal = {Nucleic Acids Res},
Pages = {e161},
Title = {Benchmarking spliced alignment programs including {Spaln2}, an extended version of {Spaln} that incorporates additional species-specific features},
Volume = {40},
Year = {2012}}
@article{Dobin:2013kx,
Author = {Dobin, Alexander and others},
Journal = {Bioinformatics},
Pages = {15-21},
Title = {{STAR}: ultrafast universal {RNA-seq} aligner},
Volume = {29},
Year = {2013}}
@article{Byrne:2017aa,
Author = {Byrne, Ashley and others},
Journal = {Nat Commun},
Pages = {16027},
Title = {Nanopore long-read {RNAseq} reveals widespread transcriptional variation among the surface receptors of individual {B} cells},
Volume = {8},
Year = {2017}}

View File

@ -316,6 +316,9 @@ alignment.
\end{methods}
\section{Results}
\subsection{Aligning genomic reads}
\begin{figure}[!tb]
\centering
\includegraphics[width=.5\textwidth]{roc-color.pdf}
@ -358,6 +361,56 @@ $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the
observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into
shorter gaps. Minimap2 does not have this issue.
\subsection{Aligning spliced reads}
\begin{table}[!tb]
\processtable{Exon-level evaluation of 2D ONT reads from mouse}
{\footnotesize\label{tab:exon}
\begin{tabular}{p{3.1cm}rrrr}
\toprule
& GMAP & minimap2 & SpAln & STAR\\
\midrule
Run time (CPU min) & 631 & 15.5 & 2\,076 & 33.9 \\
Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\
\# aligned reads & 103\,669 & 103\,917 & 103\,711 & 26\,479\\
\# chimeric alignments & 1\,904 & 1\,671 & 0 & 0\\
\# non-spliced alignments & 15\,854 & 14\,483 & 17\,033 & 10\,545\vspace{1em}\\
\# aligned introns & 692\,275 & 694\,237 & 692\,945 & 78\,603 \\
\# novel introns & 11\,239 & 3\,217 & 8\,550 & 1\,214 \\
\% exact introns & 83.8\% & 91.8\% & 87.9\% & 55.2\% \\
\% approx. introns & 91.8\% & 96.5\% & 92.5\% & 82.4\% \\
\botrule
\end{tabular}
}{Reads (AC:SRR5286960) were mapped to the primary assembly of mouse genome
GRCm38 with the following tools and command options: minimap2 (`-ax splice');
GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS -S3');
STARlong (according to
\href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were
compared to the EnsEMBL gene annotation, release 89. A predicted intron
is \emph{novel} if it has no overlaps with any annotated introns. An intron
is \emph{exact} if it is identical to an annotated intron. An intron is
\emph{approximate} if both of its 5'- and 3'-end are within 10bp around an
annotated intron.}
\end{table}
We evaluated minimap2 along with GMAP~(v2017-06-20; \citealp{Wu:2005vn}),
SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and STAR~(v2.5.3a;
\citealp{Dobin:2013kx}) on real RNA-seq reads~\citep{Byrne:2017aa}.
In general, minimap2 is more consistent with existing annotations
(Table~\ref{tab:exon}). It finds more annotated spliced exons and predicts
fewer novel exons. Most novel exons identified by GMAP and SpAln are
very short, partly because the two aligners implement special routines to
identify micro-exons. It should be possible to optimize GMAP and SpAln on this
data set to reduce such errors. On run time, minimap2 is over 40 times faster
than GMAP and SpAln. While STAR is close to minimap2 in speed, it does not work
well with noisy reads.
We have also run aligners on the SIRV spkie-in control data (AC:SRR5286959;
\citealp{Byrne:2017aa}) where the truth is know. Minimap2 is still the most
accurate. 91.9\% of internal exons in the minimap2 alignment are exact.
The percentage increases to 97.4\% if we allow up to 10bp around the splicing
boundaries. The difference between the two percentage is mostly caused by
\section{Discussions}
Minialign and minimap2 are fast because a) with chaining, they can quickly