From 19e4b2aab0bbc01c646af1ca8ea37229e87447c6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 20 Aug 2017 22:10:03 +0800 Subject: [PATCH] backup --- tex/minimap2.bib | 32 +++++++++++++++++++++++++++++ tex/minimap2.tex | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/tex/minimap2.bib b/tex/minimap2.bib index 3ca3e85..53fd4e2 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -195,3 +195,35 @@ Title = {Optimal sequence alignment using affine gap costs}, Volume = {48}, Year = {1986}} + +@article{Wu:2005vn, + Author = {Wu, Thomas D and Watanabe, Colin K}, + Journal = {Bioinformatics}, + Pages = {1859-75}, + Title = {{GMAP}: a genomic mapping and alignment program for {mRNA} and {EST} sequences}, + Volume = {21}, + Year = {2005}} + +@article{Iwata:2012aa, + Author = {Iwata, Hiroaki and Gotoh, Osamu}, + Journal = {Nucleic Acids Res}, + Pages = {e161}, + Title = {Benchmarking spliced alignment programs including {Spaln2}, an extended version of {Spaln} that incorporates additional species-specific features}, + Volume = {40}, + Year = {2012}} + +@article{Dobin:2013kx, + Author = {Dobin, Alexander and others}, + Journal = {Bioinformatics}, + Pages = {15-21}, + Title = {{STAR}: ultrafast universal {RNA-seq} aligner}, + Volume = {29}, + Year = {2013}} + +@article{Byrne:2017aa, + Author = {Byrne, Ashley and others}, + Journal = {Nat Commun}, + Pages = {16027}, + Title = {Nanopore long-read {RNAseq} reveals widespread transcriptional variation among the surface receptors of individual {B} cells}, + Volume = {8}, + Year = {2017}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index f66aa58..1081763 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -316,6 +316,9 @@ alignment. \end{methods} \section{Results} + +\subsection{Aligning genomic reads} + \begin{figure}[!tb] \centering \includegraphics[width=.5\textwidth]{roc-color.pdf} @@ -358,6 +361,56 @@ $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into shorter gaps. Minimap2 does not have this issue. +\subsection{Aligning spliced reads} + +\begin{table}[!tb] +\processtable{Exon-level evaluation of 2D ONT reads from mouse} +{\footnotesize\label{tab:exon} +\begin{tabular}{p{3.1cm}rrrr} +\toprule +& GMAP & minimap2 & SpAln & STAR\\ +\midrule +Run time (CPU min) & 631 & 15.5 & 2\,076 & 33.9 \\ +Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\ +\# aligned reads & 103\,669 & 103\,917 & 103\,711 & 26\,479\\ +\# chimeric alignments & 1\,904 & 1\,671 & 0 & 0\\ +\# non-spliced alignments & 15\,854 & 14\,483 & 17\,033 & 10\,545\vspace{1em}\\ +\# aligned introns & 692\,275 & 694\,237 & 692\,945 & 78\,603 \\ +\# novel introns & 11\,239 & 3\,217 & 8\,550 & 1\,214 \\ +\% exact introns & 83.8\% & 91.8\% & 87.9\% & 55.2\% \\ +\% approx. introns & 91.8\% & 96.5\% & 92.5\% & 82.4\% \\ +\botrule +\end{tabular} +}{Reads (AC:SRR5286960) were mapped to the primary assembly of mouse genome +GRCm38 with the following tools and command options: minimap2 (`-ax splice'); +GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS -S3'); +STARlong (according to +\href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were +compared to the EnsEMBL gene annotation, release 89. A predicted intron +is \emph{novel} if it has no overlaps with any annotated introns. An intron +is \emph{exact} if it is identical to an annotated intron. An intron is +\emph{approximate} if both of its 5'- and 3'-end are within 10bp around an +annotated intron.} +\end{table} + +We evaluated minimap2 along with GMAP~(v2017-06-20; \citealp{Wu:2005vn}), +SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and STAR~(v2.5.3a; +\citealp{Dobin:2013kx}) on real RNA-seq reads~\citep{Byrne:2017aa}. +In general, minimap2 is more consistent with existing annotations +(Table~\ref{tab:exon}). It finds more annotated spliced exons and predicts +fewer novel exons. Most novel exons identified by GMAP and SpAln are +very short, partly because the two aligners implement special routines to +identify micro-exons. It should be possible to optimize GMAP and SpAln on this +data set to reduce such errors. On run time, minimap2 is over 40 times faster +than GMAP and SpAln. While STAR is close to minimap2 in speed, it does not work +well with noisy reads. + +We have also run aligners on the SIRV spkie-in control data (AC:SRR5286959; +\citealp{Byrne:2017aa}) where the truth is know. Minimap2 is still the most +accurate. 91.9\% of internal exons in the minimap2 alignment are exact. +The percentage increases to 97.4\% if we allow up to 10bp around the splicing +boundaries. The difference between the two percentage is mostly caused by + \section{Discussions} Minialign and minimap2 are fast because a) with chaining, they can quickly