From d0ac78ac086630af82aa834f86e6edd05b73ef50 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Nov 2017 15:37:24 -0400 Subject: [PATCH] updated the tech note --- tex/minimap2.bib | 16 ++++++ tex/minimap2.tex | 102 +++++++++++++++++++++++-------------- tex/mm2-s3.sam.eval | 120 ++++++++++++++++++++++---------------------- tex/mm2.eval | 20 +++++--- 4 files changed, 153 insertions(+), 105 deletions(-) diff --git a/tex/minimap2.bib b/tex/minimap2.bib index e722516..bd85dc5 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -281,3 +281,19 @@ Journal = {arXiv:1111:5572}, Title = {Faster and More Accurate Sequence Alignment with SNAP}, Year = {2011}} + +@article{Irimia:2008aa, + Author = {Irimia, Manuel and Roy, Scott William}, + Journal = {PLoS Genet}, + Pages = {e1000148}, + Title = {Evolutionary convergence on highly-conserved 3' intron structures in intron-poor eukaryotes and insights into the ancestral eukaryotic genome}, + Volume = {4}, + Year = {2008}} + +@article{Depristo:2011vn, + Author = {Depristo, Mark A and others}, + Journal = {Nat Genet}, + Pages = {491-8}, + Title = {A framework for variation discovery and genotyping using next-generation {DNA} sequencing data}, + Volume = {43}, + Year = {2011}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index e973bf0..b14baac 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -31,10 +31,10 @@ \section{Motivation:} Recent advances in sequencing technologies promise ultra-long reads of $\sim$100 kilo bases (kb) in average, full-length mRNA or cDNA reads in high throughput and genomic contigs over 100 mega bases (Mb) in -length. Existing alignment tools are unable or inefficient to process such data +length. Existing alignment programs are unable or inefficient to process such data at scale, which presses for the development of new alignment algorithms. -\section{Results:} Minimap2 is a general-purpose aligner to map DNA or long +\section{Results:} Minimap2 is a general-purpose mapper to align DNA or long mRNA sequences against a large reference database. It works with accurate short reads of $\ge$100bp in length, $\ge$1kb genomic reads at error rate $\sim$15\%, full-length noisy Direct RNA or cDNA reads, and assembly contigs or closely @@ -64,7 +64,7 @@ the thought that 10kb long sequences should be easier to map than 100bp reads because we can more effectively skip repetitive regions, which are often the bottleneck of short-read alignment. We confirmed our speculation by achieving approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}. -\citet{Suzuki:2016} extended our work with a fast and novel algorithm on +\citet{Suzuki130633} extended our work with a fast and novel algorithm on generating base-level alignment, which in turn inspired us to develop minimap2 towards higher accuracy and more practical functionality. @@ -179,7 +179,7 @@ where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query base. Eq.~(\ref{eq:ae86}) is a natural extension to the equation under affine gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}. -\subsubsection{Suzuki's formulation} +\subsubsection{The Suzuki-Kasahara formulation} When we allow gaps longer than several hundred base pairs, nucleotide-level alignment is much slower than chaining. SSE acceleration is critical to the @@ -187,7 +187,7 @@ performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs} based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short sequences, but only 4-way parallelization when the peak alignment score reaches 32767. Long sequence alignment may exceed this threshold. Inspired by -\citet{Wu:1996aa} and the following work, \citet{Suzuki:2016} proposed a +\citet{Wu:1996aa} and the following work, \citet{Suzuki130633} proposed a difference-based formulation that lifted this limitation. In case of 2-piece gap cost, define \[ @@ -337,18 +337,24 @@ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ \tilde{E}_{i+1,j}= \max\{H_{ij}-d(i)-\tilde{q},\tilde{E}_{ij}\}\\ \end{array}\right. \end{equation} -Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor -site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a positive number $p$ -otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which -takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is -almost equivalent to the equation used by EXALIN~\citep{Zhang:2006aa} except -that we allow insertions immediately followed by deletions and vice versa; in -addition, we use Suzuki's diagonal formulation in actual implementation. - -%Given that $d_i$ and $a_i$ -%are a function of the reference sequence, it is possible to incorporate -%splicing signals with more sophisticated models, such as positional weight -%matrices. We have not tried this approach. +Let $T$ be the reference sequence. $d(i)$ is computed as +\[d(i)=\left\{\begin{array}{ll} +0 & \mbox{if $T[i+1,i+3]$ is ${\tt GTA}$ or ${\tt GTG}$} \\ +p/2 & \mbox{if $T[i+1,i+3]$ is ${\tt GTC}$ or ${\tt GTT}$} \\ +p & \mbox{otherwise} +\end{array}\right.\] +where $T[i,j]$ extracts a substring of $T$ between $i$ and $j$ inclusively. +$d(i)$ penalizes non-canonical donor sites with $p$ and less frequent Eukayotic +splicing signal ${\tt GT[C/T]}$ with $p/2$~\citep{Irimia:2008aa}. Similarly, +\[a(i)=\left\{\begin{array}{ll} +0 & \mbox{if $T[i-2,i]$ is ${\tt CAG}$ or ${\tt TAG}$} \\ +p/2 & \mbox{if $T[i-2,i]$ is ${\tt AAG}$ or ${\tt GAG}$} \\ +p & \mbox{otherwise} +\end{array}\right.\] +models the acceptor signal. Eq.~(\ref{eq:splice}) is close to an equation in +\citet{Zhang:2006aa} except that we allow insertions immediately followed by +deletions and vice versa; in addition, we use the Suzuki-Kasahara diagonal +formulation in actual implementation. If RNA-seq reads are not sequenced from stranded libraries, the read strand relative to the underlying transcript is unknown. By default, minimap2 aligns @@ -440,16 +446,16 @@ to the 2-piece affine gap cost. \subsection{Aligning long spliced reads} We evaluated minimap2 on SIRV control data~(AC:SRR5286959; -\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,916 -introns from 11\,017 reads. 93.0\% of splice juctions are precise. We examined +\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,918 +introns from 11\,018 reads. 93.8\% of splice juctions are precise. We examined wrongly predicted junctions and found the majority were caused by clustered splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing errors are frequent, it is difficult to find precise splicing sites in this case. If we allow up to 10bp distance from true splicing sites, 98.4\% of -aligned introns are approximately correct. Given this observation, we might be -able to improve boundary detection by initializing $d(\cdot)$ and $a(\cdot)$ in -Eq.~(\ref{eq:splice}) with position-specific scoring matrices or more -sophisticated models. We have not tried this approach. +aligned introns are approximately correct. It is worth noting that for SIRV, we +asked minimap2 to model the ${\tt GT..AG}$ splicing signal only without extra +bases. This is because SIRV does not honor the evolutionarily prevalent signal +${\tt GT[A/G]..[C/T]AG}$~\citep{Irimia:2008aa}. \begin{table}[!tb] \processtable{Evaluation of junction accuracy on 2D ONT reads} @@ -460,13 +466,13 @@ sophisticated models. We have not tried this approach. \midrule Run time (CPU min) & 631 & 15.9 & 2\,076 & 33.9 \\ Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\ -\# aligned reads & 103\,669 & 104\,200 & 103\,711 & 26\,479 \\ +\# aligned reads & 103\,669 & 104\,199 & 103\,711 & 26\,479 \\ \# chimeric alignments & 1\,904 & 1\,488 & 0 & 0 \\ -\# non-spliced alignments & 15\,854 & 14\,639 & 17\,033 & 10\,545\vspace{1em}\\ -\# aligned introns & 692\,275 & 694\,103 & 692\,945 & 78\,603 \\ -\# novel introns & 11\,239 & 3\,207 & 8\,550 & 1\,214 \\ -\% exact introns & 83.8\% & 91.7\% & 87.9\% & 55.2\% \\ -\% approx. introns & 91.8\% & 96.5\% & 92.5\% & 82.4\% \\ +\# non-spliced alignments & 15\,854 & 14\,798 & 17\,033 & 10\,545\vspace{1em}\\ +\# aligned introns & 692\,275 & 693\,553 & 692\,945 & 78\,603 \\ +\# novel introns & 11\,239 & 3\,113 & 8\,550 & 1\,214 \\ +\% exact introns & 83.8\% & 94.0\% & 87.9\% & 55.2\% \\ +\% approx. introns & 91.8\% & 96.9\% & 92.5\% & 82.4\% \\ \botrule \end{tabular} }{Mouse reads (AC:SRR5286960) were mapped to the primary assembly of mouse @@ -487,10 +493,16 @@ STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more consistent with existing annotations (Table~\ref{tab:intron}): it finds more junctions with a higher percentage being exactly or approximately correct. Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to -minimap2 in speed, it does not work well with noisy reads. We have also -evaluated spliced aligners on public Iso-Seq data (human Alzheimer brain -from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The observation -is similar: minimap2 is faster at higher junction accuracy. +minimap2 in speed, it does not work well with noisy reads. + +We have also evaluated spliced aligners on public Iso-Seq data (human Alzheimer +brain from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The +observation is similar: minimap2 is faster at higher junction accuracy. +On a private Nanopore Direct RNA data set with $>$20\% sequencing error rate +(M\"{u}ller et al, personal communication), minimap2 aligned 940,346 introns +from 239,976 mapped reads with 88.5\% of them consistent with human gene +annotations. In comparison, only 40.3\% of GMAP introns found in known gene +annotations. We noted that GMAP and SpAln have not been optimized for noisy reads. We are showing the best setting we have experimented, but their developers should be @@ -528,6 +540,20 @@ region close to its mate. If we disable this feature, BWA-MEM becomes slightly less accurate than minimap2. We might consider to implement a similar heuristic in minimap2 in future. +To evaluate the accuracy of minimap2 on real data, we aligned human reads +(AC:ERR1341796) with BWA-MEM and minimap2, and called SNPs and small INDELs +with GATK HaplotypeCaller v3.5~\citep{Depristo:2011vn}. This run was sequenced +from experimentally mixed CHM1 and CHM13 cell lines. Both them are homozygous +across the whole genome and have been \emph{de novo} assembled with SMRT reads +to high quality. This allowed us to construct an independent truth variant +data set +(\href{https://github.com/lh3/CHM-eval}{https://github.com/lh3/CHM-eval}) for +ERR1341796. In this evaluation, minimap2 has higher SNP false negative rate +(FNR; 2.5\% of minimap2 vs 2.2\% of BWA-MEM), but fewer false positive SNPs per +million bases (FPPM; 3.0 vs 3.9), lower INDEL FNR (7.3\% vs 7.5\%) and similar +INDEL FPPM (both 1.0). The difference between the two mappers is much smaller +than between BWA-MEM and Bowtie2. + \section{Conclusion} Minimap2 is a fast, accurate and versatile aligner for long nucleotide @@ -540,11 +566,11 @@ alignment is an intricate research topic. More thorough evaluations would be necessary to justify the use of minimap2 for such applications. \section*{Acknowledgements} -We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and -insightful notes before formal publication. We thank M. Schatz, P. Rescheneder -and F. Sedlazeck for pointing out the limitation of BWA-MEM. We are also -grateful to early minimap2 testers who have greatly helped to suggest features -and to fix various issues. +We owe a debt of gratitude to H. Suzuki and M. Kasahara for releasing their +masterpiece and insightful notes before formal publication. We thank M. +Schatz, P. Rescheneder and F. Sedlazeck for pointing out the limitation of +BWA-MEM. We are also grateful to early minimap2 testers who have greatly helped +to suggest features and to fix various issues. \bibliography{minimap2} diff --git a/tex/mm2-s3.sam.eval b/tex/mm2-s3.sam.eval index 83b81cb..6732dd2 100644 --- a/tex/mm2-s3.sam.eval +++ b/tex/mm2-s3.sam.eval @@ -1,60 +1,62 @@ -Q 60 18345673 8 0.000000436 18345673 -Q 59 33966 4 0.000000653 18379639 -Q 58 34178 1 0.000000706 18413817 -Q 56 49138 1 0.000000758 18462955 -Q 54 22442 4 0.000000974 18485397 -Q 53 19070 2 0.000001081 18504467 -Q 52 14169 3 0.000001242 18518636 -Q 51 13233 4 0.000001457 18531869 -Q 50 12133 2 0.000001564 18544002 -Q 49 11138 4 0.000001778 18555140 -Q 48 11174 8 0.000002208 18566314 -Q 47 17139 4 0.000002422 18583453 -Q 46 20428 10 0.000002956 18603881 -Q 45 16503 3 0.000003115 18620384 -Q 44 11933 6 0.000003435 18632317 -Q 43 25392 11 0.000004020 18657709 -Q 42 16734 9 0.000004498 18674443 -Q 41 13826 10 0.000005030 18688269 -Q 40 13023 10 0.000005561 18701292 -Q 39 12686 10 0.000006092 18713978 -Q 38 17275 4 0.000006300 18731253 -Q 37 17241 2 0.000006401 18748494 -Q 36 12458 12 0.000007036 18760952 -Q 35 11981 5 0.000007298 18772933 -Q 34 12004 11 0.000007879 18784937 -Q 33 12111 7 0.000008246 18797048 -Q 32 11782 9 0.000008719 18808830 -Q 31 11811 7 0.000009086 18820641 -Q 30 33507 32 0.000010767 18854148 -Q 29 11243 21 0.000011874 18865391 -Q 28 10779 17 0.000012767 18876170 -Q 27 15733 24 0.000014027 18891903 -Q 26 16762 40 0.000016130 18908665 -Q 25 13811 49 0.000018708 18922476 -Q 24 14141 46 0.000021123 18936617 -Q 23 13429 55 0.000024010 18950046 -Q 22 13116 26 0.000025365 18963162 -Q 21 13436 46 0.000027771 18976598 -Q 20 13441 55 0.000030648 18990039 -Q 19 12988 53 0.000033416 19003027 -Q 18 13353 51 0.000036074 19016380 -Q 17 13782 77 0.000040094 19030162 -Q 16 14065 94 0.000045001 19044227 -Q 15 14044 124 0.000051474 19058271 -Q 14 14714 140 0.000058774 19072985 -Q 13 17459 197 0.000069040 19090444 -Q 12 17339 259 0.000082532 19107783 -Q 11 17381 280 0.000097097 19125164 -Q 10 17732 295 0.000112418 19142896 -Q 9 17959 416 0.000134023 19160855 -Q 8 18234 530 0.000161530 19179089 -Q 7 19048 514 0.000188143 19198137 -Q 6 19722 656 0.000222085 19217859 -Q 5 19753 775 0.000262143 19237612 -Q 4 19818 1030 0.000315359 19257430 -Q 3 17088 1100 0.000372149 19274518 -Q 2 43045 6708 0.000718569 19317563 -Q 1 126377 25255 0.002012761 19443940 -Q 0 554357 372087 0.020562901 19998297 +Q 60 18579866 27 0.000001453 18579866 +Q 59 27087 4 0.000001666 18606953 +Q 58 21435 1 0.000001718 18628388 +Q 57 45663 3 0.000001874 18674051 +Q 56 36031 2 0.000001978 18710082 +Q 55 18499 2 0.000002082 18728581 +Q 54 14754 2 0.000002187 18743335 +Q 53 25541 2 0.000002291 18768876 +Q 52 26397 5 0.000002554 18795273 +Q 51 15090 3 0.000002711 18810363 +Q 50 13425 11 0.000003294 18823788 +Q 49 15175 2 0.000003397 18838963 +Q 48 19407 4 0.000003606 18858370 +Q 47 11538 16 0.000004452 18869908 +Q 46 12558 17 0.000005349 18882466 +Q 45 40362 28 0.000006817 18922828 +Q 44 10465 13 0.000007500 18933293 +Q 43 10098 20 0.000008552 18943391 +Q 42 10682 19 0.000009549 18954073 +Q 41 9823 11 0.000010125 18963896 +Q 40 9685 16 0.000010963 18973581 +Q 39 10273 18 0.000011905 18983854 +Q 38 9515 18 0.000012847 18993369 +Q 37 9474 27 0.000014261 19002843 +Q 36 10430 25 0.000015568 19013273 +Q 35 9241 34 0.000017348 19022514 +Q 34 9162 31 0.000018968 19031676 +Q 33 10164 49 0.000021532 19041840 +Q 32 9152 55 0.000024408 19050992 +Q 31 9252 35 0.000026233 19060244 +Q 30 9872 55 0.000029103 19070116 +Q 29 8938 65 0.000032496 19079054 +Q 28 8951 73 0.000036306 19088005 +Q 27 9949 95 0.000041261 19097954 +Q 26 9784 97 0.000046316 19107738 +Q 25 10126 97 0.000051366 19117864 +Q 24 11260 123 0.000057765 19129124 +Q 23 10047 114 0.000063691 19139171 +Q 22 9661 123 0.000070083 19148832 +Q 21 10339 168 0.000078813 19159171 +Q 20 17928 193 0.000088804 19177099 +Q 19 9842 193 0.000098817 19186941 +Q 18 14737 247 0.000111605 19201678 +Q 17 10218 238 0.000123934 19211896 +Q 16 10271 242 0.000136457 19222167 +Q 15 12241 333 0.000153683 19234408 +Q 14 9189 336 0.000171070 19243597 +Q 13 9493 515 0.000197734 19253090 +Q 12 11502 743 0.000236185 19264592 +Q 11 8211 507 0.000262390 19272803 +Q 10 9133 606 0.000293695 19281936 +Q 9 10014 931 0.000341801 19291950 +Q 8 8436 698 0.000377816 19300386 +Q 7 8443 705 0.000414163 19308829 +Q 6 10203 944 0.000462808 19319032 +Q 5 6936 756 0.000501760 19325968 +Q 4 6732 843 0.000545190 19332700 +Q 3 8215 1104 0.000602040 19340915 +Q 2 21201 5440 0.000882342 19362116 +Q 1 82328 22186 0.002019600 19444444 +Q 0 553853 371953 0.020562901 19998297 U 1703 diff --git a/tex/mm2.eval b/tex/mm2.eval index 405d4e9..38736f2 100644 --- a/tex/mm2.eval +++ b/tex/mm2.eval @@ -1,9 +1,13 @@ -Q 60 32226 0 0.000000000 32226 -Q 20 267 1 0.000030776 32493 -Q 10 34 1 0.000061487 32527 -Q 9 118 1 0.000091898 32645 -Q 5 27 2 0.000153036 32672 -Q 4 68 2 0.000213806 32740 -Q 1 314 101 0.003267381 33054 +Q 60 32477 0 0.000000000 32477 +Q 22 16 1 0.000030776 32493 +Q 21 44 1 0.000061468 32537 +Q 19 73 1 0.000091996 32610 +Q 14 66 1 0.000122414 32676 +Q 10 26 3 0.000214054 32702 +Q 8 14 1 0.000244529 32716 +Q 7 13 2 0.000305539 32729 +Q 6 47 1 0.000335611 32776 +Q 3 10 1 0.000366010 32786 +Q 2 20 2 0.000426751 32806 +Q 1 248 94 0.003267381 33054 Q 0 31 17 0.003778147 33085 -U 3