From d0ac78ac086630af82aa834f86e6edd05b73ef50 Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Thu, 2 Nov 2017 15:37:24 -0400
Subject: [PATCH] updated the tech note

---
 tex/minimap2.bib    |  16 ++++++
 tex/minimap2.tex    | 102 +++++++++++++++++++++++--------------
 tex/mm2-s3.sam.eval | 120 ++++++++++++++++++++++----------------------
 tex/mm2.eval        |  20 +++++---
 4 files changed, 153 insertions(+), 105 deletions(-)

diff --git a/tex/minimap2.bib b/tex/minimap2.bib
index e722516..bd85dc5 100644
--- a/tex/minimap2.bib
+++ b/tex/minimap2.bib
@@ -281,3 +281,19 @@
 	Journal = {arXiv:1111:5572},
 	Title = {Faster and More Accurate Sequence Alignment with SNAP},
 	Year = {2011}}
+
+@article{Irimia:2008aa,
+	Author = {Irimia, Manuel and Roy, Scott William},
+	Journal = {PLoS Genet},
+	Pages = {e1000148},
+	Title = {Evolutionary convergence on highly-conserved 3' intron structures in intron-poor eukaryotes and insights into the ancestral eukaryotic genome},
+	Volume = {4},
+	Year = {2008}}
+
+@article{Depristo:2011vn,
+	Author = {Depristo, Mark A and others},
+	Journal = {Nat Genet},
+	Pages = {491-8},
+	Title = {A framework for variation discovery and genotyping using next-generation {DNA} sequencing data},
+	Volume = {43},
+	Year = {2011}}
diff --git a/tex/minimap2.tex b/tex/minimap2.tex
index e973bf0..b14baac 100644
--- a/tex/minimap2.tex
+++ b/tex/minimap2.tex
@@ -31,10 +31,10 @@
 \section{Motivation:} Recent advances in sequencing technologies promise
 ultra-long reads of $\sim$100 kilo bases (kb) in average, full-length mRNA or
 cDNA reads in high throughput and genomic contigs over 100 mega bases (Mb) in
-length. Existing alignment tools are unable or inefficient to process such data
+length. Existing alignment programs are unable or inefficient to process such data
 at scale, which presses for the development of new alignment algorithms.
 
-\section{Results:} Minimap2 is a general-purpose aligner to map DNA or long
+\section{Results:} Minimap2 is a general-purpose mapper to align DNA or long
 mRNA sequences against a large reference database. It works with accurate short
 reads of $\ge$100bp in length, $\ge$1kb genomic reads at error rate $\sim$15\%,
 full-length noisy Direct RNA or cDNA reads, and assembly contigs or closely
@@ -64,7 +64,7 @@ the thought that 10kb long sequences should be easier to map than 100bp reads
 because we can more effectively skip repetitive regions, which are often the
 bottleneck of short-read alignment. We confirmed our speculation by achieving
 approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}.
-\citet{Suzuki:2016} extended our work with a fast and novel algorithm on
+\citet{Suzuki130633} extended our work with a fast and novel algorithm on
 generating base-level alignment, which in turn inspired us to develop minimap2
 towards higher accuracy and more practical functionality.
 
@@ -179,7 +179,7 @@ where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
 base. Eq.~(\ref{eq:ae86}) is a natural extension to the equation under affine
 gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}.
 
-\subsubsection{Suzuki's formulation}
+\subsubsection{The Suzuki-Kasahara formulation}
 
 When we allow gaps longer than several hundred base pairs, nucleotide-level
 alignment is much slower than chaining. SSE acceleration is critical to the
@@ -187,7 +187,7 @@ performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs}
 based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short
 sequences, but only 4-way parallelization when the peak alignment score reaches
 32767. Long sequence alignment may exceed this threshold. Inspired by
-\citet{Wu:1996aa} and the following work, \citet{Suzuki:2016} proposed a
+\citet{Wu:1996aa} and the following work, \citet{Suzuki130633} proposed a
 difference-based formulation that lifted this limitation.
 In case of 2-piece gap cost, define
 \[
@@ -337,18 +337,24 @@ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
 \tilde{E}_{i+1,j}= \max\{H_{ij}-d(i)-\tilde{q},\tilde{E}_{ij}\}\\
 \end{array}\right.
 \end{equation}
-Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor
-site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a positive number $p$
-otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which
-takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is
-almost equivalent to the equation used by EXALIN~\citep{Zhang:2006aa} except
-that we allow insertions immediately followed by deletions and vice versa; in
-addition, we use Suzuki's diagonal formulation in actual implementation.
-
-%Given that $d_i$ and $a_i$
-%are a function of the reference sequence, it is possible to incorporate
-%splicing signals with more sophisticated models, such as positional weight
-%matrices. We have not tried this approach.
+Let $T$ be the reference sequence. $d(i)$ is computed as
+\[d(i)=\left\{\begin{array}{ll}
+0 & \mbox{if $T[i+1,i+3]$ is ${\tt GTA}$ or ${\tt GTG}$} \\
+p/2 & \mbox{if $T[i+1,i+3]$ is ${\tt GTC}$ or ${\tt GTT}$} \\
+p & \mbox{otherwise}
+\end{array}\right.\]
+where $T[i,j]$ extracts a substring of $T$ between $i$ and $j$ inclusively.
+$d(i)$ penalizes non-canonical donor sites with $p$ and less frequent Eukayotic
+splicing signal ${\tt GT[C/T]}$ with $p/2$~\citep{Irimia:2008aa}. Similarly,
+\[a(i)=\left\{\begin{array}{ll}
+0 & \mbox{if $T[i-2,i]$ is ${\tt CAG}$ or ${\tt TAG}$} \\
+p/2 & \mbox{if $T[i-2,i]$ is ${\tt AAG}$ or ${\tt GAG}$} \\
+p & \mbox{otherwise}
+\end{array}\right.\]
+models the acceptor signal. Eq.~(\ref{eq:splice}) is close to an equation in
+\citet{Zhang:2006aa} except that we allow insertions immediately followed by
+deletions and vice versa; in addition, we use the Suzuki-Kasahara diagonal
+formulation in actual implementation.
 
 If RNA-seq reads are not sequenced from stranded libraries, the read strand
 relative to the underlying transcript is unknown. By default, minimap2 aligns
@@ -440,16 +446,16 @@ to the 2-piece affine gap cost.
 \subsection{Aligning long spliced reads}
 
 We evaluated minimap2 on SIRV control data~(AC:SRR5286959;
-\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,916
-introns from 11\,017 reads. 93.0\% of splice juctions are precise. We examined
+\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,918
+introns from 11\,018 reads. 93.8\% of splice juctions are precise. We examined
 wrongly predicted junctions and found the majority were caused by clustered
 splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing
 errors are frequent, it is difficult to find precise splicing sites in this
 case. If we allow up to 10bp distance from true splicing sites, 98.4\% of
-aligned introns are approximately correct.  Given this observation, we might be
-able to improve boundary detection by initializing $d(\cdot)$ and $a(\cdot)$ in
-Eq.~(\ref{eq:splice}) with position-specific scoring matrices or more
-sophisticated models. We have not tried this approach.
+aligned introns are approximately correct. It is worth noting that for SIRV, we
+asked minimap2 to model the ${\tt GT..AG}$ splicing signal only without extra
+bases. This is because SIRV does not honor the evolutionarily prevalent signal
+${\tt GT[A/G]..[C/T]AG}$~\citep{Irimia:2008aa}.
 
 \begin{table}[!tb]
 \processtable{Evaluation of junction accuracy on 2D ONT reads}
@@ -460,13 +466,13 @@ sophisticated models. We have not tried this approach.
 \midrule
 Run time (CPU min)        & 631      & 15.9     & 2\,076   & 33.9 \\
 Peak RAM (GByte)          & 8.9      & 14.5     & 3.2      & 29.2\vspace{1em}\\
-\# aligned reads          & 103\,669 & 104\,200 & 103\,711 & 26\,479 \\
+\# aligned reads          & 103\,669 & 104\,199 & 103\,711 & 26\,479 \\
 \# chimeric alignments    & 1\,904   & 1\,488   & 0        & 0 \\
-\# non-spliced alignments & 15\,854  & 14\,639  & 17\,033  & 10\,545\vspace{1em}\\
-\# aligned introns        & 692\,275 & 694\,103 & 692\,945 & 78\,603 \\
-\# novel introns          & 11\,239  & 3\,207   & 8\,550   & 1\,214 \\
-\% exact introns          & 83.8\%   & 91.7\%   & 87.9\%   & 55.2\% \\
-\% approx. introns        & 91.8\%   & 96.5\%   & 92.5\%   & 82.4\% \\
+\# non-spliced alignments & 15\,854  & 14\,798  & 17\,033  & 10\,545\vspace{1em}\\
+\# aligned introns        & 692\,275 & 693\,553 & 692\,945 & 78\,603 \\
+\# novel introns          & 11\,239  & 3\,113   & 8\,550   & 1\,214 \\
+\% exact introns          & 83.8\%   & 94.0\%   & 87.9\%   & 55.2\% \\
+\% approx. introns        & 91.8\%   & 96.9\%   & 92.5\%   & 82.4\% \\
 \botrule
 \end{tabular}
 }{Mouse reads (AC:SRR5286960) were mapped to the primary assembly of mouse
@@ -487,10 +493,16 @@ STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more
 consistent with existing annotations (Table~\ref{tab:intron}): it finds
 more junctions with a higher percentage being exactly or approximately correct.
 Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to
-minimap2 in speed, it does not work well with noisy reads.  We have also
-evaluated spliced aligners on public Iso-Seq data (human Alzheimer brain
-from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The observation
-is similar: minimap2 is faster at higher junction accuracy.
+minimap2 in speed, it does not work well with noisy reads.
+
+We have also evaluated spliced aligners on public Iso-Seq data (human Alzheimer
+brain from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The
+observation is similar: minimap2 is faster at higher junction accuracy.
+On a private Nanopore Direct RNA data set with $>$20\% sequencing error rate
+(M\"{u}ller et al, personal communication), minimap2 aligned 940,346 introns
+from 239,976 mapped reads with 88.5\% of them consistent with human gene
+annotations. In comparison, only 40.3\% of GMAP introns found in known gene
+annotations.
 
 We noted that GMAP and SpAln have not been optimized for noisy reads. We are
 showing the best setting we have experimented, but their developers should be
@@ -528,6 +540,20 @@ region close to its mate. If we disable this feature, BWA-MEM becomes slightly
 less accurate than minimap2. We might consider to implement a similar heuristic
 in minimap2 in future.
 
+To evaluate the accuracy of minimap2 on real data, we aligned human reads
+(AC:ERR1341796) with BWA-MEM and minimap2, and called SNPs and small INDELs
+with GATK HaplotypeCaller v3.5~\citep{Depristo:2011vn}. This run was sequenced
+from experimentally mixed CHM1 and CHM13 cell lines. Both them are homozygous
+across the whole genome and have been \emph{de novo} assembled with SMRT reads
+to high quality.  This allowed us to construct an independent truth variant
+data set
+(\href{https://github.com/lh3/CHM-eval}{https://github.com/lh3/CHM-eval}) for
+ERR1341796. In this evaluation, minimap2 has higher SNP false negative rate
+(FNR; 2.5\% of minimap2 vs 2.2\% of BWA-MEM), but fewer false positive SNPs per
+million bases (FPPM; 3.0 vs 3.9), lower INDEL FNR (7.3\% vs 7.5\%) and similar
+INDEL FPPM (both 1.0). The difference between the two mappers is much smaller
+than between BWA-MEM and Bowtie2.
+
 \section{Conclusion}
 
 Minimap2 is a fast, accurate and versatile aligner for long nucleotide
@@ -540,11 +566,11 @@ alignment is an intricate research topic. More thorough evaluations would be
 necessary to justify the use of minimap2 for such applications.
 
 \section*{Acknowledgements}
-We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and
-insightful notes before formal publication. We thank M. Schatz, P. Rescheneder
-and F.  Sedlazeck for pointing out the limitation of BWA-MEM. We are also
-grateful to early minimap2 testers who have greatly helped to suggest features
-and to fix various issues.
+We owe a debt of gratitude to H. Suzuki and M. Kasahara for releasing their
+masterpiece and insightful notes before formal publication. We thank M.
+Schatz, P. Rescheneder and F. Sedlazeck for pointing out the limitation of
+BWA-MEM. We are also grateful to early minimap2 testers who have greatly helped
+to suggest features and to fix various issues.
 
 \bibliography{minimap2}
 
diff --git a/tex/mm2-s3.sam.eval b/tex/mm2-s3.sam.eval
index 83b81cb..6732dd2 100644
--- a/tex/mm2-s3.sam.eval
+++ b/tex/mm2-s3.sam.eval
@@ -1,60 +1,62 @@
-Q	60	18345673	8	0.000000436	18345673
-Q	59	33966	4	0.000000653	18379639
-Q	58	34178	1	0.000000706	18413817
-Q	56	49138	1	0.000000758	18462955
-Q	54	22442	4	0.000000974	18485397
-Q	53	19070	2	0.000001081	18504467
-Q	52	14169	3	0.000001242	18518636
-Q	51	13233	4	0.000001457	18531869
-Q	50	12133	2	0.000001564	18544002
-Q	49	11138	4	0.000001778	18555140
-Q	48	11174	8	0.000002208	18566314
-Q	47	17139	4	0.000002422	18583453
-Q	46	20428	10	0.000002956	18603881
-Q	45	16503	3	0.000003115	18620384
-Q	44	11933	6	0.000003435	18632317
-Q	43	25392	11	0.000004020	18657709
-Q	42	16734	9	0.000004498	18674443
-Q	41	13826	10	0.000005030	18688269
-Q	40	13023	10	0.000005561	18701292
-Q	39	12686	10	0.000006092	18713978
-Q	38	17275	4	0.000006300	18731253
-Q	37	17241	2	0.000006401	18748494
-Q	36	12458	12	0.000007036	18760952
-Q	35	11981	5	0.000007298	18772933
-Q	34	12004	11	0.000007879	18784937
-Q	33	12111	7	0.000008246	18797048
-Q	32	11782	9	0.000008719	18808830
-Q	31	11811	7	0.000009086	18820641
-Q	30	33507	32	0.000010767	18854148
-Q	29	11243	21	0.000011874	18865391
-Q	28	10779	17	0.000012767	18876170
-Q	27	15733	24	0.000014027	18891903
-Q	26	16762	40	0.000016130	18908665
-Q	25	13811	49	0.000018708	18922476
-Q	24	14141	46	0.000021123	18936617
-Q	23	13429	55	0.000024010	18950046
-Q	22	13116	26	0.000025365	18963162
-Q	21	13436	46	0.000027771	18976598
-Q	20	13441	55	0.000030648	18990039
-Q	19	12988	53	0.000033416	19003027
-Q	18	13353	51	0.000036074	19016380
-Q	17	13782	77	0.000040094	19030162
-Q	16	14065	94	0.000045001	19044227
-Q	15	14044	124	0.000051474	19058271
-Q	14	14714	140	0.000058774	19072985
-Q	13	17459	197	0.000069040	19090444
-Q	12	17339	259	0.000082532	19107783
-Q	11	17381	280	0.000097097	19125164
-Q	10	17732	295	0.000112418	19142896
-Q	9	17959	416	0.000134023	19160855
-Q	8	18234	530	0.000161530	19179089
-Q	7	19048	514	0.000188143	19198137
-Q	6	19722	656	0.000222085	19217859
-Q	5	19753	775	0.000262143	19237612
-Q	4	19818	1030	0.000315359	19257430
-Q	3	17088	1100	0.000372149	19274518
-Q	2	43045	6708	0.000718569	19317563
-Q	1	126377	25255	0.002012761	19443940
-Q	0	554357	372087	0.020562901	19998297
+Q	60	18579866	27	0.000001453	18579866
+Q	59	27087	4	0.000001666	18606953
+Q	58	21435	1	0.000001718	18628388
+Q	57	45663	3	0.000001874	18674051
+Q	56	36031	2	0.000001978	18710082
+Q	55	18499	2	0.000002082	18728581
+Q	54	14754	2	0.000002187	18743335
+Q	53	25541	2	0.000002291	18768876
+Q	52	26397	5	0.000002554	18795273
+Q	51	15090	3	0.000002711	18810363
+Q	50	13425	11	0.000003294	18823788
+Q	49	15175	2	0.000003397	18838963
+Q	48	19407	4	0.000003606	18858370
+Q	47	11538	16	0.000004452	18869908
+Q	46	12558	17	0.000005349	18882466
+Q	45	40362	28	0.000006817	18922828
+Q	44	10465	13	0.000007500	18933293
+Q	43	10098	20	0.000008552	18943391
+Q	42	10682	19	0.000009549	18954073
+Q	41	9823	11	0.000010125	18963896
+Q	40	9685	16	0.000010963	18973581
+Q	39	10273	18	0.000011905	18983854
+Q	38	9515	18	0.000012847	18993369
+Q	37	9474	27	0.000014261	19002843
+Q	36	10430	25	0.000015568	19013273
+Q	35	9241	34	0.000017348	19022514
+Q	34	9162	31	0.000018968	19031676
+Q	33	10164	49	0.000021532	19041840
+Q	32	9152	55	0.000024408	19050992
+Q	31	9252	35	0.000026233	19060244
+Q	30	9872	55	0.000029103	19070116
+Q	29	8938	65	0.000032496	19079054
+Q	28	8951	73	0.000036306	19088005
+Q	27	9949	95	0.000041261	19097954
+Q	26	9784	97	0.000046316	19107738
+Q	25	10126	97	0.000051366	19117864
+Q	24	11260	123	0.000057765	19129124
+Q	23	10047	114	0.000063691	19139171
+Q	22	9661	123	0.000070083	19148832
+Q	21	10339	168	0.000078813	19159171
+Q	20	17928	193	0.000088804	19177099
+Q	19	9842	193	0.000098817	19186941
+Q	18	14737	247	0.000111605	19201678
+Q	17	10218	238	0.000123934	19211896
+Q	16	10271	242	0.000136457	19222167
+Q	15	12241	333	0.000153683	19234408
+Q	14	9189	336	0.000171070	19243597
+Q	13	9493	515	0.000197734	19253090
+Q	12	11502	743	0.000236185	19264592
+Q	11	8211	507	0.000262390	19272803
+Q	10	9133	606	0.000293695	19281936
+Q	9	10014	931	0.000341801	19291950
+Q	8	8436	698	0.000377816	19300386
+Q	7	8443	705	0.000414163	19308829
+Q	6	10203	944	0.000462808	19319032
+Q	5	6936	756	0.000501760	19325968
+Q	4	6732	843	0.000545190	19332700
+Q	3	8215	1104	0.000602040	19340915
+Q	2	21201	5440	0.000882342	19362116
+Q	1	82328	22186	0.002019600	19444444
+Q	0	553853	371953	0.020562901	19998297
 U	1703
diff --git a/tex/mm2.eval b/tex/mm2.eval
index 405d4e9..38736f2 100644
--- a/tex/mm2.eval
+++ b/tex/mm2.eval
@@ -1,9 +1,13 @@
-Q	60	32226	0	0.000000000	32226
-Q	20	267	1	0.000030776	32493
-Q	10	34	1	0.000061487	32527
-Q	9	118	1	0.000091898	32645
-Q	5	27	2	0.000153036	32672
-Q	4	68	2	0.000213806	32740
-Q	1	314	101	0.003267381	33054
+Q	60	32477	0	0.000000000	32477
+Q	22	16	1	0.000030776	32493
+Q	21	44	1	0.000061468	32537
+Q	19	73	1	0.000091996	32610
+Q	14	66	1	0.000122414	32676
+Q	10	26	3	0.000214054	32702
+Q	8	14	1	0.000244529	32716
+Q	7	13	2	0.000305539	32729
+Q	6	47	1	0.000335611	32776
+Q	3	10	1	0.000366010	32786
+Q	2	20	2	0.000426751	32806
+Q	1	248	94	0.003267381	33054
 Q	0	31	17	0.003778147	33085
-U	3