diff --git a/tex/minimap2.bib b/tex/minimap2.bib index 0fb22a7..27f4149 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -338,3 +338,96 @@ Title = {Introducing difference recurrence relations for faster semi-global alignment of long sequences}, Volume = {19}, Year = {2018}} + +@article{Li:2018ab, + Author = {Li, Heng}, + Journal = {Bioinformatics}, + Pages = {3094-3100}, + Title = {Minimap2: pairwise alignment for nucleotide sequences}, + Volume = {34}, + Year = {2018}} + +@article{Jain:2020aa, + Author = {Jain, Chirag and others}, + Journal = {Bioinformatics}, + Pages = {i111-i118}, + Title = {Weighted minimizer sampling improves long read mapping}, + Volume = {36}, + Year = {2020}} + +@article{Miga:2020aa, + Author = {Miga, Karen H and others}, + Journal = {Nature}, + Pages = {79-84}, + Title = {Telomere-to-telomere assembly of a complete human {X} chromosome}, + Volume = {585}, + Year = {2020}} + +@article {Jain2020.11.01.363887, + author = {Jain, Chirag and others}, + title = {A long read mapping method for highly repetitive reference sequences}, + elocation-id = {2020.11.01.363887}, + year = {2020}, + doi = {10.1101/2020.11.01.363887}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {About 5-10\% of the human genome remains inaccessible for functional analysis due to the presence of repetitive sequences such as segmental duplications and tandem repeat arrays. To enable high-quality resequencing of personal genomes, it is crucial to support end-to-end genome variant discovery using repeat-aware read mapping methods. In this study, we highlight the fact that existing long read mappers often yield incorrect alignments and variant calls within long, near-identical repeats, as they remain vulnerable to allelic bias. In the presence of a non-reference allele within a repeat, a read sampled from that region could be mapped to an incorrect repeat copy because the standard pairwise sequence alignment scoring system penalizes true variants.To address the above problem, we propose a novel, long read mapping method that addresses allelic bias by making use of minimal confidently alignable substrings (MCASs). MCASs are formulated as minimal length substrings of a read that have unique alignments to a reference locus with sufficient mapping confidence (i.e., a mapping quality score above a user-specified threshold). This approach treats each read mapping as a collection of confident sub-alignments, which is more tolerant of structural variation and more sensitive to paralog-specific variants (PSVs) within repeats. We mathematically define MCASs and discuss an exact algorithm as well as a practical heuristic to compute them. The proposed method, referred to as Winnowmap2, is evaluated using simulated as well as real long read benchmarks using the recently completed gapless assemblies of human chromosomes X and 8 as a reference. We show that Winnowmap2 successfully addresses the issue of allelic bias, enabling more accurate downstream variant calls in repetitive sequences. As an example, using simulated PacBio HiFi reads and structural variants in chromosome 8, Winnowmap2 alignments achieved the lowest false-negative and false-positive rates (1.89\%, 1.89\%) for calling structural variants within near-identical repeats compared to minimap2 (39.62\%, 5.88\%) and NGMLR (56.60\%, 36.11\%) respectively.Winnowmap2 code is accessible at https://github.com/marbl/WinnowmapCompeting Interest StatementThe authors have declared no competing interest.}, + URL = {https://www.biorxiv.org/content/early/2020/11/02/2020.11.01.363887}, + eprint = {https://www.biorxiv.org/content/early/2020/11/02/2020.11.01.363887.full.pdf}, + journal = {bioRxiv} +} + +@article{Li:2020aa, + Author = {Li, Heng and others}, + Journal = {Genome Biol}, + Pages = {265}, + Title = {The design and construction of reference pangenome graphs with minigraph}, + Volume = {21}, + Year = {2020}} + +@article{Ren:2021aa, + Author = {Ren, Jingwen and Chaisson, Mark J P}, + Journal = {PLoS Comput Biol}, + Pages = {e1009078}, + Title = {lra: A long read aligner for sequences and contigs}, + Volume = {17}, + Year = {2021}} + +@inproceedings{DBLP:conf/wabi/AbouelhodaO03, + Author = {Mohamed Ibrahim Abouelhoda and Enno Ohlebusch}, + Booktitle = {Algorithms in Bioinformatics, Third International Workshop, {WABI} 2003, Budapest, Hungary, September 15-20, 2003, Proceedings}, + Crossref = {DBLP:conf/wabi/2003}, + Pages = {1--16}, + Title = {A Local Chaining Algorithm and Its Applications in Comparative Genomics}, + Year = {2003}} + +@article{Ono:2021aa, + Author = {Ono, Yukiteru and others}, + Journal = {Bioinformatics}, + Pages = {589-595}, + Title = {{PBSIM2}: a simulator for long-read sequencers with a novel generative model of quality scores}, + Volume = {37}, + Year = {2021}} + +@article{Sedlazeck:2018ab, + Author = {Sedlazeck, Fritz J and others}, + Journal = {Nat Methods}, + Pages = {461-468}, + Title = {Accurate detection of complex structural variations using single-molecule sequencing}, + Volume = {15}, + Year = {2018}} + +@article{Jeffares:2017aa, + Author = {Jeffares, Daniel C and others}, + Journal = {Nat Commun}, + Pages = {14061}, + Title = {Transient structural variations have strong effects on quantitative traits and reproductive isolation in fission yeast}, + Volume = {8}, + Year = {2017}} + +@article{Zook:2020aa, + Author = {Zook, Justin M and others}, + Journal = {Nat Biotechnol}, + Pages = {1347-1355}, + Title = {A robust benchmark for detection of germline large deletions and insertions}, + Volume = {38}, + Year = {2020}} diff --git a/tex/mm2-update.tex b/tex/mm2-update.tex new file mode 100644 index 0000000..de999ac --- /dev/null +++ b/tex/mm2-update.tex @@ -0,0 +1,179 @@ +\documentclass{bioinfo} +\copyrightyear{2021} +\pubyear{2021} + +\usepackage{graphicx} +\usepackage{hyperref} +\usepackage{url} +\usepackage{amsmath} +\usepackage[ruled,vlined]{algorithm2e} +\newcommand\mycommfont[1]{\footnotesize\rmfamily{\it #1}} +\SetCommentSty{mycommfont} +\SetKwComment{Comment}{$\triangleright$\ }{} + +\usepackage{natbib} +\bibliographystyle{apalike} + +\DeclareMathOperator*{\argmax}{argmax} + +\begin{document} +\firstpage{1} + +\title[Improvements to minimap2]{New strategies to improve minimap2 alignment accuracy} +\author[Li]{Heng Li$^{1,2}$} +\address{$^1$Dana-Farber Cancer Institute, 450 Brookline Ave, Boston, MA 02215, USA, +$^2$Harvard Medical School, 10 Shattuck St, Boston, MA 02215, USA} + +\maketitle + +\begin{abstract} + +\section{Summary:} We present several recent improvements to minimap2, a +versatile pairwise aligner for nucleotide sequences. Now minimap2 v2.22 can +more accurately map long reads to highly repetitive regions and align through +insertions or deletions up to 100kb by default, addressing major weakness in +minimap2 v2.18 or earlier. + +\section{Availability and implementation:} +\href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2} + +\section{Contact:} hli@ds.dfci.harvard.edu +\end{abstract} + +\section{Introduction} +Minimap2~\citep{Li:2018ab} is a widely used aligner for maping long sequence +reads and assembly contigs. \citet{Jain:2020aa} found minimap2 occasionally +misaligned reads from highly repetitive regions as minimap2 ignored seeds of +high occurrence. They also noticed minimap2 may misplace reads with structural +variations (SVs) in such regions~\citep{Jain2020.11.01.363887}. These +misalignments have become a pressing issue in the advent of +temolere-to-telomore human assembly~\citep{Miga:2020aa}. Meanwhile, minimap2 +was unable to efficiently align long insertions/deletions (INDELs) and often +breaks an alignment around variable-number tandem repeats (VNTRs). This has +inspired new chaining algorithms~\citep{Li:2020aa,Ren:2021aa} which are not +integrated into minimap2. Here we will describe recent improvements implemented +in v2.19 through v2.22. + +\begin{methods} +\section{Methods} + +\subsection{Rescuing high-occurrence $k$-mers} +Minimap2 keeps all $k$-mer minimizers during indexing. Its original +implementation only selected low-occurrence minimizers during mapping. The +cutoff is a few hundred for mapping long reads against a human genome. If a +read habors only a few or even no low-occurrence minimizers, it will fail +chaining due to insufficient anchors. + +To resolve this issue, we implemented a new heuristic to add additional +minimizers. Suppose we are looking at two adjacent low-occurence $k$-mers +located at position $x_1$ and $x_2$, respectively. If $|x_1-x_2|\ge500$, the +new minimap2 adds $\lfloor|x_1-x_2|/500\rfloor$ minimizers among +high-occurrence minimizers between $x_1$ and $x_2$. We use a binary heap data +structure to select minimizers of the lowest occurrence in this interval. +This strategy adds necessary anchors at the cost of increasing total alignment +time by a few percent on real data. + +\subsection{Aligning through longer INDELs} +The original minimap2 may fail to align long INDELs due to its chaining +heuristics. Briefly, minimap2 applies dynamic programming (DP) to chain +minimizer anchors. It is a quadratic algorithm, which is slow for chaining +contigs. For acceptable performance, the original minimap2 uses a 500bp band by +default. If there is an INDEL longer than 500bp and the two chains around it +have no overlaps on either the query or the reference sequence, minimap2 may +join the two short chains later as a postprocessing step. We call it the +long-join heuristic. This heuristic may fail around VNTRs because short chains +often have overlaps in VNTRs. More subtly, minimap2 may escape the inner DP +loop early, again for performance, if the chaining result is not improved for +50 iterations. When there is a copy number change in a long segmental +duplication, the early escape may break around the event even if users +specify a large band. + +In minigraph~\citep{Li:2020aa}, we developed a new chaining algorithm that +finds short INDELs with DP-based chaining and goes through long INDELs with a +subquadratic algorithm~\citep{DBLP:conf/wabi/AbouelhodaO03}. We ported the same +algorithm to minimap2 for contig mapping. For read mapping, the minigraph +algorithm is slower. The updated minimap2 still uses the DP-based algorithm to +find short chains and then uses the minigraph algorithm to rechain anchors in +these short chains. The rechaining steps achieves the same goal as long-join +but is more reliable as it can resolve overlaps between short chains. The old +long-join heuristic has since been removed. + +\subsection{Properly mapping long reads with SVs} +The original minimap2 ranks an alignment by its Smith-Waterman score and +outputs the best scoring alignment. However, when there are SVs on the read, +the best scoring alignment is sometimes not the correct alignment. +\citet{Jain2020.11.01.363887} resolved this dilemma by altering the mapping +algorithm. + +In our view, this problem is rooted in impropriate scoring: affine-gap penalty +over-penalizes a long INDEL that was often evolutionarily created in one event. +We should not penalize a SV linearly in its length. The new minimap2 rescores +an alignment with the following scoring function. Suppose an alignment consists +of $M$ matching bases, $N$ substitutions and $G$ gap opens, we empirically +score the alignment with +$$ +M-\frac{N+G}{2d}-\sum_{i=1}^G\log_2(1+g_i) +$$ +where $g_i\ge1$ is the length of the $i$-th gap and +$$ +d=\max\left\{\frac{N+G}{M+N+G},0.02\right\} +$$ +Here $d$ approximates per-base sequence divergence with the smallest value set +to 2\%. As an analogy to affine-gap scoring, the matching score in our scheme +is 1, the mismatch and gap open penalties are both $1/2d$ and the gap extension +penalty is a logarithm function of the gap length. Our scoring gives a long SV +a much milder penalty. In terms of time complexity, scoring an alignment is +linear in the length of the alignment. Time spent on rescoring is negligible in +practice. + +%If we assume sequences evolve under a duplication-mutation model, we may have a +%better way to choose the best alignment. If a long read can be mapped to $n$ +%loci, we can take the read as the template and build a +%pseudo-multi-sequence-alignment (pMSA) of $n+1$ sequences. In this pMSA, we say +%a site on the read is informative if the $n$ reference subsequences differ at +%the position. + +\end{methods} + +\section{Results} + +\begin{table} +\processtable{Evaluation of minimap2 v2.22} +{\footnotesize\begin{tabular}{p{4.2cm}rrrr} +\toprule +$[$Benchmark$]$ Metric & v2.22 & v2.18 & Winno & lra \\ +\midrule +$[$sim-map$]$ \% mapped reads at Q10 & 97.9 & 97.6 & {\bf 99.0} & 97.3 \\ +$[$sim-map$]$ err. rate at Q10 (phredQ) & {\bf 52} & {\bf 52} & 38 & 24 \\ +$[$winno-cmp$]$ rate of diff. (phredQ) & {\bf 41} & 37 & N/A & 18 \\ +$[$sim-sv$]$ \% false negative rate & {\bf 0.5} & 2.0 & {\bf 0.5} & 1.4 \\ +$[$sim-sv$]$ \% false discovery rate & {\bf 0.0} & 0.1 & {\bf 0.0} & 0.1 \\ +$[$real-sv-1k$]$ \% false negative rate & {\bf 7.3} & 20.0 & 13.0 & N/A \\ +$[$real-sv-1k$]$ \% false discovery rate & 2.7 & {\bf 2.4} & 2.7 & N/A \\ +\botrule +\end{tabular}} +{In $[$sim-map$]$, 152,713 reads were simulated from the CHM13 telomere-to-telomere assembly v1.1 +(AC: GCA\_009914755.3) with pbsim2~\citep{Ono:2021aa}: ``pbsim2 -{}-hmm\_model R94.model -{}-length-min +5000 -{}-length-mean 20000 -{}-accuracy-mean 0.95''. Alignments of mapping quality +10 or higher were evaluated by ``paftools.js mapeval''. The mapping error rate +is measured in the phred scale: if the error rate is $e$, $-10\log_{10}e$ is +reported in the table. In $[$winno-cmp$]$, 1.39 million CHM13 HiFi reads from +SRR11292121 were mapped against CHM13. 99.3\% of them were mapped by Winnowmap2 +at mapping quality 10 or higher and were taken as ground truth to evaluate +minimap2 and lra with ``paftools.js pafcmp''. $[$sim-sv$]$ simulated 1,000 +50bp to 1000bp INDELs from chr8 in CHM13 using SURVIVOR~\citep{Jeffares:2017aa} and simulated Nanopore +reads at 30 folds with the same pbsim2 command line. SVs were called with +``sniffles -q 10''~\citep{Sedlazeck:2018ab} and compared to the simulated truth with ``SURVIVOR eval +call.vcf truth.bed 50''. In $[$real-sv-1k$]$, small and long variants were +called by dipcall-0.3 for HG002 assemblies (AC: GCA\_018852605.1 and +GCA\_018852615.1) and compared to the GIAB truth~\citep{Zook:2020aa} using ``truvari -r 2000 -s +1000 -S 400 -{}-multimatch -{}-passonly'' which sets the minimum INDEL size to 1kb in evaluation. } +\end{table} + +\section*{Acknowledgements} + +\paragraph{Funding\textcolon} NHGRI R01HG010040 + +\bibliography{minimap2} + +\end{document}