diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 0000000..628acea
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,21 @@
+name: CI
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ compiler: [gcc, clang]
+
+ steps:
+ - name: Checkout minimap2
+ uses: actions/checkout@v2
+
+ - name: Compile with ${{ matrix.compiler }}
+ run: make CC=${{ matrix.compiler }}
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..a80f848
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "lib/simde"]
+ path = lib/simde
+ url = https://github.com/nemequ/simde.git
diff --git a/.travis.yml b/.travis.yml
index f0764f1..b43ca9c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -19,6 +19,6 @@ matrix:
before_install: pip install cython
script: python setup.py build_ext
- language: python
- python: "3.6"
+ python: "3.9"
before_install: pip install cython
script: python setup.py build_ext
diff --git a/MANIFEST.in b/MANIFEST.in
index 599b046..3df1d19 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,7 +4,6 @@ include ksw2_dispatch.c
include main.c
include README.md
include sse2neon/emmintrin.h
-include python/mappy.c
include python/cmappy.h
include python/cmappy.pxd
include python/mappy.pyx
diff --git a/Makefile b/Makefile
index 18622f5..4118616 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,9 @@
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC
INCLUDES=
-OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o
+OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
+ lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
+ ksw2_ll_sse.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread
@@ -103,26 +105,28 @@ depend:
# DO NOT DELETE
-align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
+align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
bseq.o: bseq.h kvec.h kalloc.h kseq.h
-chain.o: minimap.h mmpriv.h bseq.h kalloc.h
-esterr.o: mmpriv.h minimap.h bseq.h
+esterr.o: mmpriv.h minimap.h bseq.h kseq.h
example.o: minimap.h kseq.h
-format.o: kalloc.h mmpriv.h minimap.h bseq.h
-hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h
-index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
+format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
+hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
+index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
+index.o: ksort.h
kalloc.o: kalloc.h
ksw2_extd2_sse.o: ksw2.h kalloc.h
ksw2_exts2_sse.o: ksw2.h kalloc.h
ksw2_extz2_sse.o: ksw2.h kalloc.h
ksw2_ll_sse.o: ksw2.h kalloc.h
kthread.o: kthread.h
-main.o: bseq.h minimap.h mmpriv.h ketopt.h
-map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h
-map.o: ksort.h
-misc.o: mmpriv.h minimap.h bseq.h ksort.h
-options.o: mmpriv.h minimap.h bseq.h
-pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h
-sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h
-sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h
-splitidx.o: mmpriv.h minimap.h bseq.h
+lchain.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h krmq.h
+main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
+map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
+map.o: khash.h ksort.h
+misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
+options.o: mmpriv.h minimap.h bseq.h kseq.h
+pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
+sdust.o: kalloc.h kdq.h kvec.h sdust.h
+seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h
+sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
+splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
diff --git a/Makefile.simde b/Makefile.simde
new file mode 100644
index 0000000..381a668
--- /dev/null
+++ b/Makefile.simde
@@ -0,0 +1,97 @@
+CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
+CPPFLAGS= -DHAVE_KALLOC -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
+INCLUDES= -Ilib/simde
+OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o \
+ ksw2_extz2_simde.o ksw2_extd2_simde.o ksw2_exts2_simde.o ksw2_ll_simde.o
+PROG= minimap2
+PROG_EXTRA= sdust minimap2-lite
+LIBS= -lm -lz -lpthread
+
+
+ifneq ($(arm_neon),) # if arm_neon is defined
+ifeq ($(aarch64),) #if aarch64 is not defined
+ CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char
+else #if aarch64 is defined
+ CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char
+endif
+endif
+
+ifneq ($(asan),)
+ CFLAGS+=-fsanitize=address
+ LIBS+=-fsanitize=address
+endif
+
+ifneq ($(tsan),)
+ CFLAGS+=-fsanitize=thread
+ LIBS+=-fsanitize=thread
+endif
+
+.PHONY:all extra clean depend
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+extra:all $(PROG_EXTRA)
+
+minimap2:main.o libminimap2.a
+ $(CC) $(CFLAGS) main.o -o $@ -L. -lminimap2 $(LIBS)
+
+minimap2-lite:example.o libminimap2.a
+ $(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS)
+
+libminimap2.a:$(OBJS)
+ $(AR) -csru $@ $(OBJS)
+
+sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h
+ $(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz
+
+ksw2_ll_simde.o:ksw2_ll_sse.c ksw2.h kalloc.h
+ $(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@
+
+ksw2_extz2_simde.o:ksw2_extz2_sse.c ksw2.h kalloc.h
+ $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
+
+ksw2_extd2_simde.o:ksw2_extd2_sse.c ksw2.h kalloc.h
+ $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
+
+ksw2_exts2_simde.o:ksw2_exts2_sse.c ksw2.h kalloc.h
+ $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
+
+# other non-file targets
+
+clean:
+ rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM build dist mappy*.so mappy.c python/mappy.c mappy.egg*
+
+depend:
+ (LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(CPPFLAGS) -- *.c)
+
+# DO NOT DELETE
+
+align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
+bseq.o: bseq.h kvec.h kalloc.h kseq.h
+chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h
+esterr.o: mmpriv.h minimap.h bseq.h kseq.h
+example.o: minimap.h kseq.h
+format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
+hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
+index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
+index.o: ksort.h
+kalloc.o: kalloc.h
+ksw2_extd2_sse.o: ksw2.h kalloc.h
+ksw2_exts2_sse.o: ksw2.h kalloc.h
+ksw2_extz2_sse.o: ksw2.h kalloc.h
+ksw2_ll_sse.o: ksw2.h kalloc.h
+kthread.o: kthread.h
+main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
+map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
+map.o: khash.h ksort.h
+misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
+options.o: mmpriv.h minimap.h bseq.h kseq.h
+pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
+sdust.o: kalloc.h kdq.h kvec.h sdust.h
+self-chain.o: minimap.h kseq.h
+sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
+splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
diff --git a/NEWS.md b/NEWS.md
index 5c0ab9c..425545a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,151 @@
+Release 2.21-r1071 (6 July 2021)
+--------------------------------
+
+This release fixed a regression in short-read mapping introduced in v2.19
+(#776). It also fixed invalid comparisons of uninitialized variables, though
+these are harmless (#752). Long-read alignment should be identical to v2.20.
+
+(2.21: 6 July 2021)
+
+
+
+Release 2.20-r1061 (27 May 2021)
+--------------------------------
+
+This release fixed a bug in the Python module and improves the command-line
+compatibiliity with v2.18. In v2.19, if `-r` is specified with an `asm*` preset,
+users would get alignments more fragmented than v2.18. This could be an issue
+for existing pipelines specifying `-r`. This release resolves this issue.
+
+(2.20: 27 May 2021, r1061)
+
+
+
+Release 2.19-r1057 (26 May 2021)
+--------------------------------
+
+This release includes a few important improvements backported from unimap:
+
+ * Improvement: more contiguous alignment through long INDELs. This is enabled
+ by the minigraph chaining algorithm. All `asm*` presets now use the new
+ algorithm. They can find INDELs up to 100kb and may be faster for
+ chromosome-long contigs. The default mode and `map*` presets use this
+ algorithm to replace the long-join heuristic.
+
+ * Improvement: better alignment in highly repetitive regions by rescuing
+ high-occurrence seeds. If the distance between two adjacent seeds is too
+ large, attempt to choose a fraction of high-occurrence seeds in-between.
+ Minimap2 now produces fewer clippings and alignment break points in long
+ satellite regions.
+
+ * Improvement: allow to specify an interval of k-mer occurrences with `-U`.
+ For repeat-rich genomes, the automatic k-mer occurrence threshold determined
+ by `-f` may be too large and makes alignment impractically slow. The new
+ option protects against such cases. Enabled for `asm*` and `map-hifi`.
+
+ * New feature: added the `map-hifi` preset for maping PacBio High-Fidelity
+ (HiFi) reads.
+
+ * Change to the default: apply `--cap-sw-mem=100m` for genomic alignment.
+
+ * Bugfix: minimap2 could not generate an index file with `-xsr` (#734).
+
+This release represents the most signficant algorithmic change since v2.1 in
+2017. With features backported from unimap, minimap2 now has similar power to
+unimap for contig alignment. Unimap will remain an experimental project and is
+no longer recommended over minimap2. Sorry for reverting the recommendation in
+short time.
+
+(2.19: 26 May 2021, r1057)
+
+
+
+Release 2.18-r1015 (9 April 2021)
+---------------------------------
+
+This release fixes multiple rare bugs in minimap2 and adds additional
+functionality to paftools.js.
+
+Changes to minimap2:
+
+ * Bugfix: a rare segfault caused by an off-by-one error (#489)
+
+ * Bugfix: minimap2 segfaulted due to an uninitilized variable (#622 and #625).
+
+ * Bugfix: minimap2 parsed spaces as field separators in BED (#721). This led
+ to issues when the BED name column contains spaces.
+
+ * Bugfix: minimap2 `--split-prefix` did not work with long reference names
+ (#394).
+
+ * Bugfix: option `--junc-bonus` didn't work (#513)
+
+ * Bugfix: minimap2 didn't return 1 on I/O errors (#532)
+
+ * Bugfix: the `de:f` tag (sequence divergence) could be negative if there were
+ ambiguous bases
+
+ * Bugfix: fixed two undefined behaviors caused by calling memcpy() on
+ zero-length blocks (#443)
+
+ * Bugfix: there were duplicated SAM @SQ lines if option `--split-prefix` is in
+ use (#400 and #527)
+
+ * Bugfix: option -K had to be smaller than 2 billion (#491). This was caused
+ by a 32-bit integer overflow.
+
+ * Improvement: optionally compile against SIMDe (#597). Minimap2 should work
+ with IBM POWER CPUs, though this has not been tested. To compile with SIMDe,
+ please use `make -f Makefile.simde`.
+
+ * Improvement: more informative error message for I/O errors (#454) and for
+ FASTQ parsing errors (#510)
+
+ * Improvement: abort given malformatted RG line (#541)
+
+ * Improvement: better formula to estimate the `dv:f` tag (approximate sequence
+ divergence). See DOI:10.1101/2021.01.15.426881.
+
+ * New feature: added the `--mask-len` option to fine control the removal of
+ redundant hits (#659). The default behavior is unchanged.
+
+Changes to mappy:
+
+ * Bugfix: mappy caused segmentation fault if the reference index is not
+ present (#413).
+
+ * Bugfix: fixed a memory leak via 238b6bb3
+
+ * Change: always require Cython to compile the mappy module (#723). Older
+ mappy packages at PyPI bundled the C source code generated by Cython such
+ that end users did not need to install Cython to compile mappy. However, as
+ Python 3.9 is breaking backward compatibility, older mappy does not work
+ with Python 3.9 anymore. We have to add this Cython dependency as a
+ workaround.
+
+Changes to paftools.js:
+
+ * Bugfix: the "part10-" line from asmgene was wrong (#581)
+
+ * Improvement: compatibility with GTF files from GenBank (#422)
+
+ * New feature: asmgene also checks missing multi-copy genes
+
+ * New feature: added the misjoin command to evaluate large-scale misjoins and
+ megabase-long inversions.
+
+Although given the many bug fixes and minor improvements, the core algorithm
+stays the same. This version of minimap2 produces nearly identical alignments
+to v2.17 except very rare corner cases.
+
+Now unimap is recommended over minimap2 for aligning long contigs against a
+reference genome. It often takes less wall-clock time and is much more
+sensitive to long insertions and deletions.
+
+(2.18: 9 April 2021, r1015)
+
+
+
Release 2.17-r941 (4 May 2019)
------------------------------
diff --git a/README.md b/README.md
index addeb2c..0bff2e4 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
[](https://github.com/lh3/minimap2/releases)
[](https://anaconda.org/bioconda/minimap2)
[](https://pypi.python.org/pypi/mappy)
-[](https://travis-ci.org/lh3/minimap2)
+[](https://github.com/lh3/minimap2/actions)
## Getting Started
```sh
git clone https://github.com/lh3/minimap2
@@ -12,19 +12,22 @@ cd minimap2 && make
./minimap2 -x map-ont -d MT-human-ont.mmi test/MT-human.fa
./minimap2 -a MT-human-ont.mmi test/MT-orang.fa > test.sam
# use presets (no test data)
-./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio genomic reads
+./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
-./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio CCS genomic reads
+./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
+./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier)
./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
./minimap2 -ax splice:hq -uf ref.fa query.fa > aln.sam # Final PacBio Iso-seq or traditional cDNA
+./minimap2 -ax splice --junc-bed anno.bed12 ref.fa query.fa > aln.sam # prioritize on annotated junctions
./minimap2 -cx asm5 asm1.fa asm2.fa > aln.paf # intra-species asm-to-asm alignment
./minimap2 -x ava-pb reads.fa reads.fa > overlaps.paf # PacBio read overlap
./minimap2 -x ava-ont reads.fa reads.fa > overlaps.paf # Nanopore read overlap
# man page for detailed command line options
man ./minimap2.1
```
+
## Table of Contents
- [Getting Started](#started)
@@ -71,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
the [release page][release] with:
```sh
-curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar -jxvf -
-./minimap2-2.17_x64-linux/minimap2
+curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar -jxvf -
+./minimap2-2.21_x64-linux/minimap2
```
If you want to compile from the source, you need to have a C compiler, GNU make
and zlib development files installed. Then type `make` in the source code
@@ -80,13 +83,20 @@ directory to compile. If you see compilation errors, try `make sse2only=1`
to disable SSE4 code, which will make minimap2 slightly slower.
Minimap2 also works with ARM CPUs supporting the NEON instruction sets. To
-compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 aarch64=1`.
+compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To
+compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1
+aarch64=1`.
+
+Minimap2 can use [SIMD Everywhere (SIMDe)][simde] library for porting
+implementation to the different SIMD instruction sets. To compile using SIMDe,
+use `make -f Makefile.simde`. To compile for ARM CPUs, use `Makefile.simde`
+with the ARM related command lines given above.
### General usage
Without any options, minimap2 takes a reference database and a query sequence
file as input and produce approximate mapping, without base-level alignment
-(i.e. no CIGAR), in the [PAF format][paf]:
+(i.e. coordinates are only approximate and no CIGAR in output), in the [PAF format][paf]:
```sh
minimap2 ref.fa query.fq > approx-mapping.paf
```
@@ -127,13 +137,13 @@ parameters at the same time. The default setting is the same as `map-ont`.
#### Map long noisy genomic reads
```sh
-minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio subreads
+minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
```
The difference between `map-pb` and `map-ont` is that `map-pb` uses
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
-performance and sensitivity when aligning PacBio reads, but hurt when aligning
+performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
Nanopore reads.
#### Map long mRNA/cDNA reads
@@ -178,10 +188,23 @@ This is because SIRV does not honor the evolutionarily conservative splicing
signal. If you are studying SIRV, you may apply `--splice-flank=no` to let
minimap2 only model GT..AG, ignoring the additional base.
+Since v2.17, minimap2 can optionally take annotated genes as input and
+prioritize on annotated splice junctions. To use this feature, you can
+```sh
+paftools.js gff2bed anno.gff > anno.bed
+minimap2 -ax splice --junc-bed anno.bed ref.fa query.fa > aln.sam
+```
+Here, `anno.gff` is the gene annotation in the GTF or GFF3 format (`gff2bed`
+automatically tests the format). The output of `gff2bed` is in the 12-column
+BED format, or the BED12 format. With the `--junc-bed` option, minimap2 adds a
+bonus score (tuned by `--junc-bonus`) if an aligned junction matches a junction
+in the annotation. Option `--junc-bed` also takes 5-column BED, including the
+strand field. In this case, each line indicates an oriented junction.
+
#### Find overlaps between long reads
```sh
-minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio read overlap
+minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio CLR read overlap
minimap2 -x ava-ont reads.fq reads.fq > ovlp.paf # Oxford Nanopore read overlap
```
Similarly, `ava-pb` uses HPC minimizers while `ava-ont` uses ordinary
@@ -228,7 +251,7 @@ To avoid this issue, you can add option `-L` at the minimap2 command line.
This option moves a long CIGAR to the `CG` tag and leaves a fully clipped CIGAR
at the SAM CIGAR column. Current tools that don't read CIGAR (e.g. merging and
sorting) still work with such BAM records; tools that read CIGAR will
-effectively ignore these records. It has been decided that future tools will
+effectively ignore these records. It has been decided that future tools
will seamlessly recognize long-cigar records generated by option `-L`.
**TL;DR**: if you work with ultra-long reads and use tools that only process
@@ -249,7 +272,7 @@ CGATCGATAAATAGAGTAG---GAATAGCA
CGATCG---AATAGAGTAGGTCGAATtGCA
```
is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an
-identical block, `-ata` represents a deltion, `+gtc` an insertion and `*at`
+identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at`
indicates reference base `a` is substituted with a query base `t`. It is
similar to the `MD` SAM tag but is standalone and easier to parse.
@@ -376,3 +399,5 @@ mappy` or [from BioConda][mappyconda] via `conda install -c bioconda mappy`.
[manpage]: https://lh3.github.io/minimap2/minimap2.html
[manpage-cs]: https://lh3.github.io/minimap2/minimap2.html#10
[doi]: https://doi.org/10.1093/bioinformatics/bty191
+[smide]: https://github.com/nemequ/simde
+[unimap]: https://github.com/lh3/unimap
diff --git a/align.c b/align.c
index 3d67eeb..1ba2ba3 100644
--- a/align.c
+++ b/align.c
@@ -38,8 +38,8 @@ static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, i
int z = *max - score - diff * e;
if (z > *max_zdrop) {
*max_zdrop = z;
- pos[0][0] = *max_i, pos[0][1] = i + 1;
- pos[1][0] = *max_j, pos[1][1] = j + 1;
+ pos[0][0] = *max_i, pos[0][1] = i;
+ pos[1][0] = *max_j, pos[1][1] = j;
}
} else *max = score, *max_i = i, *max_j = j;
}
@@ -53,16 +53,16 @@ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq,
// find the score and the region where score drops most along diagonal
for (k = 0, score = 0; k < n_cigar; ++k) {
uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4;
- if (op == 0) {
+ if (op == MM_CIGAR_MATCH) {
for (l = 0; l < len; ++l) {
score += mat[tseq[i + l] * 5 + qseq[j + l]];
update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
}
i += len, j += len;
- } else if (op == 1 || op == 2 || op == 3) {
+ } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) {
score -= opt->q + opt->e * len;
- if (op == 1) j += len; // insertion
- else i += len; // deletion
+ if (op == MM_CIGAR_INS) j += len;
+ else i += len;
update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
}
}
@@ -98,12 +98,12 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
for (k = 0; k < p->n_cigar; ++k) { // indel left alignment
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (len == 0) to_shrink = 1;
- if (op == 0) {
+ if (op == MM_CIGAR_MATCH) {
toff += len, qoff += len;
- } else if (op == 1 || op == 2) { // insertion or deletion
+ } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) {
int l, prev_len = p->cigar[k-1] >> 4;
- if (op == 1) {
+ if (op == MM_CIGAR_INS) {
for (l = 0; l < prev_len; ++l)
if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l])
break;
@@ -116,9 +116,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l;
if (l == prev_len) to_shrink = 1;
}
- if (op == 1) qoff += len;
+ if (op == MM_CIGAR_INS) qoff += len;
else toff += len;
- } else if (op == 3) {
+ } else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@@ -128,13 +128,13 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
uint32_t l, s[3] = {0,0,0};
for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D
uint32_t op = p->cigar[l]&0xf;
- if (op == 1 || op == 2 || p->cigar[l]>>4 == 0)
+ if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0)
s[op] += p->cigar[l] >> 4;
else break;
}
if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D
- p->cigar[k] = s[1]<<4|1;
- p->cigar[k+1] = s[2]<<4|2;
+ p->cigar[k] = s[1]<<4|MM_CIGAR_INS;
+ p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL;
for (k += 2; k < l; ++k)
p->cigar[k] &= 0xf;
to_shrink = 1;
@@ -154,9 +154,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator
p->n_cigar = l;
}
- if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D
+ if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D
int32_t l = p->cigar[0] >> 4;
- if ((p->cigar[0]&0xf) == 1) {
+ if ((p->cigar[0]&0xf) == MM_CIGAR_INS) {
if (r->rev) r->qe -= l;
else r->qs += l;
*qshift = l;
@@ -174,7 +174,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (r->p == 0) return;
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
- if (op == 0) {
+ if (op == MM_CIGAR_MATCH) {
while (len > 0) {
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "="
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
@@ -183,11 +183,11 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
}
++n_M;
- } else if (op == 1) { // insertion
+ } else if (op == MM_CIGAR_INS) {
qoff += len;
- } else if (op == 2) { // deletion
+ } else if (op == MM_CIGAR_DEL) {
toff += len;
- } else if (op == 3) { // intron
+ } else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@@ -195,7 +195,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (n_EQX == n_M) {
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
- if (op == 0) r->p->cigar[k] = len << 4 | 7;
+ if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH;
}
return;
}
@@ -209,25 +209,25 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
toff = qoff = m = 0;
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
- if (op == 0) { // match/mismatch
+ if (op == MM_CIGAR_MATCH) {
while (len > 0) {
// match
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {}
- if (l > 0) p->cigar[m++] = l << 4 | 7;
+ if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH;
len -= l;
toff += l, qoff += l;
// mismatch
for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {}
- if (l > 0) p->cigar[m++] = l << 4 | 8;
+ if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH;
len -= l;
toff += l, qoff += l;
}
continue;
- } else if (op == 1) { // insertion
+ } else if (op == MM_CIGAR_INS) {
qoff += len;
- } else if (op == 2) { // deletion
+ } else if (op == MM_CIGAR_DEL) {
toff += len;
- } else if (op == 3) { // intron
+ } else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
p->cigar[m++] = r->p->cigar[k];
@@ -248,7 +248,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
r->blen = r->mlen = 0;
for (k = 0; k < p->n_cigar; ++k) {
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
- if (op == 0) { // match/mismatch
+ if (op == MM_CIGAR_MATCH) {
int n_ambi = 0, n_diff = 0;
for (l = 0; l < len; ++l) {
int cq = qseq[qoff + l], ct = tseq[toff + l];
@@ -260,7 +260,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
}
r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi;
toff += len, qoff += len;
- } else if (op == 1) { // insertion
+ } else if (op == MM_CIGAR_INS) {
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (qseq[qoff + l] > 3) ++n_ambi;
@@ -268,7 +268,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
s -= q + e * len;
if (s < 0) s = 0;
qoff += len;
- } else if (op == 2) { // deletion
+ } else if (op == MM_CIGAR_DEL) {
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi;
@@ -276,7 +276,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
s -= q + e * len;
if (s < 0) s = 0;
toff += len;
- } else if (op == 3) { // intron
+ } else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@@ -333,7 +333,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
int i;
fprintf(stderr, "score=%d, cigar=", ez->score);
for (i = 0; i < ez->n_cigar; ++i)
- fprintf(stderr, "%d%c", ez->cigar[i]>>4, "MIDN"[ez->cigar[i]&0xf]);
+ fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]);
fprintf(stderr, "\n");
}
}
@@ -572,7 +572,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
uint8_t *tseq, *qseq, *junc;
- int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
+ int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
int32_t rs, re, qs, qe;
int32_t rs1, qs1, re1, qe1;
int8_t mat[25];
@@ -583,6 +583,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (r->cnt == 0) return;
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
bw = (int)(opt->bw * 1.5 + 1.);
+ bw_long = (int)(opt->bw_long * 1.5 + 1.);
+ if (bw_long < bw) bw_long = bw;
if (is_sr && !(mi->flag & MM_I_HPC)) {
mm_max_stretch(r, a, &as1, &cnt1);
@@ -724,7 +726,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
} else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
re1 = re, qe1 = qe;
if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) {
- int j, bw1 = bw, zdrop_code;
+ int j, bw1 = bw_long, zdrop_code;
if (a[as1+i].y & MM_SEED_LONG_JOIN)
bw1 = qe - qs > re - rs? qe - qs : re - rs;
// perform alignment
@@ -743,7 +745,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
}
- ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs);
+ ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
} else { // perform normal gapped alignment
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
}
@@ -754,6 +756,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (ez->n_cigar > 0)
mm_append_cigar(r, ez->n_cigar, ez->cigar);
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
+ if (!r->p) {
+ assert(ez->n_cigar == 0);
+ uint32_t capacity = sizeof(mm_extra_t)/4;
+ kroundup32(capacity);
+ r->p = (mm_extra_t*)calloc(capacity, 4);
+ r->p->capacity = capacity;
+ }
for (j = i - 1; j >= 0; --j)
if ((int32_t)a[as1 + j].x <= rs + ez->max_t)
break;
diff --git a/chain.c b/chain.c
deleted file mode 100644
index a2f7ac5..0000000
--- a/chain.c
+++ /dev/null
@@ -1,164 +0,0 @@
-#include
-#include
-#include
-#include "minimap.h"
-#include "mmpriv.h"
-#include "kalloc.h"
-
-static const char LogTable256[256] = {
-#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
- -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
- LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
- LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
-};
-
-static inline int ilog2_32(uint32_t v)
-{
- uint32_t t, tt;
- if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt];
- return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
-}
-
-mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
-{ // TODO: make sure this works when n has more than 32 bits
- int32_t k, *f, *p, *t, *v, n_u, n_v;
- int64_t i, j, st = 0;
- uint64_t *u, *u2, sum_qspan = 0;
- float avg_qspan;
- mm128_t *b, *w;
-
- if (_u) *_u = 0, *n_u_ = 0;
- if (n == 0 || a == 0) {
- kfree(km, a);
- return 0;
- }
- f = (int32_t*)kmalloc(km, n * 4);
- p = (int32_t*)kmalloc(km, n * 4);
- t = (int32_t*)kmalloc(km, n * 4);
- v = (int32_t*)kmalloc(km, n * 4);
- memset(t, 0, n * 4);
-
- for (i = 0; i < n; ++i) sum_qspan += a[i].y>>32&0xff;
- avg_qspan = (float)sum_qspan / n;
-
- // fill the score and backtrack arrays
- for (i = 0; i < n; ++i) {
- uint64_t ri = a[i].x;
- int64_t max_j = -1;
- int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!!
- int32_t max_f = q_span, n_skip = 0, min_d;
- int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
- while (st < i && ri > a[st].x + max_dist_x) ++st;
- if (i - st > max_iter) st = i - max_iter;
- for (j = i - 1; j >= st; --j) {
- int64_t dr = ri - a[j].x;
- int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost;
- int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
- if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below
- if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue;
- dd = dr > dq? dr - dq : dq - dr;
- if (sidi == sidj && dd > bw) continue;
- if (n_segs > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) continue;
- min_d = dq < dr? dq : dr;
- sc = min_d > q_span? q_span : dq < dr? dq : dr;
- log_dd = dd? ilog2_32(dd) : 0;
- gap_cost = 0;
- if (is_cdna || sidi != sidj) {
- int c_log, c_lin;
- c_lin = (int)(dd * .01 * avg_qspan);
- c_log = log_dd;
- if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
- else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log;
- else gap_cost = c_lin + (c_log>>1);
- } else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1);
- sc -= (int)((double)gap_cost * gap_scale + .499);
- sc += f[j];
- if (sc > max_f) {
- max_f = sc, max_j = j;
- if (n_skip > 0) --n_skip;
- } else if (t[j] == i) {
- if (++n_skip > max_skip)
- break;
- }
- if (p[j] >= 0) t[p[j]] = i;
- }
- f[i] = max_f, p[i] = max_j;
- v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
- }
-
- // find the ending positions of chains
- memset(t, 0, n * 4);
- for (i = 0; i < n; ++i)
- if (p[i] >= 0) t[p[i]] = 1;
- for (i = n_u = 0; i < n; ++i)
- if (t[i] == 0 && v[i] >= min_sc)
- ++n_u;
- if (n_u == 0) {
- kfree(km, a); kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, v);
- return 0;
- }
- u = (uint64_t*)kmalloc(km, n_u * 8);
- for (i = n_u = 0; i < n; ++i) {
- if (t[i] == 0 && v[i] >= min_sc) {
- j = i;
- while (j >= 0 && f[j] < v[j]) j = p[j]; // find the peak that maximizes f[]
- if (j < 0) j = i; // TODO: this should really be assert(j>=0)
- u[n_u++] = (uint64_t)f[j] << 32 | j;
- }
- }
- radix_sort_64(u, u + n_u);
- for (i = 0; i < n_u>>1; ++i) { // reverse, s.t. the highest scoring chain is the first
- uint64_t t = u[i];
- u[i] = u[n_u - i - 1], u[n_u - i - 1] = t;
- }
-
- // backtrack
- memset(t, 0, n * 4);
- for (i = n_v = k = 0; i < n_u; ++i) { // starting from the highest score
- int32_t n_v0 = n_v, k0 = k;
- j = (int32_t)u[i];
- do {
- v[n_v++] = j;
- t[j] = 1;
- j = p[j];
- } while (j >= 0 && t[j] == 0);
- if (j < 0) {
- if (n_v - n_v0 >= min_cnt) u[k++] = u[i]>>32<<32 | (n_v - n_v0);
- } else if ((int32_t)(u[i]>>32) - f[j] >= min_sc) {
- if (n_v - n_v0 >= min_cnt) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0);
- }
- if (k0 == k) n_v = n_v0; // no new chain added, reset
- }
- *n_u_ = n_u = k, *_u = u; // NB: note that u[] may not be sorted by score here
-
- // free temporary arrays
- kfree(km, f); kfree(km, p); kfree(km, t);
-
- // write the result to b[]
- b = (mm128_t*)kmalloc(km, n_v * sizeof(mm128_t));
- for (i = 0, k = 0; i < n_u; ++i) {
- int32_t k0 = k, ni = (int32_t)u[i];
- for (j = 0; j < ni; ++j)
- b[k] = a[v[k0 + (ni - j - 1)]], ++k;
- }
- kfree(km, v);
-
- // sort u[] and a[] by a[].x, such that adjacent chains may be joined (required by mm_join_long)
- w = (mm128_t*)kmalloc(km, n_u * sizeof(mm128_t));
- for (i = k = 0; i < n_u; ++i) {
- w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i;
- k += (int32_t)u[i];
- }
- radix_sort_128x(w, w + n_u);
- u2 = (uint64_t*)kmalloc(km, n_u * 8);
- for (i = k = 0; i < n_u; ++i) {
- int32_t j = (int32_t)w[i].y, n = (int32_t)u[j];
- u2[i] = u[j];
- memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t));
- k += n;
- }
- if (n_u) memcpy(u, u2, n_u * 8);
- if (k) memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot
- kfree(km, a); kfree(km, w); kfree(km, u2);
- return b;
-}
diff --git a/code_of_conduct.md b/code_of_conduct.md
new file mode 100644
index 0000000..b7175c1
--- /dev/null
+++ b/code_of_conduct.md
@@ -0,0 +1,30 @@
+## Contributor Code of Conduct
+
+As contributors and maintainers of this project, we pledge to respect all
+people who contribute through reporting issues, posting feature requests,
+updating documentation, submitting pull requests or patches, and other
+activities.
+
+We are committed to making participation in this project a harassment-free
+experience for everyone, regardless of level of experience, gender, gender
+identity and expression, sexual orientation, disability, personal appearance,
+body size, race, age, or religion.
+
+Examples of unacceptable behavior by participants include the use of sexual
+language or imagery, derogatory comments or personal attacks, trolling, public
+or private harassment, insults, or other unprofessional conduct.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct. Project maintainers or
+contributors who do not follow the Code of Conduct may be removed from the
+project team.
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by opening an issue or contacting the maintainer via email.
+
+This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
+1.0.0][v1].
+
+[cc]: http://contributor-covenant.org/
+[v1]: http://contributor-covenant.org/version/1/0/0/
diff --git a/cookbook.md b/cookbook.md
index 6cb4d5f..8ff2bd0 100644
--- a/cookbook.md
+++ b/cookbook.md
@@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
please follow the command lines below:
```sh
# install minimap2 executables
-curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar jxf -
-cp minimap2-2.17_x64-linux/{minimap2,k8,paftools.js} . # copy executables
+curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar jxf -
+cp minimap2-2.21_x64-linux/{minimap2,k8,paftools.js} . # copy executables
export PATH="$PATH:"`pwd` # put the current directory on PATH
# download example datasets
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
diff --git a/esterr.c b/esterr.c
index 733c762..7a509da 100644
--- a/esterr.c
+++ b/esterr.c
@@ -59,6 +59,6 @@ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const
n_tot = en - st + 1;
if (r->qs > avg_k && r->rs > avg_k) ++n_tot;
if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot;
- r->div = logf((float)n_tot / n_match) / avg_k;
+ r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k));
}
}
diff --git a/example.c b/example.c
index ca1fdbe..b495051 100644
--- a/example.c
+++ b/example.c
@@ -47,7 +47,7 @@ int main(int argc, char *argv[])
printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq);
for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings!
- printf("%d%c", r->p->cigar[i]>>4, "MIDNSH"[r->p->cigar[i]&0xf]);
+ printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]);
putchar('\n');
free(r->p);
}
diff --git a/format.c b/format.c
index 1773c95..b7a7be9 100644
--- a/format.c
+++ b/format.c
@@ -144,8 +144,8 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
- assert((op >= 0 && op <= 3) || op == 7 || op == 8);
- if (op == 0 || op == 7 || op == 8) { // match
+ assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
int l_tmp = 0;
for (j = 0; j < len; ++j) {
if (qseq[q_off + j] != tseq[t_off + j]) {
@@ -166,12 +166,12 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
} else mm_sprintf_lite(s, ":%d", l_tmp);
}
q_off += len, t_off += len;
- } else if (op == 1) { // insertion to ref
+ } else if (op == MM_CIGAR_INS) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "acgtn"[qseq[q_off + j]];
mm_sprintf_lite(s, "+%s", tmp);
q_off += len;
- } else if (op == 2) { // deletion from ref
+ } else if (op == MM_CIGAR_DEL) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "acgtn"[tseq[t_off + j]];
mm_sprintf_lite(s, "-%s", tmp);
@@ -192,8 +192,8 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
if (write_tag) mm_sprintf_lite(s, "\tMD:Z:");
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
- assert((op >= 0 && op <= 3) || op == 7 || op == 8);
- if (op == 0 || op == 7 || op == 8) { // match
+ assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
+ if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
for (j = 0; j < len; ++j) {
if (qseq[q_off + j] != tseq[t_off + j]) {
mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]);
@@ -201,15 +201,15 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
} else ++l_MD;
}
q_off += len, t_off += len;
- } else if (op == 1) { // insertion to ref
+ } else if (op == MM_CIGAR_INS) {
q_off += len;
- } else if (op == 2) { // deletion from ref
+ } else if (op == MM_CIGAR_DEL) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "ACGTN"[tseq[t_off + j]];
mm_sprintf_lite(s, "%d^%s", l_MD, tmp);
l_MD = 0;
t_off += len;
- } else if (op == 3) { // reference skip
+ } else if (op == MM_CIGAR_N_SKIP) {
t_off += len;
}
}
@@ -277,7 +277,7 @@ double mm_event_identity(const mm_reg1_t *r)
if (r->p == 0) return -1.0f;
for (i = 0; i < r->p->n_cigar; ++i) {
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
- if (op == 1 || op == 2)
+ if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
++n_gapo, n_gap += len;
}
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
@@ -335,7 +335,7 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
uint32_t k;
mm_sprintf_lite(s, "\tcg:Z:");
for (k = 0; k < r->p->n_cigar; ++k)
- mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
+ mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
}
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, opt_flag&MM_F_QSTRAND);
@@ -392,7 +392,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co
assert(clip_len[0] < qlen && clip_len[1] < qlen);
if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char);
for (k = 0; k < r->p->n_cigar; ++k)
- mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
+ mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char);
}
}
@@ -402,7 +402,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
{
const int max_bam_cigar_op = 65535;
int flag, n_regs = n_regss[seg_idx], cigar_in_tag = 0;
- int this_rid = -1, this_pos = -1, this_rev = 0;
+ int this_rid = -1, this_pos = -1;
const mm_reg1_t *regs = regss[seg_idx], *r_prev = NULL, *r_next;
const mm_reg1_t *r = n_regs > 0 && reg_idx < n_regs && reg_idx >= 0? ®s[reg_idx] : NULL;
@@ -451,7 +451,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1);
} else mm_sprintf_lite(s, "\t*\t0\t0\t*");
} else {
- this_rid = r->rid, this_pos = r->rs, this_rev = r->rev;
+ this_rid = r->rid, this_pos = r->rs;
mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq);
if ((opt_flag & MM_F_LONG_CIGAR) && r->p && r->p->n_cigar > max_bam_cigar_op - 2) {
int n_cigar = r->p->n_cigar;
diff --git a/hit.c b/hit.c
index cd30ada..c2e8180 100644
--- a/hit.c
+++ b/hit.c
@@ -122,7 +122,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int
r->split |= 1, r2->split |= 2;
}
-void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
+void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
{
int i, j, k, *w;
uint64_t *cov;
@@ -162,7 +162,7 @@ skip_uncov:
min = ej - sj < ei - si? ej - sj : ei - si;
max = ej - sj > ei - si? ej - sj : ei - si;
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
- if ((float)ol / min - (float)uncov_len / max > mask_level) {
+ if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit
int cnt_sub = 0, sci = ri->score;
ri->parent = rp->parent;
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
@@ -312,64 +312,6 @@ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
return as;
}
-void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
-{
- int i, n_aux, n_regs = *n_regs_, n_drop = 0;
- uint64_t *aux;
-
- if (n_regs < 2) return; // nothing to join
- mm_squeeze_a(km, n_regs, regs, a);
-
- aux = (uint64_t*)kmalloc(km, n_regs * 8);
- for (i = n_aux = 0; i < n_regs; ++i)
- if (regs[i].parent == i || regs[i].parent < 0)
- aux[n_aux++] = (uint64_t)regs[i].as << 32 | i;
- radix_sort_64(aux, aux + n_aux);
-
- for (i = n_aux - 1; i >= 1; --i) {
- mm_reg1_t *r0 = ®s[(int32_t)aux[i-1]], *r1 = ®s[(int32_t)aux[i]];
- mm128_t *a0e, *a1s;
- int max_gap, min_gap, sc_thres, min_flank_len;
-
- // test
- if (r0->as + r0->cnt != r1->as) continue; // not adjacent in a[]
- if (r0->rid != r1->rid || r0->rev != r1->rev) continue; // make sure on the same target and strand
- a0e = &a[r0->as + r0->cnt - 1];
- a1s = &a[r1->as];
- if (a1s->x <= a0e->x || (int32_t)a1s->y <= (int32_t)a0e->y) continue; // keep colinearity
- max_gap = min_gap = (int32_t)a1s->y - (int32_t)a0e->y;
- max_gap = a0e->x + max_gap > a1s->x? max_gap : a1s->x - a0e->x;
- min_gap = a0e->x + min_gap < a1s->x? min_gap : a1s->x - a0e->x;
- if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue;
- sc_thres = (int)((float)opt->min_join_flank_sc / opt->max_join_long * max_gap + .499);
- if (r0->score < sc_thres || r1->score < sc_thres) continue; // require good flanking chains
- min_flank_len = (int)(max_gap * opt->min_join_flank_ratio);
- if (r0->re - r0->rs < min_flank_len || r0->qe - r0->qs < min_flank_len) continue; // require enough flanking length
- if (r1->re - r1->rs < min_flank_len || r1->qe - r1->qs < min_flank_len) continue;
-
- // all conditions satisfied; join
- a[r1->as].y |= MM_SEED_LONG_JOIN;
- r0->cnt += r1->cnt, r0->score += r1->score;
- mm_reg_set_coor(r0, qlen, a, opt->flag&MM_F_QSTRAND);
- r1->cnt = 0;
- r1->parent = r0->id;
- ++n_drop;
- }
- kfree(km, aux);
-
- if (n_drop > 0) { // then fix the hits hierarchy
- for (i = 0; i < n_regs; ++i) { // adjust the mm_reg1_t::parent
- mm_reg1_t *r = ®s[i];
- if (r->parent >= 0 && r->id != r->parent) { // fix for secondary hits only
- if (regs[r->parent].parent >= 0 && regs[r->parent].parent != r->parent)
- r->parent = regs[r->parent].parent;
- }
- }
- mm_filter_regs(opt, qlen, n_regs_, regs);
- mm_sync_regs(km, *n_regs_, regs);
- }
-}
-
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
{
int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;
diff --git a/index.c b/index.c
index 3838382..0d8a2ae 100644
--- a/index.c
+++ b/index.c
@@ -676,7 +676,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
char *p, *q, *bl, *bs;
int32_t i, id = -1, n_blk = 0;
for (p = q = str.s, i = 0;; ++p) {
- if (*p == 0 || isspace(*p)) {
+ if (*p == 0 || *p == '\t') {
int32_t c = *p;
*p = 0;
if (i == 0) { // chr
diff --git a/kalloc.h b/kalloc.h
index 6d72e4e..93bff5e 100644
--- a/kalloc.h
+++ b/kalloc.h
@@ -34,4 +34,43 @@ void km_stat(const void *_km, km_stat_t *s);
KREALLOC((km), (a), (m)); \
} while (0)
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \
+ typedef struct { \
+ size_t cnt, n, max; \
+ kmptype_t **buf; \
+ void *km; \
+ } kmp_##name##_t; \
+ SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \
+ kmp_##name##_t *mp; \
+ KCALLOC(km, mp, 1); \
+ mp->km = km; \
+ return mp; \
+ } \
+ SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \
+ size_t k; \
+ for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \
+ kfree(mp->km, mp->buf); kfree(mp->km, mp); \
+ } \
+ SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
+ ++mp->cnt; \
+ if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \
+ return mp->buf[--mp->n]; \
+ } \
+ SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
+ --mp->cnt; \
+ if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \
+ mp->buf[mp->n++] = p; \
+ }
+
+#define KALLOC_POOL_INIT(name, kmptype_t) \
+ KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t)
+
#endif
diff --git a/krmq.h b/krmq.h
new file mode 100644
index 0000000..8fa1cce
--- /dev/null
+++ b/krmq.h
@@ -0,0 +1,474 @@
+/* The MIT License
+
+ Copyright (c) 2019 by Attractive Chaos
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* An example:
+
+#include
+#include
+#include
+#include "krmq.h"
+
+struct my_node {
+ char key;
+ KRMQ_HEAD(struct my_node) head;
+};
+#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key))
+KRMQ_INIT(my, struct my_node, head, my_cmp)
+
+int main(void) {
+ const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate
+ struct my_node *root = 0;
+ int i, l = strlen(str);
+ for (i = 0; i < l; ++i) { // insert in the input order
+ struct my_node *q, *p = malloc(sizeof(*p));
+ p->key = str[i];
+ q = krmq_insert(my, &root, p, 0);
+ if (p != q) free(p); // if already present, free
+ }
+ krmq_itr_t(my) itr;
+ krmq_itr_first(my, root, &itr); // place at first
+ do { // traverse
+ const struct my_node *p = krmq_at(&itr);
+ putchar(p->key);
+ free((void*)p); // free node
+ } while (krmq_itr_next(my, &itr));
+ putchar('\n');
+ return 0;
+}
+*/
+
+#ifndef KRMQ_H
+#define KRMQ_H
+
+#ifdef __STRICT_ANSI__
+#define inline __inline__
+#endif
+
+#define KRMQ_MAX_DEPTH 64
+
+#define krmq_size(head, p) ((p)? (p)->head.size : 0)
+#define krmq_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0)
+
+#define KRMQ_HEAD(__type) \
+ struct { \
+ __type *p[2], *s; \
+ signed char balance; /* balance factor */ \
+ unsigned size; /* #elements in subtree */ \
+ }
+
+#define __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \
+ __scope __type *krmq_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \
+ const __type *p = root; \
+ unsigned cnt = 0; \
+ while (p != 0) { \
+ int cmp; \
+ cmp = __cmp(x, p); \
+ if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \
+ if (cmp < 0) p = p->__head.p[0]; \
+ else if (cmp > 0) p = p->__head.p[1]; \
+ else break; \
+ } \
+ if (cnt_) *cnt_ = cnt; \
+ return (__type*)p; \
+ } \
+ __scope __type *krmq_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \
+ const __type *p = root, *l = 0, *u = 0; \
+ while (p != 0) { \
+ int cmp; \
+ cmp = __cmp(x, p); \
+ if (cmp < 0) u = p, p = p->__head.p[0]; \
+ else if (cmp > 0) l = p, p = p->__head.p[1]; \
+ else { l = u = p; break; } \
+ } \
+ if (lower) *lower = (__type*)l; \
+ if (upper) *upper = (__type*)u; \
+ return (__type*)p; \
+ }
+
+#define __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \
+ __scope __type *krmq_rmq_##suf(const __type *root, const __type *lo, const __type *up) { /* CLOSED interval */ \
+ const __type *p = root, *path[2][KRMQ_MAX_DEPTH], *min; \
+ int plen[2] = {0, 0}, pcmp[2][KRMQ_MAX_DEPTH], i, cmp, lca; \
+ if (root == 0) return 0; \
+ while (p) { \
+ cmp = __cmp(lo, p); \
+ path[0][plen[0]] = p, pcmp[0][plen[0]++] = cmp; \
+ if (cmp < 0) p = p->__head.p[0]; \
+ else if (cmp > 0) p = p->__head.p[1]; \
+ else break; \
+ } \
+ p = root; \
+ while (p) { \
+ cmp = __cmp(up, p); \
+ path[1][plen[1]] = p, pcmp[1][plen[1]++] = cmp; \
+ if (cmp < 0) p = p->__head.p[0]; \
+ else if (cmp > 0) p = p->__head.p[1]; \
+ else break; \
+ } \
+ for (i = 0; i < plen[0] && i < plen[1]; ++i) /* find the LCA */ \
+ if (path[0][i] == path[1][i] && pcmp[0][i] <= 0 && pcmp[1][i] >= 0) \
+ break; \
+ if (i == plen[0] || i == plen[1]) return 0; /* no elements in the closed interval */ \
+ lca = i, min = path[0][lca]; \
+ for (i = lca + 1; i < plen[0]; ++i) { \
+ if (pcmp[0][i] <= 0) { \
+ if (__lt2(path[0][i], min)) min = path[0][i]; \
+ if (path[0][i]->__head.p[1] && __lt2(path[0][i]->__head.p[1]->__head.s, min)) \
+ min = path[0][i]->__head.p[1]->__head.s; \
+ } \
+ } \
+ for (i = lca + 1; i < plen[1]; ++i) { \
+ if (pcmp[1][i] >= 0) { \
+ if (__lt2(path[1][i], min)) min = path[1][i]; \
+ if (path[1][i]->__head.p[0] && __lt2(path[1][i]->__head.p[0]->__head.s, min)) \
+ min = path[1][i]->__head.p[0]->__head.s; \
+ } \
+ } \
+ return (__type*)min; \
+ }
+
+#define __KRMQ_ROTATE(suf, __type, __head, __lt2) \
+ /* */ \
+ static inline void krmq_update_min_##suf(__type *p, const __type *q, const __type *r) { \
+ p->__head.s = !q || __lt2(p, q->__head.s)? p : q->__head.s; \
+ p->__head.s = !r || __lt2(p->__head.s, r->__head.s)? p->__head.s : r->__head.s; \
+ } \
+ /* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \
+ static inline __type *krmq_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \
+ int opp = 1 - dir; /* opposite direction */ \
+ __type *q = p->__head.p[opp], *s = p->__head.s; \
+ unsigned size_p = p->__head.size; \
+ p->__head.size -= q->__head.size - krmq_size_child(__head, q, dir); \
+ q->__head.size = size_p; \
+ krmq_update_min_##suf(p, p->__head.p[dir], q->__head.p[dir]); \
+ q->__head.s = s; \
+ p->__head.p[opp] = q->__head.p[dir]; \
+ q->__head.p[dir] = p; \
+ return q; \
+ } \
+ /* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \
+ static inline __type *krmq_rotate2_##suf(__type *p, int dir) { \
+ int b1, opp = 1 - dir; \
+ __type *q = p->__head.p[opp], *r = q->__head.p[dir], *s = p->__head.s; \
+ unsigned size_x_dir = krmq_size_child(__head, r, dir); \
+ r->__head.size = p->__head.size; \
+ p->__head.size -= q->__head.size - size_x_dir; \
+ q->__head.size -= size_x_dir + 1; \
+ krmq_update_min_##suf(p, p->__head.p[dir], r->__head.p[dir]); \
+ krmq_update_min_##suf(q, q->__head.p[opp], r->__head.p[opp]); \
+ r->__head.s = s; \
+ p->__head.p[opp] = r->__head.p[dir]; \
+ r->__head.p[dir] = p; \
+ q->__head.p[dir] = r->__head.p[opp]; \
+ r->__head.p[opp] = q; \
+ b1 = dir == 0? +1 : -1; \
+ if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \
+ else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \
+ else q->__head.balance = b1, p->__head.balance = 0; \
+ r->__head.balance = 0; \
+ return r; \
+ }
+
+#define __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \
+ __scope __type *krmq_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \
+ unsigned char stack[KRMQ_MAX_DEPTH]; \
+ __type *path[KRMQ_MAX_DEPTH]; \
+ __type *bp, *bq; \
+ __type *p, *q, *r = 0; /* _r_ is potentially the new root */ \
+ int i, which = 0, top, b1, path_len; \
+ unsigned cnt = 0; \
+ bp = *root_, bq = 0; \
+ /* find the insertion location */ \
+ for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \
+ int cmp; \
+ cmp = __cmp(x, p); \
+ if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \
+ if (cmp == 0) { \
+ if (cnt_) *cnt_ = cnt; \
+ return p; \
+ } \
+ if (p->__head.balance != 0) \
+ bq = q, bp = p, top = 0; \
+ stack[top++] = which = (cmp > 0); \
+ path[path_len++] = p; \
+ } \
+ if (cnt_) *cnt_ = cnt; \
+ x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0, x->__head.s = x; \
+ if (q == 0) *root_ = x; \
+ else q->__head.p[which] = x; \
+ if (bp == 0) return x; \
+ for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \
+ for (i = path_len - 1; i >= 0; --i) { \
+ krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \
+ if (path[i]->__head.s != x) break; \
+ } \
+ for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \
+ if (stack[top] == 0) --p->__head.balance; \
+ else ++p->__head.balance; \
+ if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \
+ /* re-balance */ \
+ which = (bp->__head.balance < 0); \
+ b1 = which == 0? +1 : -1; \
+ q = bp->__head.p[1 - which]; \
+ if (q->__head.balance == b1) { \
+ r = krmq_rotate1_##suf(bp, which); \
+ q->__head.balance = bp->__head.balance = 0; \
+ } else r = krmq_rotate2_##suf(bp, which); \
+ if (bq == 0) *root_ = r; \
+ else bq->__head.p[bp != bq->__head.p[0]] = r; \
+ return x; \
+ }
+
+#define __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \
+ __scope __type *krmq_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \
+ __type *p, *path[KRMQ_MAX_DEPTH], fake; \
+ unsigned char dir[KRMQ_MAX_DEPTH]; \
+ int i, d = 0, cmp; \
+ unsigned cnt = 0; \
+ fake = **root_, fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \
+ if (cnt_) *cnt_ = 0; \
+ if (x) { \
+ for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \
+ int which = (cmp > 0); \
+ if (cmp > 0) cnt += krmq_size_child(__head, p, 0) + 1; \
+ dir[d] = which; \
+ path[d++] = p; \
+ p = p->__head.p[which]; \
+ if (p == 0) { \
+ if (cnt_) *cnt_ = 0; \
+ return 0; \
+ } \
+ } \
+ cnt += krmq_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \
+ } else { \
+ for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \
+ dir[d] = 0, path[d++] = p; \
+ p = path[--d]; \
+ } \
+ if (cnt_) *cnt_ = cnt; \
+ for (i = 1; i < d; ++i) --path[i]->__head.size; \
+ if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \
+ path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \
+ } else { \
+ __type *q = p->__head.p[1]; \
+ if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3,q=2 */ \
+ q->__head.p[0] = p->__head.p[0]; \
+ q->__head.balance = p->__head.balance; \
+ path[d-1]->__head.p[dir[d-1]] = q; \
+ path[d] = q, dir[d++] = 1; \
+ q->__head.size = p->__head.size - 1; \
+ } else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \
+ __type *r; \
+ int e = d++; /* backup _d_ */\
+ for (;;) { \
+ dir[d] = 0; \
+ path[d++] = q; \
+ r = q->__head.p[0]; \
+ if (r->__head.p[0] == 0) break; \
+ q = r; \
+ } \
+ r->__head.p[0] = p->__head.p[0]; \
+ q->__head.p[0] = r->__head.p[1]; \
+ r->__head.p[1] = p->__head.p[1]; \
+ r->__head.balance = p->__head.balance; \
+ path[e-1]->__head.p[dir[e-1]] = r; \
+ path[e] = r, dir[e] = 1; \
+ for (i = e + 1; i < d; ++i) --path[i]->__head.size; \
+ r->__head.size = p->__head.size - 1; \
+ } \
+ } \
+ for (i = d - 1; i >= 0; --i) /* not sure why adding condition "path[i]->__head.s==p" doesn't work */ \
+ krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \
+ while (--d > 0) { \
+ __type *q = path[d]; \
+ int which, other, b1 = 1, b2 = 2; \
+ which = dir[d], other = 1 - which; \
+ if (which) b1 = -b1, b2 = -b2; \
+ q->__head.balance += b1; \
+ if (q->__head.balance == b1) break; \
+ else if (q->__head.balance == b2) { \
+ __type *r = q->__head.p[other]; \
+ if (r->__head.balance == -b1) { \
+ path[d-1]->__head.p[dir[d-1]] = krmq_rotate2_##suf(q, which); \
+ } else { \
+ path[d-1]->__head.p[dir[d-1]] = krmq_rotate1_##suf(q, which); \
+ if (r->__head.balance == 0) { \
+ r->__head.balance = -b1; \
+ q->__head.balance = b1; \
+ break; \
+ } else r->__head.balance = q->__head.balance = 0; \
+ } \
+ } \
+ } \
+ *root_ = fake.__head.p[0]; \
+ return p; \
+ }
+
+#define krmq_free(__type, __head, __root, __free) do { \
+ __type *_p, *_q; \
+ for (_p = __root; _p; _p = _q) { \
+ if (_p->__head.p[0] == 0) { \
+ _q = _p->__head.p[1]; \
+ __free(_p); \
+ } else { \
+ _q = _p->__head.p[0]; \
+ _p->__head.p[0] = _q->__head.p[1]; \
+ _q->__head.p[1] = _p; \
+ } \
+ } \
+ } while (0)
+
+#define __KRMQ_ITR(suf, __scope, __type, __head, __cmp) \
+ struct krmq_itr_##suf { \
+ const __type *stack[KRMQ_MAX_DEPTH], **top; \
+ }; \
+ __scope void krmq_itr_first_##suf(const __type *root, struct krmq_itr_##suf *itr) { \
+ const __type *p; \
+ for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \
+ *++itr->top = p; \
+ } \
+ __scope int krmq_itr_find_##suf(const __type *root, const __type *x, struct krmq_itr_##suf *itr) { \
+ const __type *p = root; \
+ itr->top = itr->stack - 1; \
+ while (p != 0) { \
+ int cmp; \
+ *++itr->top = p; \
+ cmp = __cmp(x, p); \
+ if (cmp < 0) p = p->__head.p[0]; \
+ else if (cmp > 0) p = p->__head.p[1]; \
+ else break; \
+ } \
+ return p? 1 : 0; \
+ } \
+ __scope int krmq_itr_next_bidir_##suf(struct krmq_itr_##suf *itr, int dir) { \
+ const __type *p; \
+ if (itr->top < itr->stack) return 0; \
+ dir = !!dir; \
+ p = (*itr->top)->__head.p[dir]; \
+ if (p) { /* go down */ \
+ for (; p; p = p->__head.p[!dir]) \
+ *++itr->top = p; \
+ return 1; \
+ } else { /* go up */ \
+ do { \
+ p = *itr->top--; \
+ } while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \
+ return itr->top < itr->stack? 0 : 1; \
+ } \
+ } \
+
+/**
+ * Insert a node to the tree
+ *
+ * @param suf name suffix used in KRMQ_INIT()
+ * @param proot pointer to the root of the tree (in/out: root may change)
+ * @param x node to insert (in)
+ * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out)
+ *
+ * @return _x_ if not present in the tree, or the node equal to x.
+ */
+#define krmq_insert(suf, proot, x, cnt) krmq_insert_##suf(proot, x, cnt)
+
+/**
+ * Find a node in the tree
+ *
+ * @param suf name suffix used in KRMQ_INIT()
+ * @param root root of the tree
+ * @param x node value to find (in)
+ * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out)
+ *
+ * @return node equal to _x_ if present, or NULL if absent
+ */
+#define krmq_find(suf, root, x, cnt) krmq_find_##suf(root, x, cnt)
+#define krmq_interval(suf, root, x, lower, upper) krmq_interval_##suf(root, x, lower, upper)
+#define krmq_rmq(suf, root, lo, up) krmq_rmq_##suf(root, lo, up)
+
+/**
+ * Delete a node from the tree
+ *
+ * @param suf name suffix used in KRMQ_INIT()
+ * @param proot pointer to the root of the tree (in/out: root may change)
+ * @param x node value to delete; if NULL, delete the first node (in)
+ *
+ * @return node removed from the tree if present, or NULL if absent
+ */
+#define krmq_erase(suf, proot, x, cnt) krmq_erase_##suf(proot, x, cnt)
+#define krmq_erase_first(suf, proot) krmq_erase_##suf(proot, 0, 0)
+
+#define krmq_itr_t(suf) struct krmq_itr_##suf
+
+/**
+ * Place the iterator at the smallest object
+ *
+ * @param suf name suffix used in KRMQ_INIT()
+ * @param root root of the tree
+ * @param itr iterator
+ */
+#define krmq_itr_first(suf, root, itr) krmq_itr_first_##suf(root, itr)
+
+/**
+ * Place the iterator at the object equal to or greater than the query
+ *
+ * @param suf name suffix used in KRMQ_INIT()
+ * @param root root of the tree
+ * @param x query (in)
+ * @param itr iterator (out)
+ *
+ * @return 1 if find; 0 otherwise. krmq_at(itr) is NULL if and only if query is
+ * larger than all objects in the tree
+ */
+#define krmq_itr_find(suf, root, x, itr) krmq_itr_find_##suf(root, x, itr)
+
+/**
+ * Move to the next object in order
+ *
+ * @param itr iterator (modified)
+ *
+ * @return 1 if there is a next object; 0 otherwise
+ */
+#define krmq_itr_next(suf, itr) krmq_itr_next_bidir_##suf(itr, 1)
+#define krmq_itr_prev(suf, itr) krmq_itr_next_bidir_##suf(itr, 0)
+
+/**
+ * Return the pointer at the iterator
+ *
+ * @param itr iterator
+ *
+ * @return pointer if present; NULL otherwise
+ */
+#define krmq_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top)
+
+#define KRMQ_INIT2(suf, __scope, __type, __head, __cmp, __lt2) \
+ __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \
+ __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \
+ __KRMQ_ROTATE(suf, __type, __head, __lt2) \
+ __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \
+ __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \
+ __KRMQ_ITR(suf, __scope, __type, __head, __cmp)
+
+#define KRMQ_INIT(suf, __type, __head, __cmp, __lt2) \
+ KRMQ_INIT2(suf,, __type, __head, __cmp, __lt2)
+
+#endif
diff --git a/ksw2.h b/ksw2.h
index c700136..cbd1ddc 100644
--- a/ksw2.h
+++ b/ksw2.h
@@ -16,6 +16,13 @@
#define KSW_EZ_SPLICE_REV 0x200
#define KSW_EZ_SPLICE_FLANK 0x400
+// The subset of CIGAR operators used by ksw code.
+// Use MM_CIGAR_* from minimap.h if you need the full list.
+#define KSW_CIGAR_MATCH 0
+#define KSW_CIGAR_INS 1
+#define KSW_CIGAR_DEL 2
+#define KSW_CIGAR_N_SKIP 3
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -137,13 +144,13 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intro
else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
if (force_state >= 0) state = force_state;
- if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
- else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
- else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
- else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
+ if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j;
+ else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i;
+ else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i;
+ else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j;
}
- if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion
- if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
+ if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion
+ if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion
if (!is_rev)
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c
index b578274..162e9e2 100644
--- a/ksw2_extd2_sse.c
+++ b/ksw2_extd2_sse.c
@@ -4,15 +4,23 @@
#include "ksw2.h"
#ifdef __SSE2__
+#ifdef USE_SIMDE
+#include
+#else
#include
+#endif
#ifdef KSW_SSE2_ONLY
#undef __SSE4_1__
#endif
#ifdef __SSE4_1__
+#ifdef USE_SIMDE
+#include
+#else
#include
#endif
+#endif
#ifdef KSW_CPU_DISPATCH
#ifdef __SSE4_1__
diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c
index e7984c6..4157e38 100644
--- a/ksw2_exts2_sse.c
+++ b/ksw2_exts2_sse.c
@@ -4,15 +4,22 @@
#include "ksw2.h"
#ifdef __SSE2__
+#ifdef USE_SIMDE
+#include
+#else
#include
-
+#endif
#ifdef KSW_SSE2_ONLY
#undef __SSE4_1__
#endif
#ifdef __SSE4_1__
+#ifdef USE_SIMDE
+#include
+#else
#include
#endif
+#endif
#ifdef KSW_CPU_DISPATCH
#ifdef __SSE4_1__
diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c
index 02bb4c2..ad19131 100644
--- a/ksw2_extz2_sse.c
+++ b/ksw2_extz2_sse.c
@@ -3,15 +3,23 @@
#include "ksw2.h"
#ifdef __SSE2__
+#ifdef USE_SIMDE
+#include
+#else
#include
+#endif
#ifdef KSW_SSE2_ONLY
#undef __SSE4_1__
#endif
#ifdef __SSE4_1__
+#ifdef USE_SIMDE
+#include
+#else
#include
#endif
+#endif
#ifdef KSW_CPU_DISPATCH
#ifdef __SSE4_1__
diff --git a/ksw2_ll_sse.c b/ksw2_ll_sse.c
index 469de52..14b9b50 100644
--- a/ksw2_ll_sse.c
+++ b/ksw2_ll_sse.c
@@ -1,9 +1,14 @@
#include
#include
#include
-#include
#include "ksw2.h"
+#ifdef USE_SIMDE
+#include
+#else
+#include
+#endif
+
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x),1)
#define UNLIKELY(x) __builtin_expect((x),0)
diff --git a/lchain.c b/lchain.c
new file mode 100644
index 0000000..220e3e9
--- /dev/null
+++ b/lchain.c
@@ -0,0 +1,354 @@
+#include
+#include
+#include
+#include
+#include "mmpriv.h"
+#include "kalloc.h"
+#include "krmq.h"
+
+static inline float mg_log2(float x) // NB: this doesn't work when x<2
+{
+ union { float f; uint32_t i; } z = { x };
+ float log_2 = ((z.i >> 23) & 255) - 128;
+ z.i &= ~(255 << 23);
+ z.i += 127 << 23;
+ log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f;
+ return log_2;
+}
+
+uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t *n_u_, int32_t *n_v_)
+{
+ mm128_t *z;
+ uint64_t *u;
+ int64_t i, k, n_z, n_v;
+ int32_t n_u;
+
+ *n_u_ = *n_v_ = 0;
+ for (i = 0, n_z = 0; i < n; ++i) // precompute n_z
+ if (f[i] >= min_sc) ++n_z;
+ if (n_z == 0) return 0;
+ KMALLOC(km, z, n_z);
+ for (i = 0, k = 0; i < n; ++i) // populate z[]
+ if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i;
+ radix_sort_128x(z, z + n_z);
+
+ memset(t, 0, n * 4);
+ for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u
+ int64_t n_v0 = n_v;
+ int32_t sc;
+ for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i])
+ ++n_v, t[i] = 1;
+ sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
+ if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
+ ++n_u;
+ else n_v = n_v0;
+ }
+ KMALLOC(km, u, n_u);
+ memset(t, 0, n * 4);
+ for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[]
+ int64_t n_v0 = n_v;
+ int32_t sc;
+ for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i])
+ v[n_v++] = i, t[i] = 1;
+ sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
+ if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
+ u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0);
+ else n_v = n_v0;
+ }
+ kfree(km, z);
+ assert(n_v < INT32_MAX);
+ *n_u_ = n_u, *n_v_ = n_v;
+ return u;
+}
+
+static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a)
+{
+ mm128_t *b, *w;
+ uint64_t *u2;
+ int64_t i, j, k;
+
+ // write the result to b[]
+ KMALLOC(km, b, n_v);
+ for (i = 0, k = 0; i < n_u; ++i) {
+ int32_t k0 = k, ni = (int32_t)u[i];
+ for (j = 0; j < ni; ++j)
+ b[k++] = a[v[k0 + (ni - j - 1)]];
+ }
+ kfree(km, v);
+
+ // sort u[] and a[] by the target position, such that adjacent chains may be joined
+ KMALLOC(km, w, n_u);
+ for (i = k = 0; i < n_u; ++i) {
+ w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i;
+ k += (int32_t)u[i];
+ }
+ radix_sort_128x(w, w + n_u);
+ KMALLOC(km, u2, n_u);
+ for (i = k = 0; i < n_u; ++i) {
+ int32_t j = (int32_t)w[i].y, n = (int32_t)u[j];
+ u2[i] = u[j];
+ memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t));
+ k += n;
+ }
+ memcpy(u, u2, n_u * 8);
+ memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot
+ kfree(km, a); kfree(km, w); kfree(km, u2);
+ return b;
+}
+
+static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap, float chn_pen_skip, int is_cdna, int n_seg)
+{
+ int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc;
+ int32_t sidi = (ai->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
+ int32_t sidj = (aj->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
+ if (dq <= 0 || dq > max_dist_x) return INT32_MIN;
+ dr = (int32_t)(ai->x - aj->x);
+ if (sidi == sidj && (dr == 0 || dq > max_dist_y)) return INT32_MIN;
+ dd = dr > dq? dr - dq : dq - dr;
+ if (sidi == sidj && dd > bw) return INT32_MIN;
+ if (n_seg > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) return INT32_MIN;
+ dg = dr < dq? dr : dq;
+ q_span = aj->y>>32&0xff;
+ sc = q_span < dg? q_span : dg;
+ if (dd || dg > q_span) {
+ float lin_pen, log_pen;
+ lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg;
+ log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2
+ if (is_cdna || sidi != sidj) {
+ if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
+ else if (dr > dq || sidi != sidj) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends
+ else sc -= (int)(lin_pen + .5f * log_pen);
+ } else sc -= (int)(lin_pen + .5f * log_pen);
+ }
+ return sc;
+}
+
+/* Input:
+ * a[].x: tid<<33 | rev<<32 | tpos
+ * a[].y: flags<<40 | q_span<<32 | q_pos
+ * Output:
+ * n_u: #chains
+ * u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[])
+ * input a[] is deallocated on return
+ */
+mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
+ int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
+{ // TODO: make sure this works when n has more than 32 bits
+ int32_t *f, *t, *v, n_u, n_v, mmax_f = 0;
+ int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
+ uint64_t *u;
+
+ if (_u) *_u = 0, *n_u_ = 0;
+ if (n == 0 || a == 0) {
+ kfree(km, a);
+ return 0;
+ }
+ if (max_dist_x < bw) max_dist_x = bw;
+ if (max_dist_y < bw && !is_cdna) max_dist_y = bw;
+ KMALLOC(km, p, n);
+ KMALLOC(km, f, n);
+ KMALLOC(km, v, n);
+ KCALLOC(km, t, n);
+
+ // fill the score and backtrack arrays
+ for (i = 0, max_ii = -1; i < n; ++i) {
+ int64_t max_j = -1, end_j;
+ int32_t max_f = a[i].y>>32&0xff, n_skip = 0;
+ while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st;
+ if (i - st > max_iter) st = i - max_iter;
+ for (j = i - 1; j >= st; --j) {
+ int32_t sc;
+ sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
+ ++n_iter;
+ if (sc == INT32_MIN) continue;
+ sc += f[j];
+ if (sc > max_f) {
+ max_f = sc, max_j = j;
+ if (n_skip > 0) --n_skip;
+ } else if (t[j] == (int32_t)i) {
+ if (++n_skip > max_skip)
+ break;
+ }
+ if (p[j] >= 0) t[p[j]] = i;
+ }
+ end_j = j;
+ if (max_ii < 0 || a[i].x - a[max_ii].x > (int64_t)max_dist_x) {
+ int32_t max = INT32_MIN;
+ max_ii = -1;
+ for (j = i - 1; j >= st; --j)
+ if (max < f[j]) max = f[j], max_ii = j;
+ }
+ if (max_ii >= 0 && max_ii < end_j) {
+ int32_t tmp;
+ tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
+ if (tmp != INT32_MIN && max_f < tmp + f[max_ii])
+ max_f = tmp + f[max_ii], max_j = max_ii;
+ }
+ f[i] = max_f, p[i] = max_j;
+ v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
+ if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i]))
+ max_ii = i;
+ if (mmax_f < max_f) mmax_f = max_f;
+ }
+
+ u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v);
+ *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
+ kfree(km, p); kfree(km, f); kfree(km, t);
+ if (n_u == 0) {
+ kfree(km, a); kfree(km, v);
+ return 0;
+ }
+ return compact_a(km, n_u, u, n_v, v, a);
+}
+
+typedef struct lc_elem_s {
+ int32_t y;
+ int64_t i;
+ double pri;
+ KRMQ_HEAD(struct lc_elem_s) head;
+} lc_elem_t;
+
+#define lc_elem_cmp(a, b) ((a)->y < (b)->y? -1 : (a)->y > (b)->y? 1 : ((a)->i > (b)->i) - ((a)->i < (b)->i))
+#define lc_elem_lt2(a, b) ((a)->pri < (b)->pri)
+KRMQ_INIT(lc_elem, lc_elem_t, head, lc_elem_cmp, lc_elem_lt2)
+
+KALLOC_POOL_INIT(rmq, lc_elem_t)
+
+static inline int32_t comput_sc_simple(const mm128_t *ai, const mm128_t *aj, float chn_pen_gap, float chn_pen_skip, int32_t *exact, int32_t *width)
+{
+ int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc;
+ dr = (int32_t)(ai->x - aj->x);
+ *width = dd = dr > dq? dr - dq : dq - dr;
+ dg = dr < dq? dr : dq;
+ q_span = aj->y>>32&0xff;
+ sc = q_span < dg? q_span : dg;
+ if (exact) *exact = (dd == 0 && dg <= q_span);
+ if (dd || dq > q_span) {
+ float lin_pen, log_pen;
+ lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg;
+ log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2
+ sc -= (int)(lin_pen + .5f * log_pen);
+ }
+ return sc;
+}
+
+mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
+ int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
+{
+ int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0;
+ int64_t *p, i, i0, st = 0, st_inner = 0, n_iter = 0;
+ uint64_t *u;
+ lc_elem_t *root = 0, *root_inner = 0;
+ void *mem_mp = 0;
+ kmp_rmq_t *mp;
+
+ if (_u) *_u = 0, *n_u_ = 0;
+ if (n == 0 || a == 0) {
+ kfree(km, a);
+ return 0;
+ }
+ if (max_dist < bw) max_dist = bw;
+ if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0;
+ KMALLOC(km, p, n);
+ KMALLOC(km, f, n);
+ KCALLOC(km, t, n);
+ KMALLOC(km, v, n);
+ mem_mp = km_init2(km, 0x10000);
+ mp = kmp_init_rmq(mem_mp);
+
+ // fill the score and backtrack arrays
+ for (i = i0 = 0; i < n; ++i) {
+ int64_t max_j = -1;
+ int32_t q_span = a[i].y>>32&0xff, max_f = q_span;
+ lc_elem_t s, *q, *r, lo, hi;
+ // add in-range anchors
+ if (i0 < i && a[i0].x != a[i].x) {
+ int64_t j;
+ for (j = i0; j < i; ++j) {
+ q = kmp_alloc_rmq(mp);
+ q->y = (int32_t)a[j].y, q->i = j, q->pri = -(f[j] + 0.5 * chn_pen_gap * ((int32_t)a[j].x + (int32_t)a[j].y));
+ krmq_insert(lc_elem, &root, q, 0);
+ if (max_dist_inner > 0) {
+ r = kmp_alloc_rmq(mp);
+ *r = *q;
+ krmq_insert(lc_elem, &root_inner, r, 0);
+ }
+ }
+ i0 = i;
+ }
+ // get rid of active chains out of range
+ while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist || krmq_size(head, root) > cap_rmq_size)) {
+ s.y = (int32_t)a[st].y, s.i = st;
+ if ((q = krmq_find(lc_elem, root, &s, 0)) != 0) {
+ q = krmq_erase(lc_elem, &root, q, 0);
+ kmp_free_rmq(mp, q);
+ }
+ ++st;
+ }
+ if (max_dist_inner > 0) { // similar to the block above, but applied to the inner tree
+ while (st_inner < i && (a[i].x>>32 != a[st_inner].x>>32 || a[i].x > a[st_inner].x + max_dist_inner || krmq_size(head, root_inner) > cap_rmq_size)) {
+ s.y = (int32_t)a[st_inner].y, s.i = st_inner;
+ if ((q = krmq_find(lc_elem, root_inner, &s, 0)) != 0) {
+ q = krmq_erase(lc_elem, &root_inner, q, 0);
+ kmp_free_rmq(mp, q);
+ }
+ ++st_inner;
+ }
+ }
+ // RMQ
+ lo.i = INT32_MAX, lo.y = (int32_t)a[i].y - max_dist;
+ hi.i = 0, hi.y = (int32_t)a[i].y;
+ if ((q = krmq_rmq(lc_elem, root, &lo, &hi)) != 0) {
+ int32_t sc, exact, width, n_skip = 0;
+ int64_t j = q->i;
+ assert(q->y >= lo.y && q->y <= hi.y);
+ sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, &exact, &width);
+ if (width <= bw && sc > max_f) max_f = sc, max_j = j;
+ if (!exact && root_inner && (int32_t)a[i].y > 0) {
+ lc_elem_t *lo, *hi;
+ s.y = (int32_t)a[i].y - 1, s.i = n;
+ krmq_interval(lc_elem, root_inner, &s, &lo, &hi);
+ if (lo) {
+ const lc_elem_t *q;
+ int32_t width, n_rmq_iter = 0;
+ krmq_itr_t(lc_elem) itr;
+ krmq_itr_find(lc_elem, root_inner, lo, &itr);
+ while ((q = krmq_at(&itr)) != 0) {
+ if (q->y < (int32_t)a[i].y - max_dist_inner) break;
+ ++n_rmq_iter;
+ j = q->i;
+ sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width);
+ if (width <= bw) {
+ if (sc > max_f) {
+ max_f = sc, max_j = j;
+ if (n_skip > 0) --n_skip;
+ } else if (t[j] == (int32_t)i) {
+ if (++n_skip > max_chn_skip)
+ break;
+ }
+ if (p[j] >= 0) t[p[j]] = i;
+ }
+ if (!krmq_itr_prev(lc_elem, &itr)) break;
+ }
+ n_iter += n_rmq_iter;
+ }
+ }
+ }
+ // set max
+ assert(max_j < 0 || (a[max_j].x < a[i].x && (int32_t)a[max_j].y < (int32_t)a[i].y));
+ f[i] = max_f, p[i] = max_j;
+ v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
+ if (mmax_f < max_f) mmax_f = max_f;
+ if (max_rmq_size < krmq_size(head, root)) max_rmq_size = krmq_size(head, root);
+ }
+ km_destroy(mem_mp);
+
+ u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v);
+ *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
+ kfree(km, p); kfree(km, f); kfree(km, t);
+ if (n_u == 0) {
+ kfree(km, a); kfree(km, v);
+ return 0;
+ }
+ return compact_a(km, n_u, u, n_v, v, a);
+}
diff --git a/lib/simde b/lib/simde
new file mode 160000
index 0000000..b30129b
--- /dev/null
+++ b/lib/simde
@@ -0,0 +1 @@
+Subproject commit b30129b3b48a6823013da2b309c50a081177b6b8
diff --git a/main.c b/main.c
index daff256..1f7b66f 100644
--- a/main.c
+++ b/main.c
@@ -7,7 +7,7 @@
#include "mmpriv.h"
#include "ketopt.h"
-#define MM_VERSION "2.17-r987-qs-dirty"
+#define MM_VERSION "2.21-r1071-qs-dirty"
#ifdef __linux__
#include
@@ -70,7 +70,9 @@ static ko_longopt_t long_options[] = {
{ "chain-gap-scale",ko_required_argument, 343 },
{ "alt", ko_required_argument, 344 },
{ "alt-drop", ko_required_argument, 345 },
- { "qstrand", ko_no_argument, 346 },
+ { "mask-len", ko_required_argument, 346 },
+ { "rmq", ko_optional_argument, 347 },
+ { "qstrand", ko_no_argument, 348 },
{ "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' },
@@ -82,17 +84,23 @@ static ko_longopt_t long_options[] = {
{ 0, 0, 0 }
};
-static inline int64_t mm_parse_num(const char *str)
+static inline int64_t mm_parse_num2(const char *str, char **q)
{
double x;
char *p;
x = strtod(str, &p);
- if (*p == 'G' || *p == 'g') x *= 1e9;
- else if (*p == 'M' || *p == 'm') x *= 1e6;
- else if (*p == 'K' || *p == 'k') x *= 1e3;
+ if (*p == 'G' || *p == 'g') x *= 1e9, ++p;
+ else if (*p == 'M' || *p == 'm') x *= 1e6, ++p;
+ else if (*p == 'K' || *p == 'k') x *= 1e3, ++p;
+ if (q) *q = p;
return (int64_t)(x + .499);
}
+static inline int64_t mm_parse_num(const char *str)
+{
+ return mm_parse_num2(str, 0);
+}
+
static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const char *arg, int yes_to_set)
{
if (yes_to_set) {
@@ -108,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha
int main(int argc, char *argv[])
{
- const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:";
+ const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:";
ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt;
mm_idxopt_t ipt;
@@ -144,7 +152,6 @@ int main(int argc, char *argv[])
else if (c == 'k') ipt.k = atoi(o.arg);
else if (c == 'H') ipt.flag |= MM_I_HPC;
else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I
- else if (c == 'r') opt.bw = (int)mm_parse_num(o.arg);
else if (c == 't') n_threads = atoi(o.arg);
else if (c == 'v') mm_verbose = atoi(o.arg);
else if (c == 'g') opt.max_gap = (int)mm_parse_num(o.arg);
@@ -171,6 +178,7 @@ int main(int argc, char *argv[])
else if (c == 'C') opt.noncan = atoi(o.arg);
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg);
+ else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg);
else if (c == 'R') rg = o.arg;
else if (c == 'h') fp_help = stdout;
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
@@ -203,7 +211,6 @@ int main(int argc, char *argv[])
else if (c == 327) opt.max_clip_ratio = atof(o.arg); // --max-clip-ratio
else if (c == 328) opt.min_mid_occ = atoi(o.arg); // --min-occ-floor
else if (c == 329) opt.flag |= MM_F_OUT_MD; // --MD
- else if (c == 330) opt.min_join_flank_ratio = atof(o.arg); // --lj-min-ratio
else if (c == 331) opt.sc_ambi = atoi(o.arg); // --score-N
else if (c == 332) opt.flag |= MM_F_EQX; // --eqx
else if (c == 333) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit
@@ -218,8 +225,11 @@ int main(int argc, char *argv[])
else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale
else if (c == 344) alt_list = o.arg; // --alt
else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop
- else if (c == 346) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand
- else if (c == 314) { // --frag
+ else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len
+ else if (c == 348) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand
+ else if (c == 330) {
+ fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
+ } else if (c == 314) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
} else if (c == 315) { // --secondary
yes_or_no(&opt, MM_F_NO_PRINT_2ND, o.longidx, o.arg, 0);
@@ -240,6 +250,8 @@ int main(int argc, char *argv[])
yes_or_no(&opt, MM_F_HEAP_SORT, o.longidx, o.arg, 1);
} else if (c == 326) { // --dual
yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0);
+ } else if (c == 347) { // --rmq
+ yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1);
} else if (c == 'S') {
opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
if (mm_verbose >= 2)
@@ -247,6 +259,12 @@ int main(int argc, char *argv[])
} else if (c == 'V') {
puts(MM_VERSION);
return 0;
+ } else if (c == 'r') {
+ opt.bw = (int)mm_parse_num2(o.arg, &s);
+ if (*s == ',') opt.bw_long = (int)mm_parse_num2(s + 1, &s);
+ } else if (c == 'U') {
+ opt.min_mid_occ = strtol(o.arg, &s, 10);
+ if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10);
} else if (c == 'f') {
double x;
char *p;
@@ -301,7 +319,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
fprintf(fp_help, " -G NUM max intron length (effective with -xsplice; changing -r) [200k]\n");
fprintf(fp_help, " -F NUM max fragment length (effective with -xsr or in the fragment mode) [800]\n");
- fprintf(fp_help, " -r NUM bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
+ fprintf(fp_help, " -r NUM[,NUM] chaining/alignment bandwidth and long-join bandwidth [%d,%d]\n", opt.bw, opt.bw_long);
fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
// fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
@@ -332,7 +350,8 @@ int main(int argc, char *argv[])
fprintf(fp_help, " --version show version number\n");
fprintf(fp_help, " Preset:\n");
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
- fprintf(fp_help, " - map-pb/map-ont - PacBio/Nanopore vs reference mapping\n");
+ fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
+ fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n");
@@ -389,6 +408,7 @@ int main(int argc, char *argv[])
if (mm_verbose >= 3) mm_idx_stat(mi);
if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1);
if (alt_list) mm_idx_alt_read(mi, alt_list);
+ if (argc - (o.ind + 1) == 0) continue; // no query files
ret = 0;
if (!(opt.flag & MM_F_FRAG_MODE)) {
for (i = o.ind + 1; i < argc; ++i) {
diff --git a/map.c b/map.c
index 1def5fd..2428627 100644
--- a/map.c
+++ b/map.c
@@ -80,49 +80,7 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t
#define heap_lt(a, b) ((a).x > (b).x)
KSORT_INIT(heap, mm128_t, heap_lt)
-typedef struct {
- uint32_t n;
- uint32_t q_pos, q_span;
- uint32_t seg_id:31, is_tandem:1;
- const uint64_t *cr;
-} mm_match_t;
-
-static mm_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
-{
- int rep_st = 0, rep_en = 0, n_m;
- size_t i;
- mm_match_t *m;
- *n_mini_pos = 0;
- *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
- m = (mm_match_t*)kmalloc(km, mv->n * sizeof(mm_match_t));
- for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
- const uint64_t *cr;
- mm128_t *p = &mv->a[i];
- uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
- int t;
- cr = mm_idx_get(mi, p->x>>8, &t);
- if (t >= max_occ) {
- int en = (q_pos >> 1) + 1, st = en - q_span;
- if (st > rep_en) {
- *rep_len += rep_en - rep_st;
- rep_st = st, rep_en = en;
- } else rep_en = en;
- } else {
- mm_match_t *q = &m[n_m++];
- q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
- q->is_tandem = 0;
- if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
- if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
- *n_a += q->n;
- (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1;
- }
- }
- *rep_len += rep_en - rep_st;
- *_n_m = n_m;
- return m;
-}
-
-static inline int skip_seed(int flag, uint64_t r, const mm_match_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
+static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
{
*is_self = 0;
if (qname && (flag & (MM_F_NO_DIAG|MM_F_NO_DUAL))) {
@@ -151,10 +109,10 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
{
int i, n_m, heap_size = 0;
int64_t j, n_for = 0, n_rev = 0;
- mm_match_t *m;
+ mm_seed_t *m;
mm128_t *a, *heap;
- m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
+ m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t));
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
@@ -168,7 +126,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
}
ks_heapmake_heap(heap_size, heap);
while (heap_size > 0) {
- mm_match_t *q = &m[heap->y>>32];
+ mm_seed_t *q = &m[heap->y>>32];
mm128_t *p;
uint64_t r = heap->x;
int32_t is_self, rpos = (uint32_t)r >> 1;
@@ -216,12 +174,12 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
int *n_mini_pos, uint64_t **mini_pos)
{
int i, n_m;
- mm_match_t *m;
+ mm_seed_t *m;
mm128_t *a;
- m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
+ m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
for (i = 0, *n_a = 0; i < n_m; ++i) {
- mm_match_t *q = &m[i];
+ mm_seed_t *q = &m[i];
const uint64_t *r = q->cr;
uint32_t k;
for (k = 0; k < q->n; ++k) {
@@ -253,11 +211,9 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a)
{
if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
- mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
+ mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs);
- if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains
- mm_join_long(km, opt, qlen, n_regs, regs, a);
}
}
@@ -266,7 +222,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
if (!(opt->flag & MM_F_CIGAR)) return regs;
regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
- mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
+ mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
mm_set_sam_pri(*n_regs, regs);
}
@@ -317,9 +273,25 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
} else max_chain_gap_ref = opt->max_gap;
- a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
+ if (opt->flag & MM_F_RMQ) {
+ a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score,
+ opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km);
+ } else {
+ a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
+ opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
+ }
- if (opt->max_occ > opt->mid_occ && rep_len > 0) {
+ if (opt->bw_long > opt->bw && (opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences
+ int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y;
+ if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) {
+ int32_t i;
+ for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i];
+ kfree(b->km, u);
+ radix_sort_128x(a, a + n_a);
+ a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score,
+ opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km);
+ }
+ } else if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads
int rechain = 0;
if (n_regs0 > 0) { // test if the best chain has all the segments
int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0;
@@ -339,7 +311,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
kfree(b->km, mini_pos);
if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
- a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
+ a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
+ opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
}
}
b->frag_gap = max_chain_gap_ref;
@@ -370,7 +343,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains
free(regs0);
for (i = 0; i < n_segs; ++i) {
- mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent
+ mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent
regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a);
mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr);
}
@@ -514,7 +487,7 @@ static void merge_hits(step_t *s)
}
}
mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_drop);
- mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
+ mm_set_parent(km, opt->mask_level, opt->mask_len, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
if (!(opt->flag & MM_F_ALL_CHAINS)) {
mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]);
mm_set_sam_pri(s->n_reg[k], s->reg[k]);
diff --git a/minimap.h b/minimap.h
index 150f52c..f75e7de 100644
--- a/minimap.h
+++ b/minimap.h
@@ -36,8 +36,9 @@
#define MM_F_NO_END_FLT 0x10000000
#define MM_F_HARD_MLEVEL 0x20000000
#define MM_F_SAM_HIT_ONLY 0x40000000
-#define MM_F_QSTRAND (0x80000000LL)
-#define MM_F_NO_INV (0x100000000LL)
+#define MM_F_RMQ 0x80000000LL
+#define MM_F_QSTRAND (0x100000000LL)
+#define MM_F_NO_INV (0x200000000LL)
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
@@ -47,6 +48,18 @@
#define MM_MAX_SEG 255
+#define MM_CIGAR_MATCH 0
+#define MM_CIGAR_INS 1
+#define MM_CIGAR_DEL 2
+#define MM_CIGAR_N_SKIP 3
+#define MM_CIGAR_SOFTCLIP 4
+#define MM_CIGAR_HARDCLIP 5
+#define MM_CIGAR_PADDING 6
+#define MM_CIGAR_EQ_MATCH 7
+#define MM_CIGAR_X_MISMATCH 8
+
+#define MM_CIGAR_STR "MIDNSHP=XB"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -115,22 +128,22 @@ typedef struct {
int max_qlen; // max query length
- int bw; // bandwidth
+ int bw, bw_long; // bandwidth
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_frag_len;
int max_chain_skip, max_chain_iter;
int min_cnt; // min number of minimizers on each chain
int min_chain_score; // min chaining score
float chain_gap_scale;
+ int rmq_size_cap, rmq_inner_dist;
+ int rmq_rescue_size;
+ float rmq_rescue_ratio;
float mask_level;
+ int mask_len;
float pri_ratio;
int best_n; // top best_n chains are subjected to DP alignment
- int max_join_long, max_join_short;
- int min_join_flank_sc;
- float min_join_flank_ratio;
-
float alt_drop;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
@@ -147,9 +160,9 @@ typedef struct {
int pe_ori, pe_bonus;
float mid_occ_frac; // only used by mm_mapopt_update(); see below
- int32_t min_mid_occ;
+ int32_t min_mid_occ, max_mid_occ;
int32_t mid_occ; // ignore seeds with occurrences above this threshold
- int32_t max_occ;
+ int32_t max_occ, max_max_occ, occ_dist;
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
int64_t max_sw_mat;
diff --git a/minimap2.1 b/minimap2.1
index 88d895c..a1c0fb8 100644
--- a/minimap2.1
+++ b/minimap2.1
@@ -1,4 +1,4 @@
-.TH minimap2 1 "4 May 2019" "minimap2-2.17 (r941)" "Bioinformatics tools"
+.TH minimap2 1 "6 July 2021" "minimap2-2.21 (r1071)" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@@ -145,22 +145,32 @@ or
.B -xsr
mode, which sets the threshold for a second round of seeding.
.TP
-.BI --min-occ-floor \ INT
-Force minimap2 to always use k-mers occurring
+.BI -U \ INT1 [, INT2 ]
+Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is
+.RI max{ INT1 ,\ min{ INT2 ,
+.BR -f }}.
+This option prevents excessively small or large
+.B -f
+estimated from the input reference. It deprecates
+.B --min-occ-floor
+in earlier versions of minimap2.
+.TP
+.BI -e \ INT
+Sample a high-frequency minimizer every
.I INT
-times or less [0]. In effect, the max occurrence threshold is set to
-the
-.RI max{ INT ,
-.BR -f }.
+basepairs [500].
.TP
-.BI -g \ INT
+.BI -g \ NUM
Stop chain enlongation if there are no minimizers within
-.IR INT -bp
-[10000].
+.IR NUM -bp
+[10k].
.TP
-.BI -r \ INT
-Bandwidth used in chaining and DP-based alignment [500]. This option
-approximately controls the maximum gap size.
+.BI -r \ NUM1 [, NUM2 ]
+Bandwidth for chaining and base alignment [500,20k].
+.I NUM1
+is used for initial chaining and alignment extension;
+.I NUM2
+for RMQ-based re-chaining and closing gaps in alignments.
.TP
.BI -n \ INT
Discard chains consisting of
@@ -234,10 +244,21 @@ Mark as secondary a chain that overlaps with a better chain by
.I FLOAT
or more of the shorter chain [0.5]
.TP
+.BR --rmq = no | yes
+Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
+for aligning contigs through long INDELs.
+.TP
.B --hard-mask-level
Honor option
.B -M
-and disable a heurstic to save unmapped subsequences.
+and disable a heurstic to save unmapped subsequences and disables
+.BR --mask-len .
+.TP
+.BI --mask-len \ NUM
+Keep an alignment if dropping it leaves an unaligned region on query longer than
+.IR INT
+[inf]. Effective without
+.BR --hard-mask-level .
.TP
.BI --max-chain-skip \ INT
A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming
@@ -261,10 +282,6 @@ Disable the long gap patching heuristic. When this option is applied, the
maximum alignment gap is mostly controlled by
.BR -r .
.TP
-.BI --lj-min-ratio \ FLOAT
-Fraction of query sequence length required to bridge a long gap [0.5]. A
-smaller value helps to recover longer gaps, at the cost of more false gaps.
-.TP
.B --splice
Enable the splice alignment mode.
.TP
@@ -384,7 +401,7 @@ BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
.BR --junc-bonus \ INT
Score bonus for a splice donor or acceptor found in annotation (effective with
.BR --junc-bed )
-[0].
+[9].
.TP
.BI --end-seed-pen \ INT
Drop a terminal anchor if
@@ -405,7 +422,7 @@ alignment.
.BI --cap-sw-mem \ NUM
Skip alignment if the DP matrix size is above
.IR NUM .
-Set 0 to disable [0].
+Set 0 to disable [100m].
.SS Input/output options
.TP 10
.B -a
@@ -516,60 +533,47 @@ Available
.I STR
are:
.RS
-.TP 8
-.B map-pb
-PacBio/Oxford Nanopore read to reference mapping
-.RB ( -Hk19 )
-.TP
+.TP 10
.B map-ont
-Slightly more sensitive for Oxford Nanopore to reference mapping
-.RB ( -k15 ).
-For PacBio reads, HPC minimizers consistently leads to faster performance and
-more sensitive results in comparison to normal minimizers. For Oxford Nanopore
-data, normal minimizers are better, though not much. The effectiveness of HPC
-is determined by the sequencing error mode.
+Align noisy long reads of ~10% error rate to a reference genome. This is the
+default mode.
+.TP
+.B map-hifi
+Align PacBio high-fidelity (HiFi) reads to a reference genome
+.RB ( -k19
+.B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1
+.BR -s200 ).
+.TP
+.B map-pb
+Align older PacBio continuous long (CLR) reads to a reference genome
+.RB ( -Hk19 ).
.TP
.B asm5
Long assembly to reference mapping
.RB ( -k19
-.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 -N50
-.BR --min-occ-floor=100 ).
+.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
+.BR -N50 ).
Typically, the alignment will not extend to regions with 5% or higher sequence
divergence. Only use this preset if the average divergence is far below 5%.
.TP
.B asm10
Long assembly to reference mapping
.RB ( -k19
-.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 -N50
-.BR --min-occ-floor=100 ).
+.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
+.BR -N50 ).
Up to 10% sequence divergence.
.TP
.B asm20
Long assembly to reference mapping
.RB ( -k19
-.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50
-.BR --min-occ-floor=100 ).
+.B -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
+.BR -N50 ).
Up to 20% sequence divergence.
.TP
-.B ava-pb
-PacBio all-vs-all overlap mapping
-.RB ( -Hk19
-.B -Xw5 -m100 -g10000 --max-chain-skip
-.BR 25 ).
-.TP
-.B ava-ont
-Oxford Nanopore all-vs-all overlap mapping
-.RB ( -k15
-.B -Xw5 -m100 -g10000 -r2000 --max-chain-skip
-.BR 25 ).
-Similarly, the major difference from
-.B ava-pb
-is that this preset is not using HPC minimizers.
-.TP
.B splice
Long-read spliced alignment
.RB ( -k15
-.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9
+.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
.BR --splice-flank=yes ).
In the splice mode, 1) long deletions are taken as introns and represented as
the
@@ -588,9 +592,21 @@ Long-read splice alignment for PacBio CCS reads
.B sr
Short single-end reads without splicing
.RB ( -k21
-.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20
-.B -s40 -g200 -2K50m --heap-sort=yes
+.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m20
+.B -s40 -g100 -2K50m --heap-sort=yes
.BR --secondary=no ).
+.TP
+.B ava-pb
+PacBio CLR all-vs-all overlap mapping
+.RB ( -Hk19
+.B -Xw5 -e0
+.BR -m100 ).
+.TP
+.B ava-ont
+Oxford Nanopore all-vs-all overlap mapping
+.RB ( -k15
+.B -Xw5 -e0 -m100
+.BR -r2k ).
.RE
.SS Miscellaneous options
.TP 10
diff --git a/misc.c b/misc.c
index ddfa9de..f3a29f8 100644
--- a/misc.c
+++ b/misc.c
@@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
KSORT_INIT_GENERIC(uint32_t)
+KSORT_INIT_GENERIC(uint64_t)
diff --git a/misc/paftools.js b/misc/paftools.js
index f15cfde..0c5c186 100755
--- a/misc/paftools.js
+++ b/misc/paftools.js
@@ -1,6 +1,6 @@
#!/usr/bin/env k8
-var paftools_version = '2.17-r982-dirty';
+var paftools_version = '2.21-r1071';
/*****************************
***** Library functions *****
@@ -962,12 +962,13 @@ function paf_asmgene(args)
function paf_stat(args)
{
- var c, gap_out_len = null;
- while ((c = getopt(args, "l:")) != null)
+ var c, gap_out_len = null, count_err = false;
+ while ((c = getopt(args, "cl:")) != null)
if (c == 'l') gap_out_len = parseInt(getopt.arg);
+ else if (c == 'c') count_err = true;
if (getopt.ind == args.length) {
- print("Usage: paftools.js stat [-l gapOutLen] |");
+ print("Usage: paftools.js stat [-c] [-l gapOutLen] |");
exit(1);
}
@@ -998,7 +999,7 @@ function paf_stat(args)
if (line.charAt(0) != '@') {
var t = line.split("\t", 12);
var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null;
- var atlen = null, aqlen, qs, qe, mapq, ori_qlen;
+ var atlen = null, aqlen, qs, qe, mapq, ori_qlen, NM = null;
if (t.length < 2) continue;
if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
if (t[4] == '*') continue; // unmapped
@@ -1006,6 +1007,8 @@ function paf_stat(args)
++n_2nd;
continue;
}
+ if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
+ NM = parseInt(m[1]);
if ((m = /\tcg:Z:(\S+)/.exec(line)) != null)
cigar = m[1];
if (cigar == null) {
@@ -1027,6 +1030,8 @@ function paf_stat(args)
++n_2nd;
continue;
}
+ if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
+ NM = parseInt(m[1]);
cigar = t[5];
tname = t[2];
rs = parseInt(t[3]) - 1;
@@ -1045,11 +1050,13 @@ function paf_stat(args)
++n_seq, last = t[0];
}
var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0;
+ var n_gapo = 0, n_gap_all = 0, l_match = 0;
while ((m = re.exec(cigar)) != null) {
var l = parseInt(m[1]);
++n_cigar;
if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') {
tl += l, ql += l, M += l;
+ l_match += l;
} else if (m[2] == 'I' || m[2] == 'D') {
var type;
if (l < 50) type = 0;
@@ -1062,6 +1069,7 @@ function paf_stat(args)
else tl += l, ++n_gap[1][type];
if (gap_out_len != null && l >= gap_out_len)
print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l);
+ ++n_gapo, n_gap_all += l;
} else if (m[2] == 'N') {
tl += l;
} else if (m[2] == 'S') {
@@ -1079,6 +1087,12 @@ function paf_stat(args)
qs = clip[is_rev? 1 : 0], qe = qs + ql;
ori_qlen = clip[0] + ql + clip[1];
}
+ if (count_err && NM != null) {
+ var n_mm = NM - n_gap_all;
+ if (n_mm < 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno);
+ if (n_mm < 0) n_mm = 0;
+ print(t[0], ori_qlen, t[11], ori_qlen - (qe - qs), NM, l_match + n_gap_all, n_mm + n_gapo, l_match + n_gapo);
+ }
regs.push([qs, qe]);
last_qlen = ori_qlen;
}
@@ -1091,7 +1105,7 @@ function paf_stat(args)
file.close();
buf.destroy();
- if (gap_out_len == null) {
+ if (gap_out_len == null && !count_err) {
print("Number of mapped sequences: " + n_seq);
print("Number of primary alignments: " + n_pri);
print("Number of secondary alignments: " + n_2nd);
@@ -1405,7 +1419,7 @@ function paf_view(args)
var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); // these are used to show padded alignment
var re_cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g;
- var re_cg = /(\d+)([MIDNSH])/g;
+ var re_cg = /(\d+)([MIDNSHP=X])/g;
var buf = new Bytes();
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
@@ -1885,7 +1899,7 @@ function paf_splice2bed(args)
a.length = 0;
}
- var re = /(\d+)([MIDNSH])/g;
+ var re = /(\d+)([MIDNSHP=X])/g;
var c, fmt = "bed", fn_name_conv = null, keep_multi = false;
while ((c = getopt(args, "f:n:m")) != null) {
if (c == 'f') fmt = getopt.arg;
@@ -2355,28 +2369,43 @@ function paf_junceval(args)
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
var last_qname = null;
- var re_cigar = /(\d+)([MIDNSHX=])/g;
+ var re_cigar = /(\d+)([MIDNSHP=X])/g;
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
+ var ctg_name = null, cigar = null, pos = null, qname = t[0];
if (t[0].charAt(0) == '@') continue;
- if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(t[2])) continue;
- var flag = parseInt(t[1]);
- if (flag&0x100) continue;
- if (first_only && last_qname == t[0]) continue;
- if (t[2] == '*') {
+ if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
+ ctg_name = t[5], pos = parseInt(t[7]);
+ var type = 'P';
+ for (i = 12; i < t.length; ++i) {
+ if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
+ if (m[1] == 'tp:A') type = m[2];
+ else cigar = m[2];
+ }
+ }
+ if (type == 'S') continue; // secondary
+ } else { // SAM
+ ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
+ var flag = parseInt(t[1]);
+ if (flag&0x100) continue; // secondary
+ }
+
+ if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
+ if (first_only && last_qname == qname) continue;
+ if (ctg_name == '*') { // unmapped
++n_unmapped;
continue;
} else {
++n_pri;
- if (last_qname != t[0]) {
+ if (last_qname != qname) {
++n_mapped;
- last_qname = t[0];
+ last_qname = qname;
}
}
- var pos = parseInt(t[3]) - 1, intron = [];
- while ((m = re_cigar.exec(t[5])) != null) {
+ var intron = [];
+ while ((m = re_cigar.exec(cigar)) != null) {
var len = parseInt(m[1]), op = m[2];
if (op == 'N') {
intron.push([pos, pos + len]);
@@ -2389,7 +2418,7 @@ function paf_junceval(args)
}
n_splice += intron.length;
- var chr = anno[t[2]];
+ var chr = anno[ctg_name];
if (chr != null) {
for (var i = 0; i < intron.length; ++i) {
var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]);
@@ -2413,12 +2442,12 @@ function paf_junceval(args)
x += '(' + o[j][0] + "," + o[j][1] + ')';
}
x += ']';
- print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x);
+ print(type, qname, i+1, ctg_name, intron[i][0], intron[i][1], x);
}
} else {
++n_splice_novel;
if (print_ovlp)
- print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]);
+ print('N', qname, i+1, ctg_name, intron[i][0], intron[i][1]);
}
}
} else {
@@ -2512,6 +2541,395 @@ function paf_ov_eval(args)
print((100 * (1 - n_missing / n_ovlp)).toFixed(2) + "% sensitivity");
}
+function paf_vcfstat(args)
+{
+ var c, ts = { "AG":1, "GA":1, "CT":1, "TC":1 };
+ while ((c = getopt(args, "")) != null) {
+ }
+ var buf = new Bytes();
+ var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]);
+ var x = { sub:0, ts:0, tv:0, ins:0, del:0, ins1:0, del1:0, ins2:0, del2:0, ins50:0, del50:0, ins1k:0, del1k:0, ins7k:0, del7k:0, insinf:0, delinf:0 };
+ while (file.readline(buf) >= 0) {
+ var t = buf.toString().split("\t");
+ if (t[0][0] == '#') continue;
+ var alt = t[4].split(",");
+ var ref = t[3];
+ for (var i = 0; i < alt.length; ++i) {
+ var a = alt[i];
+ if (a[0] == '<' || a[1] == '>') continue;
+ var l = ref.length < a.length? ref.length : a.length;
+ for (var j = 0; j < l; ++j) {
+ if (ref[j] != a[j]) {
+ ++x.sub;
+ if (ts[ref[j] + a[j]]) ++x.ts;
+ else ++x.tv;
+ }
+ }
+ var d = a.length - ref.length;
+ if (d > 0) {
+ ++x.ins;
+ if (d == 1) ++x.ins1;
+ else if (d == 2) ++x.ins2;
+ else if (d < 50) ++x.ins50;
+ else if (d < 1000) ++x.ins1k;
+ else if (d < 7000) ++x.ins7k;
+ else ++x.insinf;
+ } else if (d < 0) {
+ d = -d;
+ ++x.del;
+ if (d == 1) ++x.del1;
+ else if (d == 2) ++x.del2;
+ else if (d < 50) ++x.del50;
+ else if (d < 1000) ++x.del1k;
+ else if (d < 7000) ++x.del7k;
+ else ++x.delinf;
+ }
+ }
+ }
+ file.close();
+ buf.destroy();
+ print("# substitutions: " + x.sub);
+ print("ts/tv: " + (x.ts / x.tv).toFixed(3));
+ print("# insertions: " + x.ins);
+ print("# 1bp insertions: " + x.ins1);
+ print("# 2bp insertions: " + x.ins2);
+ print("# [3,50) insertions: " + x.ins50);
+ print("# [50,1000) insertions: " + x.ins1k);
+ print("# [1000,7000) insertions: " + x.ins7k);
+ print("# >=7000 insertions: " + x.insinf);
+ print("# deletions: " + x.del);
+ print("# 1bp deletions: " + x.del1);
+ print("# 2bp deletions: " + x.del2);
+ print("# [3,50) deletions: " + x.del50);
+ print("# [50,1000) deletions: " + x.del1k);
+ print("# [1000,7000) deletions: " + x.del7k);
+ print("# >=7000 deletions: " + x.delinf);
+}
+
+function paf_parseNum(s) {
+ var m, x = null;
+ if ((m = /^(\d*\.?\d*)([mMgGkK]?)/.exec(s)) != null) {
+ x = parseFloat(m[1]);
+ if (m[2] == 'k' || m[2] == 'K') x *= 1000;
+ else if (m[2] == 'm' || m[2] == 'M') x *= 1000000;
+ else if (m[2] == 'g' || m[2] == 'G') x *= 1000000000;
+ }
+ return Math.floor(x + .499);
+}
+
+function paf_misjoin(args)
+{
+ var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false, cen_ratio = 0.5;
+ var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0];
+ while ((c = getopt(args, "l:g:c:per:")) != null) {
+ if (c == 'l') min_seg_len = paf_parseNum(getopt.arg);
+ else if (c == 'g') max_gap = paf_parseNum(getopt.arg);
+ else if (c == 'c') fn_cen = getopt.arg;
+ else if (c == 'r') cen_ratio = parseFloat(getopt.arg);
+ else if (c == 'p') show_long = true;
+ else if (c == 'e') show_err = true;
+ }
+ if (args.length == getopt.ind) {
+ print("Usage: paftools.js misjoin [options] ");
+ print("Options:");
+ print(" -c FILE BED for centromeres []");
+ print(" -r FLOAT count a centromeric event if overlap ratio > FLOAT [" + cen_ratio + "]");
+ print(" -l NUM min alignment block length [1m]");
+ print(" -g NUM max gap size [1m]");
+ print(" -e output misjoins not involving centromeres");
+ print(" -p output long alignment blocks for debugging");
+ return;
+ }
+ var cen = {};
+ var file, buf = new Bytes();
+ if (fn_cen != null) {
+ file = new File(fn_cen);
+ while (file.readline(buf) >= 0) {
+ var t = buf.toString().split("\t");
+ if (cen[t[0]] == null) cen[t[0]] = [];
+ cen[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
+ }
+ file.close();
+ }
+
+ function test_cen(cen, chr, st, en) {
+ var b = cen[chr], len = 0;
+ if (b == null) return false;
+ for (var j = 0; j < b.length; ++j)
+ if (b[j][0] < en && b[j][1] > st) {
+ var s = b[j][0] > st? b[j][0] : st;
+ var e = b[j][1] < en? b[j][1] : en;
+ len += e - s;
+ }
+ return len < (en - st) * cen_ratio? false : true;
+ }
+
+ function process(a) {
+ var k = 0;
+ for (var i = 0; i < a.length; ++i) {
+ for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]);
+ for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]);
+ if (a[i][10] >= min_seg_len) a[k++] = a[i];
+ }
+ a.length = k;
+ if (a.length == 1) return;
+ a = a.sort(function(x,y){return x[2]-y[2]});
+ if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
+ for (var i = 1; i < a.length; ++i) {
+ var ov = [false, false];
+ ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
+ ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
+ if (a[i-1][5] != a[i][5]) { // different chr
+ if (ov[0] || ov[1]) ++n_diff[1];
+ else if (show_err) {
+ print("J", a[i-1].slice(0, 12).join("\t"));
+ print("J", a[i].slice(0, 12).join("\t"));
+ }
+ ++n_diff[0];
+ } else if (a[i-1][4] == a[i][4]) { // a gap
+ var dq = a[i][2] - a[i-1][3];
+ var dr = a[i][4] == '+'? a[i][7] - a[i-1][8] : a[i-1][7] - a[i][8];
+ var gap = dr > dq? dr - dq : dq - dr;
+ if (gap > max_gap) {
+ if (ov[0] || ov[1]) ++n_gap[1];
+ else if (show_err) {
+ print("G", a[i-1].slice(0, 12).join("\t"));
+ print("G", a[i].slice(0, 12).join("\t"));
+ }
+ ++n_gap[0];
+ }
+ } else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion
+ if (ov[0] || ov[1]) ++n_inv[1];
+ else if (show_err) {
+ print("M", a[i-1].slice(0, 12).join("\t"));
+ print("M", a[i].slice(0, 12).join("\t"));
+ print("M", a[i+1].slice(0, 12).join("\t"));
+ }
+ ++n_inv[0];
+ ++i;
+ } else { // hanging inversion
+ if (ov[0] || ov[1]) ++n_inv_end[1];
+ ++n_inv_end[0];
+ }
+ }
+ }
+
+ file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
+ var a = [];
+ while (file.readline(buf) >= 0) {
+ var t = buf.toString().split("\t");
+ if (a.length > 0 && a[0][0] != t[0]) {
+ process(a);
+ a.length = 0;
+ }
+ a.push(t);
+ }
+ if (a.length > 0) process(a);
+ file.close();
+ buf.destroy();
+ print("# inter-chromosomal misjoins: " + n_diff.join(","));
+ print("# intra-chromosomal gaps: " + n_gap.join(","));
+ print("# candidate inversions in the middle: " + n_inv.join(","));
+ print("# candidate inversions at contig ends: " + n_inv_end.join(","));
+}
+
+function _paf_get_alen(t)
+{
+ var svlen = null, alen = null;
+ if ((m = /(^|;)SVLEN=(-?\d+)/.exec(t[7])) != null)
+ svlen = parseInt(m[2]);
+ var s = t[4].split(",");
+ var min_abs_diff = 1<<30, max_abs_diff = 0;
+ if (svlen != null && svlen != 0)
+ alen = svlen, min_abs_diff = max_abs_diff = svlen > 0? svlen : -svlen;
+ var rlen = t[3].length;
+ for (var i = 0; i < s.length; ++i) {
+ if (/^<\S+>$/.test(s[i])) continue;
+ var diff = s[i].length - rlen;
+ var abs_diff = diff > 0? diff : -diff;
+ min_abs_diff = min_abs_diff < abs_diff? min_abs_diff : abs_diff;
+ if (max_abs_diff < abs_diff)
+ max_abs_diff = abs_diff, alen = diff;
+ }
+ return [alen, min_abs_diff, max_abs_diff];
+}
+
+function paf_sveval(args)
+{
+ var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, print_match = false, bed_fn = null;
+ var len_diff_ratio = 0.5;
+ while ((c = getopt(args, "f:i:x:w:er:pd:")) != null) {
+ if (c == 'f') min_flt = paf_parseNum(getopt.arg);
+ else if (c == 'i') min_size = paf_parseNum(getopt.arg);
+ else if (c == 'x') max_size = paf_parseNum(getopt.arg);
+ else if (c == 'w') win_size = paf_parseNum(getopt.arg);
+ else if (c == 'd') len_diff_ratio = parseFloat(getopt.arg);
+ else if (c == 'r') bed_fn = getopt.arg;
+ else if (c == 'e') print_err = true;
+ else if (c == 'p') print_match = true;
+ }
+ if (args.length - getopt.ind < 2) {
+ print("Usage: paftools.js sveval [options] ");
+ print("Options:");
+ print(" -r FILE confident region in BED []");
+ print(" -f INT min length to discard [" + min_flt + "]");
+ print(" -i INT min SV length [" + min_size + "]");
+ print(" -x INT max SV length [" + max_size + "]");
+ print(" -w INT fuzzy windown size [" + win_size + "]");
+ print(" -d FLOAT max allele diff if there is a single allele in the window [" + len_diff_ratio + "]");
+ print(" -e print errors");
+ return;
+ }
+
+ function read_bed(fn) {
+ var buf = new Bytes();
+ var file = new File(fn);
+ var bed = {};
+ while (file.readline(buf) >= 0) {
+ var t = buf.toString().split("\t");
+ if (bed[t[0]] == null) bed[t[0]] = [];
+ bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
+ }
+ file.close();
+ buf.destroy();
+ for (var x in bed) {
+ Interval.sort(bed[x]);
+ Interval.merge(bed[x]);
+ Interval.index_end(bed[x]);
+ }
+ return bed;
+ }
+
+ var bed = bed_fn != null? read_bed(bed_fn) : null;
+
+ function read_vcf(fn, bed) {
+ var buf = new Bytes();
+ var file = new File(fn);
+ var v = {};
+ while (file.readline(buf) >= 0) {
+ var m, t = buf.toString().split("\t");
+ if (t[0][0] == '#') continue;
+ if (bed != null && bed[t[0]] == null) continue;
+ if (t[4] == '' || t[4] == '') continue; // no inversion
+ if (/[\[\]]/.test(t[4])) continue; // no break points
+ var st = parseInt(t[1]) - 1, en = st + t[3].length;
+ // parse svlen
+ var b = _paf_get_alen(t), svlen = b[0];
+ var abslen = svlen == null? 0 : svlen > 0? svlen : -svlen;
+ if (abslen < min_flt || abslen > max_size) continue;
+ // update end
+ if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
+ en = parseInt(m[2]);
+ else if (svlen != null && svlen < 0)
+ en = st + (-svlen);
+ if (en < st) en = st;
+ if (st == en) --st, ++en;
+ if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue;
+ // insert
+ if (v[t[0]] == null) v[t[0]] = [];
+ v[t[0]].push([st, en, svlen, abslen]);
+ }
+ file.close();
+ buf.destroy();
+ for (var x in v) {
+ Interval.sort(v[x]);
+ Interval.index_end(v[x]);
+ }
+ return v;
+ }
+
+ function compare_vcf(v0, v1, label) {
+ var m = 0, n = 0;
+ for (var x in v1) {
+ var a1 = v1[x], a0 = v0[x];
+ for (var i = 0; i < a1.length; ++i) {
+ if (a1[i][3] < min_size) continue;
+ ++n;
+ if (a0 == null) continue;
+ var ws = win_size + (a1[i][3]>>1);
+ var st = a1[i][0] > ws? a1[i][0] - ws : 0;
+ b = Interval.find_ovlp(a0, st, a1[i][1] + ws);
+ var n_ins = 0, n_del = 0, sv_del = null, sv_ins = null;
+ for (var j = 0; j < b.length; ++j) {
+ if (b[j][2] < 0) ++n_del, sv_del = -b[j][2];
+ else if (b[j][2] > 0) ++n_ins, sv_ins = b[j][2];
+ if (print_match)
+ print("MA", x, a1[i].slice(0, 3).join("\t"), b[j].slice(0, 3).join("\t"));
+ }
+ var match = false;
+ if (a1[i][2] > 0) { // insertion
+ if (n_ins == 1) {
+ var diff = sv_ins - a1[i][3];
+ if (diff < 0) diff = -diff;
+ if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
+ match = true;
+ } else if (n_ins > 1) match = true; // multiple insertions; ambiguous
+ } else if (a1[i][2] < 0) {
+ if (n_del == 1) { // deletion
+ var diff = sv_del - a1[i][3];
+ if (diff < 0) diff = -diff;
+ if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
+ match = true;
+ } else if (n_del > 1) match = true; // multiple deletions; ambiguous
+ }
+ if (match) ++m;
+ else if (print_err) {
+ if ((a1[i][2] > 0 && n_ins > 0) || (a1[i][2] < 0 && n_del > 0))
+ print("MM", x, a1[i].slice(0, 3).join("\t"));
+ print(label, x, a1[i].slice(0, 3).join("\t"));
+ }
+ }
+ }
+ return [n, m];
+ }
+
+ var v_base = read_vcf(args[getopt.ind+0], bed);
+ var v_call = read_vcf(args[getopt.ind+1], bed);
+ var fn = compare_vcf(v_call, v_base, 'FN');
+ var fp = compare_vcf(v_base, v_call, 'FP');
+ print('SN', fn[0], fn[1], (fn[1] / fn[0]).toFixed(6));
+ print('PC', fp[0], fp[1], (fp[1] / fp[0]).toFixed(6));
+ print('F1', ((fn[1] / fn[0] + fp[1] / fp[0]) / 2).toFixed(6));
+}
+
+function paf_vcfsel(args)
+{
+ var c, min_l = 0, max_l = 1<<30;
+ while ((c = getopt(args, "l:L:")) != null) {
+ if (c == 'l') min_l = parseInt(getopt.arg);
+ else if (c == 'L') max_l = parseInt(getopt.arg);
+ }
+
+ var buf = new Bytes();
+ if (getopt.ind == args.length) {
+ print("Usage: paftools.js vcfsel [options] ");
+ return 1;
+ }
+ var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
+ while (file.readline(buf) >= 0) {
+ var m, line = buf.toString();
+ if (line[0] == '#') {
+ print(line);
+ continue;
+ }
+ var t = line.split("\t");
+ var st = parseInt(t[1]), en = st + t[3].length - 1;
+ if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
+ en = parseInt(m[2]);
+ if (en < st) {
+ warn("END is smaller than POS: " + en + " < " + st);
+ en = st;
+ }
+ var b = _paf_get_alen(t);
+ var alen = b[0], min_abs_diff = b[1], max_abs_diff = b[2];
+ if (max_abs_diff < min_l || min_abs_diff > max_l)
+ continue;
+ print(line);
+ }
+ file.close();
+ buf.destroy();
+}
+
/*************************
***** main function *****
*************************/
@@ -2529,10 +2947,13 @@ function main(args)
print("");
print(" stat collect basic mapping information in PAF/SAM");
print(" asmstat collect basic assembly information");
- print(" asmgene evaluate gene completeness (EXPERIMENTAL)");
+ print(" asmgene evaluate gene completeness");
+ print(" misjoin evaluate large-scale misjoins");
print(" liftover simplistic liftOver");
print(" call call variants from asm-to-ref alignment with the cs tag");
print(" bedcov compute the number of bases covered");
+ print(" vcfstat VCF statistics");
+ print(" sveval compare two SV callsets in VCF");
print(" version print paftools.js version");
print("");
print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ");
@@ -2552,6 +2973,7 @@ function main(args)
else if (cmd == 'stat') paf_stat(args);
else if (cmd == 'asmstat') paf_asmstat(args);
else if (cmd == 'asmgene') paf_asmgene(args);
+ else if (cmd == 'misjoin') paf_misjoin(args);
else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args);
else if (cmd == 'vcfpair') paf_vcfpair(args);
else if (cmd == 'call') paf_call(args);
@@ -2561,6 +2983,9 @@ function main(args)
else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
else if (cmd == 'junceval') paf_junceval(args);
else if (cmd == 'ov-eval') paf_ov_eval(args);
+ else if (cmd == 'vcfstat') paf_vcfstat(args);
+ else if (cmd == 'sveval') paf_sveval(args);
+ else if (cmd == 'vcfsel') paf_vcfsel(args);
else if (cmd == 'version') print(paftools_version);
else throw Error("unrecognized command: " + cmd);
}
diff --git a/mmpriv.h b/mmpriv.h
index 3e03d57..cb9618b 100644
--- a/mmpriv.h
+++ b/mmpriv.h
@@ -36,6 +36,14 @@
extern "C" {
#endif
+typedef struct {
+ uint32_t n;
+ uint32_t q_pos;
+ uint32_t q_span:31, flt:1;
+ uint32_t seg_id:31, is_tandem:1;
+ const uint64_t *cr;
+} mm_seed_t;
+
typedef struct {
int n_u, n_a;
uint64_t *u;
@@ -52,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
+mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
+
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);
@@ -63,20 +73,25 @@ void mm_idxopt_init(mm_idxopt_t *opt);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
-mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
-
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
+
+mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale,
+ int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
+mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
+ int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
+mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
+ int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
+
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r);
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand);
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a);
int mm_set_sam_pri(int n, mm_reg1_t *r);
-void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
+void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r);
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
-void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
diff --git a/options.c b/options.c
index 81f6f4c..a567fb2 100644
--- a/options.c
+++ b/options.c
@@ -1,4 +1,5 @@
#include
+#include
#include "mmpriv.h"
void mm_idxopt_init(mm_idxopt_t *opt)
@@ -15,26 +16,30 @@ void mm_mapopt_init(mm_mapopt_t *opt)
memset(opt, 0, sizeof(mm_mapopt_t));
opt->seed = 11;
opt->mid_occ_frac = 2e-4f;
+ opt->min_mid_occ = 10;
+ opt->max_mid_occ = 1000000;
opt->sdust_thres = 0; // no SDUST masking
opt->min_cnt = 3;
opt->min_chain_score = 40;
- opt->bw = 500;
+ opt->bw = 500, opt->bw_long = 20000;
opt->max_gap = 5000;
opt->max_gap_ref = -1;
opt->max_chain_skip = 25;
opt->max_chain_iter = 5000;
- opt->chain_gap_scale = 1.0f;
+ opt->rmq_inner_dist = 1000;
+ opt->rmq_size_cap = 100000;
+ opt->rmq_rescue_size = 1000;
+ opt->rmq_rescue_ratio = 0.1f;
+ opt->chain_gap_scale = 0.8f;
+ opt->max_max_occ = 4095;
+ opt->occ_dist = 500;
opt->mask_level = 0.5f;
+ opt->mask_len = INT_MAX;
opt->pri_ratio = 0.8f;
opt->best_n = 5;
- opt->max_join_long = 20000;
- opt->max_join_short = 2000;
- opt->min_join_flank_sc = 1000;
- opt->min_join_flank_ratio = 0.5f;
-
opt->alt_drop = 0.15f;
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
@@ -46,6 +51,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6;
opt->max_clip_ratio = 1.0f;
opt->mini_batch_size = 500000000;
+ opt->max_sw_mat = 100000000;
opt->pe_ori = 0; // FF
opt->pe_bonus = 33;
@@ -55,10 +61,13 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
{
if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV))
opt->flag |= MM_F_SPLICE;
- if (opt->mid_occ <= 0)
+ if (opt->mid_occ <= 0) {
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
- if (opt->mid_occ < opt->min_mid_occ)
- opt->mid_occ = opt->min_mid_occ;
+ if (opt->mid_occ < opt->min_mid_occ)
+ opt->mid_occ = opt->min_mid_occ;
+ if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
+ opt->mid_occ = opt->max_mid_occ;
+ }
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
}
@@ -66,7 +75,7 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len)
{
if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0)
- opt->max_gap_ref = opt->bw = max_intron_len;
+ opt->max_gap_ref = opt->bw = opt->bw_long = max_intron_len;
}
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
@@ -74,37 +83,44 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
if (preset == 0) {
mm_idxopt_init(io);
mm_mapopt_init(mo);
+ } else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
} else if (strcmp(preset, "ava-ont") == 0) {
io->flag = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
- mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
- mo->bw = 2000;
+ mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
+ mo->bw = mo->bw_long = 2000;
+ mo->occ_dist = 0;
+ } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
+ io->flag |= MM_I_HPC, io->k = 19;
} else if (strcmp(preset, "ava-pb") == 0) {
io->flag |= MM_I_HPC, io->k = 19, io->w = 5;
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
- mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
- } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
- io->flag |= MM_I_HPC, io->k = 19;
- } else if (strcmp(preset, "map-ont") == 0) {
- io->flag = 0, io->k = 15;
- } else if (strcmp(preset, "asm5") == 0) {
+ mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
+ mo->bw_long = mo->bw;
+ mo->occ_dist = 0;
+ } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
io->flag = 0, io->k = 19, io->w = 19;
- mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
- mo->min_mid_occ = 100;
+ mo->max_gap = 10000;
+ mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
+ mo->occ_dist = 500;
+ mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
- mo->best_n = 50;
- } else if (strcmp(preset, "asm10") == 0) {
+ } else if (strncmp(preset, "asm", 3) == 0) {
io->flag = 0, io->k = 19, io->w = 19;
- mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
- mo->min_mid_occ = 100;
- mo->min_dp_max = 200;
- mo->best_n = 50;
- } else if (strcmp(preset, "asm20") == 0) {
- io->flag = 0, io->k = 19, io->w = 10;
- mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
- mo->min_mid_occ = 100;
+ mo->bw = mo->bw_long = 100000;
+ mo->max_gap = 10000;
+ mo->flag |= MM_F_RMQ;
+ mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
mo->best_n = 50;
+ if (strcmp(preset, "asm5") == 0) {
+ mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
+ } else if (strcmp(preset, "asm10") == 0) {
+ mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
+ } else if (strcmp(preset, "asm20") == 0) {
+ mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
+ io->w = 10;
+ } else return -1;
} else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {
io->flag = 0, io->k = 21, io->w = 11;
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
@@ -114,7 +130,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->end_bonus = 10;
mo->max_frag_len = 800;
mo->max_gap = 100;
- mo->bw = 100;
+ mo->bw = mo->bw_long = 100;
mo->pri_ratio = 0.5f;
mo->min_cnt = 2;
mo->min_chain_score = 25;
@@ -126,7 +142,8 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
} else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) {
io->flag = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
- mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
+ mo->max_sw_mat = 0;
+ mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
mo->noncan = 9;
mo->junc_bonus = 9;
@@ -139,6 +156,16 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
{
+ if (mo->bw > mo->bw_long) {
+ if (mm_verbose >= 1)
+ fprintf(stderr, "[ERROR]\033[1;31m with '-rNUM1,NUM2', NUM1 (%d) can't be larger than NUM2 (%d)\033[0m\n", mo->bw, mo->bw_long);
+ return -8;
+ }
+ if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) {
+ if (mm_verbose >= 1)
+ fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n");
+ return -7;
+ }
if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) {
if (mm_verbose >= 1)
fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n");
diff --git a/python/README.rst b/python/README.rst
index 3d0fae8..3082980 100644
--- a/python/README.rst
+++ b/python/README.rst
@@ -144,7 +144,7 @@ properties:
* **mlen**: length of the matching bases in the alignment, excluding ambiguous
base matches.
-* **NM**: number of mismatches, gaps and ambiguous poistions in the alignment
+* **NM**: number of mismatches, gaps and ambiguous positions in the alignment
* **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the
reverse strand; 0 if unknown
diff --git a/python/cmappy.pxd b/python/cmappy.pxd
index 72a19d7..abc3ef7 100644
--- a/python/cmappy.pxd
+++ b/python/cmappy.pxd
@@ -13,21 +13,27 @@ cdef extern from "minimap.h":
int64_t flag
int seed
int sdust_thres
+
int max_qlen
- int bw
+
+ int bw, bw_long
int max_gap, max_gap_ref
int max_frag_len
int max_chain_skip, max_chain_iter
int min_cnt
int min_chain_score
float chain_gap_scale
+ int rmq_size_cap, rmq_inner_dist
+ int rmq_rescue_size
+ float rmq_rescue_ratio
+
float mask_level
+ int mask_len
float pri_ratio
int best_n
- int max_join_long, max_join_short
- int min_join_flank_sc
- float min_join_flank_ratio
+
float alt_drop
+
int a, b, q, e, q2, e2
int sc_ambi
int noncan
@@ -38,13 +44,16 @@ cdef extern from "minimap.h":
int min_ksw_len
int anchor_ext_len, anchor_ext_shift
float max_clip_ratio
+
int pe_ori, pe_bonus
+
float mid_occ_frac
int32_t min_mid_occ
int32_t mid_occ
int32_t max_occ
int64_t mini_batch_size
int64_t max_sw_mat
+
const char *split_prefix
int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
diff --git a/python/mappy.pyx b/python/mappy.pyx
index 3273514..838b5ad 100644
--- a/python/mappy.pyx
+++ b/python/mappy.pyx
@@ -3,7 +3,7 @@ from libc.stdlib cimport free
cimport cmappy
import sys
-__version__ = '2.17'
+__version__ = '2.21'
cmappy.mm_reset_timer()
@@ -82,7 +82,7 @@ cdef class Alignment:
@property
def cigar_str(self):
- return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
+ return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar))
def __str__(self):
if self._strand > 0: strand = '+'
diff --git a/seed.c b/seed.c
new file mode 100644
index 0000000..baf1b20
--- /dev/null
+++ b/seed.c
@@ -0,0 +1,106 @@
+#include "mmpriv.h"
+#include "kalloc.h"
+#include "ksort.h"
+
+mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
+{
+ mm_seed_t *m;
+ size_t i;
+ int32_t k;
+ m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
+ for (i = k = 0; i < mv->n; ++i) {
+ const uint64_t *cr;
+ mm_seed_t *q;
+ mm128_t *p = &mv->a[i];
+ uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
+ int t;
+ cr = mm_idx_get(mi, p->x>>8, &t);
+ if (t == 0) continue;
+ q = &m[k++];
+ q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
+ q->is_tandem = q->flt = 0;
+ if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
+ if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
+ }
+ *n_m_ = k;
+ return m;
+}
+
+#define MAX_MAX_HIGH_OCC 128
+
+void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
+{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
+ extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
+ extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
+ int32_t i, last0, m;
+ uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
+
+ if (n == 0 || n == 1) return;
+ for (i = m = 0; i < n; ++i)
+ if (a[i].n > max_occ) ++m;
+ if (m == 0) return; // no high-frequency k-mers; do nothing
+ for (i = 0, last0 = -1; i <= n; ++i) {
+ if (i == n || a[i].n <= max_occ) {
+ if (i - last0 > 1) {
+ int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
+ int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
+ int32_t j, k, st = last0 + 1, en = i;
+ int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
+ if (max_high_occ > 0) {
+ if (max_high_occ > MAX_MAX_HIGH_OCC)
+ max_high_occ = MAX_MAX_HIGH_OCC;
+ for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
+ b[k] = (uint64_t)a[j].n<<32 | j;
+ ks_heapmake_uint64_t(k, b); // initialize the binomial heap
+ for (; j < en; ++j) { // if there are more, choose top max_high_occ
+ if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
+ b[0] = (uint64_t)a[j].n<<32 | j;
+ ks_heapdown_uint64_t(0, k, b);
+ }
+ }
+ for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
+ }
+ for (j = st; j < en; ++j) a[j].flt ^= 1;
+ for (j = st; j < en; ++j)
+ if (a[j].n > max_max_occ)
+ a[j].flt = 1;
+ }
+ last0 = i;
+ }
+ }
+}
+
+mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
+{
+ int rep_st = 0, rep_en = 0, n_m, n_m0;
+ size_t i;
+ mm_seed_t *m;
+ *n_mini_pos = 0;
+ *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
+ m = mm_seed_collect_all(km, mi, mv, &n_m0);
+ if (dist > 0 && max_max_occ > max_occ) {
+ mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
+ } else {
+ for (i = 0; i < n_m0; ++i)
+ if (m[i].n > max_occ)
+ m[i].flt = 1;
+ }
+ for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
+ mm_seed_t *q = &m[i];
+ //fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
+ if (q->flt) {
+ int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
+ if (st > rep_en) {
+ *rep_len += rep_en - rep_st;
+ rep_st = st, rep_en = en;
+ } else rep_en = en;
+ } else {
+ *n_a += q->n;
+ (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
+ m[n_m++] = *q;
+ }
+ }
+ *rep_len += rep_en - rep_st;
+ *_n_m = n_m;
+ return m;
+}
diff --git a/setup.py b/setup.py
index 042911c..1a4d227 100644
--- a/setup.py
+++ b/setup.py
@@ -4,16 +4,6 @@ except ImportError:
from distutils.core import setup
from distutils.extension import Extension
-cmdclass = {}
-
-try:
- from Cython.Build import build_ext
-except ImportError: # without Cython
- module_src = 'python/mappy.c'
-else: # with Cython
- module_src = 'python/mappy.pyx'
- cmdclass['build_ext'] = build_ext
-
import sys, platform
sys.path.append('python')
@@ -33,7 +23,7 @@ def readme():
setup(
name = 'mappy',
- version = '2.17',
+ version = '2.21',
url = 'https://github.com/lh3/minimap2',
description = 'Minimap2 python binding',
long_description = readme(),
@@ -42,8 +32,8 @@ setup(
license = 'MIT',
keywords = 'sequence-alignment',
scripts = ['python/minimap2.py'],
- ext_modules = [Extension('mappy',
- sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
+ ext_modules = [Extension('mappy',
+ sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
@@ -62,4 +52,4 @@ setup(
'Programming Language :: Python :: 3',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Bio-Informatics'],
- cmdclass = cmdclass)
+ setup_requires=["cython"])