Merge branch 'master' into utec
This commit is contained in:
commit
b046052d82
|
|
@ -0,0 +1,21 @@
|
|||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
compiler: [gcc, clang]
|
||||
|
||||
steps:
|
||||
- name: Checkout minimap2
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Compile with ${{ matrix.compiler }}
|
||||
run: make CC=${{ matrix.compiler }}
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
[submodule "lib/simde"]
|
||||
path = lib/simde
|
||||
url = https://github.com/nemequ/simde.git
|
||||
|
|
@ -19,6 +19,6 @@ matrix:
|
|||
before_install: pip install cython
|
||||
script: python setup.py build_ext
|
||||
- language: python
|
||||
python: "3.6"
|
||||
python: "3.9"
|
||||
before_install: pip install cython
|
||||
script: python setup.py build_ext
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ include ksw2_dispatch.c
|
|||
include main.c
|
||||
include README.md
|
||||
include sse2neon/emmintrin.h
|
||||
include python/mappy.c
|
||||
include python/cmappy.h
|
||||
include python/cmappy.pxd
|
||||
include python/mappy.pyx
|
||||
|
|
|
|||
36
Makefile
36
Makefile
|
|
@ -1,7 +1,9 @@
|
|||
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
|
||||
CPPFLAGS= -DHAVE_KALLOC
|
||||
INCLUDES=
|
||||
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o
|
||||
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
|
||||
lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
|
||||
ksw2_ll_sse.o
|
||||
PROG= minimap2
|
||||
PROG_EXTRA= sdust minimap2-lite
|
||||
LIBS= -lm -lz -lpthread
|
||||
|
|
@ -103,26 +105,28 @@ depend:
|
|||
|
||||
# DO NOT DELETE
|
||||
|
||||
align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
|
||||
align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
|
||||
bseq.o: bseq.h kvec.h kalloc.h kseq.h
|
||||
chain.o: minimap.h mmpriv.h bseq.h kalloc.h
|
||||
esterr.o: mmpriv.h minimap.h bseq.h
|
||||
esterr.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
example.o: minimap.h kseq.h
|
||||
format.o: kalloc.h mmpriv.h minimap.h bseq.h
|
||||
hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h
|
||||
index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
|
||||
format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
|
||||
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
|
||||
index.o: ksort.h
|
||||
kalloc.o: kalloc.h
|
||||
ksw2_extd2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_exts2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_extz2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_ll_sse.o: ksw2.h kalloc.h
|
||||
kthread.o: kthread.h
|
||||
main.o: bseq.h minimap.h mmpriv.h ketopt.h
|
||||
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h
|
||||
map.o: ksort.h
|
||||
misc.o: mmpriv.h minimap.h bseq.h ksort.h
|
||||
options.o: mmpriv.h minimap.h bseq.h
|
||||
pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h
|
||||
sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h
|
||||
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h
|
||||
splitidx.o: mmpriv.h minimap.h bseq.h
|
||||
lchain.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h krmq.h
|
||||
main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
|
||||
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
map.o: khash.h ksort.h
|
||||
misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
|
||||
options.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
|
||||
sdust.o: kalloc.h kdq.h kvec.h sdust.h
|
||||
seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h
|
||||
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
|
|
|
|||
|
|
@ -0,0 +1,97 @@
|
|||
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
|
||||
CPPFLAGS= -DHAVE_KALLOC -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
|
||||
INCLUDES= -Ilib/simde
|
||||
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o \
|
||||
ksw2_extz2_simde.o ksw2_extd2_simde.o ksw2_exts2_simde.o ksw2_ll_simde.o
|
||||
PROG= minimap2
|
||||
PROG_EXTRA= sdust minimap2-lite
|
||||
LIBS= -lm -lz -lpthread
|
||||
|
||||
|
||||
ifneq ($(arm_neon),) # if arm_neon is defined
|
||||
ifeq ($(aarch64),) #if aarch64 is not defined
|
||||
CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char
|
||||
else #if aarch64 is defined
|
||||
CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(asan),)
|
||||
CFLAGS+=-fsanitize=address
|
||||
LIBS+=-fsanitize=address
|
||||
endif
|
||||
|
||||
ifneq ($(tsan),)
|
||||
CFLAGS+=-fsanitize=thread
|
||||
LIBS+=-fsanitize=thread
|
||||
endif
|
||||
|
||||
.PHONY:all extra clean depend
|
||||
.SUFFIXES:.c .o
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@
|
||||
|
||||
all:$(PROG)
|
||||
|
||||
extra:all $(PROG_EXTRA)
|
||||
|
||||
minimap2:main.o libminimap2.a
|
||||
$(CC) $(CFLAGS) main.o -o $@ -L. -lminimap2 $(LIBS)
|
||||
|
||||
minimap2-lite:example.o libminimap2.a
|
||||
$(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS)
|
||||
|
||||
libminimap2.a:$(OBJS)
|
||||
$(AR) -csru $@ $(OBJS)
|
||||
|
||||
sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h
|
||||
$(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz
|
||||
|
||||
ksw2_ll_simde.o:ksw2_ll_sse.c ksw2.h kalloc.h
|
||||
$(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@
|
||||
|
||||
ksw2_extz2_simde.o:ksw2_extz2_sse.c ksw2.h kalloc.h
|
||||
$(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
|
||||
|
||||
ksw2_extd2_simde.o:ksw2_extd2_sse.c ksw2.h kalloc.h
|
||||
$(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
|
||||
|
||||
ksw2_exts2_simde.o:ksw2_exts2_sse.c ksw2.h kalloc.h
|
||||
$(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
|
||||
|
||||
# other non-file targets
|
||||
|
||||
clean:
|
||||
rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM build dist mappy*.so mappy.c python/mappy.c mappy.egg*
|
||||
|
||||
depend:
|
||||
(LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(CPPFLAGS) -- *.c)
|
||||
|
||||
# DO NOT DELETE
|
||||
|
||||
align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
|
||||
bseq.o: bseq.h kvec.h kalloc.h kseq.h
|
||||
chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h
|
||||
esterr.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
example.o: minimap.h kseq.h
|
||||
format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
|
||||
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
|
||||
index.o: ksort.h
|
||||
kalloc.o: kalloc.h
|
||||
ksw2_extd2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_exts2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_extz2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_ll_sse.o: ksw2.h kalloc.h
|
||||
kthread.o: kthread.h
|
||||
main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
|
||||
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
map.o: khash.h ksort.h
|
||||
misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
|
||||
options.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
|
||||
sdust.o: kalloc.h kdq.h kvec.h sdust.h
|
||||
self-chain.o: minimap.h kseq.h
|
||||
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
|
||||
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h
|
||||
148
NEWS.md
148
NEWS.md
|
|
@ -1,3 +1,151 @@
|
|||
Release 2.21-r1071 (6 July 2021)
|
||||
--------------------------------
|
||||
|
||||
This release fixed a regression in short-read mapping introduced in v2.19
|
||||
(#776). It also fixed invalid comparisons of uninitialized variables, though
|
||||
these are harmless (#752). Long-read alignment should be identical to v2.20.
|
||||
|
||||
(2.21: 6 July 2021)
|
||||
|
||||
|
||||
|
||||
Release 2.20-r1061 (27 May 2021)
|
||||
--------------------------------
|
||||
|
||||
This release fixed a bug in the Python module and improves the command-line
|
||||
compatibiliity with v2.18. In v2.19, if `-r` is specified with an `asm*` preset,
|
||||
users would get alignments more fragmented than v2.18. This could be an issue
|
||||
for existing pipelines specifying `-r`. This release resolves this issue.
|
||||
|
||||
(2.20: 27 May 2021, r1061)
|
||||
|
||||
|
||||
|
||||
Release 2.19-r1057 (26 May 2021)
|
||||
--------------------------------
|
||||
|
||||
This release includes a few important improvements backported from unimap:
|
||||
|
||||
* Improvement: more contiguous alignment through long INDELs. This is enabled
|
||||
by the minigraph chaining algorithm. All `asm*` presets now use the new
|
||||
algorithm. They can find INDELs up to 100kb and may be faster for
|
||||
chromosome-long contigs. The default mode and `map*` presets use this
|
||||
algorithm to replace the long-join heuristic.
|
||||
|
||||
* Improvement: better alignment in highly repetitive regions by rescuing
|
||||
high-occurrence seeds. If the distance between two adjacent seeds is too
|
||||
large, attempt to choose a fraction of high-occurrence seeds in-between.
|
||||
Minimap2 now produces fewer clippings and alignment break points in long
|
||||
satellite regions.
|
||||
|
||||
* Improvement: allow to specify an interval of k-mer occurrences with `-U`.
|
||||
For repeat-rich genomes, the automatic k-mer occurrence threshold determined
|
||||
by `-f` may be too large and makes alignment impractically slow. The new
|
||||
option protects against such cases. Enabled for `asm*` and `map-hifi`.
|
||||
|
||||
* New feature: added the `map-hifi` preset for maping PacBio High-Fidelity
|
||||
(HiFi) reads.
|
||||
|
||||
* Change to the default: apply `--cap-sw-mem=100m` for genomic alignment.
|
||||
|
||||
* Bugfix: minimap2 could not generate an index file with `-xsr` (#734).
|
||||
|
||||
This release represents the most signficant algorithmic change since v2.1 in
|
||||
2017. With features backported from unimap, minimap2 now has similar power to
|
||||
unimap for contig alignment. Unimap will remain an experimental project and is
|
||||
no longer recommended over minimap2. Sorry for reverting the recommendation in
|
||||
short time.
|
||||
|
||||
(2.19: 26 May 2021, r1057)
|
||||
|
||||
|
||||
|
||||
Release 2.18-r1015 (9 April 2021)
|
||||
---------------------------------
|
||||
|
||||
This release fixes multiple rare bugs in minimap2 and adds additional
|
||||
functionality to paftools.js.
|
||||
|
||||
Changes to minimap2:
|
||||
|
||||
* Bugfix: a rare segfault caused by an off-by-one error (#489)
|
||||
|
||||
* Bugfix: minimap2 segfaulted due to an uninitilized variable (#622 and #625).
|
||||
|
||||
* Bugfix: minimap2 parsed spaces as field separators in BED (#721). This led
|
||||
to issues when the BED name column contains spaces.
|
||||
|
||||
* Bugfix: minimap2 `--split-prefix` did not work with long reference names
|
||||
(#394).
|
||||
|
||||
* Bugfix: option `--junc-bonus` didn't work (#513)
|
||||
|
||||
* Bugfix: minimap2 didn't return 1 on I/O errors (#532)
|
||||
|
||||
* Bugfix: the `de:f` tag (sequence divergence) could be negative if there were
|
||||
ambiguous bases
|
||||
|
||||
* Bugfix: fixed two undefined behaviors caused by calling memcpy() on
|
||||
zero-length blocks (#443)
|
||||
|
||||
* Bugfix: there were duplicated SAM @SQ lines if option `--split-prefix` is in
|
||||
use (#400 and #527)
|
||||
|
||||
* Bugfix: option -K had to be smaller than 2 billion (#491). This was caused
|
||||
by a 32-bit integer overflow.
|
||||
|
||||
* Improvement: optionally compile against SIMDe (#597). Minimap2 should work
|
||||
with IBM POWER CPUs, though this has not been tested. To compile with SIMDe,
|
||||
please use `make -f Makefile.simde`.
|
||||
|
||||
* Improvement: more informative error message for I/O errors (#454) and for
|
||||
FASTQ parsing errors (#510)
|
||||
|
||||
* Improvement: abort given malformatted RG line (#541)
|
||||
|
||||
* Improvement: better formula to estimate the `dv:f` tag (approximate sequence
|
||||
divergence). See DOI:10.1101/2021.01.15.426881.
|
||||
|
||||
* New feature: added the `--mask-len` option to fine control the removal of
|
||||
redundant hits (#659). The default behavior is unchanged.
|
||||
|
||||
Changes to mappy:
|
||||
|
||||
* Bugfix: mappy caused segmentation fault if the reference index is not
|
||||
present (#413).
|
||||
|
||||
* Bugfix: fixed a memory leak via 238b6bb3
|
||||
|
||||
* Change: always require Cython to compile the mappy module (#723). Older
|
||||
mappy packages at PyPI bundled the C source code generated by Cython such
|
||||
that end users did not need to install Cython to compile mappy. However, as
|
||||
Python 3.9 is breaking backward compatibility, older mappy does not work
|
||||
with Python 3.9 anymore. We have to add this Cython dependency as a
|
||||
workaround.
|
||||
|
||||
Changes to paftools.js:
|
||||
|
||||
* Bugfix: the "part10-" line from asmgene was wrong (#581)
|
||||
|
||||
* Improvement: compatibility with GTF files from GenBank (#422)
|
||||
|
||||
* New feature: asmgene also checks missing multi-copy genes
|
||||
|
||||
* New feature: added the misjoin command to evaluate large-scale misjoins and
|
||||
megabase-long inversions.
|
||||
|
||||
Although given the many bug fixes and minor improvements, the core algorithm
|
||||
stays the same. This version of minimap2 produces nearly identical alignments
|
||||
to v2.17 except very rare corner cases.
|
||||
|
||||
Now unimap is recommended over minimap2 for aligning long contigs against a
|
||||
reference genome. It often takes less wall-clock time and is much more
|
||||
sensitive to long insertions and deletions.
|
||||
|
||||
(2.18: 9 April 2021, r1015)
|
||||
|
||||
|
||||
|
||||
Release 2.17-r941 (4 May 2019)
|
||||
------------------------------
|
||||
|
||||
|
|
|
|||
49
README.md
49
README.md
|
|
@ -1,7 +1,7 @@
|
|||
[](https://github.com/lh3/minimap2/releases)
|
||||
[](https://anaconda.org/bioconda/minimap2)
|
||||
[](https://pypi.python.org/pypi/mappy)
|
||||
[](https://travis-ci.org/lh3/minimap2)
|
||||
[](https://github.com/lh3/minimap2/actions)
|
||||
## <a name="started"></a>Getting Started
|
||||
```sh
|
||||
git clone https://github.com/lh3/minimap2
|
||||
|
|
@ -12,19 +12,22 @@ cd minimap2 && make
|
|||
./minimap2 -x map-ont -d MT-human-ont.mmi test/MT-human.fa
|
||||
./minimap2 -a MT-human-ont.mmi test/MT-orang.fa > test.sam
|
||||
# use presets (no test data)
|
||||
./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio genomic reads
|
||||
./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
|
||||
./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
|
||||
./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio CCS genomic reads
|
||||
./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
|
||||
./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier)
|
||||
./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
|
||||
./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
|
||||
./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
|
||||
./minimap2 -ax splice:hq -uf ref.fa query.fa > aln.sam # Final PacBio Iso-seq or traditional cDNA
|
||||
./minimap2 -ax splice --junc-bed anno.bed12 ref.fa query.fa > aln.sam # prioritize on annotated junctions
|
||||
./minimap2 -cx asm5 asm1.fa asm2.fa > aln.paf # intra-species asm-to-asm alignment
|
||||
./minimap2 -x ava-pb reads.fa reads.fa > overlaps.paf # PacBio read overlap
|
||||
./minimap2 -x ava-ont reads.fa reads.fa > overlaps.paf # Nanopore read overlap
|
||||
# man page for detailed command line options
|
||||
man ./minimap2.1
|
||||
```
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Getting Started](#started)
|
||||
|
|
@ -71,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
|
|||
Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
|
||||
the [release page][release] with:
|
||||
```sh
|
||||
curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar -jxvf -
|
||||
./minimap2-2.17_x64-linux/minimap2
|
||||
curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar -jxvf -
|
||||
./minimap2-2.21_x64-linux/minimap2
|
||||
```
|
||||
If you want to compile from the source, you need to have a C compiler, GNU make
|
||||
and zlib development files installed. Then type `make` in the source code
|
||||
|
|
@ -80,13 +83,20 @@ directory to compile. If you see compilation errors, try `make sse2only=1`
|
|||
to disable SSE4 code, which will make minimap2 slightly slower.
|
||||
|
||||
Minimap2 also works with ARM CPUs supporting the NEON instruction sets. To
|
||||
compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 aarch64=1`.
|
||||
compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To
|
||||
compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1
|
||||
aarch64=1`.
|
||||
|
||||
Minimap2 can use [SIMD Everywhere (SIMDe)][simde] library for porting
|
||||
implementation to the different SIMD instruction sets. To compile using SIMDe,
|
||||
use `make -f Makefile.simde`. To compile for ARM CPUs, use `Makefile.simde`
|
||||
with the ARM related command lines given above.
|
||||
|
||||
### <a name="general"></a>General usage
|
||||
|
||||
Without any options, minimap2 takes a reference database and a query sequence
|
||||
file as input and produce approximate mapping, without base-level alignment
|
||||
(i.e. no CIGAR), in the [PAF format][paf]:
|
||||
(i.e. coordinates are only approximate and no CIGAR in output), in the [PAF format][paf]:
|
||||
```sh
|
||||
minimap2 ref.fa query.fq > approx-mapping.paf
|
||||
```
|
||||
|
|
@ -127,13 +137,13 @@ parameters at the same time. The default setting is the same as `map-ont`.
|
|||
#### <a name="map-long-genomic"></a>Map long noisy genomic reads
|
||||
|
||||
```sh
|
||||
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio subreads
|
||||
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
|
||||
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
|
||||
```
|
||||
The difference between `map-pb` and `map-ont` is that `map-pb` uses
|
||||
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
|
||||
minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
|
||||
performance and sensitivity when aligning PacBio reads, but hurt when aligning
|
||||
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
|
||||
Nanopore reads.
|
||||
|
||||
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
|
||||
|
|
@ -178,10 +188,23 @@ This is because SIRV does not honor the evolutionarily conservative splicing
|
|||
signal. If you are studying SIRV, you may apply `--splice-flank=no` to let
|
||||
minimap2 only model GT..AG, ignoring the additional base.
|
||||
|
||||
Since v2.17, minimap2 can optionally take annotated genes as input and
|
||||
prioritize on annotated splice junctions. To use this feature, you can
|
||||
```sh
|
||||
paftools.js gff2bed anno.gff > anno.bed
|
||||
minimap2 -ax splice --junc-bed anno.bed ref.fa query.fa > aln.sam
|
||||
```
|
||||
Here, `anno.gff` is the gene annotation in the GTF or GFF3 format (`gff2bed`
|
||||
automatically tests the format). The output of `gff2bed` is in the 12-column
|
||||
BED format, or the BED12 format. With the `--junc-bed` option, minimap2 adds a
|
||||
bonus score (tuned by `--junc-bonus`) if an aligned junction matches a junction
|
||||
in the annotation. Option `--junc-bed` also takes 5-column BED, including the
|
||||
strand field. In this case, each line indicates an oriented junction.
|
||||
|
||||
#### <a name="long-overlap"></a>Find overlaps between long reads
|
||||
|
||||
```sh
|
||||
minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio read overlap
|
||||
minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio CLR read overlap
|
||||
minimap2 -x ava-ont reads.fq reads.fq > ovlp.paf # Oxford Nanopore read overlap
|
||||
```
|
||||
Similarly, `ava-pb` uses HPC minimizers while `ava-ont` uses ordinary
|
||||
|
|
@ -228,7 +251,7 @@ To avoid this issue, you can add option `-L` at the minimap2 command line.
|
|||
This option moves a long CIGAR to the `CG` tag and leaves a fully clipped CIGAR
|
||||
at the SAM CIGAR column. Current tools that don't read CIGAR (e.g. merging and
|
||||
sorting) still work with such BAM records; tools that read CIGAR will
|
||||
effectively ignore these records. It has been decided that future tools will
|
||||
effectively ignore these records. It has been decided that future tools
|
||||
will seamlessly recognize long-cigar records generated by option `-L`.
|
||||
|
||||
**TL;DR**: if you work with ultra-long reads and use tools that only process
|
||||
|
|
@ -249,7 +272,7 @@ CGATCGATAAATAGAGTAG---GAATAGCA
|
|||
CGATCG---AATAGAGTAGGTCGAATtGCA
|
||||
```
|
||||
is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an
|
||||
identical block, `-ata` represents a deltion, `+gtc` an insertion and `*at`
|
||||
identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at`
|
||||
indicates reference base `a` is substituted with a query base `t`. It is
|
||||
similar to the `MD` SAM tag but is standalone and easier to parse.
|
||||
|
||||
|
|
@ -376,3 +399,5 @@ mappy` or [from BioConda][mappyconda] via `conda install -c bioconda mappy`.
|
|||
[manpage]: https://lh3.github.io/minimap2/minimap2.html
|
||||
[manpage-cs]: https://lh3.github.io/minimap2/minimap2.html#10
|
||||
[doi]: https://doi.org/10.1093/bioinformatics/bty191
|
||||
[smide]: https://github.com/nemequ/simde
|
||||
[unimap]: https://github.com/lh3/unimap
|
||||
|
|
|
|||
79
align.c
79
align.c
|
|
@ -38,8 +38,8 @@ static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, i
|
|||
int z = *max - score - diff * e;
|
||||
if (z > *max_zdrop) {
|
||||
*max_zdrop = z;
|
||||
pos[0][0] = *max_i, pos[0][1] = i + 1;
|
||||
pos[1][0] = *max_j, pos[1][1] = j + 1;
|
||||
pos[0][0] = *max_i, pos[0][1] = i;
|
||||
pos[1][0] = *max_j, pos[1][1] = j;
|
||||
}
|
||||
} else *max = score, *max_i = i, *max_j = j;
|
||||
}
|
||||
|
|
@ -53,16 +53,16 @@ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq,
|
|||
// find the score and the region where score drops most along diagonal
|
||||
for (k = 0, score = 0; k < n_cigar; ++k) {
|
||||
uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4;
|
||||
if (op == 0) {
|
||||
if (op == MM_CIGAR_MATCH) {
|
||||
for (l = 0; l < len; ++l) {
|
||||
score += mat[tseq[i + l] * 5 + qseq[j + l]];
|
||||
update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
|
||||
}
|
||||
i += len, j += len;
|
||||
} else if (op == 1 || op == 2 || op == 3) {
|
||||
} else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) {
|
||||
score -= opt->q + opt->e * len;
|
||||
if (op == 1) j += len; // insertion
|
||||
else i += len; // deletion
|
||||
if (op == MM_CIGAR_INS) j += len;
|
||||
else i += len;
|
||||
update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
|
||||
}
|
||||
}
|
||||
|
|
@ -98,12 +98,12 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
|
|||
for (k = 0; k < p->n_cigar; ++k) { // indel left alignment
|
||||
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
|
||||
if (len == 0) to_shrink = 1;
|
||||
if (op == 0) {
|
||||
if (op == MM_CIGAR_MATCH) {
|
||||
toff += len, qoff += len;
|
||||
} else if (op == 1 || op == 2) { // insertion or deletion
|
||||
} else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
|
||||
if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) {
|
||||
int l, prev_len = p->cigar[k-1] >> 4;
|
||||
if (op == 1) {
|
||||
if (op == MM_CIGAR_INS) {
|
||||
for (l = 0; l < prev_len; ++l)
|
||||
if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l])
|
||||
break;
|
||||
|
|
@ -116,9 +116,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
|
|||
p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l;
|
||||
if (l == prev_len) to_shrink = 1;
|
||||
}
|
||||
if (op == 1) qoff += len;
|
||||
if (op == MM_CIGAR_INS) qoff += len;
|
||||
else toff += len;
|
||||
} else if (op == 3) {
|
||||
} else if (op == MM_CIGAR_N_SKIP) {
|
||||
toff += len;
|
||||
}
|
||||
}
|
||||
|
|
@ -128,13 +128,13 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
|
|||
uint32_t l, s[3] = {0,0,0};
|
||||
for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D
|
||||
uint32_t op = p->cigar[l]&0xf;
|
||||
if (op == 1 || op == 2 || p->cigar[l]>>4 == 0)
|
||||
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0)
|
||||
s[op] += p->cigar[l] >> 4;
|
||||
else break;
|
||||
}
|
||||
if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D
|
||||
p->cigar[k] = s[1]<<4|1;
|
||||
p->cigar[k+1] = s[2]<<4|2;
|
||||
p->cigar[k] = s[1]<<4|MM_CIGAR_INS;
|
||||
p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL;
|
||||
for (k += 2; k < l; ++k)
|
||||
p->cigar[k] &= 0xf;
|
||||
to_shrink = 1;
|
||||
|
|
@ -154,9 +154,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
|
|||
else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator
|
||||
p->n_cigar = l;
|
||||
}
|
||||
if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D
|
||||
if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D
|
||||
int32_t l = p->cigar[0] >> 4;
|
||||
if ((p->cigar[0]&0xf) == 1) {
|
||||
if ((p->cigar[0]&0xf) == MM_CIGAR_INS) {
|
||||
if (r->rev) r->qe -= l;
|
||||
else r->qs += l;
|
||||
*qshift = l;
|
||||
|
|
@ -174,7 +174,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
|
|||
if (r->p == 0) return;
|
||||
for (k = 0; k < r->p->n_cigar; ++k) {
|
||||
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
|
||||
if (op == 0) {
|
||||
if (op == MM_CIGAR_MATCH) {
|
||||
while (len > 0) {
|
||||
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "="
|
||||
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
|
||||
|
|
@ -183,11 +183,11 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
|
|||
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
|
||||
}
|
||||
++n_M;
|
||||
} else if (op == 1) { // insertion
|
||||
} else if (op == MM_CIGAR_INS) {
|
||||
qoff += len;
|
||||
} else if (op == 2) { // deletion
|
||||
} else if (op == MM_CIGAR_DEL) {
|
||||
toff += len;
|
||||
} else if (op == 3) { // intron
|
||||
} else if (op == MM_CIGAR_N_SKIP) {
|
||||
toff += len;
|
||||
}
|
||||
}
|
||||
|
|
@ -195,7 +195,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
|
|||
if (n_EQX == n_M) {
|
||||
for (k = 0; k < r->p->n_cigar; ++k) {
|
||||
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
|
||||
if (op == 0) r->p->cigar[k] = len << 4 | 7;
|
||||
if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
@ -209,25 +209,25 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
|
|||
toff = qoff = m = 0;
|
||||
for (k = 0; k < r->p->n_cigar; ++k) {
|
||||
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
|
||||
if (op == 0) { // match/mismatch
|
||||
if (op == MM_CIGAR_MATCH) {
|
||||
while (len > 0) {
|
||||
// match
|
||||
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {}
|
||||
if (l > 0) p->cigar[m++] = l << 4 | 7;
|
||||
if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH;
|
||||
len -= l;
|
||||
toff += l, qoff += l;
|
||||
// mismatch
|
||||
for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {}
|
||||
if (l > 0) p->cigar[m++] = l << 4 | 8;
|
||||
if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH;
|
||||
len -= l;
|
||||
toff += l, qoff += l;
|
||||
}
|
||||
continue;
|
||||
} else if (op == 1) { // insertion
|
||||
} else if (op == MM_CIGAR_INS) {
|
||||
qoff += len;
|
||||
} else if (op == 2) { // deletion
|
||||
} else if (op == MM_CIGAR_DEL) {
|
||||
toff += len;
|
||||
} else if (op == 3) { // intron
|
||||
} else if (op == MM_CIGAR_N_SKIP) {
|
||||
toff += len;
|
||||
}
|
||||
p->cigar[m++] = r->p->cigar[k];
|
||||
|
|
@ -248,7 +248,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
r->blen = r->mlen = 0;
|
||||
for (k = 0; k < p->n_cigar; ++k) {
|
||||
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
|
||||
if (op == 0) { // match/mismatch
|
||||
if (op == MM_CIGAR_MATCH) {
|
||||
int n_ambi = 0, n_diff = 0;
|
||||
for (l = 0; l < len; ++l) {
|
||||
int cq = qseq[qoff + l], ct = tseq[toff + l];
|
||||
|
|
@ -260,7 +260,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
}
|
||||
r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi;
|
||||
toff += len, qoff += len;
|
||||
} else if (op == 1) { // insertion
|
||||
} else if (op == MM_CIGAR_INS) {
|
||||
int n_ambi = 0;
|
||||
for (l = 0; l < len; ++l)
|
||||
if (qseq[qoff + l] > 3) ++n_ambi;
|
||||
|
|
@ -268,7 +268,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
qoff += len;
|
||||
} else if (op == 2) { // deletion
|
||||
} else if (op == MM_CIGAR_DEL) {
|
||||
int n_ambi = 0;
|
||||
for (l = 0; l < len; ++l)
|
||||
if (tseq[toff + l] > 3) ++n_ambi;
|
||||
|
|
@ -276,7 +276,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
|
|||
s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
toff += len;
|
||||
} else if (op == 3) { // intron
|
||||
} else if (op == MM_CIGAR_N_SKIP) {
|
||||
toff += len;
|
||||
}
|
||||
}
|
||||
|
|
@ -333,7 +333,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
|
|||
int i;
|
||||
fprintf(stderr, "score=%d, cigar=", ez->score);
|
||||
for (i = 0; i < ez->n_cigar; ++i)
|
||||
fprintf(stderr, "%d%c", ez->cigar[i]>>4, "MIDN"[ez->cigar[i]&0xf]);
|
||||
fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
|
@ -572,7 +572,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
|
||||
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
|
||||
uint8_t *tseq, *qseq, *junc;
|
||||
int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
|
||||
int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
|
||||
int32_t rs, re, qs, qe;
|
||||
int32_t rs1, qs1, re1, qe1;
|
||||
int8_t mat[25];
|
||||
|
|
@ -583,6 +583,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
if (r->cnt == 0) return;
|
||||
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
|
||||
bw = (int)(opt->bw * 1.5 + 1.);
|
||||
bw_long = (int)(opt->bw_long * 1.5 + 1.);
|
||||
if (bw_long < bw) bw_long = bw;
|
||||
|
||||
if (is_sr && !(mi->flag & MM_I_HPC)) {
|
||||
mm_max_stretch(r, a, &as1, &cnt1);
|
||||
|
|
@ -724,7 +726,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
} else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
|
||||
re1 = re, qe1 = qe;
|
||||
if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) {
|
||||
int j, bw1 = bw, zdrop_code;
|
||||
int j, bw1 = bw_long, zdrop_code;
|
||||
if (a[as1+i].y & MM_SEED_LONG_JOIN)
|
||||
bw1 = qe - qs > re - rs? qe - qs : re - rs;
|
||||
// perform alignment
|
||||
|
|
@ -743,7 +745,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
|
||||
else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
|
||||
}
|
||||
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs);
|
||||
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
|
||||
} else { // perform normal gapped alignment
|
||||
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
|
||||
}
|
||||
|
|
@ -754,6 +756,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
if (ez->n_cigar > 0)
|
||||
mm_append_cigar(r, ez->n_cigar, ez->cigar);
|
||||
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
|
||||
if (!r->p) {
|
||||
assert(ez->n_cigar == 0);
|
||||
uint32_t capacity = sizeof(mm_extra_t)/4;
|
||||
kroundup32(capacity);
|
||||
r->p = (mm_extra_t*)calloc(capacity, 4);
|
||||
r->p->capacity = capacity;
|
||||
}
|
||||
for (j = i - 1; j >= 0; --j)
|
||||
if ((int32_t)a[as1 + j].x <= rs + ez->max_t)
|
||||
break;
|
||||
|
|
|
|||
164
chain.c
164
chain.c
|
|
@ -1,164 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
#include "kalloc.h"
|
||||
|
||||
static const char LogTable256[256] = {
|
||||
#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
|
||||
-1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
|
||||
LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
|
||||
};
|
||||
|
||||
static inline int ilog2_32(uint32_t v)
|
||||
{
|
||||
uint32_t t, tt;
|
||||
if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt];
|
||||
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
|
||||
}
|
||||
|
||||
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
|
||||
{ // TODO: make sure this works when n has more than 32 bits
|
||||
int32_t k, *f, *p, *t, *v, n_u, n_v;
|
||||
int64_t i, j, st = 0;
|
||||
uint64_t *u, *u2, sum_qspan = 0;
|
||||
float avg_qspan;
|
||||
mm128_t *b, *w;
|
||||
|
||||
if (_u) *_u = 0, *n_u_ = 0;
|
||||
if (n == 0 || a == 0) {
|
||||
kfree(km, a);
|
||||
return 0;
|
||||
}
|
||||
f = (int32_t*)kmalloc(km, n * 4);
|
||||
p = (int32_t*)kmalloc(km, n * 4);
|
||||
t = (int32_t*)kmalloc(km, n * 4);
|
||||
v = (int32_t*)kmalloc(km, n * 4);
|
||||
memset(t, 0, n * 4);
|
||||
|
||||
for (i = 0; i < n; ++i) sum_qspan += a[i].y>>32&0xff;
|
||||
avg_qspan = (float)sum_qspan / n;
|
||||
|
||||
// fill the score and backtrack arrays
|
||||
for (i = 0; i < n; ++i) {
|
||||
uint64_t ri = a[i].x;
|
||||
int64_t max_j = -1;
|
||||
int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!!
|
||||
int32_t max_f = q_span, n_skip = 0, min_d;
|
||||
int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
|
||||
while (st < i && ri > a[st].x + max_dist_x) ++st;
|
||||
if (i - st > max_iter) st = i - max_iter;
|
||||
for (j = i - 1; j >= st; --j) {
|
||||
int64_t dr = ri - a[j].x;
|
||||
int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost;
|
||||
int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
|
||||
if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below
|
||||
if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue;
|
||||
dd = dr > dq? dr - dq : dq - dr;
|
||||
if (sidi == sidj && dd > bw) continue;
|
||||
if (n_segs > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) continue;
|
||||
min_d = dq < dr? dq : dr;
|
||||
sc = min_d > q_span? q_span : dq < dr? dq : dr;
|
||||
log_dd = dd? ilog2_32(dd) : 0;
|
||||
gap_cost = 0;
|
||||
if (is_cdna || sidi != sidj) {
|
||||
int c_log, c_lin;
|
||||
c_lin = (int)(dd * .01 * avg_qspan);
|
||||
c_log = log_dd;
|
||||
if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
|
||||
else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log;
|
||||
else gap_cost = c_lin + (c_log>>1);
|
||||
} else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1);
|
||||
sc -= (int)((double)gap_cost * gap_scale + .499);
|
||||
sc += f[j];
|
||||
if (sc > max_f) {
|
||||
max_f = sc, max_j = j;
|
||||
if (n_skip > 0) --n_skip;
|
||||
} else if (t[j] == i) {
|
||||
if (++n_skip > max_skip)
|
||||
break;
|
||||
}
|
||||
if (p[j] >= 0) t[p[j]] = i;
|
||||
}
|
||||
f[i] = max_f, p[i] = max_j;
|
||||
v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
|
||||
}
|
||||
|
||||
// find the ending positions of chains
|
||||
memset(t, 0, n * 4);
|
||||
for (i = 0; i < n; ++i)
|
||||
if (p[i] >= 0) t[p[i]] = 1;
|
||||
for (i = n_u = 0; i < n; ++i)
|
||||
if (t[i] == 0 && v[i] >= min_sc)
|
||||
++n_u;
|
||||
if (n_u == 0) {
|
||||
kfree(km, a); kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, v);
|
||||
return 0;
|
||||
}
|
||||
u = (uint64_t*)kmalloc(km, n_u * 8);
|
||||
for (i = n_u = 0; i < n; ++i) {
|
||||
if (t[i] == 0 && v[i] >= min_sc) {
|
||||
j = i;
|
||||
while (j >= 0 && f[j] < v[j]) j = p[j]; // find the peak that maximizes f[]
|
||||
if (j < 0) j = i; // TODO: this should really be assert(j>=0)
|
||||
u[n_u++] = (uint64_t)f[j] << 32 | j;
|
||||
}
|
||||
}
|
||||
radix_sort_64(u, u + n_u);
|
||||
for (i = 0; i < n_u>>1; ++i) { // reverse, s.t. the highest scoring chain is the first
|
||||
uint64_t t = u[i];
|
||||
u[i] = u[n_u - i - 1], u[n_u - i - 1] = t;
|
||||
}
|
||||
|
||||
// backtrack
|
||||
memset(t, 0, n * 4);
|
||||
for (i = n_v = k = 0; i < n_u; ++i) { // starting from the highest score
|
||||
int32_t n_v0 = n_v, k0 = k;
|
||||
j = (int32_t)u[i];
|
||||
do {
|
||||
v[n_v++] = j;
|
||||
t[j] = 1;
|
||||
j = p[j];
|
||||
} while (j >= 0 && t[j] == 0);
|
||||
if (j < 0) {
|
||||
if (n_v - n_v0 >= min_cnt) u[k++] = u[i]>>32<<32 | (n_v - n_v0);
|
||||
} else if ((int32_t)(u[i]>>32) - f[j] >= min_sc) {
|
||||
if (n_v - n_v0 >= min_cnt) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0);
|
||||
}
|
||||
if (k0 == k) n_v = n_v0; // no new chain added, reset
|
||||
}
|
||||
*n_u_ = n_u = k, *_u = u; // NB: note that u[] may not be sorted by score here
|
||||
|
||||
// free temporary arrays
|
||||
kfree(km, f); kfree(km, p); kfree(km, t);
|
||||
|
||||
// write the result to b[]
|
||||
b = (mm128_t*)kmalloc(km, n_v * sizeof(mm128_t));
|
||||
for (i = 0, k = 0; i < n_u; ++i) {
|
||||
int32_t k0 = k, ni = (int32_t)u[i];
|
||||
for (j = 0; j < ni; ++j)
|
||||
b[k] = a[v[k0 + (ni - j - 1)]], ++k;
|
||||
}
|
||||
kfree(km, v);
|
||||
|
||||
// sort u[] and a[] by a[].x, such that adjacent chains may be joined (required by mm_join_long)
|
||||
w = (mm128_t*)kmalloc(km, n_u * sizeof(mm128_t));
|
||||
for (i = k = 0; i < n_u; ++i) {
|
||||
w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i;
|
||||
k += (int32_t)u[i];
|
||||
}
|
||||
radix_sort_128x(w, w + n_u);
|
||||
u2 = (uint64_t*)kmalloc(km, n_u * 8);
|
||||
for (i = k = 0; i < n_u; ++i) {
|
||||
int32_t j = (int32_t)w[i].y, n = (int32_t)u[j];
|
||||
u2[i] = u[j];
|
||||
memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t));
|
||||
k += n;
|
||||
}
|
||||
if (n_u) memcpy(u, u2, n_u * 8);
|
||||
if (k) memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot
|
||||
kfree(km, a); kfree(km, w); kfree(km, u2);
|
||||
return b;
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
## Contributor Code of Conduct
|
||||
|
||||
As contributors and maintainers of this project, we pledge to respect all
|
||||
people who contribute through reporting issues, posting feature requests,
|
||||
updating documentation, submitting pull requests or patches, and other
|
||||
activities.
|
||||
|
||||
We are committed to making participation in this project a harassment-free
|
||||
experience for everyone, regardless of level of experience, gender, gender
|
||||
identity and expression, sexual orientation, disability, personal appearance,
|
||||
body size, race, age, or religion.
|
||||
|
||||
Examples of unacceptable behavior by participants include the use of sexual
|
||||
language or imagery, derogatory comments or personal attacks, trolling, public
|
||||
or private harassment, insults, or other unprofessional conduct.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or
|
||||
reject comments, commits, code, wiki edits, issues, and other contributions
|
||||
that are not aligned to this Code of Conduct. Project maintainers or
|
||||
contributors who do not follow the Code of Conduct may be removed from the
|
||||
project team.
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported by opening an issue or contacting the maintainer via email.
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
|
||||
1.0.0][v1].
|
||||
|
||||
[cc]: http://contributor-covenant.org/
|
||||
[v1]: http://contributor-covenant.org/version/1/0/0/
|
||||
|
|
@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
|
|||
please follow the command lines below:
|
||||
```sh
|
||||
# install minimap2 executables
|
||||
curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar jxf -
|
||||
cp minimap2-2.17_x64-linux/{minimap2,k8,paftools.js} . # copy executables
|
||||
curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar jxf -
|
||||
cp minimap2-2.21_x64-linux/{minimap2,k8,paftools.js} . # copy executables
|
||||
export PATH="$PATH:"`pwd` # put the current directory on PATH
|
||||
# download example datasets
|
||||
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
|
||||
|
|
|
|||
2
esterr.c
2
esterr.c
|
|
@ -59,6 +59,6 @@ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const
|
|||
n_tot = en - st + 1;
|
||||
if (r->qs > avg_k && r->rs > avg_k) ++n_tot;
|
||||
if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot;
|
||||
r->div = logf((float)n_tot / n_match) / avg_k;
|
||||
r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ int main(int argc, char *argv[])
|
|||
printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
|
||||
printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq);
|
||||
for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings!
|
||||
printf("%d%c", r->p->cigar[i]>>4, "MIDNSH"[r->p->cigar[i]&0xf]);
|
||||
printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]);
|
||||
putchar('\n');
|
||||
free(r->p);
|
||||
}
|
||||
|
|
|
|||
28
format.c
28
format.c
|
|
@ -144,8 +144,8 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|||
if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
|
||||
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
||||
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
||||
assert((op >= 0 && op <= 3) || op == 7 || op == 8);
|
||||
if (op == 0 || op == 7 || op == 8) { // match
|
||||
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
|
||||
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
|
||||
int l_tmp = 0;
|
||||
for (j = 0; j < len; ++j) {
|
||||
if (qseq[q_off + j] != tseq[t_off + j]) {
|
||||
|
|
@ -166,12 +166,12 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|||
} else mm_sprintf_lite(s, ":%d", l_tmp);
|
||||
}
|
||||
q_off += len, t_off += len;
|
||||
} else if (op == 1) { // insertion to ref
|
||||
} else if (op == MM_CIGAR_INS) {
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "acgtn"[qseq[q_off + j]];
|
||||
mm_sprintf_lite(s, "+%s", tmp);
|
||||
q_off += len;
|
||||
} else if (op == 2) { // deletion from ref
|
||||
} else if (op == MM_CIGAR_DEL) {
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "acgtn"[tseq[t_off + j]];
|
||||
mm_sprintf_lite(s, "-%s", tmp);
|
||||
|
|
@ -192,8 +192,8 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|||
if (write_tag) mm_sprintf_lite(s, "\tMD:Z:");
|
||||
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
||||
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
||||
assert((op >= 0 && op <= 3) || op == 7 || op == 8);
|
||||
if (op == 0 || op == 7 || op == 8) { // match
|
||||
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
|
||||
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
|
||||
for (j = 0; j < len; ++j) {
|
||||
if (qseq[q_off + j] != tseq[t_off + j]) {
|
||||
mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]);
|
||||
|
|
@ -201,15 +201,15 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
|||
} else ++l_MD;
|
||||
}
|
||||
q_off += len, t_off += len;
|
||||
} else if (op == 1) { // insertion to ref
|
||||
} else if (op == MM_CIGAR_INS) {
|
||||
q_off += len;
|
||||
} else if (op == 2) { // deletion from ref
|
||||
} else if (op == MM_CIGAR_DEL) {
|
||||
for (j = 0, tmp[len] = 0; j < len; ++j)
|
||||
tmp[j] = "ACGTN"[tseq[t_off + j]];
|
||||
mm_sprintf_lite(s, "%d^%s", l_MD, tmp);
|
||||
l_MD = 0;
|
||||
t_off += len;
|
||||
} else if (op == 3) { // reference skip
|
||||
} else if (op == MM_CIGAR_N_SKIP) {
|
||||
t_off += len;
|
||||
}
|
||||
}
|
||||
|
|
@ -277,7 +277,7 @@ double mm_event_identity(const mm_reg1_t *r)
|
|||
if (r->p == 0) return -1.0f;
|
||||
for (i = 0; i < r->p->n_cigar; ++i) {
|
||||
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
|
||||
if (op == 1 || op == 2)
|
||||
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
|
||||
++n_gapo, n_gap += len;
|
||||
}
|
||||
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
|
||||
|
|
@ -335,7 +335,7 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
|
|||
uint32_t k;
|
||||
mm_sprintf_lite(s, "\tcg:Z:");
|
||||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
|
||||
}
|
||||
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
||||
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, opt_flag&MM_F_QSTRAND);
|
||||
|
|
@ -392,7 +392,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co
|
|||
assert(clip_len[0] < qlen && clip_len[1] < qlen);
|
||||
if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char);
|
||||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
|
||||
if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char);
|
||||
}
|
||||
}
|
||||
|
|
@ -402,7 +402,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|||
{
|
||||
const int max_bam_cigar_op = 65535;
|
||||
int flag, n_regs = n_regss[seg_idx], cigar_in_tag = 0;
|
||||
int this_rid = -1, this_pos = -1, this_rev = 0;
|
||||
int this_rid = -1, this_pos = -1;
|
||||
const mm_reg1_t *regs = regss[seg_idx], *r_prev = NULL, *r_next;
|
||||
const mm_reg1_t *r = n_regs > 0 && reg_idx < n_regs && reg_idx >= 0? ®s[reg_idx] : NULL;
|
||||
|
||||
|
|
@ -451,7 +451,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|||
mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1);
|
||||
} else mm_sprintf_lite(s, "\t*\t0\t0\t*");
|
||||
} else {
|
||||
this_rid = r->rid, this_pos = r->rs, this_rev = r->rev;
|
||||
this_rid = r->rid, this_pos = r->rs;
|
||||
mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq);
|
||||
if ((opt_flag & MM_F_LONG_CIGAR) && r->p && r->p->n_cigar > max_bam_cigar_op - 2) {
|
||||
int n_cigar = r->p->n_cigar;
|
||||
|
|
|
|||
62
hit.c
62
hit.c
|
|
@ -122,7 +122,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int
|
|||
r->split |= 1, r2->split |= 2;
|
||||
}
|
||||
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
|
||||
void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
|
||||
{
|
||||
int i, j, k, *w;
|
||||
uint64_t *cov;
|
||||
|
|
@ -162,7 +162,7 @@ skip_uncov:
|
|||
min = ej - sj < ei - si? ej - sj : ei - si;
|
||||
max = ej - sj > ei - si? ej - sj : ei - si;
|
||||
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
|
||||
if ((float)ol / min - (float)uncov_len / max > mask_level) {
|
||||
if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit
|
||||
int cnt_sub = 0, sci = ri->score;
|
||||
ri->parent = rp->parent;
|
||||
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
|
||||
|
|
@ -312,64 +312,6 @@ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
|
|||
return as;
|
||||
}
|
||||
|
||||
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
|
||||
{
|
||||
int i, n_aux, n_regs = *n_regs_, n_drop = 0;
|
||||
uint64_t *aux;
|
||||
|
||||
if (n_regs < 2) return; // nothing to join
|
||||
mm_squeeze_a(km, n_regs, regs, a);
|
||||
|
||||
aux = (uint64_t*)kmalloc(km, n_regs * 8);
|
||||
for (i = n_aux = 0; i < n_regs; ++i)
|
||||
if (regs[i].parent == i || regs[i].parent < 0)
|
||||
aux[n_aux++] = (uint64_t)regs[i].as << 32 | i;
|
||||
radix_sort_64(aux, aux + n_aux);
|
||||
|
||||
for (i = n_aux - 1; i >= 1; --i) {
|
||||
mm_reg1_t *r0 = ®s[(int32_t)aux[i-1]], *r1 = ®s[(int32_t)aux[i]];
|
||||
mm128_t *a0e, *a1s;
|
||||
int max_gap, min_gap, sc_thres, min_flank_len;
|
||||
|
||||
// test
|
||||
if (r0->as + r0->cnt != r1->as) continue; // not adjacent in a[]
|
||||
if (r0->rid != r1->rid || r0->rev != r1->rev) continue; // make sure on the same target and strand
|
||||
a0e = &a[r0->as + r0->cnt - 1];
|
||||
a1s = &a[r1->as];
|
||||
if (a1s->x <= a0e->x || (int32_t)a1s->y <= (int32_t)a0e->y) continue; // keep colinearity
|
||||
max_gap = min_gap = (int32_t)a1s->y - (int32_t)a0e->y;
|
||||
max_gap = a0e->x + max_gap > a1s->x? max_gap : a1s->x - a0e->x;
|
||||
min_gap = a0e->x + min_gap < a1s->x? min_gap : a1s->x - a0e->x;
|
||||
if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue;
|
||||
sc_thres = (int)((float)opt->min_join_flank_sc / opt->max_join_long * max_gap + .499);
|
||||
if (r0->score < sc_thres || r1->score < sc_thres) continue; // require good flanking chains
|
||||
min_flank_len = (int)(max_gap * opt->min_join_flank_ratio);
|
||||
if (r0->re - r0->rs < min_flank_len || r0->qe - r0->qs < min_flank_len) continue; // require enough flanking length
|
||||
if (r1->re - r1->rs < min_flank_len || r1->qe - r1->qs < min_flank_len) continue;
|
||||
|
||||
// all conditions satisfied; join
|
||||
a[r1->as].y |= MM_SEED_LONG_JOIN;
|
||||
r0->cnt += r1->cnt, r0->score += r1->score;
|
||||
mm_reg_set_coor(r0, qlen, a, opt->flag&MM_F_QSTRAND);
|
||||
r1->cnt = 0;
|
||||
r1->parent = r0->id;
|
||||
++n_drop;
|
||||
}
|
||||
kfree(km, aux);
|
||||
|
||||
if (n_drop > 0) { // then fix the hits hierarchy
|
||||
for (i = 0; i < n_regs; ++i) { // adjust the mm_reg1_t::parent
|
||||
mm_reg1_t *r = ®s[i];
|
||||
if (r->parent >= 0 && r->id != r->parent) { // fix for secondary hits only
|
||||
if (regs[r->parent].parent >= 0 && regs[r->parent].parent != r->parent)
|
||||
r->parent = regs[r->parent].parent;
|
||||
}
|
||||
}
|
||||
mm_filter_regs(opt, qlen, n_regs_, regs);
|
||||
mm_sync_regs(km, *n_regs_, regs);
|
||||
}
|
||||
}
|
||||
|
||||
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
|
||||
{
|
||||
int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;
|
||||
|
|
|
|||
2
index.c
2
index.c
|
|
@ -676,7 +676,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
|
|||
char *p, *q, *bl, *bs;
|
||||
int32_t i, id = -1, n_blk = 0;
|
||||
for (p = q = str.s, i = 0;; ++p) {
|
||||
if (*p == 0 || isspace(*p)) {
|
||||
if (*p == 0 || *p == '\t') {
|
||||
int32_t c = *p;
|
||||
*p = 0;
|
||||
if (i == 0) { // chr
|
||||
|
|
|
|||
39
kalloc.h
39
kalloc.h
|
|
@ -34,4 +34,43 @@ void km_stat(const void *_km, km_stat_t *s);
|
|||
KREALLOC((km), (a), (m)); \
|
||||
} while (0)
|
||||
|
||||
#ifndef klib_unused
|
||||
#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
|
||||
#define klib_unused __attribute__ ((__unused__))
|
||||
#else
|
||||
#define klib_unused
|
||||
#endif
|
||||
#endif /* klib_unused */
|
||||
|
||||
#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \
|
||||
typedef struct { \
|
||||
size_t cnt, n, max; \
|
||||
kmptype_t **buf; \
|
||||
void *km; \
|
||||
} kmp_##name##_t; \
|
||||
SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \
|
||||
kmp_##name##_t *mp; \
|
||||
KCALLOC(km, mp, 1); \
|
||||
mp->km = km; \
|
||||
return mp; \
|
||||
} \
|
||||
SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \
|
||||
size_t k; \
|
||||
for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \
|
||||
kfree(mp->km, mp->buf); kfree(mp->km, mp); \
|
||||
} \
|
||||
SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
|
||||
++mp->cnt; \
|
||||
if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \
|
||||
return mp->buf[--mp->n]; \
|
||||
} \
|
||||
SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
|
||||
--mp->cnt; \
|
||||
if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \
|
||||
mp->buf[mp->n++] = p; \
|
||||
}
|
||||
|
||||
#define KALLOC_POOL_INIT(name, kmptype_t) \
|
||||
KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t)
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,474 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2019 by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* An example:
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include "krmq.h"
|
||||
|
||||
struct my_node {
|
||||
char key;
|
||||
KRMQ_HEAD(struct my_node) head;
|
||||
};
|
||||
#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key))
|
||||
KRMQ_INIT(my, struct my_node, head, my_cmp)
|
||||
|
||||
int main(void) {
|
||||
const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate
|
||||
struct my_node *root = 0;
|
||||
int i, l = strlen(str);
|
||||
for (i = 0; i < l; ++i) { // insert in the input order
|
||||
struct my_node *q, *p = malloc(sizeof(*p));
|
||||
p->key = str[i];
|
||||
q = krmq_insert(my, &root, p, 0);
|
||||
if (p != q) free(p); // if already present, free
|
||||
}
|
||||
krmq_itr_t(my) itr;
|
||||
krmq_itr_first(my, root, &itr); // place at first
|
||||
do { // traverse
|
||||
const struct my_node *p = krmq_at(&itr);
|
||||
putchar(p->key);
|
||||
free((void*)p); // free node
|
||||
} while (krmq_itr_next(my, &itr));
|
||||
putchar('\n');
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
#ifndef KRMQ_H
|
||||
#define KRMQ_H
|
||||
|
||||
#ifdef __STRICT_ANSI__
|
||||
#define inline __inline__
|
||||
#endif
|
||||
|
||||
#define KRMQ_MAX_DEPTH 64
|
||||
|
||||
#define krmq_size(head, p) ((p)? (p)->head.size : 0)
|
||||
#define krmq_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0)
|
||||
|
||||
#define KRMQ_HEAD(__type) \
|
||||
struct { \
|
||||
__type *p[2], *s; \
|
||||
signed char balance; /* balance factor */ \
|
||||
unsigned size; /* #elements in subtree */ \
|
||||
}
|
||||
|
||||
#define __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \
|
||||
__scope __type *krmq_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \
|
||||
const __type *p = root; \
|
||||
unsigned cnt = 0; \
|
||||
while (p != 0) { \
|
||||
int cmp; \
|
||||
cmp = __cmp(x, p); \
|
||||
if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \
|
||||
if (cmp < 0) p = p->__head.p[0]; \
|
||||
else if (cmp > 0) p = p->__head.p[1]; \
|
||||
else break; \
|
||||
} \
|
||||
if (cnt_) *cnt_ = cnt; \
|
||||
return (__type*)p; \
|
||||
} \
|
||||
__scope __type *krmq_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \
|
||||
const __type *p = root, *l = 0, *u = 0; \
|
||||
while (p != 0) { \
|
||||
int cmp; \
|
||||
cmp = __cmp(x, p); \
|
||||
if (cmp < 0) u = p, p = p->__head.p[0]; \
|
||||
else if (cmp > 0) l = p, p = p->__head.p[1]; \
|
||||
else { l = u = p; break; } \
|
||||
} \
|
||||
if (lower) *lower = (__type*)l; \
|
||||
if (upper) *upper = (__type*)u; \
|
||||
return (__type*)p; \
|
||||
}
|
||||
|
||||
#define __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \
|
||||
__scope __type *krmq_rmq_##suf(const __type *root, const __type *lo, const __type *up) { /* CLOSED interval */ \
|
||||
const __type *p = root, *path[2][KRMQ_MAX_DEPTH], *min; \
|
||||
int plen[2] = {0, 0}, pcmp[2][KRMQ_MAX_DEPTH], i, cmp, lca; \
|
||||
if (root == 0) return 0; \
|
||||
while (p) { \
|
||||
cmp = __cmp(lo, p); \
|
||||
path[0][plen[0]] = p, pcmp[0][plen[0]++] = cmp; \
|
||||
if (cmp < 0) p = p->__head.p[0]; \
|
||||
else if (cmp > 0) p = p->__head.p[1]; \
|
||||
else break; \
|
||||
} \
|
||||
p = root; \
|
||||
while (p) { \
|
||||
cmp = __cmp(up, p); \
|
||||
path[1][plen[1]] = p, pcmp[1][plen[1]++] = cmp; \
|
||||
if (cmp < 0) p = p->__head.p[0]; \
|
||||
else if (cmp > 0) p = p->__head.p[1]; \
|
||||
else break; \
|
||||
} \
|
||||
for (i = 0; i < plen[0] && i < plen[1]; ++i) /* find the LCA */ \
|
||||
if (path[0][i] == path[1][i] && pcmp[0][i] <= 0 && pcmp[1][i] >= 0) \
|
||||
break; \
|
||||
if (i == plen[0] || i == plen[1]) return 0; /* no elements in the closed interval */ \
|
||||
lca = i, min = path[0][lca]; \
|
||||
for (i = lca + 1; i < plen[0]; ++i) { \
|
||||
if (pcmp[0][i] <= 0) { \
|
||||
if (__lt2(path[0][i], min)) min = path[0][i]; \
|
||||
if (path[0][i]->__head.p[1] && __lt2(path[0][i]->__head.p[1]->__head.s, min)) \
|
||||
min = path[0][i]->__head.p[1]->__head.s; \
|
||||
} \
|
||||
} \
|
||||
for (i = lca + 1; i < plen[1]; ++i) { \
|
||||
if (pcmp[1][i] >= 0) { \
|
||||
if (__lt2(path[1][i], min)) min = path[1][i]; \
|
||||
if (path[1][i]->__head.p[0] && __lt2(path[1][i]->__head.p[0]->__head.s, min)) \
|
||||
min = path[1][i]->__head.p[0]->__head.s; \
|
||||
} \
|
||||
} \
|
||||
return (__type*)min; \
|
||||
}
|
||||
|
||||
#define __KRMQ_ROTATE(suf, __type, __head, __lt2) \
|
||||
/* */ \
|
||||
static inline void krmq_update_min_##suf(__type *p, const __type *q, const __type *r) { \
|
||||
p->__head.s = !q || __lt2(p, q->__head.s)? p : q->__head.s; \
|
||||
p->__head.s = !r || __lt2(p->__head.s, r->__head.s)? p->__head.s : r->__head.s; \
|
||||
} \
|
||||
/* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \
|
||||
static inline __type *krmq_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \
|
||||
int opp = 1 - dir; /* opposite direction */ \
|
||||
__type *q = p->__head.p[opp], *s = p->__head.s; \
|
||||
unsigned size_p = p->__head.size; \
|
||||
p->__head.size -= q->__head.size - krmq_size_child(__head, q, dir); \
|
||||
q->__head.size = size_p; \
|
||||
krmq_update_min_##suf(p, p->__head.p[dir], q->__head.p[dir]); \
|
||||
q->__head.s = s; \
|
||||
p->__head.p[opp] = q->__head.p[dir]; \
|
||||
q->__head.p[dir] = p; \
|
||||
return q; \
|
||||
} \
|
||||
/* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \
|
||||
static inline __type *krmq_rotate2_##suf(__type *p, int dir) { \
|
||||
int b1, opp = 1 - dir; \
|
||||
__type *q = p->__head.p[opp], *r = q->__head.p[dir], *s = p->__head.s; \
|
||||
unsigned size_x_dir = krmq_size_child(__head, r, dir); \
|
||||
r->__head.size = p->__head.size; \
|
||||
p->__head.size -= q->__head.size - size_x_dir; \
|
||||
q->__head.size -= size_x_dir + 1; \
|
||||
krmq_update_min_##suf(p, p->__head.p[dir], r->__head.p[dir]); \
|
||||
krmq_update_min_##suf(q, q->__head.p[opp], r->__head.p[opp]); \
|
||||
r->__head.s = s; \
|
||||
p->__head.p[opp] = r->__head.p[dir]; \
|
||||
r->__head.p[dir] = p; \
|
||||
q->__head.p[dir] = r->__head.p[opp]; \
|
||||
r->__head.p[opp] = q; \
|
||||
b1 = dir == 0? +1 : -1; \
|
||||
if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \
|
||||
else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \
|
||||
else q->__head.balance = b1, p->__head.balance = 0; \
|
||||
r->__head.balance = 0; \
|
||||
return r; \
|
||||
}
|
||||
|
||||
#define __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \
|
||||
__scope __type *krmq_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \
|
||||
unsigned char stack[KRMQ_MAX_DEPTH]; \
|
||||
__type *path[KRMQ_MAX_DEPTH]; \
|
||||
__type *bp, *bq; \
|
||||
__type *p, *q, *r = 0; /* _r_ is potentially the new root */ \
|
||||
int i, which = 0, top, b1, path_len; \
|
||||
unsigned cnt = 0; \
|
||||
bp = *root_, bq = 0; \
|
||||
/* find the insertion location */ \
|
||||
for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \
|
||||
int cmp; \
|
||||
cmp = __cmp(x, p); \
|
||||
if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \
|
||||
if (cmp == 0) { \
|
||||
if (cnt_) *cnt_ = cnt; \
|
||||
return p; \
|
||||
} \
|
||||
if (p->__head.balance != 0) \
|
||||
bq = q, bp = p, top = 0; \
|
||||
stack[top++] = which = (cmp > 0); \
|
||||
path[path_len++] = p; \
|
||||
} \
|
||||
if (cnt_) *cnt_ = cnt; \
|
||||
x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0, x->__head.s = x; \
|
||||
if (q == 0) *root_ = x; \
|
||||
else q->__head.p[which] = x; \
|
||||
if (bp == 0) return x; \
|
||||
for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \
|
||||
for (i = path_len - 1; i >= 0; --i) { \
|
||||
krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \
|
||||
if (path[i]->__head.s != x) break; \
|
||||
} \
|
||||
for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \
|
||||
if (stack[top] == 0) --p->__head.balance; \
|
||||
else ++p->__head.balance; \
|
||||
if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \
|
||||
/* re-balance */ \
|
||||
which = (bp->__head.balance < 0); \
|
||||
b1 = which == 0? +1 : -1; \
|
||||
q = bp->__head.p[1 - which]; \
|
||||
if (q->__head.balance == b1) { \
|
||||
r = krmq_rotate1_##suf(bp, which); \
|
||||
q->__head.balance = bp->__head.balance = 0; \
|
||||
} else r = krmq_rotate2_##suf(bp, which); \
|
||||
if (bq == 0) *root_ = r; \
|
||||
else bq->__head.p[bp != bq->__head.p[0]] = r; \
|
||||
return x; \
|
||||
}
|
||||
|
||||
#define __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \
|
||||
__scope __type *krmq_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \
|
||||
__type *p, *path[KRMQ_MAX_DEPTH], fake; \
|
||||
unsigned char dir[KRMQ_MAX_DEPTH]; \
|
||||
int i, d = 0, cmp; \
|
||||
unsigned cnt = 0; \
|
||||
fake = **root_, fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \
|
||||
if (cnt_) *cnt_ = 0; \
|
||||
if (x) { \
|
||||
for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \
|
||||
int which = (cmp > 0); \
|
||||
if (cmp > 0) cnt += krmq_size_child(__head, p, 0) + 1; \
|
||||
dir[d] = which; \
|
||||
path[d++] = p; \
|
||||
p = p->__head.p[which]; \
|
||||
if (p == 0) { \
|
||||
if (cnt_) *cnt_ = 0; \
|
||||
return 0; \
|
||||
} \
|
||||
} \
|
||||
cnt += krmq_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \
|
||||
} else { \
|
||||
for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \
|
||||
dir[d] = 0, path[d++] = p; \
|
||||
p = path[--d]; \
|
||||
} \
|
||||
if (cnt_) *cnt_ = cnt; \
|
||||
for (i = 1; i < d; ++i) --path[i]->__head.size; \
|
||||
if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \
|
||||
path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \
|
||||
} else { \
|
||||
__type *q = p->__head.p[1]; \
|
||||
if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3,q=2 */ \
|
||||
q->__head.p[0] = p->__head.p[0]; \
|
||||
q->__head.balance = p->__head.balance; \
|
||||
path[d-1]->__head.p[dir[d-1]] = q; \
|
||||
path[d] = q, dir[d++] = 1; \
|
||||
q->__head.size = p->__head.size - 1; \
|
||||
} else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \
|
||||
__type *r; \
|
||||
int e = d++; /* backup _d_ */\
|
||||
for (;;) { \
|
||||
dir[d] = 0; \
|
||||
path[d++] = q; \
|
||||
r = q->__head.p[0]; \
|
||||
if (r->__head.p[0] == 0) break; \
|
||||
q = r; \
|
||||
} \
|
||||
r->__head.p[0] = p->__head.p[0]; \
|
||||
q->__head.p[0] = r->__head.p[1]; \
|
||||
r->__head.p[1] = p->__head.p[1]; \
|
||||
r->__head.balance = p->__head.balance; \
|
||||
path[e-1]->__head.p[dir[e-1]] = r; \
|
||||
path[e] = r, dir[e] = 1; \
|
||||
for (i = e + 1; i < d; ++i) --path[i]->__head.size; \
|
||||
r->__head.size = p->__head.size - 1; \
|
||||
} \
|
||||
} \
|
||||
for (i = d - 1; i >= 0; --i) /* not sure why adding condition "path[i]->__head.s==p" doesn't work */ \
|
||||
krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \
|
||||
while (--d > 0) { \
|
||||
__type *q = path[d]; \
|
||||
int which, other, b1 = 1, b2 = 2; \
|
||||
which = dir[d], other = 1 - which; \
|
||||
if (which) b1 = -b1, b2 = -b2; \
|
||||
q->__head.balance += b1; \
|
||||
if (q->__head.balance == b1) break; \
|
||||
else if (q->__head.balance == b2) { \
|
||||
__type *r = q->__head.p[other]; \
|
||||
if (r->__head.balance == -b1) { \
|
||||
path[d-1]->__head.p[dir[d-1]] = krmq_rotate2_##suf(q, which); \
|
||||
} else { \
|
||||
path[d-1]->__head.p[dir[d-1]] = krmq_rotate1_##suf(q, which); \
|
||||
if (r->__head.balance == 0) { \
|
||||
r->__head.balance = -b1; \
|
||||
q->__head.balance = b1; \
|
||||
break; \
|
||||
} else r->__head.balance = q->__head.balance = 0; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
*root_ = fake.__head.p[0]; \
|
||||
return p; \
|
||||
}
|
||||
|
||||
#define krmq_free(__type, __head, __root, __free) do { \
|
||||
__type *_p, *_q; \
|
||||
for (_p = __root; _p; _p = _q) { \
|
||||
if (_p->__head.p[0] == 0) { \
|
||||
_q = _p->__head.p[1]; \
|
||||
__free(_p); \
|
||||
} else { \
|
||||
_q = _p->__head.p[0]; \
|
||||
_p->__head.p[0] = _q->__head.p[1]; \
|
||||
_q->__head.p[1] = _p; \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define __KRMQ_ITR(suf, __scope, __type, __head, __cmp) \
|
||||
struct krmq_itr_##suf { \
|
||||
const __type *stack[KRMQ_MAX_DEPTH], **top; \
|
||||
}; \
|
||||
__scope void krmq_itr_first_##suf(const __type *root, struct krmq_itr_##suf *itr) { \
|
||||
const __type *p; \
|
||||
for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \
|
||||
*++itr->top = p; \
|
||||
} \
|
||||
__scope int krmq_itr_find_##suf(const __type *root, const __type *x, struct krmq_itr_##suf *itr) { \
|
||||
const __type *p = root; \
|
||||
itr->top = itr->stack - 1; \
|
||||
while (p != 0) { \
|
||||
int cmp; \
|
||||
*++itr->top = p; \
|
||||
cmp = __cmp(x, p); \
|
||||
if (cmp < 0) p = p->__head.p[0]; \
|
||||
else if (cmp > 0) p = p->__head.p[1]; \
|
||||
else break; \
|
||||
} \
|
||||
return p? 1 : 0; \
|
||||
} \
|
||||
__scope int krmq_itr_next_bidir_##suf(struct krmq_itr_##suf *itr, int dir) { \
|
||||
const __type *p; \
|
||||
if (itr->top < itr->stack) return 0; \
|
||||
dir = !!dir; \
|
||||
p = (*itr->top)->__head.p[dir]; \
|
||||
if (p) { /* go down */ \
|
||||
for (; p; p = p->__head.p[!dir]) \
|
||||
*++itr->top = p; \
|
||||
return 1; \
|
||||
} else { /* go up */ \
|
||||
do { \
|
||||
p = *itr->top--; \
|
||||
} while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \
|
||||
return itr->top < itr->stack? 0 : 1; \
|
||||
} \
|
||||
} \
|
||||
|
||||
/**
|
||||
* Insert a node to the tree
|
||||
*
|
||||
* @param suf name suffix used in KRMQ_INIT()
|
||||
* @param proot pointer to the root of the tree (in/out: root may change)
|
||||
* @param x node to insert (in)
|
||||
* @param cnt number of nodes smaller than or equal to _x_; can be NULL (out)
|
||||
*
|
||||
* @return _x_ if not present in the tree, or the node equal to x.
|
||||
*/
|
||||
#define krmq_insert(suf, proot, x, cnt) krmq_insert_##suf(proot, x, cnt)
|
||||
|
||||
/**
|
||||
* Find a node in the tree
|
||||
*
|
||||
* @param suf name suffix used in KRMQ_INIT()
|
||||
* @param root root of the tree
|
||||
* @param x node value to find (in)
|
||||
* @param cnt number of nodes smaller than or equal to _x_; can be NULL (out)
|
||||
*
|
||||
* @return node equal to _x_ if present, or NULL if absent
|
||||
*/
|
||||
#define krmq_find(suf, root, x, cnt) krmq_find_##suf(root, x, cnt)
|
||||
#define krmq_interval(suf, root, x, lower, upper) krmq_interval_##suf(root, x, lower, upper)
|
||||
#define krmq_rmq(suf, root, lo, up) krmq_rmq_##suf(root, lo, up)
|
||||
|
||||
/**
|
||||
* Delete a node from the tree
|
||||
*
|
||||
* @param suf name suffix used in KRMQ_INIT()
|
||||
* @param proot pointer to the root of the tree (in/out: root may change)
|
||||
* @param x node value to delete; if NULL, delete the first node (in)
|
||||
*
|
||||
* @return node removed from the tree if present, or NULL if absent
|
||||
*/
|
||||
#define krmq_erase(suf, proot, x, cnt) krmq_erase_##suf(proot, x, cnt)
|
||||
#define krmq_erase_first(suf, proot) krmq_erase_##suf(proot, 0, 0)
|
||||
|
||||
#define krmq_itr_t(suf) struct krmq_itr_##suf
|
||||
|
||||
/**
|
||||
* Place the iterator at the smallest object
|
||||
*
|
||||
* @param suf name suffix used in KRMQ_INIT()
|
||||
* @param root root of the tree
|
||||
* @param itr iterator
|
||||
*/
|
||||
#define krmq_itr_first(suf, root, itr) krmq_itr_first_##suf(root, itr)
|
||||
|
||||
/**
|
||||
* Place the iterator at the object equal to or greater than the query
|
||||
*
|
||||
* @param suf name suffix used in KRMQ_INIT()
|
||||
* @param root root of the tree
|
||||
* @param x query (in)
|
||||
* @param itr iterator (out)
|
||||
*
|
||||
* @return 1 if find; 0 otherwise. krmq_at(itr) is NULL if and only if query is
|
||||
* larger than all objects in the tree
|
||||
*/
|
||||
#define krmq_itr_find(suf, root, x, itr) krmq_itr_find_##suf(root, x, itr)
|
||||
|
||||
/**
|
||||
* Move to the next object in order
|
||||
*
|
||||
* @param itr iterator (modified)
|
||||
*
|
||||
* @return 1 if there is a next object; 0 otherwise
|
||||
*/
|
||||
#define krmq_itr_next(suf, itr) krmq_itr_next_bidir_##suf(itr, 1)
|
||||
#define krmq_itr_prev(suf, itr) krmq_itr_next_bidir_##suf(itr, 0)
|
||||
|
||||
/**
|
||||
* Return the pointer at the iterator
|
||||
*
|
||||
* @param itr iterator
|
||||
*
|
||||
* @return pointer if present; NULL otherwise
|
||||
*/
|
||||
#define krmq_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top)
|
||||
|
||||
#define KRMQ_INIT2(suf, __scope, __type, __head, __cmp, __lt2) \
|
||||
__KRMQ_FIND(suf, __scope, __type, __head, __cmp) \
|
||||
__KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \
|
||||
__KRMQ_ROTATE(suf, __type, __head, __lt2) \
|
||||
__KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \
|
||||
__KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \
|
||||
__KRMQ_ITR(suf, __scope, __type, __head, __cmp)
|
||||
|
||||
#define KRMQ_INIT(suf, __type, __head, __cmp, __lt2) \
|
||||
KRMQ_INIT2(suf,, __type, __head, __cmp, __lt2)
|
||||
|
||||
#endif
|
||||
19
ksw2.h
19
ksw2.h
|
|
@ -16,6 +16,13 @@
|
|||
#define KSW_EZ_SPLICE_REV 0x200
|
||||
#define KSW_EZ_SPLICE_FLANK 0x400
|
||||
|
||||
// The subset of CIGAR operators used by ksw code.
|
||||
// Use MM_CIGAR_* from minimap.h if you need the full list.
|
||||
#define KSW_CIGAR_MATCH 0
|
||||
#define KSW_CIGAR_INS 1
|
||||
#define KSW_CIGAR_DEL 2
|
||||
#define KSW_CIGAR_N_SKIP 3
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
@ -137,13 +144,13 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intro
|
|||
else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
|
||||
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
|
||||
if (force_state >= 0) state = force_state;
|
||||
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
|
||||
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
|
||||
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
|
||||
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
|
||||
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j;
|
||||
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i;
|
||||
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i;
|
||||
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j;
|
||||
}
|
||||
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion
|
||||
if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
|
||||
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion
|
||||
if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion
|
||||
if (!is_rev)
|
||||
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
|
||||
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
|
||||
|
|
|
|||
|
|
@ -4,15 +4,23 @@
|
|||
#include "ksw2.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
#ifdef USE_SIMDE
|
||||
#include <simde/x86/sse2.h>
|
||||
#else
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef KSW_SSE2_ONLY
|
||||
#undef __SSE4_1__
|
||||
#endif
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
#ifdef USE_SIMDE
|
||||
#include <simde/x86/sse4.1.h>
|
||||
#else
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KSW_CPU_DISPATCH
|
||||
#ifdef __SSE4_1__
|
||||
|
|
|
|||
|
|
@ -4,15 +4,22 @@
|
|||
#include "ksw2.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
#ifdef USE_SIMDE
|
||||
#include <simde/x86/sse2.h>
|
||||
#else
|
||||
#include <emmintrin.h>
|
||||
|
||||
#endif
|
||||
#ifdef KSW_SSE2_ONLY
|
||||
#undef __SSE4_1__
|
||||
#endif
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
#ifdef USE_SIMDE
|
||||
#include <simde/x86/sse4.1.h>
|
||||
#else
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KSW_CPU_DISPATCH
|
||||
#ifdef __SSE4_1__
|
||||
|
|
|
|||
|
|
@ -3,15 +3,23 @@
|
|||
#include "ksw2.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
#ifdef USE_SIMDE
|
||||
#include <simde/x86/sse2.h>
|
||||
#else
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef KSW_SSE2_ONLY
|
||||
#undef __SSE4_1__
|
||||
#endif
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
#ifdef USE_SIMDE
|
||||
#include <simde/x86/sse4.1.h>
|
||||
#else
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KSW_CPU_DISPATCH
|
||||
#ifdef __SSE4_1__
|
||||
|
|
|
|||
|
|
@ -1,9 +1,14 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <emmintrin.h>
|
||||
#include "ksw2.h"
|
||||
|
||||
#ifdef USE_SIMDE
|
||||
#include <simde/x86/sse2.h>
|
||||
#else
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define LIKELY(x) __builtin_expect((x),1)
|
||||
#define UNLIKELY(x) __builtin_expect((x),0)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,354 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include "mmpriv.h"
|
||||
#include "kalloc.h"
|
||||
#include "krmq.h"
|
||||
|
||||
static inline float mg_log2(float x) // NB: this doesn't work when x<2
|
||||
{
|
||||
union { float f; uint32_t i; } z = { x };
|
||||
float log_2 = ((z.i >> 23) & 255) - 128;
|
||||
z.i &= ~(255 << 23);
|
||||
z.i += 127 << 23;
|
||||
log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f;
|
||||
return log_2;
|
||||
}
|
||||
|
||||
uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t *n_u_, int32_t *n_v_)
|
||||
{
|
||||
mm128_t *z;
|
||||
uint64_t *u;
|
||||
int64_t i, k, n_z, n_v;
|
||||
int32_t n_u;
|
||||
|
||||
*n_u_ = *n_v_ = 0;
|
||||
for (i = 0, n_z = 0; i < n; ++i) // precompute n_z
|
||||
if (f[i] >= min_sc) ++n_z;
|
||||
if (n_z == 0) return 0;
|
||||
KMALLOC(km, z, n_z);
|
||||
for (i = 0, k = 0; i < n; ++i) // populate z[]
|
||||
if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i;
|
||||
radix_sort_128x(z, z + n_z);
|
||||
|
||||
memset(t, 0, n * 4);
|
||||
for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u
|
||||
int64_t n_v0 = n_v;
|
||||
int32_t sc;
|
||||
for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i])
|
||||
++n_v, t[i] = 1;
|
||||
sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
|
||||
if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
|
||||
++n_u;
|
||||
else n_v = n_v0;
|
||||
}
|
||||
KMALLOC(km, u, n_u);
|
||||
memset(t, 0, n * 4);
|
||||
for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[]
|
||||
int64_t n_v0 = n_v;
|
||||
int32_t sc;
|
||||
for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i])
|
||||
v[n_v++] = i, t[i] = 1;
|
||||
sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
|
||||
if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
|
||||
u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0);
|
||||
else n_v = n_v0;
|
||||
}
|
||||
kfree(km, z);
|
||||
assert(n_v < INT32_MAX);
|
||||
*n_u_ = n_u, *n_v_ = n_v;
|
||||
return u;
|
||||
}
|
||||
|
||||
static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a)
|
||||
{
|
||||
mm128_t *b, *w;
|
||||
uint64_t *u2;
|
||||
int64_t i, j, k;
|
||||
|
||||
// write the result to b[]
|
||||
KMALLOC(km, b, n_v);
|
||||
for (i = 0, k = 0; i < n_u; ++i) {
|
||||
int32_t k0 = k, ni = (int32_t)u[i];
|
||||
for (j = 0; j < ni; ++j)
|
||||
b[k++] = a[v[k0 + (ni - j - 1)]];
|
||||
}
|
||||
kfree(km, v);
|
||||
|
||||
// sort u[] and a[] by the target position, such that adjacent chains may be joined
|
||||
KMALLOC(km, w, n_u);
|
||||
for (i = k = 0; i < n_u; ++i) {
|
||||
w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i;
|
||||
k += (int32_t)u[i];
|
||||
}
|
||||
radix_sort_128x(w, w + n_u);
|
||||
KMALLOC(km, u2, n_u);
|
||||
for (i = k = 0; i < n_u; ++i) {
|
||||
int32_t j = (int32_t)w[i].y, n = (int32_t)u[j];
|
||||
u2[i] = u[j];
|
||||
memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t));
|
||||
k += n;
|
||||
}
|
||||
memcpy(u, u2, n_u * 8);
|
||||
memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot
|
||||
kfree(km, a); kfree(km, w); kfree(km, u2);
|
||||
return b;
|
||||
}
|
||||
|
||||
static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap, float chn_pen_skip, int is_cdna, int n_seg)
|
||||
{
|
||||
int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc;
|
||||
int32_t sidi = (ai->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
|
||||
int32_t sidj = (aj->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
|
||||
if (dq <= 0 || dq > max_dist_x) return INT32_MIN;
|
||||
dr = (int32_t)(ai->x - aj->x);
|
||||
if (sidi == sidj && (dr == 0 || dq > max_dist_y)) return INT32_MIN;
|
||||
dd = dr > dq? dr - dq : dq - dr;
|
||||
if (sidi == sidj && dd > bw) return INT32_MIN;
|
||||
if (n_seg > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) return INT32_MIN;
|
||||
dg = dr < dq? dr : dq;
|
||||
q_span = aj->y>>32&0xff;
|
||||
sc = q_span < dg? q_span : dg;
|
||||
if (dd || dg > q_span) {
|
||||
float lin_pen, log_pen;
|
||||
lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg;
|
||||
log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2
|
||||
if (is_cdna || sidi != sidj) {
|
||||
if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
|
||||
else if (dr > dq || sidi != sidj) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends
|
||||
else sc -= (int)(lin_pen + .5f * log_pen);
|
||||
} else sc -= (int)(lin_pen + .5f * log_pen);
|
||||
}
|
||||
return sc;
|
||||
}
|
||||
|
||||
/* Input:
|
||||
* a[].x: tid<<33 | rev<<32 | tpos
|
||||
* a[].y: flags<<40 | q_span<<32 | q_pos
|
||||
* Output:
|
||||
* n_u: #chains
|
||||
* u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[])
|
||||
* input a[] is deallocated on return
|
||||
*/
|
||||
mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
||||
int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
|
||||
{ // TODO: make sure this works when n has more than 32 bits
|
||||
int32_t *f, *t, *v, n_u, n_v, mmax_f = 0;
|
||||
int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
|
||||
uint64_t *u;
|
||||
|
||||
if (_u) *_u = 0, *n_u_ = 0;
|
||||
if (n == 0 || a == 0) {
|
||||
kfree(km, a);
|
||||
return 0;
|
||||
}
|
||||
if (max_dist_x < bw) max_dist_x = bw;
|
||||
if (max_dist_y < bw && !is_cdna) max_dist_y = bw;
|
||||
KMALLOC(km, p, n);
|
||||
KMALLOC(km, f, n);
|
||||
KMALLOC(km, v, n);
|
||||
KCALLOC(km, t, n);
|
||||
|
||||
// fill the score and backtrack arrays
|
||||
for (i = 0, max_ii = -1; i < n; ++i) {
|
||||
int64_t max_j = -1, end_j;
|
||||
int32_t max_f = a[i].y>>32&0xff, n_skip = 0;
|
||||
while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st;
|
||||
if (i - st > max_iter) st = i - max_iter;
|
||||
for (j = i - 1; j >= st; --j) {
|
||||
int32_t sc;
|
||||
sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
|
||||
++n_iter;
|
||||
if (sc == INT32_MIN) continue;
|
||||
sc += f[j];
|
||||
if (sc > max_f) {
|
||||
max_f = sc, max_j = j;
|
||||
if (n_skip > 0) --n_skip;
|
||||
} else if (t[j] == (int32_t)i) {
|
||||
if (++n_skip > max_skip)
|
||||
break;
|
||||
}
|
||||
if (p[j] >= 0) t[p[j]] = i;
|
||||
}
|
||||
end_j = j;
|
||||
if (max_ii < 0 || a[i].x - a[max_ii].x > (int64_t)max_dist_x) {
|
||||
int32_t max = INT32_MIN;
|
||||
max_ii = -1;
|
||||
for (j = i - 1; j >= st; --j)
|
||||
if (max < f[j]) max = f[j], max_ii = j;
|
||||
}
|
||||
if (max_ii >= 0 && max_ii < end_j) {
|
||||
int32_t tmp;
|
||||
tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
|
||||
if (tmp != INT32_MIN && max_f < tmp + f[max_ii])
|
||||
max_f = tmp + f[max_ii], max_j = max_ii;
|
||||
}
|
||||
f[i] = max_f, p[i] = max_j;
|
||||
v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
|
||||
if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i]))
|
||||
max_ii = i;
|
||||
if (mmax_f < max_f) mmax_f = max_f;
|
||||
}
|
||||
|
||||
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v);
|
||||
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
|
||||
kfree(km, p); kfree(km, f); kfree(km, t);
|
||||
if (n_u == 0) {
|
||||
kfree(km, a); kfree(km, v);
|
||||
return 0;
|
||||
}
|
||||
return compact_a(km, n_u, u, n_v, v, a);
|
||||
}
|
||||
|
||||
typedef struct lc_elem_s {
|
||||
int32_t y;
|
||||
int64_t i;
|
||||
double pri;
|
||||
KRMQ_HEAD(struct lc_elem_s) head;
|
||||
} lc_elem_t;
|
||||
|
||||
#define lc_elem_cmp(a, b) ((a)->y < (b)->y? -1 : (a)->y > (b)->y? 1 : ((a)->i > (b)->i) - ((a)->i < (b)->i))
|
||||
#define lc_elem_lt2(a, b) ((a)->pri < (b)->pri)
|
||||
KRMQ_INIT(lc_elem, lc_elem_t, head, lc_elem_cmp, lc_elem_lt2)
|
||||
|
||||
KALLOC_POOL_INIT(rmq, lc_elem_t)
|
||||
|
||||
static inline int32_t comput_sc_simple(const mm128_t *ai, const mm128_t *aj, float chn_pen_gap, float chn_pen_skip, int32_t *exact, int32_t *width)
|
||||
{
|
||||
int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc;
|
||||
dr = (int32_t)(ai->x - aj->x);
|
||||
*width = dd = dr > dq? dr - dq : dq - dr;
|
||||
dg = dr < dq? dr : dq;
|
||||
q_span = aj->y>>32&0xff;
|
||||
sc = q_span < dg? q_span : dg;
|
||||
if (exact) *exact = (dd == 0 && dg <= q_span);
|
||||
if (dd || dq > q_span) {
|
||||
float lin_pen, log_pen;
|
||||
lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg;
|
||||
log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2
|
||||
sc -= (int)(lin_pen + .5f * log_pen);
|
||||
}
|
||||
return sc;
|
||||
}
|
||||
|
||||
mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
||||
int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
|
||||
{
|
||||
int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0;
|
||||
int64_t *p, i, i0, st = 0, st_inner = 0, n_iter = 0;
|
||||
uint64_t *u;
|
||||
lc_elem_t *root = 0, *root_inner = 0;
|
||||
void *mem_mp = 0;
|
||||
kmp_rmq_t *mp;
|
||||
|
||||
if (_u) *_u = 0, *n_u_ = 0;
|
||||
if (n == 0 || a == 0) {
|
||||
kfree(km, a);
|
||||
return 0;
|
||||
}
|
||||
if (max_dist < bw) max_dist = bw;
|
||||
if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0;
|
||||
KMALLOC(km, p, n);
|
||||
KMALLOC(km, f, n);
|
||||
KCALLOC(km, t, n);
|
||||
KMALLOC(km, v, n);
|
||||
mem_mp = km_init2(km, 0x10000);
|
||||
mp = kmp_init_rmq(mem_mp);
|
||||
|
||||
// fill the score and backtrack arrays
|
||||
for (i = i0 = 0; i < n; ++i) {
|
||||
int64_t max_j = -1;
|
||||
int32_t q_span = a[i].y>>32&0xff, max_f = q_span;
|
||||
lc_elem_t s, *q, *r, lo, hi;
|
||||
// add in-range anchors
|
||||
if (i0 < i && a[i0].x != a[i].x) {
|
||||
int64_t j;
|
||||
for (j = i0; j < i; ++j) {
|
||||
q = kmp_alloc_rmq(mp);
|
||||
q->y = (int32_t)a[j].y, q->i = j, q->pri = -(f[j] + 0.5 * chn_pen_gap * ((int32_t)a[j].x + (int32_t)a[j].y));
|
||||
krmq_insert(lc_elem, &root, q, 0);
|
||||
if (max_dist_inner > 0) {
|
||||
r = kmp_alloc_rmq(mp);
|
||||
*r = *q;
|
||||
krmq_insert(lc_elem, &root_inner, r, 0);
|
||||
}
|
||||
}
|
||||
i0 = i;
|
||||
}
|
||||
// get rid of active chains out of range
|
||||
while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist || krmq_size(head, root) > cap_rmq_size)) {
|
||||
s.y = (int32_t)a[st].y, s.i = st;
|
||||
if ((q = krmq_find(lc_elem, root, &s, 0)) != 0) {
|
||||
q = krmq_erase(lc_elem, &root, q, 0);
|
||||
kmp_free_rmq(mp, q);
|
||||
}
|
||||
++st;
|
||||
}
|
||||
if (max_dist_inner > 0) { // similar to the block above, but applied to the inner tree
|
||||
while (st_inner < i && (a[i].x>>32 != a[st_inner].x>>32 || a[i].x > a[st_inner].x + max_dist_inner || krmq_size(head, root_inner) > cap_rmq_size)) {
|
||||
s.y = (int32_t)a[st_inner].y, s.i = st_inner;
|
||||
if ((q = krmq_find(lc_elem, root_inner, &s, 0)) != 0) {
|
||||
q = krmq_erase(lc_elem, &root_inner, q, 0);
|
||||
kmp_free_rmq(mp, q);
|
||||
}
|
||||
++st_inner;
|
||||
}
|
||||
}
|
||||
// RMQ
|
||||
lo.i = INT32_MAX, lo.y = (int32_t)a[i].y - max_dist;
|
||||
hi.i = 0, hi.y = (int32_t)a[i].y;
|
||||
if ((q = krmq_rmq(lc_elem, root, &lo, &hi)) != 0) {
|
||||
int32_t sc, exact, width, n_skip = 0;
|
||||
int64_t j = q->i;
|
||||
assert(q->y >= lo.y && q->y <= hi.y);
|
||||
sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, &exact, &width);
|
||||
if (width <= bw && sc > max_f) max_f = sc, max_j = j;
|
||||
if (!exact && root_inner && (int32_t)a[i].y > 0) {
|
||||
lc_elem_t *lo, *hi;
|
||||
s.y = (int32_t)a[i].y - 1, s.i = n;
|
||||
krmq_interval(lc_elem, root_inner, &s, &lo, &hi);
|
||||
if (lo) {
|
||||
const lc_elem_t *q;
|
||||
int32_t width, n_rmq_iter = 0;
|
||||
krmq_itr_t(lc_elem) itr;
|
||||
krmq_itr_find(lc_elem, root_inner, lo, &itr);
|
||||
while ((q = krmq_at(&itr)) != 0) {
|
||||
if (q->y < (int32_t)a[i].y - max_dist_inner) break;
|
||||
++n_rmq_iter;
|
||||
j = q->i;
|
||||
sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width);
|
||||
if (width <= bw) {
|
||||
if (sc > max_f) {
|
||||
max_f = sc, max_j = j;
|
||||
if (n_skip > 0) --n_skip;
|
||||
} else if (t[j] == (int32_t)i) {
|
||||
if (++n_skip > max_chn_skip)
|
||||
break;
|
||||
}
|
||||
if (p[j] >= 0) t[p[j]] = i;
|
||||
}
|
||||
if (!krmq_itr_prev(lc_elem, &itr)) break;
|
||||
}
|
||||
n_iter += n_rmq_iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
// set max
|
||||
assert(max_j < 0 || (a[max_j].x < a[i].x && (int32_t)a[max_j].y < (int32_t)a[i].y));
|
||||
f[i] = max_f, p[i] = max_j;
|
||||
v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
|
||||
if (mmax_f < max_f) mmax_f = max_f;
|
||||
if (max_rmq_size < krmq_size(head, root)) max_rmq_size = krmq_size(head, root);
|
||||
}
|
||||
km_destroy(mem_mp);
|
||||
|
||||
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v);
|
||||
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
|
||||
kfree(km, p); kfree(km, f); kfree(km, t);
|
||||
if (n_u == 0) {
|
||||
kfree(km, a); kfree(km, v);
|
||||
return 0;
|
||||
}
|
||||
return compact_a(km, n_u, u, n_v, v, a);
|
||||
}
|
||||
|
|
@ -0,0 +1 @@
|
|||
Subproject commit b30129b3b48a6823013da2b309c50a081177b6b8
|
||||
46
main.c
46
main.c
|
|
@ -7,7 +7,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "ketopt.h"
|
||||
|
||||
#define MM_VERSION "2.17-r987-qs-dirty"
|
||||
#define MM_VERSION "2.21-r1071-qs-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -70,7 +70,9 @@ static ko_longopt_t long_options[] = {
|
|||
{ "chain-gap-scale",ko_required_argument, 343 },
|
||||
{ "alt", ko_required_argument, 344 },
|
||||
{ "alt-drop", ko_required_argument, 345 },
|
||||
{ "qstrand", ko_no_argument, 346 },
|
||||
{ "mask-len", ko_required_argument, 346 },
|
||||
{ "rmq", ko_optional_argument, 347 },
|
||||
{ "qstrand", ko_no_argument, 348 },
|
||||
{ "help", ko_no_argument, 'h' },
|
||||
{ "max-intron-len", ko_required_argument, 'G' },
|
||||
{ "version", ko_no_argument, 'V' },
|
||||
|
|
@ -82,17 +84,23 @@ static ko_longopt_t long_options[] = {
|
|||
{ 0, 0, 0 }
|
||||
};
|
||||
|
||||
static inline int64_t mm_parse_num(const char *str)
|
||||
static inline int64_t mm_parse_num2(const char *str, char **q)
|
||||
{
|
||||
double x;
|
||||
char *p;
|
||||
x = strtod(str, &p);
|
||||
if (*p == 'G' || *p == 'g') x *= 1e9;
|
||||
else if (*p == 'M' || *p == 'm') x *= 1e6;
|
||||
else if (*p == 'K' || *p == 'k') x *= 1e3;
|
||||
if (*p == 'G' || *p == 'g') x *= 1e9, ++p;
|
||||
else if (*p == 'M' || *p == 'm') x *= 1e6, ++p;
|
||||
else if (*p == 'K' || *p == 'k') x *= 1e3, ++p;
|
||||
if (q) *q = p;
|
||||
return (int64_t)(x + .499);
|
||||
}
|
||||
|
||||
static inline int64_t mm_parse_num(const char *str)
|
||||
{
|
||||
return mm_parse_num2(str, 0);
|
||||
}
|
||||
|
||||
static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const char *arg, int yes_to_set)
|
||||
{
|
||||
if (yes_to_set) {
|
||||
|
|
@ -108,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha
|
|||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:";
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:";
|
||||
ketopt_t o = KETOPT_INIT;
|
||||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
|
|
@ -144,7 +152,6 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'k') ipt.k = atoi(o.arg);
|
||||
else if (c == 'H') ipt.flag |= MM_I_HPC;
|
||||
else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I
|
||||
else if (c == 'r') opt.bw = (int)mm_parse_num(o.arg);
|
||||
else if (c == 't') n_threads = atoi(o.arg);
|
||||
else if (c == 'v') mm_verbose = atoi(o.arg);
|
||||
else if (c == 'g') opt.max_gap = (int)mm_parse_num(o.arg);
|
||||
|
|
@ -171,6 +178,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'C') opt.noncan = atoi(o.arg);
|
||||
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
||||
else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg);
|
||||
else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg);
|
||||
else if (c == 'R') rg = o.arg;
|
||||
else if (c == 'h') fp_help = stdout;
|
||||
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
|
||||
|
|
@ -203,7 +211,6 @@ int main(int argc, char *argv[])
|
|||
else if (c == 327) opt.max_clip_ratio = atof(o.arg); // --max-clip-ratio
|
||||
else if (c == 328) opt.min_mid_occ = atoi(o.arg); // --min-occ-floor
|
||||
else if (c == 329) opt.flag |= MM_F_OUT_MD; // --MD
|
||||
else if (c == 330) opt.min_join_flank_ratio = atof(o.arg); // --lj-min-ratio
|
||||
else if (c == 331) opt.sc_ambi = atoi(o.arg); // --score-N
|
||||
else if (c == 332) opt.flag |= MM_F_EQX; // --eqx
|
||||
else if (c == 333) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit
|
||||
|
|
@ -218,8 +225,11 @@ int main(int argc, char *argv[])
|
|||
else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale
|
||||
else if (c == 344) alt_list = o.arg; // --alt
|
||||
else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop
|
||||
else if (c == 346) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand
|
||||
else if (c == 314) { // --frag
|
||||
else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len
|
||||
else if (c == 348) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand
|
||||
else if (c == 330) {
|
||||
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
|
||||
} else if (c == 314) { // --frag
|
||||
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
|
||||
} else if (c == 315) { // --secondary
|
||||
yes_or_no(&opt, MM_F_NO_PRINT_2ND, o.longidx, o.arg, 0);
|
||||
|
|
@ -240,6 +250,8 @@ int main(int argc, char *argv[])
|
|||
yes_or_no(&opt, MM_F_HEAP_SORT, o.longidx, o.arg, 1);
|
||||
} else if (c == 326) { // --dual
|
||||
yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0);
|
||||
} else if (c == 347) { // --rmq
|
||||
yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1);
|
||||
} else if (c == 'S') {
|
||||
opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
|
||||
if (mm_verbose >= 2)
|
||||
|
|
@ -247,6 +259,12 @@ int main(int argc, char *argv[])
|
|||
} else if (c == 'V') {
|
||||
puts(MM_VERSION);
|
||||
return 0;
|
||||
} else if (c == 'r') {
|
||||
opt.bw = (int)mm_parse_num2(o.arg, &s);
|
||||
if (*s == ',') opt.bw_long = (int)mm_parse_num2(s + 1, &s);
|
||||
} else if (c == 'U') {
|
||||
opt.min_mid_occ = strtol(o.arg, &s, 10);
|
||||
if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10);
|
||||
} else if (c == 'f') {
|
||||
double x;
|
||||
char *p;
|
||||
|
|
@ -301,7 +319,7 @@ int main(int argc, char *argv[])
|
|||
fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
|
||||
fprintf(fp_help, " -G NUM max intron length (effective with -xsplice; changing -r) [200k]\n");
|
||||
fprintf(fp_help, " -F NUM max fragment length (effective with -xsr or in the fragment mode) [800]\n");
|
||||
fprintf(fp_help, " -r NUM bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
|
||||
fprintf(fp_help, " -r NUM[,NUM] chaining/alignment bandwidth and long-join bandwidth [%d,%d]\n", opt.bw, opt.bw_long);
|
||||
fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
|
||||
fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
|
||||
// fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
|
||||
|
|
@ -332,7 +350,8 @@ int main(int argc, char *argv[])
|
|||
fprintf(fp_help, " --version show version number\n");
|
||||
fprintf(fp_help, " Preset:\n");
|
||||
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
|
||||
fprintf(fp_help, " - map-pb/map-ont - PacBio/Nanopore vs reference mapping\n");
|
||||
fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
|
||||
fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
|
||||
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
|
||||
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
|
||||
fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n");
|
||||
|
|
@ -389,6 +408,7 @@ int main(int argc, char *argv[])
|
|||
if (mm_verbose >= 3) mm_idx_stat(mi);
|
||||
if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1);
|
||||
if (alt_list) mm_idx_alt_read(mi, alt_list);
|
||||
if (argc - (o.ind + 1) == 0) continue; // no query files
|
||||
ret = 0;
|
||||
if (!(opt.flag & MM_F_FRAG_MODE)) {
|
||||
for (i = o.ind + 1; i < argc; ++i) {
|
||||
|
|
|
|||
89
map.c
89
map.c
|
|
@ -80,49 +80,7 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t
|
|||
#define heap_lt(a, b) ((a).x > (b).x)
|
||||
KSORT_INIT(heap, mm128_t, heap_lt)
|
||||
|
||||
typedef struct {
|
||||
uint32_t n;
|
||||
uint32_t q_pos, q_span;
|
||||
uint32_t seg_id:31, is_tandem:1;
|
||||
const uint64_t *cr;
|
||||
} mm_match_t;
|
||||
|
||||
static mm_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
|
||||
{
|
||||
int rep_st = 0, rep_en = 0, n_m;
|
||||
size_t i;
|
||||
mm_match_t *m;
|
||||
*n_mini_pos = 0;
|
||||
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
||||
m = (mm_match_t*)kmalloc(km, mv->n * sizeof(mm_match_t));
|
||||
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
|
||||
const uint64_t *cr;
|
||||
mm128_t *p = &mv->a[i];
|
||||
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
||||
int t;
|
||||
cr = mm_idx_get(mi, p->x>>8, &t);
|
||||
if (t >= max_occ) {
|
||||
int en = (q_pos >> 1) + 1, st = en - q_span;
|
||||
if (st > rep_en) {
|
||||
*rep_len += rep_en - rep_st;
|
||||
rep_st = st, rep_en = en;
|
||||
} else rep_en = en;
|
||||
} else {
|
||||
mm_match_t *q = &m[n_m++];
|
||||
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
||||
q->is_tandem = 0;
|
||||
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
||||
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
||||
*n_a += q->n;
|
||||
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1;
|
||||
}
|
||||
}
|
||||
*rep_len += rep_en - rep_st;
|
||||
*_n_m = n_m;
|
||||
return m;
|
||||
}
|
||||
|
||||
static inline int skip_seed(int flag, uint64_t r, const mm_match_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
|
||||
static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
|
||||
{
|
||||
*is_self = 0;
|
||||
if (qname && (flag & (MM_F_NO_DIAG|MM_F_NO_DUAL))) {
|
||||
|
|
@ -151,10 +109,10 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
|
|||
{
|
||||
int i, n_m, heap_size = 0;
|
||||
int64_t j, n_for = 0, n_rev = 0;
|
||||
mm_match_t *m;
|
||||
mm_seed_t *m;
|
||||
mm128_t *a, *heap;
|
||||
|
||||
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
|
||||
heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t));
|
||||
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
|
||||
|
|
@ -168,7 +126,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
|
|||
}
|
||||
ks_heapmake_heap(heap_size, heap);
|
||||
while (heap_size > 0) {
|
||||
mm_match_t *q = &m[heap->y>>32];
|
||||
mm_seed_t *q = &m[heap->y>>32];
|
||||
mm128_t *p;
|
||||
uint64_t r = heap->x;
|
||||
int32_t is_self, rpos = (uint32_t)r >> 1;
|
||||
|
|
@ -216,12 +174,12 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
|||
int *n_mini_pos, uint64_t **mini_pos)
|
||||
{
|
||||
int i, n_m;
|
||||
mm_match_t *m;
|
||||
mm_seed_t *m;
|
||||
mm128_t *a;
|
||||
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
|
||||
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
|
||||
for (i = 0, *n_a = 0; i < n_m; ++i) {
|
||||
mm_match_t *q = &m[i];
|
||||
mm_seed_t *q = &m[i];
|
||||
const uint64_t *r = q->cr;
|
||||
uint32_t k;
|
||||
for (k = 0; k < q->n; ++k) {
|
||||
|
|
@ -253,11 +211,9 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
|
|||
static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a)
|
||||
{
|
||||
if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
|
||||
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
|
||||
mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
|
||||
if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
|
||||
else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs);
|
||||
if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains
|
||||
mm_join_long(km, opt, qlen, n_regs, regs, a);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -266,7 +222,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
|
|||
if (!(opt->flag & MM_F_CIGAR)) return regs;
|
||||
regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
|
||||
if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
|
||||
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
|
||||
mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
|
||||
mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
|
||||
mm_set_sam_pri(*n_regs, regs);
|
||||
}
|
||||
|
|
@ -317,9 +273,25 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
|
||||
} else max_chain_gap_ref = opt->max_gap;
|
||||
|
||||
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
|
||||
if (opt->flag & MM_F_RMQ) {
|
||||
a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score,
|
||||
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km);
|
||||
} else {
|
||||
a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
|
||||
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
|
||||
}
|
||||
|
||||
if (opt->max_occ > opt->mid_occ && rep_len > 0) {
|
||||
if (opt->bw_long > opt->bw && (opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences
|
||||
int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y;
|
||||
if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) {
|
||||
int32_t i;
|
||||
for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i];
|
||||
kfree(b->km, u);
|
||||
radix_sort_128x(a, a + n_a);
|
||||
a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score,
|
||||
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km);
|
||||
}
|
||||
} else if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads
|
||||
int rechain = 0;
|
||||
if (n_regs0 > 0) { // test if the best chain has all the segments
|
||||
int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0;
|
||||
|
|
@ -339,7 +311,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
kfree(b->km, mini_pos);
|
||||
if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
|
||||
else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
|
||||
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
|
||||
a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
|
||||
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
|
||||
}
|
||||
}
|
||||
b->frag_gap = max_chain_gap_ref;
|
||||
|
|
@ -370,7 +343,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
|
|||
seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains
|
||||
free(regs0);
|
||||
for (i = 0; i < n_segs; ++i) {
|
||||
mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent
|
||||
mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent
|
||||
regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a);
|
||||
mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr);
|
||||
}
|
||||
|
|
@ -514,7 +487,7 @@ static void merge_hits(step_t *s)
|
|||
}
|
||||
}
|
||||
mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_drop);
|
||||
mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
|
||||
mm_set_parent(km, opt->mask_level, opt->mask_len, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
|
||||
if (!(opt->flag & MM_F_ALL_CHAINS)) {
|
||||
mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]);
|
||||
mm_set_sam_pri(s->n_reg[k], s->reg[k]);
|
||||
|
|
|
|||
31
minimap.h
31
minimap.h
|
|
@ -36,8 +36,9 @@
|
|||
#define MM_F_NO_END_FLT 0x10000000
|
||||
#define MM_F_HARD_MLEVEL 0x20000000
|
||||
#define MM_F_SAM_HIT_ONLY 0x40000000
|
||||
#define MM_F_QSTRAND (0x80000000LL)
|
||||
#define MM_F_NO_INV (0x100000000LL)
|
||||
#define MM_F_RMQ 0x80000000LL
|
||||
#define MM_F_QSTRAND (0x100000000LL)
|
||||
#define MM_F_NO_INV (0x200000000LL)
|
||||
|
||||
#define MM_I_HPC 0x1
|
||||
#define MM_I_NO_SEQ 0x2
|
||||
|
|
@ -47,6 +48,18 @@
|
|||
|
||||
#define MM_MAX_SEG 255
|
||||
|
||||
#define MM_CIGAR_MATCH 0
|
||||
#define MM_CIGAR_INS 1
|
||||
#define MM_CIGAR_DEL 2
|
||||
#define MM_CIGAR_N_SKIP 3
|
||||
#define MM_CIGAR_SOFTCLIP 4
|
||||
#define MM_CIGAR_HARDCLIP 5
|
||||
#define MM_CIGAR_PADDING 6
|
||||
#define MM_CIGAR_EQ_MATCH 7
|
||||
#define MM_CIGAR_X_MISMATCH 8
|
||||
|
||||
#define MM_CIGAR_STR "MIDNSHP=XB"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
@ -115,22 +128,22 @@ typedef struct {
|
|||
|
||||
int max_qlen; // max query length
|
||||
|
||||
int bw; // bandwidth
|
||||
int bw, bw_long; // bandwidth
|
||||
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
|
||||
int max_frag_len;
|
||||
int max_chain_skip, max_chain_iter;
|
||||
int min_cnt; // min number of minimizers on each chain
|
||||
int min_chain_score; // min chaining score
|
||||
float chain_gap_scale;
|
||||
int rmq_size_cap, rmq_inner_dist;
|
||||
int rmq_rescue_size;
|
||||
float rmq_rescue_ratio;
|
||||
|
||||
float mask_level;
|
||||
int mask_len;
|
||||
float pri_ratio;
|
||||
int best_n; // top best_n chains are subjected to DP alignment
|
||||
|
||||
int max_join_long, max_join_short;
|
||||
int min_join_flank_sc;
|
||||
float min_join_flank_ratio;
|
||||
|
||||
float alt_drop;
|
||||
|
||||
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
||||
|
|
@ -147,9 +160,9 @@ typedef struct {
|
|||
int pe_ori, pe_bonus;
|
||||
|
||||
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
||||
int32_t min_mid_occ;
|
||||
int32_t min_mid_occ, max_mid_occ;
|
||||
int32_t mid_occ; // ignore seeds with occurrences above this threshold
|
||||
int32_t max_occ;
|
||||
int32_t max_occ, max_max_occ, occ_dist;
|
||||
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
|
||||
int64_t max_sw_mat;
|
||||
|
||||
|
|
|
|||
126
minimap2.1
126
minimap2.1
|
|
@ -1,4 +1,4 @@
|
|||
.TH minimap2 1 "4 May 2019" "minimap2-2.17 (r941)" "Bioinformatics tools"
|
||||
.TH minimap2 1 "6 July 2021" "minimap2-2.21 (r1071)" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
minimap2 - mapping and alignment between collections of DNA sequences
|
||||
|
|
@ -145,22 +145,32 @@ or
|
|||
.B -xsr
|
||||
mode, which sets the threshold for a second round of seeding.
|
||||
.TP
|
||||
.BI --min-occ-floor \ INT
|
||||
Force minimap2 to always use k-mers occurring
|
||||
.BI -U \ INT1 [, INT2 ]
|
||||
Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is
|
||||
.RI max{ INT1 ,\ min{ INT2 ,
|
||||
.BR -f }}.
|
||||
This option prevents excessively small or large
|
||||
.B -f
|
||||
estimated from the input reference. It deprecates
|
||||
.B --min-occ-floor
|
||||
in earlier versions of minimap2.
|
||||
.TP
|
||||
.BI -e \ INT
|
||||
Sample a high-frequency minimizer every
|
||||
.I INT
|
||||
times or less [0]. In effect, the max occurrence threshold is set to
|
||||
the
|
||||
.RI max{ INT ,
|
||||
.BR -f }.
|
||||
basepairs [500].
|
||||
.TP
|
||||
.BI -g \ INT
|
||||
.BI -g \ NUM
|
||||
Stop chain enlongation if there are no minimizers within
|
||||
.IR INT -bp
|
||||
[10000].
|
||||
.IR NUM -bp
|
||||
[10k].
|
||||
.TP
|
||||
.BI -r \ INT
|
||||
Bandwidth used in chaining and DP-based alignment [500]. This option
|
||||
approximately controls the maximum gap size.
|
||||
.BI -r \ NUM1 [, NUM2 ]
|
||||
Bandwidth for chaining and base alignment [500,20k].
|
||||
.I NUM1
|
||||
is used for initial chaining and alignment extension;
|
||||
.I NUM2
|
||||
for RMQ-based re-chaining and closing gaps in alignments.
|
||||
.TP
|
||||
.BI -n \ INT
|
||||
Discard chains consisting of
|
||||
|
|
@ -234,10 +244,21 @@ Mark as secondary a chain that overlaps with a better chain by
|
|||
.I FLOAT
|
||||
or more of the shorter chain [0.5]
|
||||
.TP
|
||||
.BR --rmq = no | yes
|
||||
Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
|
||||
for aligning contigs through long INDELs.
|
||||
.TP
|
||||
.B --hard-mask-level
|
||||
Honor option
|
||||
.B -M
|
||||
and disable a heurstic to save unmapped subsequences.
|
||||
and disable a heurstic to save unmapped subsequences and disables
|
||||
.BR --mask-len .
|
||||
.TP
|
||||
.BI --mask-len \ NUM
|
||||
Keep an alignment if dropping it leaves an unaligned region on query longer than
|
||||
.IR INT
|
||||
[inf]. Effective without
|
||||
.BR --hard-mask-level .
|
||||
.TP
|
||||
.BI --max-chain-skip \ INT
|
||||
A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming
|
||||
|
|
@ -261,10 +282,6 @@ Disable the long gap patching heuristic. When this option is applied, the
|
|||
maximum alignment gap is mostly controlled by
|
||||
.BR -r .
|
||||
.TP
|
||||
.BI --lj-min-ratio \ FLOAT
|
||||
Fraction of query sequence length required to bridge a long gap [0.5]. A
|
||||
smaller value helps to recover longer gaps, at the cost of more false gaps.
|
||||
.TP
|
||||
.B --splice
|
||||
Enable the splice alignment mode.
|
||||
.TP
|
||||
|
|
@ -384,7 +401,7 @@ BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
|
|||
.BR --junc-bonus \ INT
|
||||
Score bonus for a splice donor or acceptor found in annotation (effective with
|
||||
.BR --junc-bed )
|
||||
[0].
|
||||
[9].
|
||||
.TP
|
||||
.BI --end-seed-pen \ INT
|
||||
Drop a terminal anchor if
|
||||
|
|
@ -405,7 +422,7 @@ alignment.
|
|||
.BI --cap-sw-mem \ NUM
|
||||
Skip alignment if the DP matrix size is above
|
||||
.IR NUM .
|
||||
Set 0 to disable [0].
|
||||
Set 0 to disable [100m].
|
||||
.SS Input/output options
|
||||
.TP 10
|
||||
.B -a
|
||||
|
|
@ -516,60 +533,47 @@ Available
|
|||
.I STR
|
||||
are:
|
||||
.RS
|
||||
.TP 8
|
||||
.B map-pb
|
||||
PacBio/Oxford Nanopore read to reference mapping
|
||||
.RB ( -Hk19 )
|
||||
.TP
|
||||
.TP 10
|
||||
.B map-ont
|
||||
Slightly more sensitive for Oxford Nanopore to reference mapping
|
||||
.RB ( -k15 ).
|
||||
For PacBio reads, HPC minimizers consistently leads to faster performance and
|
||||
more sensitive results in comparison to normal minimizers. For Oxford Nanopore
|
||||
data, normal minimizers are better, though not much. The effectiveness of HPC
|
||||
is determined by the sequencing error mode.
|
||||
Align noisy long reads of ~10% error rate to a reference genome. This is the
|
||||
default mode.
|
||||
.TP
|
||||
.B map-hifi
|
||||
Align PacBio high-fidelity (HiFi) reads to a reference genome
|
||||
.RB ( -k19
|
||||
.B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1
|
||||
.BR -s200 ).
|
||||
.TP
|
||||
.B map-pb
|
||||
Align older PacBio continuous long (CLR) reads to a reference genome
|
||||
.RB ( -Hk19 ).
|
||||
.TP
|
||||
.B asm5
|
||||
Long assembly to reference mapping
|
||||
.RB ( -k19
|
||||
.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 -N50
|
||||
.BR --min-occ-floor=100 ).
|
||||
.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
|
||||
.BR -N50 ).
|
||||
Typically, the alignment will not extend to regions with 5% or higher sequence
|
||||
divergence. Only use this preset if the average divergence is far below 5%.
|
||||
.TP
|
||||
.B asm10
|
||||
Long assembly to reference mapping
|
||||
.RB ( -k19
|
||||
.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 -N50
|
||||
.BR --min-occ-floor=100 ).
|
||||
.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
|
||||
.BR -N50 ).
|
||||
Up to 10% sequence divergence.
|
||||
.TP
|
||||
.B asm20
|
||||
Long assembly to reference mapping
|
||||
.RB ( -k19
|
||||
.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50
|
||||
.BR --min-occ-floor=100 ).
|
||||
.B -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
|
||||
.BR -N50 ).
|
||||
Up to 20% sequence divergence.
|
||||
.TP
|
||||
.B ava-pb
|
||||
PacBio all-vs-all overlap mapping
|
||||
.RB ( -Hk19
|
||||
.B -Xw5 -m100 -g10000 --max-chain-skip
|
||||
.BR 25 ).
|
||||
.TP
|
||||
.B ava-ont
|
||||
Oxford Nanopore all-vs-all overlap mapping
|
||||
.RB ( -k15
|
||||
.B -Xw5 -m100 -g10000 -r2000 --max-chain-skip
|
||||
.BR 25 ).
|
||||
Similarly, the major difference from
|
||||
.B ava-pb
|
||||
is that this preset is not using HPC minimizers.
|
||||
.TP
|
||||
.B splice
|
||||
Long-read spliced alignment
|
||||
.RB ( -k15
|
||||
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9
|
||||
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
|
||||
.BR --splice-flank=yes ).
|
||||
In the splice mode, 1) long deletions are taken as introns and represented as
|
||||
the
|
||||
|
|
@ -588,9 +592,21 @@ Long-read splice alignment for PacBio CCS reads
|
|||
.B sr
|
||||
Short single-end reads without splicing
|
||||
.RB ( -k21
|
||||
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20
|
||||
.B -s40 -g200 -2K50m --heap-sort=yes
|
||||
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m20
|
||||
.B -s40 -g100 -2K50m --heap-sort=yes
|
||||
.BR --secondary=no ).
|
||||
.TP
|
||||
.B ava-pb
|
||||
PacBio CLR all-vs-all overlap mapping
|
||||
.RB ( -Hk19
|
||||
.B -Xw5 -e0
|
||||
.BR -m100 ).
|
||||
.TP
|
||||
.B ava-ont
|
||||
Oxford Nanopore all-vs-all overlap mapping
|
||||
.RB ( -k15
|
||||
.B -Xw5 -e0 -m100
|
||||
.BR -r2k ).
|
||||
.RE
|
||||
.SS Miscellaneous options
|
||||
.TP 10
|
||||
|
|
|
|||
1
misc.c
1
misc.c
|
|
@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
|
|||
KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
|
||||
|
||||
KSORT_INIT_GENERIC(uint32_t)
|
||||
KSORT_INIT_GENERIC(uint64_t)
|
||||
|
|
|
|||
469
misc/paftools.js
469
misc/paftools.js
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env k8
|
||||
|
||||
var paftools_version = '2.17-r982-dirty';
|
||||
var paftools_version = '2.21-r1071';
|
||||
|
||||
/*****************************
|
||||
***** Library functions *****
|
||||
|
|
@ -962,12 +962,13 @@ function paf_asmgene(args)
|
|||
|
||||
function paf_stat(args)
|
||||
{
|
||||
var c, gap_out_len = null;
|
||||
while ((c = getopt(args, "l:")) != null)
|
||||
var c, gap_out_len = null, count_err = false;
|
||||
while ((c = getopt(args, "cl:")) != null)
|
||||
if (c == 'l') gap_out_len = parseInt(getopt.arg);
|
||||
else if (c == 'c') count_err = true;
|
||||
|
||||
if (getopt.ind == args.length) {
|
||||
print("Usage: paftools.js stat [-l gapOutLen] <in.sam>|<in.paf>");
|
||||
print("Usage: paftools.js stat [-c] [-l gapOutLen] <in.sam>|<in.paf>");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
|
@ -998,7 +999,7 @@ function paf_stat(args)
|
|||
if (line.charAt(0) != '@') {
|
||||
var t = line.split("\t", 12);
|
||||
var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null;
|
||||
var atlen = null, aqlen, qs, qe, mapq, ori_qlen;
|
||||
var atlen = null, aqlen, qs, qe, mapq, ori_qlen, NM = null;
|
||||
if (t.length < 2) continue;
|
||||
if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
|
||||
if (t[4] == '*') continue; // unmapped
|
||||
|
|
@ -1006,6 +1007,8 @@ function paf_stat(args)
|
|||
++n_2nd;
|
||||
continue;
|
||||
}
|
||||
if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
|
||||
NM = parseInt(m[1]);
|
||||
if ((m = /\tcg:Z:(\S+)/.exec(line)) != null)
|
||||
cigar = m[1];
|
||||
if (cigar == null) {
|
||||
|
|
@ -1027,6 +1030,8 @@ function paf_stat(args)
|
|||
++n_2nd;
|
||||
continue;
|
||||
}
|
||||
if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
|
||||
NM = parseInt(m[1]);
|
||||
cigar = t[5];
|
||||
tname = t[2];
|
||||
rs = parseInt(t[3]) - 1;
|
||||
|
|
@ -1045,11 +1050,13 @@ function paf_stat(args)
|
|||
++n_seq, last = t[0];
|
||||
}
|
||||
var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0;
|
||||
var n_gapo = 0, n_gap_all = 0, l_match = 0;
|
||||
while ((m = re.exec(cigar)) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
++n_cigar;
|
||||
if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') {
|
||||
tl += l, ql += l, M += l;
|
||||
l_match += l;
|
||||
} else if (m[2] == 'I' || m[2] == 'D') {
|
||||
var type;
|
||||
if (l < 50) type = 0;
|
||||
|
|
@ -1062,6 +1069,7 @@ function paf_stat(args)
|
|||
else tl += l, ++n_gap[1][type];
|
||||
if (gap_out_len != null && l >= gap_out_len)
|
||||
print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l);
|
||||
++n_gapo, n_gap_all += l;
|
||||
} else if (m[2] == 'N') {
|
||||
tl += l;
|
||||
} else if (m[2] == 'S') {
|
||||
|
|
@ -1079,6 +1087,12 @@ function paf_stat(args)
|
|||
qs = clip[is_rev? 1 : 0], qe = qs + ql;
|
||||
ori_qlen = clip[0] + ql + clip[1];
|
||||
}
|
||||
if (count_err && NM != null) {
|
||||
var n_mm = NM - n_gap_all;
|
||||
if (n_mm < 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno);
|
||||
if (n_mm < 0) n_mm = 0;
|
||||
print(t[0], ori_qlen, t[11], ori_qlen - (qe - qs), NM, l_match + n_gap_all, n_mm + n_gapo, l_match + n_gapo);
|
||||
}
|
||||
regs.push([qs, qe]);
|
||||
last_qlen = ori_qlen;
|
||||
}
|
||||
|
|
@ -1091,7 +1105,7 @@ function paf_stat(args)
|
|||
file.close();
|
||||
buf.destroy();
|
||||
|
||||
if (gap_out_len == null) {
|
||||
if (gap_out_len == null && !count_err) {
|
||||
print("Number of mapped sequences: " + n_seq);
|
||||
print("Number of primary alignments: " + n_pri);
|
||||
print("Number of secondary alignments: " + n_2nd);
|
||||
|
|
@ -1405,7 +1419,7 @@ function paf_view(args)
|
|||
|
||||
var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); // these are used to show padded alignment
|
||||
var re_cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g;
|
||||
var re_cg = /(\d+)([MIDNSH])/g;
|
||||
var re_cg = /(\d+)([MIDNSHP=X])/g;
|
||||
|
||||
var buf = new Bytes();
|
||||
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
||||
|
|
@ -1885,7 +1899,7 @@ function paf_splice2bed(args)
|
|||
a.length = 0;
|
||||
}
|
||||
|
||||
var re = /(\d+)([MIDNSH])/g;
|
||||
var re = /(\d+)([MIDNSHP=X])/g;
|
||||
var c, fmt = "bed", fn_name_conv = null, keep_multi = false;
|
||||
while ((c = getopt(args, "f:n:m")) != null) {
|
||||
if (c == 'f') fmt = getopt.arg;
|
||||
|
|
@ -2355,28 +2369,43 @@ function paf_junceval(args)
|
|||
|
||||
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
|
||||
var last_qname = null;
|
||||
var re_cigar = /(\d+)([MIDNSHX=])/g;
|
||||
var re_cigar = /(\d+)([MIDNSHP=X])/g;
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, t = buf.toString().split("\t");
|
||||
var ctg_name = null, cigar = null, pos = null, qname = t[0];
|
||||
|
||||
if (t[0].charAt(0) == '@') continue;
|
||||
if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(t[2])) continue;
|
||||
var flag = parseInt(t[1]);
|
||||
if (flag&0x100) continue;
|
||||
if (first_only && last_qname == t[0]) continue;
|
||||
if (t[2] == '*') {
|
||||
if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
|
||||
ctg_name = t[5], pos = parseInt(t[7]);
|
||||
var type = 'P';
|
||||
for (i = 12; i < t.length; ++i) {
|
||||
if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
|
||||
if (m[1] == 'tp:A') type = m[2];
|
||||
else cigar = m[2];
|
||||
}
|
||||
}
|
||||
if (type == 'S') continue; // secondary
|
||||
} else { // SAM
|
||||
ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
|
||||
var flag = parseInt(t[1]);
|
||||
if (flag&0x100) continue; // secondary
|
||||
}
|
||||
|
||||
if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
|
||||
if (first_only && last_qname == qname) continue;
|
||||
if (ctg_name == '*') { // unmapped
|
||||
++n_unmapped;
|
||||
continue;
|
||||
} else {
|
||||
++n_pri;
|
||||
if (last_qname != t[0]) {
|
||||
if (last_qname != qname) {
|
||||
++n_mapped;
|
||||
last_qname = t[0];
|
||||
last_qname = qname;
|
||||
}
|
||||
}
|
||||
|
||||
var pos = parseInt(t[3]) - 1, intron = [];
|
||||
while ((m = re_cigar.exec(t[5])) != null) {
|
||||
var intron = [];
|
||||
while ((m = re_cigar.exec(cigar)) != null) {
|
||||
var len = parseInt(m[1]), op = m[2];
|
||||
if (op == 'N') {
|
||||
intron.push([pos, pos + len]);
|
||||
|
|
@ -2389,7 +2418,7 @@ function paf_junceval(args)
|
|||
}
|
||||
n_splice += intron.length;
|
||||
|
||||
var chr = anno[t[2]];
|
||||
var chr = anno[ctg_name];
|
||||
if (chr != null) {
|
||||
for (var i = 0; i < intron.length; ++i) {
|
||||
var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]);
|
||||
|
|
@ -2413,12 +2442,12 @@ function paf_junceval(args)
|
|||
x += '(' + o[j][0] + "," + o[j][1] + ')';
|
||||
}
|
||||
x += ']';
|
||||
print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x);
|
||||
print(type, qname, i+1, ctg_name, intron[i][0], intron[i][1], x);
|
||||
}
|
||||
} else {
|
||||
++n_splice_novel;
|
||||
if (print_ovlp)
|
||||
print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]);
|
||||
print('N', qname, i+1, ctg_name, intron[i][0], intron[i][1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
@ -2512,6 +2541,395 @@ function paf_ov_eval(args)
|
|||
print((100 * (1 - n_missing / n_ovlp)).toFixed(2) + "% sensitivity");
|
||||
}
|
||||
|
||||
function paf_vcfstat(args)
|
||||
{
|
||||
var c, ts = { "AG":1, "GA":1, "CT":1, "TC":1 };
|
||||
while ((c = getopt(args, "")) != null) {
|
||||
}
|
||||
var buf = new Bytes();
|
||||
var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]);
|
||||
var x = { sub:0, ts:0, tv:0, ins:0, del:0, ins1:0, del1:0, ins2:0, del2:0, ins50:0, del50:0, ins1k:0, del1k:0, ins7k:0, del7k:0, insinf:0, delinf:0 };
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t");
|
||||
if (t[0][0] == '#') continue;
|
||||
var alt = t[4].split(",");
|
||||
var ref = t[3];
|
||||
for (var i = 0; i < alt.length; ++i) {
|
||||
var a = alt[i];
|
||||
if (a[0] == '<' || a[1] == '>') continue;
|
||||
var l = ref.length < a.length? ref.length : a.length;
|
||||
for (var j = 0; j < l; ++j) {
|
||||
if (ref[j] != a[j]) {
|
||||
++x.sub;
|
||||
if (ts[ref[j] + a[j]]) ++x.ts;
|
||||
else ++x.tv;
|
||||
}
|
||||
}
|
||||
var d = a.length - ref.length;
|
||||
if (d > 0) {
|
||||
++x.ins;
|
||||
if (d == 1) ++x.ins1;
|
||||
else if (d == 2) ++x.ins2;
|
||||
else if (d < 50) ++x.ins50;
|
||||
else if (d < 1000) ++x.ins1k;
|
||||
else if (d < 7000) ++x.ins7k;
|
||||
else ++x.insinf;
|
||||
} else if (d < 0) {
|
||||
d = -d;
|
||||
++x.del;
|
||||
if (d == 1) ++x.del1;
|
||||
else if (d == 2) ++x.del2;
|
||||
else if (d < 50) ++x.del50;
|
||||
else if (d < 1000) ++x.del1k;
|
||||
else if (d < 7000) ++x.del7k;
|
||||
else ++x.delinf;
|
||||
}
|
||||
}
|
||||
}
|
||||
file.close();
|
||||
buf.destroy();
|
||||
print("# substitutions: " + x.sub);
|
||||
print("ts/tv: " + (x.ts / x.tv).toFixed(3));
|
||||
print("# insertions: " + x.ins);
|
||||
print("# 1bp insertions: " + x.ins1);
|
||||
print("# 2bp insertions: " + x.ins2);
|
||||
print("# [3,50) insertions: " + x.ins50);
|
||||
print("# [50,1000) insertions: " + x.ins1k);
|
||||
print("# [1000,7000) insertions: " + x.ins7k);
|
||||
print("# >=7000 insertions: " + x.insinf);
|
||||
print("# deletions: " + x.del);
|
||||
print("# 1bp deletions: " + x.del1);
|
||||
print("# 2bp deletions: " + x.del2);
|
||||
print("# [3,50) deletions: " + x.del50);
|
||||
print("# [50,1000) deletions: " + x.del1k);
|
||||
print("# [1000,7000) deletions: " + x.del7k);
|
||||
print("# >=7000 deletions: " + x.delinf);
|
||||
}
|
||||
|
||||
function paf_parseNum(s) {
|
||||
var m, x = null;
|
||||
if ((m = /^(\d*\.?\d*)([mMgGkK]?)/.exec(s)) != null) {
|
||||
x = parseFloat(m[1]);
|
||||
if (m[2] == 'k' || m[2] == 'K') x *= 1000;
|
||||
else if (m[2] == 'm' || m[2] == 'M') x *= 1000000;
|
||||
else if (m[2] == 'g' || m[2] == 'G') x *= 1000000000;
|
||||
}
|
||||
return Math.floor(x + .499);
|
||||
}
|
||||
|
||||
function paf_misjoin(args)
|
||||
{
|
||||
var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false, cen_ratio = 0.5;
|
||||
var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0];
|
||||
while ((c = getopt(args, "l:g:c:per:")) != null) {
|
||||
if (c == 'l') min_seg_len = paf_parseNum(getopt.arg);
|
||||
else if (c == 'g') max_gap = paf_parseNum(getopt.arg);
|
||||
else if (c == 'c') fn_cen = getopt.arg;
|
||||
else if (c == 'r') cen_ratio = parseFloat(getopt.arg);
|
||||
else if (c == 'p') show_long = true;
|
||||
else if (c == 'e') show_err = true;
|
||||
}
|
||||
if (args.length == getopt.ind) {
|
||||
print("Usage: paftools.js misjoin [options] <in.paf>");
|
||||
print("Options:");
|
||||
print(" -c FILE BED for centromeres []");
|
||||
print(" -r FLOAT count a centromeric event if overlap ratio > FLOAT [" + cen_ratio + "]");
|
||||
print(" -l NUM min alignment block length [1m]");
|
||||
print(" -g NUM max gap size [1m]");
|
||||
print(" -e output misjoins not involving centromeres");
|
||||
print(" -p output long alignment blocks for debugging");
|
||||
return;
|
||||
}
|
||||
var cen = {};
|
||||
var file, buf = new Bytes();
|
||||
if (fn_cen != null) {
|
||||
file = new File(fn_cen);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t");
|
||||
if (cen[t[0]] == null) cen[t[0]] = [];
|
||||
cen[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
|
||||
function test_cen(cen, chr, st, en) {
|
||||
var b = cen[chr], len = 0;
|
||||
if (b == null) return false;
|
||||
for (var j = 0; j < b.length; ++j)
|
||||
if (b[j][0] < en && b[j][1] > st) {
|
||||
var s = b[j][0] > st? b[j][0] : st;
|
||||
var e = b[j][1] < en? b[j][1] : en;
|
||||
len += e - s;
|
||||
}
|
||||
return len < (en - st) * cen_ratio? false : true;
|
||||
}
|
||||
|
||||
function process(a) {
|
||||
var k = 0;
|
||||
for (var i = 0; i < a.length; ++i) {
|
||||
for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]);
|
||||
for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]);
|
||||
if (a[i][10] >= min_seg_len) a[k++] = a[i];
|
||||
}
|
||||
a.length = k;
|
||||
if (a.length == 1) return;
|
||||
a = a.sort(function(x,y){return x[2]-y[2]});
|
||||
if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
|
||||
for (var i = 1; i < a.length; ++i) {
|
||||
var ov = [false, false];
|
||||
ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
|
||||
ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
|
||||
if (a[i-1][5] != a[i][5]) { // different chr
|
||||
if (ov[0] || ov[1]) ++n_diff[1];
|
||||
else if (show_err) {
|
||||
print("J", a[i-1].slice(0, 12).join("\t"));
|
||||
print("J", a[i].slice(0, 12).join("\t"));
|
||||
}
|
||||
++n_diff[0];
|
||||
} else if (a[i-1][4] == a[i][4]) { // a gap
|
||||
var dq = a[i][2] - a[i-1][3];
|
||||
var dr = a[i][4] == '+'? a[i][7] - a[i-1][8] : a[i-1][7] - a[i][8];
|
||||
var gap = dr > dq? dr - dq : dq - dr;
|
||||
if (gap > max_gap) {
|
||||
if (ov[0] || ov[1]) ++n_gap[1];
|
||||
else if (show_err) {
|
||||
print("G", a[i-1].slice(0, 12).join("\t"));
|
||||
print("G", a[i].slice(0, 12).join("\t"));
|
||||
}
|
||||
++n_gap[0];
|
||||
}
|
||||
} else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion
|
||||
if (ov[0] || ov[1]) ++n_inv[1];
|
||||
else if (show_err) {
|
||||
print("M", a[i-1].slice(0, 12).join("\t"));
|
||||
print("M", a[i].slice(0, 12).join("\t"));
|
||||
print("M", a[i+1].slice(0, 12).join("\t"));
|
||||
}
|
||||
++n_inv[0];
|
||||
++i;
|
||||
} else { // hanging inversion
|
||||
if (ov[0] || ov[1]) ++n_inv_end[1];
|
||||
++n_inv_end[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
||||
var a = [];
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t");
|
||||
if (a.length > 0 && a[0][0] != t[0]) {
|
||||
process(a);
|
||||
a.length = 0;
|
||||
}
|
||||
a.push(t);
|
||||
}
|
||||
if (a.length > 0) process(a);
|
||||
file.close();
|
||||
buf.destroy();
|
||||
print("# inter-chromosomal misjoins: " + n_diff.join(","));
|
||||
print("# intra-chromosomal gaps: " + n_gap.join(","));
|
||||
print("# candidate inversions in the middle: " + n_inv.join(","));
|
||||
print("# candidate inversions at contig ends: " + n_inv_end.join(","));
|
||||
}
|
||||
|
||||
function _paf_get_alen(t)
|
||||
{
|
||||
var svlen = null, alen = null;
|
||||
if ((m = /(^|;)SVLEN=(-?\d+)/.exec(t[7])) != null)
|
||||
svlen = parseInt(m[2]);
|
||||
var s = t[4].split(",");
|
||||
var min_abs_diff = 1<<30, max_abs_diff = 0;
|
||||
if (svlen != null && svlen != 0)
|
||||
alen = svlen, min_abs_diff = max_abs_diff = svlen > 0? svlen : -svlen;
|
||||
var rlen = t[3].length;
|
||||
for (var i = 0; i < s.length; ++i) {
|
||||
if (/^<\S+>$/.test(s[i])) continue;
|
||||
var diff = s[i].length - rlen;
|
||||
var abs_diff = diff > 0? diff : -diff;
|
||||
min_abs_diff = min_abs_diff < abs_diff? min_abs_diff : abs_diff;
|
||||
if (max_abs_diff < abs_diff)
|
||||
max_abs_diff = abs_diff, alen = diff;
|
||||
}
|
||||
return [alen, min_abs_diff, max_abs_diff];
|
||||
}
|
||||
|
||||
function paf_sveval(args)
|
||||
{
|
||||
var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, print_match = false, bed_fn = null;
|
||||
var len_diff_ratio = 0.5;
|
||||
while ((c = getopt(args, "f:i:x:w:er:pd:")) != null) {
|
||||
if (c == 'f') min_flt = paf_parseNum(getopt.arg);
|
||||
else if (c == 'i') min_size = paf_parseNum(getopt.arg);
|
||||
else if (c == 'x') max_size = paf_parseNum(getopt.arg);
|
||||
else if (c == 'w') win_size = paf_parseNum(getopt.arg);
|
||||
else if (c == 'd') len_diff_ratio = parseFloat(getopt.arg);
|
||||
else if (c == 'r') bed_fn = getopt.arg;
|
||||
else if (c == 'e') print_err = true;
|
||||
else if (c == 'p') print_match = true;
|
||||
}
|
||||
if (args.length - getopt.ind < 2) {
|
||||
print("Usage: paftools.js sveval [options] <base.vcf> <call.vcf>");
|
||||
print("Options:");
|
||||
print(" -r FILE confident region in BED []");
|
||||
print(" -f INT min length to discard [" + min_flt + "]");
|
||||
print(" -i INT min SV length [" + min_size + "]");
|
||||
print(" -x INT max SV length [" + max_size + "]");
|
||||
print(" -w INT fuzzy windown size [" + win_size + "]");
|
||||
print(" -d FLOAT max allele diff if there is a single allele in the window [" + len_diff_ratio + "]");
|
||||
print(" -e print errors");
|
||||
return;
|
||||
}
|
||||
|
||||
function read_bed(fn) {
|
||||
var buf = new Bytes();
|
||||
var file = new File(fn);
|
||||
var bed = {};
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t");
|
||||
if (bed[t[0]] == null) bed[t[0]] = [];
|
||||
bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
|
||||
}
|
||||
file.close();
|
||||
buf.destroy();
|
||||
for (var x in bed) {
|
||||
Interval.sort(bed[x]);
|
||||
Interval.merge(bed[x]);
|
||||
Interval.index_end(bed[x]);
|
||||
}
|
||||
return bed;
|
||||
}
|
||||
|
||||
var bed = bed_fn != null? read_bed(bed_fn) : null;
|
||||
|
||||
function read_vcf(fn, bed) {
|
||||
var buf = new Bytes();
|
||||
var file = new File(fn);
|
||||
var v = {};
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, t = buf.toString().split("\t");
|
||||
if (t[0][0] == '#') continue;
|
||||
if (bed != null && bed[t[0]] == null) continue;
|
||||
if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
|
||||
if (/[\[\]]/.test(t[4])) continue; // no break points
|
||||
var st = parseInt(t[1]) - 1, en = st + t[3].length;
|
||||
// parse svlen
|
||||
var b = _paf_get_alen(t), svlen = b[0];
|
||||
var abslen = svlen == null? 0 : svlen > 0? svlen : -svlen;
|
||||
if (abslen < min_flt || abslen > max_size) continue;
|
||||
// update end
|
||||
if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
|
||||
en = parseInt(m[2]);
|
||||
else if (svlen != null && svlen < 0)
|
||||
en = st + (-svlen);
|
||||
if (en < st) en = st;
|
||||
if (st == en) --st, ++en;
|
||||
if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue;
|
||||
// insert
|
||||
if (v[t[0]] == null) v[t[0]] = [];
|
||||
v[t[0]].push([st, en, svlen, abslen]);
|
||||
}
|
||||
file.close();
|
||||
buf.destroy();
|
||||
for (var x in v) {
|
||||
Interval.sort(v[x]);
|
||||
Interval.index_end(v[x]);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
function compare_vcf(v0, v1, label) {
|
||||
var m = 0, n = 0;
|
||||
for (var x in v1) {
|
||||
var a1 = v1[x], a0 = v0[x];
|
||||
for (var i = 0; i < a1.length; ++i) {
|
||||
if (a1[i][3] < min_size) continue;
|
||||
++n;
|
||||
if (a0 == null) continue;
|
||||
var ws = win_size + (a1[i][3]>>1);
|
||||
var st = a1[i][0] > ws? a1[i][0] - ws : 0;
|
||||
b = Interval.find_ovlp(a0, st, a1[i][1] + ws);
|
||||
var n_ins = 0, n_del = 0, sv_del = null, sv_ins = null;
|
||||
for (var j = 0; j < b.length; ++j) {
|
||||
if (b[j][2] < 0) ++n_del, sv_del = -b[j][2];
|
||||
else if (b[j][2] > 0) ++n_ins, sv_ins = b[j][2];
|
||||
if (print_match)
|
||||
print("MA", x, a1[i].slice(0, 3).join("\t"), b[j].slice(0, 3).join("\t"));
|
||||
}
|
||||
var match = false;
|
||||
if (a1[i][2] > 0) { // insertion
|
||||
if (n_ins == 1) {
|
||||
var diff = sv_ins - a1[i][3];
|
||||
if (diff < 0) diff = -diff;
|
||||
if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
|
||||
match = true;
|
||||
} else if (n_ins > 1) match = true; // multiple insertions; ambiguous
|
||||
} else if (a1[i][2] < 0) {
|
||||
if (n_del == 1) { // deletion
|
||||
var diff = sv_del - a1[i][3];
|
||||
if (diff < 0) diff = -diff;
|
||||
if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
|
||||
match = true;
|
||||
} else if (n_del > 1) match = true; // multiple deletions; ambiguous
|
||||
}
|
||||
if (match) ++m;
|
||||
else if (print_err) {
|
||||
if ((a1[i][2] > 0 && n_ins > 0) || (a1[i][2] < 0 && n_del > 0))
|
||||
print("MM", x, a1[i].slice(0, 3).join("\t"));
|
||||
print(label, x, a1[i].slice(0, 3).join("\t"));
|
||||
}
|
||||
}
|
||||
}
|
||||
return [n, m];
|
||||
}
|
||||
|
||||
var v_base = read_vcf(args[getopt.ind+0], bed);
|
||||
var v_call = read_vcf(args[getopt.ind+1], bed);
|
||||
var fn = compare_vcf(v_call, v_base, 'FN');
|
||||
var fp = compare_vcf(v_base, v_call, 'FP');
|
||||
print('SN', fn[0], fn[1], (fn[1] / fn[0]).toFixed(6));
|
||||
print('PC', fp[0], fp[1], (fp[1] / fp[0]).toFixed(6));
|
||||
print('F1', ((fn[1] / fn[0] + fp[1] / fp[0]) / 2).toFixed(6));
|
||||
}
|
||||
|
||||
function paf_vcfsel(args)
|
||||
{
|
||||
var c, min_l = 0, max_l = 1<<30;
|
||||
while ((c = getopt(args, "l:L:")) != null) {
|
||||
if (c == 'l') min_l = parseInt(getopt.arg);
|
||||
else if (c == 'L') max_l = parseInt(getopt.arg);
|
||||
}
|
||||
|
||||
var buf = new Bytes();
|
||||
if (getopt.ind == args.length) {
|
||||
print("Usage: paftools.js vcfsel [options] <in.vcf>");
|
||||
return 1;
|
||||
}
|
||||
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, line = buf.toString();
|
||||
if (line[0] == '#') {
|
||||
print(line);
|
||||
continue;
|
||||
}
|
||||
var t = line.split("\t");
|
||||
var st = parseInt(t[1]), en = st + t[3].length - 1;
|
||||
if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
|
||||
en = parseInt(m[2]);
|
||||
if (en < st) {
|
||||
warn("END is smaller than POS: " + en + " < " + st);
|
||||
en = st;
|
||||
}
|
||||
var b = _paf_get_alen(t);
|
||||
var alen = b[0], min_abs_diff = b[1], max_abs_diff = b[2];
|
||||
if (max_abs_diff < min_l || min_abs_diff > max_l)
|
||||
continue;
|
||||
print(line);
|
||||
}
|
||||
file.close();
|
||||
buf.destroy();
|
||||
}
|
||||
|
||||
/*************************
|
||||
***** main function *****
|
||||
*************************/
|
||||
|
|
@ -2529,10 +2947,13 @@ function main(args)
|
|||
print("");
|
||||
print(" stat collect basic mapping information in PAF/SAM");
|
||||
print(" asmstat collect basic assembly information");
|
||||
print(" asmgene evaluate gene completeness (EXPERIMENTAL)");
|
||||
print(" asmgene evaluate gene completeness");
|
||||
print(" misjoin evaluate large-scale misjoins");
|
||||
print(" liftover simplistic liftOver");
|
||||
print(" call call variants from asm-to-ref alignment with the cs tag");
|
||||
print(" bedcov compute the number of bases covered");
|
||||
print(" vcfstat VCF statistics");
|
||||
print(" sveval compare two SV callsets in VCF");
|
||||
print(" version print paftools.js version");
|
||||
print("");
|
||||
print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ");
|
||||
|
|
@ -2552,6 +2973,7 @@ function main(args)
|
|||
else if (cmd == 'stat') paf_stat(args);
|
||||
else if (cmd == 'asmstat') paf_asmstat(args);
|
||||
else if (cmd == 'asmgene') paf_asmgene(args);
|
||||
else if (cmd == 'misjoin') paf_misjoin(args);
|
||||
else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args);
|
||||
else if (cmd == 'vcfpair') paf_vcfpair(args);
|
||||
else if (cmd == 'call') paf_call(args);
|
||||
|
|
@ -2561,6 +2983,9 @@ function main(args)
|
|||
else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
|
||||
else if (cmd == 'junceval') paf_junceval(args);
|
||||
else if (cmd == 'ov-eval') paf_ov_eval(args);
|
||||
else if (cmd == 'vcfstat') paf_vcfstat(args);
|
||||
else if (cmd == 'sveval') paf_sveval(args);
|
||||
else if (cmd == 'vcfsel') paf_vcfsel(args);
|
||||
else if (cmd == 'version') print(paftools_version);
|
||||
else throw Error("unrecognized command: " + cmd);
|
||||
}
|
||||
|
|
|
|||
23
mmpriv.h
23
mmpriv.h
|
|
@ -36,6 +36,14 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t n;
|
||||
uint32_t q_pos;
|
||||
uint32_t q_span:31, flt:1;
|
||||
uint32_t seg_id:31, is_tandem:1;
|
||||
const uint64_t *cr;
|
||||
} mm_seed_t;
|
||||
|
||||
typedef struct {
|
||||
int n_u, n_a;
|
||||
uint64_t *u;
|
||||
|
|
@ -52,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
|||
|
||||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
||||
|
||||
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
|
||||
|
||||
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);
|
||||
|
|
@ -63,20 +73,25 @@ void mm_idxopt_init(mm_idxopt_t *opt);
|
|||
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
|
||||
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
|
||||
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
||||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
||||
|
||||
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
|
||||
|
||||
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale,
|
||||
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
||||
mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
||||
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
||||
mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
||||
int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
||||
|
||||
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r);
|
||||
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand);
|
||||
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
|
||||
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a);
|
||||
int mm_set_sam_pri(int n, mm_reg1_t *r);
|
||||
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
|
||||
void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
|
||||
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r);
|
||||
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
|
||||
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
|
||||
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
|
||||
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
|
||||
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
|
||||
|
||||
|
|
|
|||
93
options.c
93
options.c
|
|
@ -1,4 +1,5 @@
|
|||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
#include "mmpriv.h"
|
||||
|
||||
void mm_idxopt_init(mm_idxopt_t *opt)
|
||||
|
|
@ -15,26 +16,30 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
memset(opt, 0, sizeof(mm_mapopt_t));
|
||||
opt->seed = 11;
|
||||
opt->mid_occ_frac = 2e-4f;
|
||||
opt->min_mid_occ = 10;
|
||||
opt->max_mid_occ = 1000000;
|
||||
opt->sdust_thres = 0; // no SDUST masking
|
||||
|
||||
opt->min_cnt = 3;
|
||||
opt->min_chain_score = 40;
|
||||
opt->bw = 500;
|
||||
opt->bw = 500, opt->bw_long = 20000;
|
||||
opt->max_gap = 5000;
|
||||
opt->max_gap_ref = -1;
|
||||
opt->max_chain_skip = 25;
|
||||
opt->max_chain_iter = 5000;
|
||||
opt->chain_gap_scale = 1.0f;
|
||||
opt->rmq_inner_dist = 1000;
|
||||
opt->rmq_size_cap = 100000;
|
||||
opt->rmq_rescue_size = 1000;
|
||||
opt->rmq_rescue_ratio = 0.1f;
|
||||
opt->chain_gap_scale = 0.8f;
|
||||
opt->max_max_occ = 4095;
|
||||
opt->occ_dist = 500;
|
||||
|
||||
opt->mask_level = 0.5f;
|
||||
opt->mask_len = INT_MAX;
|
||||
opt->pri_ratio = 0.8f;
|
||||
opt->best_n = 5;
|
||||
|
||||
opt->max_join_long = 20000;
|
||||
opt->max_join_short = 2000;
|
||||
opt->min_join_flank_sc = 1000;
|
||||
opt->min_join_flank_ratio = 0.5f;
|
||||
|
||||
opt->alt_drop = 0.15f;
|
||||
|
||||
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
||||
|
|
@ -46,6 +51,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6;
|
||||
opt->max_clip_ratio = 1.0f;
|
||||
opt->mini_batch_size = 500000000;
|
||||
opt->max_sw_mat = 100000000;
|
||||
|
||||
opt->pe_ori = 0; // FF
|
||||
opt->pe_bonus = 33;
|
||||
|
|
@ -55,10 +61,13 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
|||
{
|
||||
if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV))
|
||||
opt->flag |= MM_F_SPLICE;
|
||||
if (opt->mid_occ <= 0)
|
||||
if (opt->mid_occ <= 0) {
|
||||
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
|
||||
if (opt->mid_occ < opt->min_mid_occ)
|
||||
opt->mid_occ = opt->min_mid_occ;
|
||||
if (opt->mid_occ < opt->min_mid_occ)
|
||||
opt->mid_occ = opt->min_mid_occ;
|
||||
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
|
||||
opt->mid_occ = opt->max_mid_occ;
|
||||
}
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
|
||||
}
|
||||
|
|
@ -66,7 +75,7 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
|||
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len)
|
||||
{
|
||||
if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0)
|
||||
opt->max_gap_ref = opt->bw = max_intron_len;
|
||||
opt->max_gap_ref = opt->bw = opt->bw_long = max_intron_len;
|
||||
}
|
||||
|
||||
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
||||
|
|
@ -74,37 +83,44 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
if (preset == 0) {
|
||||
mm_idxopt_init(io);
|
||||
mm_mapopt_init(mo);
|
||||
} else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
|
||||
} else if (strcmp(preset, "ava-ont") == 0) {
|
||||
io->flag = 0, io->k = 15, io->w = 5;
|
||||
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
|
||||
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
|
||||
mo->bw = 2000;
|
||||
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
|
||||
mo->bw = mo->bw_long = 2000;
|
||||
mo->occ_dist = 0;
|
||||
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
|
||||
io->flag |= MM_I_HPC, io->k = 19;
|
||||
} else if (strcmp(preset, "ava-pb") == 0) {
|
||||
io->flag |= MM_I_HPC, io->k = 19, io->w = 5;
|
||||
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
|
||||
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
|
||||
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
|
||||
io->flag |= MM_I_HPC, io->k = 19;
|
||||
} else if (strcmp(preset, "map-ont") == 0) {
|
||||
io->flag = 0, io->k = 15;
|
||||
} else if (strcmp(preset, "asm5") == 0) {
|
||||
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
|
||||
mo->bw_long = mo->bw;
|
||||
mo->occ_dist = 0;
|
||||
} else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
|
||||
io->flag = 0, io->k = 19, io->w = 19;
|
||||
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
||||
mo->min_mid_occ = 100;
|
||||
mo->max_gap = 10000;
|
||||
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
|
||||
mo->occ_dist = 500;
|
||||
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
||||
mo->min_dp_max = 200;
|
||||
mo->best_n = 50;
|
||||
} else if (strcmp(preset, "asm10") == 0) {
|
||||
} else if (strncmp(preset, "asm", 3) == 0) {
|
||||
io->flag = 0, io->k = 19, io->w = 19;
|
||||
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
||||
mo->min_mid_occ = 100;
|
||||
mo->min_dp_max = 200;
|
||||
mo->best_n = 50;
|
||||
} else if (strcmp(preset, "asm20") == 0) {
|
||||
io->flag = 0, io->k = 19, io->w = 10;
|
||||
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
||||
mo->min_mid_occ = 100;
|
||||
mo->bw = mo->bw_long = 100000;
|
||||
mo->max_gap = 10000;
|
||||
mo->flag |= MM_F_RMQ;
|
||||
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
||||
mo->min_dp_max = 200;
|
||||
mo->best_n = 50;
|
||||
if (strcmp(preset, "asm5") == 0) {
|
||||
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
||||
} else if (strcmp(preset, "asm10") == 0) {
|
||||
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
||||
} else if (strcmp(preset, "asm20") == 0) {
|
||||
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
|
||||
io->w = 10;
|
||||
} else return -1;
|
||||
} else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {
|
||||
io->flag = 0, io->k = 21, io->w = 11;
|
||||
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
|
||||
|
|
@ -114,7 +130,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
mo->end_bonus = 10;
|
||||
mo->max_frag_len = 800;
|
||||
mo->max_gap = 100;
|
||||
mo->bw = 100;
|
||||
mo->bw = mo->bw_long = 100;
|
||||
mo->pri_ratio = 0.5f;
|
||||
mo->min_cnt = 2;
|
||||
mo->min_chain_score = 25;
|
||||
|
|
@ -126,7 +142,8 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
} else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) {
|
||||
io->flag = 0, io->k = 15, io->w = 5;
|
||||
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
|
||||
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
|
||||
mo->max_sw_mat = 0;
|
||||
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
|
||||
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
|
||||
mo->noncan = 9;
|
||||
mo->junc_bonus = 9;
|
||||
|
|
@ -139,6 +156,16 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
|
||||
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
|
||||
{
|
||||
if (mo->bw > mo->bw_long) {
|
||||
if (mm_verbose >= 1)
|
||||
fprintf(stderr, "[ERROR]\033[1;31m with '-rNUM1,NUM2', NUM1 (%d) can't be larger than NUM2 (%d)\033[0m\n", mo->bw, mo->bw_long);
|
||||
return -8;
|
||||
}
|
||||
if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) {
|
||||
if (mm_verbose >= 1)
|
||||
fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n");
|
||||
return -7;
|
||||
}
|
||||
if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) {
|
||||
if (mm_verbose >= 1)
|
||||
fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n");
|
||||
|
|
|
|||
|
|
@ -144,7 +144,7 @@ properties:
|
|||
* **mlen**: length of the matching bases in the alignment, excluding ambiguous
|
||||
base matches.
|
||||
|
||||
* **NM**: number of mismatches, gaps and ambiguous poistions in the alignment
|
||||
* **NM**: number of mismatches, gaps and ambiguous positions in the alignment
|
||||
|
||||
* **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the
|
||||
reverse strand; 0 if unknown
|
||||
|
|
|
|||
|
|
@ -13,21 +13,27 @@ cdef extern from "minimap.h":
|
|||
int64_t flag
|
||||
int seed
|
||||
int sdust_thres
|
||||
|
||||
int max_qlen
|
||||
int bw
|
||||
|
||||
int bw, bw_long
|
||||
int max_gap, max_gap_ref
|
||||
int max_frag_len
|
||||
int max_chain_skip, max_chain_iter
|
||||
int min_cnt
|
||||
int min_chain_score
|
||||
float chain_gap_scale
|
||||
int rmq_size_cap, rmq_inner_dist
|
||||
int rmq_rescue_size
|
||||
float rmq_rescue_ratio
|
||||
|
||||
float mask_level
|
||||
int mask_len
|
||||
float pri_ratio
|
||||
int best_n
|
||||
int max_join_long, max_join_short
|
||||
int min_join_flank_sc
|
||||
float min_join_flank_ratio
|
||||
|
||||
float alt_drop
|
||||
|
||||
int a, b, q, e, q2, e2
|
||||
int sc_ambi
|
||||
int noncan
|
||||
|
|
@ -38,13 +44,16 @@ cdef extern from "minimap.h":
|
|||
int min_ksw_len
|
||||
int anchor_ext_len, anchor_ext_shift
|
||||
float max_clip_ratio
|
||||
|
||||
int pe_ori, pe_bonus
|
||||
|
||||
float mid_occ_frac
|
||||
int32_t min_mid_occ
|
||||
int32_t mid_occ
|
||||
int32_t max_occ
|
||||
int64_t mini_batch_size
|
||||
int64_t max_sw_mat
|
||||
|
||||
const char *split_prefix
|
||||
|
||||
int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from libc.stdlib cimport free
|
|||
cimport cmappy
|
||||
import sys
|
||||
|
||||
__version__ = '2.17'
|
||||
__version__ = '2.21'
|
||||
|
||||
cmappy.mm_reset_timer()
|
||||
|
||||
|
|
@ -82,7 +82,7 @@ cdef class Alignment:
|
|||
|
||||
@property
|
||||
def cigar_str(self):
|
||||
return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
|
||||
return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar))
|
||||
|
||||
def __str__(self):
|
||||
if self._strand > 0: strand = '+'
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
#include "mmpriv.h"
|
||||
#include "kalloc.h"
|
||||
#include "ksort.h"
|
||||
|
||||
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
|
||||
{
|
||||
mm_seed_t *m;
|
||||
size_t i;
|
||||
int32_t k;
|
||||
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
|
||||
for (i = k = 0; i < mv->n; ++i) {
|
||||
const uint64_t *cr;
|
||||
mm_seed_t *q;
|
||||
mm128_t *p = &mv->a[i];
|
||||
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
|
||||
int t;
|
||||
cr = mm_idx_get(mi, p->x>>8, &t);
|
||||
if (t == 0) continue;
|
||||
q = &m[k++];
|
||||
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
|
||||
q->is_tandem = q->flt = 0;
|
||||
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
|
||||
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
|
||||
}
|
||||
*n_m_ = k;
|
||||
return m;
|
||||
}
|
||||
|
||||
#define MAX_MAX_HIGH_OCC 128
|
||||
|
||||
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
|
||||
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
|
||||
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
|
||||
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
|
||||
int32_t i, last0, m;
|
||||
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
|
||||
|
||||
if (n == 0 || n == 1) return;
|
||||
for (i = m = 0; i < n; ++i)
|
||||
if (a[i].n > max_occ) ++m;
|
||||
if (m == 0) return; // no high-frequency k-mers; do nothing
|
||||
for (i = 0, last0 = -1; i <= n; ++i) {
|
||||
if (i == n || a[i].n <= max_occ) {
|
||||
if (i - last0 > 1) {
|
||||
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
|
||||
int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
|
||||
int32_t j, k, st = last0 + 1, en = i;
|
||||
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
|
||||
if (max_high_occ > 0) {
|
||||
if (max_high_occ > MAX_MAX_HIGH_OCC)
|
||||
max_high_occ = MAX_MAX_HIGH_OCC;
|
||||
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
|
||||
b[k] = (uint64_t)a[j].n<<32 | j;
|
||||
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
|
||||
for (; j < en; ++j) { // if there are more, choose top max_high_occ
|
||||
if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
|
||||
b[0] = (uint64_t)a[j].n<<32 | j;
|
||||
ks_heapdown_uint64_t(0, k, b);
|
||||
}
|
||||
}
|
||||
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
|
||||
}
|
||||
for (j = st; j < en; ++j) a[j].flt ^= 1;
|
||||
for (j = st; j < en; ++j)
|
||||
if (a[j].n > max_max_occ)
|
||||
a[j].flt = 1;
|
||||
}
|
||||
last0 = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
|
||||
{
|
||||
int rep_st = 0, rep_en = 0, n_m, n_m0;
|
||||
size_t i;
|
||||
mm_seed_t *m;
|
||||
*n_mini_pos = 0;
|
||||
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
|
||||
m = mm_seed_collect_all(km, mi, mv, &n_m0);
|
||||
if (dist > 0 && max_max_occ > max_occ) {
|
||||
mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
|
||||
} else {
|
||||
for (i = 0; i < n_m0; ++i)
|
||||
if (m[i].n > max_occ)
|
||||
m[i].flt = 1;
|
||||
}
|
||||
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
|
||||
mm_seed_t *q = &m[i];
|
||||
//fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
|
||||
if (q->flt) {
|
||||
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
|
||||
if (st > rep_en) {
|
||||
*rep_len += rep_en - rep_st;
|
||||
rep_st = st, rep_en = en;
|
||||
} else rep_en = en;
|
||||
} else {
|
||||
*n_a += q->n;
|
||||
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
|
||||
m[n_m++] = *q;
|
||||
}
|
||||
}
|
||||
*rep_len += rep_en - rep_st;
|
||||
*_n_m = n_m;
|
||||
return m;
|
||||
}
|
||||
18
setup.py
18
setup.py
|
|
@ -4,16 +4,6 @@ except ImportError:
|
|||
from distutils.core import setup
|
||||
from distutils.extension import Extension
|
||||
|
||||
cmdclass = {}
|
||||
|
||||
try:
|
||||
from Cython.Build import build_ext
|
||||
except ImportError: # without Cython
|
||||
module_src = 'python/mappy.c'
|
||||
else: # with Cython
|
||||
module_src = 'python/mappy.pyx'
|
||||
cmdclass['build_ext'] = build_ext
|
||||
|
||||
import sys, platform
|
||||
|
||||
sys.path.append('python')
|
||||
|
|
@ -33,7 +23,7 @@ def readme():
|
|||
|
||||
setup(
|
||||
name = 'mappy',
|
||||
version = '2.17',
|
||||
version = '2.21',
|
||||
url = 'https://github.com/lh3/minimap2',
|
||||
description = 'Minimap2 python binding',
|
||||
long_description = readme(),
|
||||
|
|
@ -42,8 +32,8 @@ setup(
|
|||
license = 'MIT',
|
||||
keywords = 'sequence-alignment',
|
||||
scripts = ['python/minimap2.py'],
|
||||
ext_modules = [Extension('mappy',
|
||||
sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
|
||||
ext_modules = [Extension('mappy',
|
||||
sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
|
||||
'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
|
||||
'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
|
||||
depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
|
||||
|
|
@ -62,4 +52,4 @@ setup(
|
|||
'Programming Language :: Python :: 3',
|
||||
'Intended Audience :: Science/Research',
|
||||
'Topic :: Scientific/Engineering :: Bio-Informatics'],
|
||||
cmdclass = cmdclass)
|
||||
setup_requires=["cython"])
|
||||
|
|
|
|||
Loading…
Reference in New Issue