Merge branch 'master' into utec

This commit is contained in:
Heng Li 2021-07-16 13:32:47 -04:00
commit b046052d82
39 changed files with 2143 additions and 531 deletions

21
.github/workflows/ci.yaml vendored 100644
View File

@ -0,0 +1,21 @@
name: CI
on:
push:
branches:
- master
pull_request:
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
compiler: [gcc, clang]
steps:
- name: Checkout minimap2
uses: actions/checkout@v2
- name: Compile with ${{ matrix.compiler }}
run: make CC=${{ matrix.compiler }}

3
.gitmodules vendored 100644
View File

@ -0,0 +1,3 @@
[submodule "lib/simde"]
path = lib/simde
url = https://github.com/nemequ/simde.git

View File

@ -19,6 +19,6 @@ matrix:
before_install: pip install cython
script: python setup.py build_ext
- language: python
python: "3.6"
python: "3.9"
before_install: pip install cython
script: python setup.py build_ext

View File

@ -4,7 +4,6 @@ include ksw2_dispatch.c
include main.c
include README.md
include sse2neon/emmintrin.h
include python/mappy.c
include python/cmappy.h
include python/cmappy.pxd
include python/mappy.pyx

View File

@ -1,7 +1,9 @@
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC
INCLUDES=
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
ksw2_ll_sse.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread
@ -103,26 +105,28 @@ depend:
# DO NOT DELETE
align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
bseq.o: bseq.h kvec.h kalloc.h kseq.h
chain.o: minimap.h mmpriv.h bseq.h kalloc.h
esterr.o: mmpriv.h minimap.h bseq.h
esterr.o: mmpriv.h minimap.h bseq.h kseq.h
example.o: minimap.h kseq.h
format.o: kalloc.h mmpriv.h minimap.h bseq.h
hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h
index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
index.o: ksort.h
kalloc.o: kalloc.h
ksw2_extd2_sse.o: ksw2.h kalloc.h
ksw2_exts2_sse.o: ksw2.h kalloc.h
ksw2_extz2_sse.o: ksw2.h kalloc.h
ksw2_ll_sse.o: ksw2.h kalloc.h
kthread.o: kthread.h
main.o: bseq.h minimap.h mmpriv.h ketopt.h
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h
map.o: ksort.h
misc.o: mmpriv.h minimap.h bseq.h ksort.h
options.o: mmpriv.h minimap.h bseq.h
pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h
sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h
splitidx.o: mmpriv.h minimap.h bseq.h
lchain.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h krmq.h
main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
map.o: khash.h ksort.h
misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
options.o: mmpriv.h minimap.h bseq.h kseq.h
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
sdust.o: kalloc.h kdq.h kvec.h sdust.h
seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h

97
Makefile.simde 100644
View File

@ -0,0 +1,97 @@
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
INCLUDES= -Ilib/simde
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o \
ksw2_extz2_simde.o ksw2_extd2_simde.o ksw2_exts2_simde.o ksw2_ll_simde.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread
ifneq ($(arm_neon),) # if arm_neon is defined
ifeq ($(aarch64),) #if aarch64 is not defined
CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char
else #if aarch64 is defined
CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char
endif
endif
ifneq ($(asan),)
CFLAGS+=-fsanitize=address
LIBS+=-fsanitize=address
endif
ifneq ($(tsan),)
CFLAGS+=-fsanitize=thread
LIBS+=-fsanitize=thread
endif
.PHONY:all extra clean depend
.SUFFIXES:.c .o
.c.o:
$(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@
all:$(PROG)
extra:all $(PROG_EXTRA)
minimap2:main.o libminimap2.a
$(CC) $(CFLAGS) main.o -o $@ -L. -lminimap2 $(LIBS)
minimap2-lite:example.o libminimap2.a
$(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS)
libminimap2.a:$(OBJS)
$(AR) -csru $@ $(OBJS)
sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h
$(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz
ksw2_ll_simde.o:ksw2_ll_sse.c ksw2.h kalloc.h
$(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@
ksw2_extz2_simde.o:ksw2_extz2_sse.c ksw2.h kalloc.h
$(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
ksw2_extd2_simde.o:ksw2_extd2_sse.c ksw2.h kalloc.h
$(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
ksw2_exts2_simde.o:ksw2_exts2_sse.c ksw2.h kalloc.h
$(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@
# other non-file targets
clean:
rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM build dist mappy*.so mappy.c python/mappy.c mappy.egg*
depend:
(LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(CPPFLAGS) -- *.c)
# DO NOT DELETE
align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h
bseq.o: bseq.h kvec.h kalloc.h kseq.h
chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h
esterr.o: mmpriv.h minimap.h bseq.h kseq.h
example.o: minimap.h kseq.h
format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
index.o: ksort.h
kalloc.o: kalloc.h
ksw2_extd2_sse.o: ksw2.h kalloc.h
ksw2_exts2_sse.o: ksw2.h kalloc.h
ksw2_extz2_sse.o: ksw2.h kalloc.h
ksw2_ll_sse.o: ksw2.h kalloc.h
kthread.o: kthread.h
main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h
map.o: khash.h ksort.h
misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h
options.o: mmpriv.h minimap.h bseq.h kseq.h
pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h
sdust.o: kalloc.h kdq.h kvec.h sdust.h
self-chain.o: minimap.h kseq.h
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h
splitidx.o: mmpriv.h minimap.h bseq.h kseq.h

148
NEWS.md
View File

@ -1,3 +1,151 @@
Release 2.21-r1071 (6 July 2021)
--------------------------------
This release fixed a regression in short-read mapping introduced in v2.19
(#776). It also fixed invalid comparisons of uninitialized variables, though
these are harmless (#752). Long-read alignment should be identical to v2.20.
(2.21: 6 July 2021)
Release 2.20-r1061 (27 May 2021)
--------------------------------
This release fixed a bug in the Python module and improves the command-line
compatibiliity with v2.18. In v2.19, if `-r` is specified with an `asm*` preset,
users would get alignments more fragmented than v2.18. This could be an issue
for existing pipelines specifying `-r`. This release resolves this issue.
(2.20: 27 May 2021, r1061)
Release 2.19-r1057 (26 May 2021)
--------------------------------
This release includes a few important improvements backported from unimap:
* Improvement: more contiguous alignment through long INDELs. This is enabled
by the minigraph chaining algorithm. All `asm*` presets now use the new
algorithm. They can find INDELs up to 100kb and may be faster for
chromosome-long contigs. The default mode and `map*` presets use this
algorithm to replace the long-join heuristic.
* Improvement: better alignment in highly repetitive regions by rescuing
high-occurrence seeds. If the distance between two adjacent seeds is too
large, attempt to choose a fraction of high-occurrence seeds in-between.
Minimap2 now produces fewer clippings and alignment break points in long
satellite regions.
* Improvement: allow to specify an interval of k-mer occurrences with `-U`.
For repeat-rich genomes, the automatic k-mer occurrence threshold determined
by `-f` may be too large and makes alignment impractically slow. The new
option protects against such cases. Enabled for `asm*` and `map-hifi`.
* New feature: added the `map-hifi` preset for maping PacBio High-Fidelity
(HiFi) reads.
* Change to the default: apply `--cap-sw-mem=100m` for genomic alignment.
* Bugfix: minimap2 could not generate an index file with `-xsr` (#734).
This release represents the most signficant algorithmic change since v2.1 in
2017. With features backported from unimap, minimap2 now has similar power to
unimap for contig alignment. Unimap will remain an experimental project and is
no longer recommended over minimap2. Sorry for reverting the recommendation in
short time.
(2.19: 26 May 2021, r1057)
Release 2.18-r1015 (9 April 2021)
---------------------------------
This release fixes multiple rare bugs in minimap2 and adds additional
functionality to paftools.js.
Changes to minimap2:
* Bugfix: a rare segfault caused by an off-by-one error (#489)
* Bugfix: minimap2 segfaulted due to an uninitilized variable (#622 and #625).
* Bugfix: minimap2 parsed spaces as field separators in BED (#721). This led
to issues when the BED name column contains spaces.
* Bugfix: minimap2 `--split-prefix` did not work with long reference names
(#394).
* Bugfix: option `--junc-bonus` didn't work (#513)
* Bugfix: minimap2 didn't return 1 on I/O errors (#532)
* Bugfix: the `de:f` tag (sequence divergence) could be negative if there were
ambiguous bases
* Bugfix: fixed two undefined behaviors caused by calling memcpy() on
zero-length blocks (#443)
* Bugfix: there were duplicated SAM @SQ lines if option `--split-prefix` is in
use (#400 and #527)
* Bugfix: option -K had to be smaller than 2 billion (#491). This was caused
by a 32-bit integer overflow.
* Improvement: optionally compile against SIMDe (#597). Minimap2 should work
with IBM POWER CPUs, though this has not been tested. To compile with SIMDe,
please use `make -f Makefile.simde`.
* Improvement: more informative error message for I/O errors (#454) and for
FASTQ parsing errors (#510)
* Improvement: abort given malformatted RG line (#541)
* Improvement: better formula to estimate the `dv:f` tag (approximate sequence
divergence). See DOI:10.1101/2021.01.15.426881.
* New feature: added the `--mask-len` option to fine control the removal of
redundant hits (#659). The default behavior is unchanged.
Changes to mappy:
* Bugfix: mappy caused segmentation fault if the reference index is not
present (#413).
* Bugfix: fixed a memory leak via 238b6bb3
* Change: always require Cython to compile the mappy module (#723). Older
mappy packages at PyPI bundled the C source code generated by Cython such
that end users did not need to install Cython to compile mappy. However, as
Python 3.9 is breaking backward compatibility, older mappy does not work
with Python 3.9 anymore. We have to add this Cython dependency as a
workaround.
Changes to paftools.js:
* Bugfix: the "part10-" line from asmgene was wrong (#581)
* Improvement: compatibility with GTF files from GenBank (#422)
* New feature: asmgene also checks missing multi-copy genes
* New feature: added the misjoin command to evaluate large-scale misjoins and
megabase-long inversions.
Although given the many bug fixes and minor improvements, the core algorithm
stays the same. This version of minimap2 produces nearly identical alignments
to v2.17 except very rare corner cases.
Now unimap is recommended over minimap2 for aligning long contigs against a
reference genome. It often takes less wall-clock time and is much more
sensitive to long insertions and deletions.
(2.18: 9 April 2021, r1015)
Release 2.17-r941 (4 May 2019)
------------------------------

View File

@ -1,7 +1,7 @@
[![GitHub Downloads](https://img.shields.io/github/downloads/lh3/minimap2/total.svg?style=social&logo=github&label=Download)](https://github.com/lh3/minimap2/releases)
[![BioConda Install](https://img.shields.io/conda/dn/bioconda/minimap2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/minimap2)
[![PyPI](https://img.shields.io/pypi/v/mappy.svg?style=flat)](https://pypi.python.org/pypi/mappy)
[![Build Status](https://travis-ci.org/lh3/minimap2.svg?branch=master)](https://travis-ci.org/lh3/minimap2)
[![Build Status](https://github.com/lh3/minimap2/actions/workflows/ci.yaml/badge.svg)](https://github.com/lh3/minimap2/actions)
## <a name="started"></a>Getting Started
```sh
git clone https://github.com/lh3/minimap2
@ -12,19 +12,22 @@ cd minimap2 && make
./minimap2 -x map-ont -d MT-human-ont.mmi test/MT-human.fa
./minimap2 -a MT-human-ont.mmi test/MT-orang.fa > test.sam
# use presets (no test data)
./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio genomic reads
./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads
./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads
./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio CCS genomic reads
./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier)
./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads
./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown)
./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq
./minimap2 -ax splice:hq -uf ref.fa query.fa > aln.sam # Final PacBio Iso-seq or traditional cDNA
./minimap2 -ax splice --junc-bed anno.bed12 ref.fa query.fa > aln.sam # prioritize on annotated junctions
./minimap2 -cx asm5 asm1.fa asm2.fa > aln.paf # intra-species asm-to-asm alignment
./minimap2 -x ava-pb reads.fa reads.fa > overlaps.paf # PacBio read overlap
./minimap2 -x ava-ont reads.fa reads.fa > overlaps.paf # Nanopore read overlap
# man page for detailed command line options
man ./minimap2.1
```
## Table of Contents
- [Getting Started](#started)
@ -71,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
the [release page][release] with:
```sh
curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar -jxvf -
./minimap2-2.17_x64-linux/minimap2
curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar -jxvf -
./minimap2-2.21_x64-linux/minimap2
```
If you want to compile from the source, you need to have a C compiler, GNU make
and zlib development files installed. Then type `make` in the source code
@ -80,13 +83,20 @@ directory to compile. If you see compilation errors, try `make sse2only=1`
to disable SSE4 code, which will make minimap2 slightly slower.
Minimap2 also works with ARM CPUs supporting the NEON instruction sets. To
compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 aarch64=1`.
compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To
compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1
aarch64=1`.
Minimap2 can use [SIMD Everywhere (SIMDe)][simde] library for porting
implementation to the different SIMD instruction sets. To compile using SIMDe,
use `make -f Makefile.simde`. To compile for ARM CPUs, use `Makefile.simde`
with the ARM related command lines given above.
### <a name="general"></a>General usage
Without any options, minimap2 takes a reference database and a query sequence
file as input and produce approximate mapping, without base-level alignment
(i.e. no CIGAR), in the [PAF format][paf]:
(i.e. coordinates are only approximate and no CIGAR in output), in the [PAF format][paf]:
```sh
minimap2 ref.fa query.fq > approx-mapping.paf
```
@ -127,13 +137,13 @@ parameters at the same time. The default setting is the same as `map-ont`.
#### <a name="map-long-genomic"></a>Map long noisy genomic reads
```sh
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio subreads
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
```
The difference between `map-pb` and `map-ont` is that `map-pb` uses
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
performance and sensitivity when aligning PacBio reads, but hurt when aligning
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
Nanopore reads.
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
@ -178,10 +188,23 @@ This is because SIRV does not honor the evolutionarily conservative splicing
signal. If you are studying SIRV, you may apply `--splice-flank=no` to let
minimap2 only model GT..AG, ignoring the additional base.
Since v2.17, minimap2 can optionally take annotated genes as input and
prioritize on annotated splice junctions. To use this feature, you can
```sh
paftools.js gff2bed anno.gff > anno.bed
minimap2 -ax splice --junc-bed anno.bed ref.fa query.fa > aln.sam
```
Here, `anno.gff` is the gene annotation in the GTF or GFF3 format (`gff2bed`
automatically tests the format). The output of `gff2bed` is in the 12-column
BED format, or the BED12 format. With the `--junc-bed` option, minimap2 adds a
bonus score (tuned by `--junc-bonus`) if an aligned junction matches a junction
in the annotation. Option `--junc-bed` also takes 5-column BED, including the
strand field. In this case, each line indicates an oriented junction.
#### <a name="long-overlap"></a>Find overlaps between long reads
```sh
minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio read overlap
minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio CLR read overlap
minimap2 -x ava-ont reads.fq reads.fq > ovlp.paf # Oxford Nanopore read overlap
```
Similarly, `ava-pb` uses HPC minimizers while `ava-ont` uses ordinary
@ -228,7 +251,7 @@ To avoid this issue, you can add option `-L` at the minimap2 command line.
This option moves a long CIGAR to the `CG` tag and leaves a fully clipped CIGAR
at the SAM CIGAR column. Current tools that don't read CIGAR (e.g. merging and
sorting) still work with such BAM records; tools that read CIGAR will
effectively ignore these records. It has been decided that future tools will
effectively ignore these records. It has been decided that future tools
will seamlessly recognize long-cigar records generated by option `-L`.
**TL;DR**: if you work with ultra-long reads and use tools that only process
@ -249,7 +272,7 @@ CGATCGATAAATAGAGTAG---GAATAGCA
CGATCG---AATAGAGTAGGTCGAATtGCA
```
is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an
identical block, `-ata` represents a deltion, `+gtc` an insertion and `*at`
identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at`
indicates reference base `a` is substituted with a query base `t`. It is
similar to the `MD` SAM tag but is standalone and easier to parse.
@ -376,3 +399,5 @@ mappy` or [from BioConda][mappyconda] via `conda install -c bioconda mappy`.
[manpage]: https://lh3.github.io/minimap2/minimap2.html
[manpage-cs]: https://lh3.github.io/minimap2/minimap2.html#10
[doi]: https://doi.org/10.1093/bioinformatics/bty191
[smide]: https://github.com/nemequ/simde
[unimap]: https://github.com/lh3/unimap

79
align.c
View File

@ -38,8 +38,8 @@ static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, i
int z = *max - score - diff * e;
if (z > *max_zdrop) {
*max_zdrop = z;
pos[0][0] = *max_i, pos[0][1] = i + 1;
pos[1][0] = *max_j, pos[1][1] = j + 1;
pos[0][0] = *max_i, pos[0][1] = i;
pos[1][0] = *max_j, pos[1][1] = j;
}
} else *max = score, *max_i = i, *max_j = j;
}
@ -53,16 +53,16 @@ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq,
// find the score and the region where score drops most along diagonal
for (k = 0, score = 0; k < n_cigar; ++k) {
uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4;
if (op == 0) {
if (op == MM_CIGAR_MATCH) {
for (l = 0; l < len; ++l) {
score += mat[tseq[i + l] * 5 + qseq[j + l]];
update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
}
i += len, j += len;
} else if (op == 1 || op == 2 || op == 3) {
} else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) {
score -= opt->q + opt->e * len;
if (op == 1) j += len; // insertion
else i += len; // deletion
if (op == MM_CIGAR_INS) j += len;
else i += len;
update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
}
}
@ -98,12 +98,12 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
for (k = 0; k < p->n_cigar; ++k) { // indel left alignment
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (len == 0) to_shrink = 1;
if (op == 0) {
if (op == MM_CIGAR_MATCH) {
toff += len, qoff += len;
} else if (op == 1 || op == 2) { // insertion or deletion
} else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) {
int l, prev_len = p->cigar[k-1] >> 4;
if (op == 1) {
if (op == MM_CIGAR_INS) {
for (l = 0; l < prev_len; ++l)
if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l])
break;
@ -116,9 +116,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l;
if (l == prev_len) to_shrink = 1;
}
if (op == 1) qoff += len;
if (op == MM_CIGAR_INS) qoff += len;
else toff += len;
} else if (op == 3) {
} else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@ -128,13 +128,13 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
uint32_t l, s[3] = {0,0,0};
for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D
uint32_t op = p->cigar[l]&0xf;
if (op == 1 || op == 2 || p->cigar[l]>>4 == 0)
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0)
s[op] += p->cigar[l] >> 4;
else break;
}
if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D
p->cigar[k] = s[1]<<4|1;
p->cigar[k+1] = s[2]<<4|2;
p->cigar[k] = s[1]<<4|MM_CIGAR_INS;
p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL;
for (k += 2; k < l; ++k)
p->cigar[k] &= 0xf;
to_shrink = 1;
@ -154,9 +154,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator
p->n_cigar = l;
}
if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D
if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D
int32_t l = p->cigar[0] >> 4;
if ((p->cigar[0]&0xf) == 1) {
if ((p->cigar[0]&0xf) == MM_CIGAR_INS) {
if (r->rev) r->qe -= l;
else r->qs += l;
*qshift = l;
@ -174,7 +174,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (r->p == 0) return;
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) {
if (op == MM_CIGAR_MATCH) {
while (len > 0) {
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "="
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
@ -183,11 +183,11 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
}
++n_M;
} else if (op == 1) { // insertion
} else if (op == MM_CIGAR_INS) {
qoff += len;
} else if (op == 2) { // deletion
} else if (op == MM_CIGAR_DEL) {
toff += len;
} else if (op == 3) { // intron
} else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@ -195,7 +195,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (n_EQX == n_M) {
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) r->p->cigar[k] = len << 4 | 7;
if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH;
}
return;
}
@ -209,25 +209,25 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
toff = qoff = m = 0;
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) { // match/mismatch
if (op == MM_CIGAR_MATCH) {
while (len > 0) {
// match
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {}
if (l > 0) p->cigar[m++] = l << 4 | 7;
if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH;
len -= l;
toff += l, qoff += l;
// mismatch
for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {}
if (l > 0) p->cigar[m++] = l << 4 | 8;
if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH;
len -= l;
toff += l, qoff += l;
}
continue;
} else if (op == 1) { // insertion
} else if (op == MM_CIGAR_INS) {
qoff += len;
} else if (op == 2) { // deletion
} else if (op == MM_CIGAR_DEL) {
toff += len;
} else if (op == 3) { // intron
} else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
p->cigar[m++] = r->p->cigar[k];
@ -248,7 +248,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
r->blen = r->mlen = 0;
for (k = 0; k < p->n_cigar; ++k) {
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (op == 0) { // match/mismatch
if (op == MM_CIGAR_MATCH) {
int n_ambi = 0, n_diff = 0;
for (l = 0; l < len; ++l) {
int cq = qseq[qoff + l], ct = tseq[toff + l];
@ -260,7 +260,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
}
r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi;
toff += len, qoff += len;
} else if (op == 1) { // insertion
} else if (op == MM_CIGAR_INS) {
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (qseq[qoff + l] > 3) ++n_ambi;
@ -268,7 +268,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
s -= q + e * len;
if (s < 0) s = 0;
qoff += len;
} else if (op == 2) { // deletion
} else if (op == MM_CIGAR_DEL) {
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi;
@ -276,7 +276,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
s -= q + e * len;
if (s < 0) s = 0;
toff += len;
} else if (op == 3) { // intron
} else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@ -333,7 +333,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
int i;
fprintf(stderr, "score=%d, cigar=", ez->score);
for (i = 0; i < ez->n_cigar; ++i)
fprintf(stderr, "%d%c", ez->cigar[i]>>4, "MIDN"[ez->cigar[i]&0xf]);
fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]);
fprintf(stderr, "\n");
}
}
@ -572,7 +572,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
uint8_t *tseq, *qseq, *junc;
int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
int32_t rs, re, qs, qe;
int32_t rs1, qs1, re1, qe1;
int8_t mat[25];
@ -583,6 +583,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (r->cnt == 0) return;
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
bw = (int)(opt->bw * 1.5 + 1.);
bw_long = (int)(opt->bw_long * 1.5 + 1.);
if (bw_long < bw) bw_long = bw;
if (is_sr && !(mi->flag & MM_I_HPC)) {
mm_max_stretch(r, a, &as1, &cnt1);
@ -724,7 +726,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
} else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
re1 = re, qe1 = qe;
if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) {
int j, bw1 = bw, zdrop_code;
int j, bw1 = bw_long, zdrop_code;
if (a[as1+i].y & MM_SEED_LONG_JOIN)
bw1 = qe - qs > re - rs? qe - qs : re - rs;
// perform alignment
@ -743,7 +745,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
}
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs);
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
} else { // perform normal gapped alignment
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
}
@ -754,6 +756,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (ez->n_cigar > 0)
mm_append_cigar(r, ez->n_cigar, ez->cigar);
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
if (!r->p) {
assert(ez->n_cigar == 0);
uint32_t capacity = sizeof(mm_extra_t)/4;
kroundup32(capacity);
r->p = (mm_extra_t*)calloc(capacity, 4);
r->p->capacity = capacity;
}
for (j = i - 1; j >= 0; --j)
if ((int32_t)a[as1 + j].x <= rs + ez->max_t)
break;

164
chain.c
View File

@ -1,164 +0,0 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "minimap.h"
#include "mmpriv.h"
#include "kalloc.h"
static const char LogTable256[256] = {
#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
-1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
};
static inline int ilog2_32(uint32_t v)
{
uint32_t t, tt;
if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt];
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
}
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
{ // TODO: make sure this works when n has more than 32 bits
int32_t k, *f, *p, *t, *v, n_u, n_v;
int64_t i, j, st = 0;
uint64_t *u, *u2, sum_qspan = 0;
float avg_qspan;
mm128_t *b, *w;
if (_u) *_u = 0, *n_u_ = 0;
if (n == 0 || a == 0) {
kfree(km, a);
return 0;
}
f = (int32_t*)kmalloc(km, n * 4);
p = (int32_t*)kmalloc(km, n * 4);
t = (int32_t*)kmalloc(km, n * 4);
v = (int32_t*)kmalloc(km, n * 4);
memset(t, 0, n * 4);
for (i = 0; i < n; ++i) sum_qspan += a[i].y>>32&0xff;
avg_qspan = (float)sum_qspan / n;
// fill the score and backtrack arrays
for (i = 0; i < n; ++i) {
uint64_t ri = a[i].x;
int64_t max_j = -1;
int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!!
int32_t max_f = q_span, n_skip = 0, min_d;
int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
while (st < i && ri > a[st].x + max_dist_x) ++st;
if (i - st > max_iter) st = i - max_iter;
for (j = i - 1; j >= st; --j) {
int64_t dr = ri - a[j].x;
int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost;
int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below
if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue;
dd = dr > dq? dr - dq : dq - dr;
if (sidi == sidj && dd > bw) continue;
if (n_segs > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) continue;
min_d = dq < dr? dq : dr;
sc = min_d > q_span? q_span : dq < dr? dq : dr;
log_dd = dd? ilog2_32(dd) : 0;
gap_cost = 0;
if (is_cdna || sidi != sidj) {
int c_log, c_lin;
c_lin = (int)(dd * .01 * avg_qspan);
c_log = log_dd;
if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log;
else gap_cost = c_lin + (c_log>>1);
} else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1);
sc -= (int)((double)gap_cost * gap_scale + .499);
sc += f[j];
if (sc > max_f) {
max_f = sc, max_j = j;
if (n_skip > 0) --n_skip;
} else if (t[j] == i) {
if (++n_skip > max_skip)
break;
}
if (p[j] >= 0) t[p[j]] = i;
}
f[i] = max_f, p[i] = max_j;
v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
}
// find the ending positions of chains
memset(t, 0, n * 4);
for (i = 0; i < n; ++i)
if (p[i] >= 0) t[p[i]] = 1;
for (i = n_u = 0; i < n; ++i)
if (t[i] == 0 && v[i] >= min_sc)
++n_u;
if (n_u == 0) {
kfree(km, a); kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, v);
return 0;
}
u = (uint64_t*)kmalloc(km, n_u * 8);
for (i = n_u = 0; i < n; ++i) {
if (t[i] == 0 && v[i] >= min_sc) {
j = i;
while (j >= 0 && f[j] < v[j]) j = p[j]; // find the peak that maximizes f[]
if (j < 0) j = i; // TODO: this should really be assert(j>=0)
u[n_u++] = (uint64_t)f[j] << 32 | j;
}
}
radix_sort_64(u, u + n_u);
for (i = 0; i < n_u>>1; ++i) { // reverse, s.t. the highest scoring chain is the first
uint64_t t = u[i];
u[i] = u[n_u - i - 1], u[n_u - i - 1] = t;
}
// backtrack
memset(t, 0, n * 4);
for (i = n_v = k = 0; i < n_u; ++i) { // starting from the highest score
int32_t n_v0 = n_v, k0 = k;
j = (int32_t)u[i];
do {
v[n_v++] = j;
t[j] = 1;
j = p[j];
} while (j >= 0 && t[j] == 0);
if (j < 0) {
if (n_v - n_v0 >= min_cnt) u[k++] = u[i]>>32<<32 | (n_v - n_v0);
} else if ((int32_t)(u[i]>>32) - f[j] >= min_sc) {
if (n_v - n_v0 >= min_cnt) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0);
}
if (k0 == k) n_v = n_v0; // no new chain added, reset
}
*n_u_ = n_u = k, *_u = u; // NB: note that u[] may not be sorted by score here
// free temporary arrays
kfree(km, f); kfree(km, p); kfree(km, t);
// write the result to b[]
b = (mm128_t*)kmalloc(km, n_v * sizeof(mm128_t));
for (i = 0, k = 0; i < n_u; ++i) {
int32_t k0 = k, ni = (int32_t)u[i];
for (j = 0; j < ni; ++j)
b[k] = a[v[k0 + (ni - j - 1)]], ++k;
}
kfree(km, v);
// sort u[] and a[] by a[].x, such that adjacent chains may be joined (required by mm_join_long)
w = (mm128_t*)kmalloc(km, n_u * sizeof(mm128_t));
for (i = k = 0; i < n_u; ++i) {
w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i;
k += (int32_t)u[i];
}
radix_sort_128x(w, w + n_u);
u2 = (uint64_t*)kmalloc(km, n_u * 8);
for (i = k = 0; i < n_u; ++i) {
int32_t j = (int32_t)w[i].y, n = (int32_t)u[j];
u2[i] = u[j];
memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t));
k += n;
}
if (n_u) memcpy(u, u2, n_u * 8);
if (k) memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot
kfree(km, a); kfree(km, w); kfree(km, u2);
return b;
}

30
code_of_conduct.md 100644
View File

@ -0,0 +1,30 @@
## Contributor Code of Conduct
As contributors and maintainers of this project, we pledge to respect all
people who contribute through reporting issues, posting feature requests,
updating documentation, submitting pull requests or patches, and other
activities.
We are committed to making participation in this project a harassment-free
experience for everyone, regardless of level of experience, gender, gender
identity and expression, sexual orientation, disability, personal appearance,
body size, race, age, or religion.
Examples of unacceptable behavior by participants include the use of sexual
language or imagery, derogatory comments or personal attacks, trolling, public
or private harassment, insults, or other unprofessional conduct.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct. Project maintainers or
contributors who do not follow the Code of Conduct may be removed from the
project team.
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by opening an issue or contacting the maintainer via email.
This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
1.0.0][v1].
[cc]: http://contributor-covenant.org/
[v1]: http://contributor-covenant.org/version/1/0/0/

View File

@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
please follow the command lines below:
```sh
# install minimap2 executables
curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar jxf -
cp minimap2-2.17_x64-linux/{minimap2,k8,paftools.js} . # copy executables
curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar jxf -
cp minimap2-2.21_x64-linux/{minimap2,k8,paftools.js} . # copy executables
export PATH="$PATH:"`pwd` # put the current directory on PATH
# download example datasets
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -

View File

@ -59,6 +59,6 @@ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const
n_tot = en - st + 1;
if (r->qs > avg_k && r->rs > avg_k) ++n_tot;
if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot;
r->div = logf((float)n_tot / n_match) / avg_k;
r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k));
}
}

View File

@ -47,7 +47,7 @@ int main(int argc, char *argv[])
printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq);
for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings!
printf("%d%c", r->p->cigar[i]>>4, "MIDNSH"[r->p->cigar[i]&0xf]);
printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]);
putchar('\n');
free(r->p);
}

View File

@ -144,8 +144,8 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
assert((op >= 0 && op <= 3) || op == 7 || op == 8);
if (op == 0 || op == 7 || op == 8) { // match
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
int l_tmp = 0;
for (j = 0; j < len; ++j) {
if (qseq[q_off + j] != tseq[t_off + j]) {
@ -166,12 +166,12 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
} else mm_sprintf_lite(s, ":%d", l_tmp);
}
q_off += len, t_off += len;
} else if (op == 1) { // insertion to ref
} else if (op == MM_CIGAR_INS) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "acgtn"[qseq[q_off + j]];
mm_sprintf_lite(s, "+%s", tmp);
q_off += len;
} else if (op == 2) { // deletion from ref
} else if (op == MM_CIGAR_DEL) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "acgtn"[tseq[t_off + j]];
mm_sprintf_lite(s, "-%s", tmp);
@ -192,8 +192,8 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
if (write_tag) mm_sprintf_lite(s, "\tMD:Z:");
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
assert((op >= 0 && op <= 3) || op == 7 || op == 8);
if (op == 0 || op == 7 || op == 8) { // match
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
for (j = 0; j < len; ++j) {
if (qseq[q_off + j] != tseq[t_off + j]) {
mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]);
@ -201,15 +201,15 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
} else ++l_MD;
}
q_off += len, t_off += len;
} else if (op == 1) { // insertion to ref
} else if (op == MM_CIGAR_INS) {
q_off += len;
} else if (op == 2) { // deletion from ref
} else if (op == MM_CIGAR_DEL) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "ACGTN"[tseq[t_off + j]];
mm_sprintf_lite(s, "%d^%s", l_MD, tmp);
l_MD = 0;
t_off += len;
} else if (op == 3) { // reference skip
} else if (op == MM_CIGAR_N_SKIP) {
t_off += len;
}
}
@ -277,7 +277,7 @@ double mm_event_identity(const mm_reg1_t *r)
if (r->p == 0) return -1.0f;
for (i = 0; i < r->p->n_cigar; ++i) {
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
if (op == 1 || op == 2)
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
++n_gapo, n_gap += len;
}
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);
@ -335,7 +335,7 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
uint32_t k;
mm_sprintf_lite(s, "\tcg:Z:");
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
}
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, opt_flag&MM_F_QSTRAND);
@ -392,7 +392,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co
assert(clip_len[0] < qlen && clip_len[1] < qlen);
if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char);
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char);
}
}
@ -402,7 +402,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
{
const int max_bam_cigar_op = 65535;
int flag, n_regs = n_regss[seg_idx], cigar_in_tag = 0;
int this_rid = -1, this_pos = -1, this_rev = 0;
int this_rid = -1, this_pos = -1;
const mm_reg1_t *regs = regss[seg_idx], *r_prev = NULL, *r_next;
const mm_reg1_t *r = n_regs > 0 && reg_idx < n_regs && reg_idx >= 0? &regs[reg_idx] : NULL;
@ -451,7 +451,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1);
} else mm_sprintf_lite(s, "\t*\t0\t0\t*");
} else {
this_rid = r->rid, this_pos = r->rs, this_rev = r->rev;
this_rid = r->rid, this_pos = r->rs;
mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq);
if ((opt_flag & MM_F_LONG_CIGAR) && r->p && r->p->n_cigar > max_bam_cigar_op - 2) {
int n_cigar = r->p->n_cigar;

62
hit.c
View File

@ -122,7 +122,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int
r->split |= 1, r2->split |= 2;
}
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
{
int i, j, k, *w;
uint64_t *cov;
@ -162,7 +162,7 @@ skip_uncov:
min = ej - sj < ei - si? ej - sj : ei - si;
max = ej - sj > ei - si? ej - sj : ei - si;
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
if ((float)ol / min - (float)uncov_len / max > mask_level) {
if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit
int cnt_sub = 0, sci = ri->score;
ri->parent = rp->parent;
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
@ -312,64 +312,6 @@ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
return as;
}
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
{
int i, n_aux, n_regs = *n_regs_, n_drop = 0;
uint64_t *aux;
if (n_regs < 2) return; // nothing to join
mm_squeeze_a(km, n_regs, regs, a);
aux = (uint64_t*)kmalloc(km, n_regs * 8);
for (i = n_aux = 0; i < n_regs; ++i)
if (regs[i].parent == i || regs[i].parent < 0)
aux[n_aux++] = (uint64_t)regs[i].as << 32 | i;
radix_sort_64(aux, aux + n_aux);
for (i = n_aux - 1; i >= 1; --i) {
mm_reg1_t *r0 = &regs[(int32_t)aux[i-1]], *r1 = &regs[(int32_t)aux[i]];
mm128_t *a0e, *a1s;
int max_gap, min_gap, sc_thres, min_flank_len;
// test
if (r0->as + r0->cnt != r1->as) continue; // not adjacent in a[]
if (r0->rid != r1->rid || r0->rev != r1->rev) continue; // make sure on the same target and strand
a0e = &a[r0->as + r0->cnt - 1];
a1s = &a[r1->as];
if (a1s->x <= a0e->x || (int32_t)a1s->y <= (int32_t)a0e->y) continue; // keep colinearity
max_gap = min_gap = (int32_t)a1s->y - (int32_t)a0e->y;
max_gap = a0e->x + max_gap > a1s->x? max_gap : a1s->x - a0e->x;
min_gap = a0e->x + min_gap < a1s->x? min_gap : a1s->x - a0e->x;
if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue;
sc_thres = (int)((float)opt->min_join_flank_sc / opt->max_join_long * max_gap + .499);
if (r0->score < sc_thres || r1->score < sc_thres) continue; // require good flanking chains
min_flank_len = (int)(max_gap * opt->min_join_flank_ratio);
if (r0->re - r0->rs < min_flank_len || r0->qe - r0->qs < min_flank_len) continue; // require enough flanking length
if (r1->re - r1->rs < min_flank_len || r1->qe - r1->qs < min_flank_len) continue;
// all conditions satisfied; join
a[r1->as].y |= MM_SEED_LONG_JOIN;
r0->cnt += r1->cnt, r0->score += r1->score;
mm_reg_set_coor(r0, qlen, a, opt->flag&MM_F_QSTRAND);
r1->cnt = 0;
r1->parent = r0->id;
++n_drop;
}
kfree(km, aux);
if (n_drop > 0) { // then fix the hits hierarchy
for (i = 0; i < n_regs; ++i) { // adjust the mm_reg1_t::parent
mm_reg1_t *r = &regs[i];
if (r->parent >= 0 && r->id != r->parent) { // fix for secondary hits only
if (regs[r->parent].parent >= 0 && regs[r->parent].parent != r->parent)
r->parent = regs[r->parent].parent;
}
}
mm_filter_regs(opt, qlen, n_regs_, regs);
mm_sync_regs(km, *n_regs_, regs);
}
}
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
{
int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;

View File

@ -676,7 +676,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
char *p, *q, *bl, *bs;
int32_t i, id = -1, n_blk = 0;
for (p = q = str.s, i = 0;; ++p) {
if (*p == 0 || isspace(*p)) {
if (*p == 0 || *p == '\t') {
int32_t c = *p;
*p = 0;
if (i == 0) { // chr

View File

@ -34,4 +34,43 @@ void km_stat(const void *_km, km_stat_t *s);
KREALLOC((km), (a), (m)); \
} while (0)
#ifndef klib_unused
#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
#define klib_unused __attribute__ ((__unused__))
#else
#define klib_unused
#endif
#endif /* klib_unused */
#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \
typedef struct { \
size_t cnt, n, max; \
kmptype_t **buf; \
void *km; \
} kmp_##name##_t; \
SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \
kmp_##name##_t *mp; \
KCALLOC(km, mp, 1); \
mp->km = km; \
return mp; \
} \
SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \
size_t k; \
for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \
kfree(mp->km, mp->buf); kfree(mp->km, mp); \
} \
SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
++mp->cnt; \
if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \
return mp->buf[--mp->n]; \
} \
SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
--mp->cnt; \
if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \
mp->buf[mp->n++] = p; \
}
#define KALLOC_POOL_INIT(name, kmptype_t) \
KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t)
#endif

474
krmq.h 100644
View File

@ -0,0 +1,474 @@
/* The MIT License
Copyright (c) 2019 by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* An example:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "krmq.h"
struct my_node {
char key;
KRMQ_HEAD(struct my_node) head;
};
#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key))
KRMQ_INIT(my, struct my_node, head, my_cmp)
int main(void) {
const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate
struct my_node *root = 0;
int i, l = strlen(str);
for (i = 0; i < l; ++i) { // insert in the input order
struct my_node *q, *p = malloc(sizeof(*p));
p->key = str[i];
q = krmq_insert(my, &root, p, 0);
if (p != q) free(p); // if already present, free
}
krmq_itr_t(my) itr;
krmq_itr_first(my, root, &itr); // place at first
do { // traverse
const struct my_node *p = krmq_at(&itr);
putchar(p->key);
free((void*)p); // free node
} while (krmq_itr_next(my, &itr));
putchar('\n');
return 0;
}
*/
#ifndef KRMQ_H
#define KRMQ_H
#ifdef __STRICT_ANSI__
#define inline __inline__
#endif
#define KRMQ_MAX_DEPTH 64
#define krmq_size(head, p) ((p)? (p)->head.size : 0)
#define krmq_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0)
#define KRMQ_HEAD(__type) \
struct { \
__type *p[2], *s; \
signed char balance; /* balance factor */ \
unsigned size; /* #elements in subtree */ \
}
#define __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \
__scope __type *krmq_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \
const __type *p = root; \
unsigned cnt = 0; \
while (p != 0) { \
int cmp; \
cmp = __cmp(x, p); \
if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \
if (cmp < 0) p = p->__head.p[0]; \
else if (cmp > 0) p = p->__head.p[1]; \
else break; \
} \
if (cnt_) *cnt_ = cnt; \
return (__type*)p; \
} \
__scope __type *krmq_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \
const __type *p = root, *l = 0, *u = 0; \
while (p != 0) { \
int cmp; \
cmp = __cmp(x, p); \
if (cmp < 0) u = p, p = p->__head.p[0]; \
else if (cmp > 0) l = p, p = p->__head.p[1]; \
else { l = u = p; break; } \
} \
if (lower) *lower = (__type*)l; \
if (upper) *upper = (__type*)u; \
return (__type*)p; \
}
#define __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \
__scope __type *krmq_rmq_##suf(const __type *root, const __type *lo, const __type *up) { /* CLOSED interval */ \
const __type *p = root, *path[2][KRMQ_MAX_DEPTH], *min; \
int plen[2] = {0, 0}, pcmp[2][KRMQ_MAX_DEPTH], i, cmp, lca; \
if (root == 0) return 0; \
while (p) { \
cmp = __cmp(lo, p); \
path[0][plen[0]] = p, pcmp[0][plen[0]++] = cmp; \
if (cmp < 0) p = p->__head.p[0]; \
else if (cmp > 0) p = p->__head.p[1]; \
else break; \
} \
p = root; \
while (p) { \
cmp = __cmp(up, p); \
path[1][plen[1]] = p, pcmp[1][plen[1]++] = cmp; \
if (cmp < 0) p = p->__head.p[0]; \
else if (cmp > 0) p = p->__head.p[1]; \
else break; \
} \
for (i = 0; i < plen[0] && i < plen[1]; ++i) /* find the LCA */ \
if (path[0][i] == path[1][i] && pcmp[0][i] <= 0 && pcmp[1][i] >= 0) \
break; \
if (i == plen[0] || i == plen[1]) return 0; /* no elements in the closed interval */ \
lca = i, min = path[0][lca]; \
for (i = lca + 1; i < plen[0]; ++i) { \
if (pcmp[0][i] <= 0) { \
if (__lt2(path[0][i], min)) min = path[0][i]; \
if (path[0][i]->__head.p[1] && __lt2(path[0][i]->__head.p[1]->__head.s, min)) \
min = path[0][i]->__head.p[1]->__head.s; \
} \
} \
for (i = lca + 1; i < plen[1]; ++i) { \
if (pcmp[1][i] >= 0) { \
if (__lt2(path[1][i], min)) min = path[1][i]; \
if (path[1][i]->__head.p[0] && __lt2(path[1][i]->__head.p[0]->__head.s, min)) \
min = path[1][i]->__head.p[0]->__head.s; \
} \
} \
return (__type*)min; \
}
#define __KRMQ_ROTATE(suf, __type, __head, __lt2) \
/* */ \
static inline void krmq_update_min_##suf(__type *p, const __type *q, const __type *r) { \
p->__head.s = !q || __lt2(p, q->__head.s)? p : q->__head.s; \
p->__head.s = !r || __lt2(p->__head.s, r->__head.s)? p->__head.s : r->__head.s; \
} \
/* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \
static inline __type *krmq_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \
int opp = 1 - dir; /* opposite direction */ \
__type *q = p->__head.p[opp], *s = p->__head.s; \
unsigned size_p = p->__head.size; \
p->__head.size -= q->__head.size - krmq_size_child(__head, q, dir); \
q->__head.size = size_p; \
krmq_update_min_##suf(p, p->__head.p[dir], q->__head.p[dir]); \
q->__head.s = s; \
p->__head.p[opp] = q->__head.p[dir]; \
q->__head.p[dir] = p; \
return q; \
} \
/* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \
static inline __type *krmq_rotate2_##suf(__type *p, int dir) { \
int b1, opp = 1 - dir; \
__type *q = p->__head.p[opp], *r = q->__head.p[dir], *s = p->__head.s; \
unsigned size_x_dir = krmq_size_child(__head, r, dir); \
r->__head.size = p->__head.size; \
p->__head.size -= q->__head.size - size_x_dir; \
q->__head.size -= size_x_dir + 1; \
krmq_update_min_##suf(p, p->__head.p[dir], r->__head.p[dir]); \
krmq_update_min_##suf(q, q->__head.p[opp], r->__head.p[opp]); \
r->__head.s = s; \
p->__head.p[opp] = r->__head.p[dir]; \
r->__head.p[dir] = p; \
q->__head.p[dir] = r->__head.p[opp]; \
r->__head.p[opp] = q; \
b1 = dir == 0? +1 : -1; \
if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \
else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \
else q->__head.balance = b1, p->__head.balance = 0; \
r->__head.balance = 0; \
return r; \
}
#define __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \
__scope __type *krmq_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \
unsigned char stack[KRMQ_MAX_DEPTH]; \
__type *path[KRMQ_MAX_DEPTH]; \
__type *bp, *bq; \
__type *p, *q, *r = 0; /* _r_ is potentially the new root */ \
int i, which = 0, top, b1, path_len; \
unsigned cnt = 0; \
bp = *root_, bq = 0; \
/* find the insertion location */ \
for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \
int cmp; \
cmp = __cmp(x, p); \
if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \
if (cmp == 0) { \
if (cnt_) *cnt_ = cnt; \
return p; \
} \
if (p->__head.balance != 0) \
bq = q, bp = p, top = 0; \
stack[top++] = which = (cmp > 0); \
path[path_len++] = p; \
} \
if (cnt_) *cnt_ = cnt; \
x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0, x->__head.s = x; \
if (q == 0) *root_ = x; \
else q->__head.p[which] = x; \
if (bp == 0) return x; \
for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \
for (i = path_len - 1; i >= 0; --i) { \
krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \
if (path[i]->__head.s != x) break; \
} \
for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \
if (stack[top] == 0) --p->__head.balance; \
else ++p->__head.balance; \
if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \
/* re-balance */ \
which = (bp->__head.balance < 0); \
b1 = which == 0? +1 : -1; \
q = bp->__head.p[1 - which]; \
if (q->__head.balance == b1) { \
r = krmq_rotate1_##suf(bp, which); \
q->__head.balance = bp->__head.balance = 0; \
} else r = krmq_rotate2_##suf(bp, which); \
if (bq == 0) *root_ = r; \
else bq->__head.p[bp != bq->__head.p[0]] = r; \
return x; \
}
#define __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \
__scope __type *krmq_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \
__type *p, *path[KRMQ_MAX_DEPTH], fake; \
unsigned char dir[KRMQ_MAX_DEPTH]; \
int i, d = 0, cmp; \
unsigned cnt = 0; \
fake = **root_, fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \
if (cnt_) *cnt_ = 0; \
if (x) { \
for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \
int which = (cmp > 0); \
if (cmp > 0) cnt += krmq_size_child(__head, p, 0) + 1; \
dir[d] = which; \
path[d++] = p; \
p = p->__head.p[which]; \
if (p == 0) { \
if (cnt_) *cnt_ = 0; \
return 0; \
} \
} \
cnt += krmq_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \
} else { \
for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \
dir[d] = 0, path[d++] = p; \
p = path[--d]; \
} \
if (cnt_) *cnt_ = cnt; \
for (i = 1; i < d; ++i) --path[i]->__head.size; \
if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \
path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \
} else { \
__type *q = p->__head.p[1]; \
if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3,q=2 */ \
q->__head.p[0] = p->__head.p[0]; \
q->__head.balance = p->__head.balance; \
path[d-1]->__head.p[dir[d-1]] = q; \
path[d] = q, dir[d++] = 1; \
q->__head.size = p->__head.size - 1; \
} else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \
__type *r; \
int e = d++; /* backup _d_ */\
for (;;) { \
dir[d] = 0; \
path[d++] = q; \
r = q->__head.p[0]; \
if (r->__head.p[0] == 0) break; \
q = r; \
} \
r->__head.p[0] = p->__head.p[0]; \
q->__head.p[0] = r->__head.p[1]; \
r->__head.p[1] = p->__head.p[1]; \
r->__head.balance = p->__head.balance; \
path[e-1]->__head.p[dir[e-1]] = r; \
path[e] = r, dir[e] = 1; \
for (i = e + 1; i < d; ++i) --path[i]->__head.size; \
r->__head.size = p->__head.size - 1; \
} \
} \
for (i = d - 1; i >= 0; --i) /* not sure why adding condition "path[i]->__head.s==p" doesn't work */ \
krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \
while (--d > 0) { \
__type *q = path[d]; \
int which, other, b1 = 1, b2 = 2; \
which = dir[d], other = 1 - which; \
if (which) b1 = -b1, b2 = -b2; \
q->__head.balance += b1; \
if (q->__head.balance == b1) break; \
else if (q->__head.balance == b2) { \
__type *r = q->__head.p[other]; \
if (r->__head.balance == -b1) { \
path[d-1]->__head.p[dir[d-1]] = krmq_rotate2_##suf(q, which); \
} else { \
path[d-1]->__head.p[dir[d-1]] = krmq_rotate1_##suf(q, which); \
if (r->__head.balance == 0) { \
r->__head.balance = -b1; \
q->__head.balance = b1; \
break; \
} else r->__head.balance = q->__head.balance = 0; \
} \
} \
} \
*root_ = fake.__head.p[0]; \
return p; \
}
#define krmq_free(__type, __head, __root, __free) do { \
__type *_p, *_q; \
for (_p = __root; _p; _p = _q) { \
if (_p->__head.p[0] == 0) { \
_q = _p->__head.p[1]; \
__free(_p); \
} else { \
_q = _p->__head.p[0]; \
_p->__head.p[0] = _q->__head.p[1]; \
_q->__head.p[1] = _p; \
} \
} \
} while (0)
#define __KRMQ_ITR(suf, __scope, __type, __head, __cmp) \
struct krmq_itr_##suf { \
const __type *stack[KRMQ_MAX_DEPTH], **top; \
}; \
__scope void krmq_itr_first_##suf(const __type *root, struct krmq_itr_##suf *itr) { \
const __type *p; \
for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \
*++itr->top = p; \
} \
__scope int krmq_itr_find_##suf(const __type *root, const __type *x, struct krmq_itr_##suf *itr) { \
const __type *p = root; \
itr->top = itr->stack - 1; \
while (p != 0) { \
int cmp; \
*++itr->top = p; \
cmp = __cmp(x, p); \
if (cmp < 0) p = p->__head.p[0]; \
else if (cmp > 0) p = p->__head.p[1]; \
else break; \
} \
return p? 1 : 0; \
} \
__scope int krmq_itr_next_bidir_##suf(struct krmq_itr_##suf *itr, int dir) { \
const __type *p; \
if (itr->top < itr->stack) return 0; \
dir = !!dir; \
p = (*itr->top)->__head.p[dir]; \
if (p) { /* go down */ \
for (; p; p = p->__head.p[!dir]) \
*++itr->top = p; \
return 1; \
} else { /* go up */ \
do { \
p = *itr->top--; \
} while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \
return itr->top < itr->stack? 0 : 1; \
} \
} \
/**
* Insert a node to the tree
*
* @param suf name suffix used in KRMQ_INIT()
* @param proot pointer to the root of the tree (in/out: root may change)
* @param x node to insert (in)
* @param cnt number of nodes smaller than or equal to _x_; can be NULL (out)
*
* @return _x_ if not present in the tree, or the node equal to x.
*/
#define krmq_insert(suf, proot, x, cnt) krmq_insert_##suf(proot, x, cnt)
/**
* Find a node in the tree
*
* @param suf name suffix used in KRMQ_INIT()
* @param root root of the tree
* @param x node value to find (in)
* @param cnt number of nodes smaller than or equal to _x_; can be NULL (out)
*
* @return node equal to _x_ if present, or NULL if absent
*/
#define krmq_find(suf, root, x, cnt) krmq_find_##suf(root, x, cnt)
#define krmq_interval(suf, root, x, lower, upper) krmq_interval_##suf(root, x, lower, upper)
#define krmq_rmq(suf, root, lo, up) krmq_rmq_##suf(root, lo, up)
/**
* Delete a node from the tree
*
* @param suf name suffix used in KRMQ_INIT()
* @param proot pointer to the root of the tree (in/out: root may change)
* @param x node value to delete; if NULL, delete the first node (in)
*
* @return node removed from the tree if present, or NULL if absent
*/
#define krmq_erase(suf, proot, x, cnt) krmq_erase_##suf(proot, x, cnt)
#define krmq_erase_first(suf, proot) krmq_erase_##suf(proot, 0, 0)
#define krmq_itr_t(suf) struct krmq_itr_##suf
/**
* Place the iterator at the smallest object
*
* @param suf name suffix used in KRMQ_INIT()
* @param root root of the tree
* @param itr iterator
*/
#define krmq_itr_first(suf, root, itr) krmq_itr_first_##suf(root, itr)
/**
* Place the iterator at the object equal to or greater than the query
*
* @param suf name suffix used in KRMQ_INIT()
* @param root root of the tree
* @param x query (in)
* @param itr iterator (out)
*
* @return 1 if find; 0 otherwise. krmq_at(itr) is NULL if and only if query is
* larger than all objects in the tree
*/
#define krmq_itr_find(suf, root, x, itr) krmq_itr_find_##suf(root, x, itr)
/**
* Move to the next object in order
*
* @param itr iterator (modified)
*
* @return 1 if there is a next object; 0 otherwise
*/
#define krmq_itr_next(suf, itr) krmq_itr_next_bidir_##suf(itr, 1)
#define krmq_itr_prev(suf, itr) krmq_itr_next_bidir_##suf(itr, 0)
/**
* Return the pointer at the iterator
*
* @param itr iterator
*
* @return pointer if present; NULL otherwise
*/
#define krmq_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top)
#define KRMQ_INIT2(suf, __scope, __type, __head, __cmp, __lt2) \
__KRMQ_FIND(suf, __scope, __type, __head, __cmp) \
__KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \
__KRMQ_ROTATE(suf, __type, __head, __lt2) \
__KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \
__KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \
__KRMQ_ITR(suf, __scope, __type, __head, __cmp)
#define KRMQ_INIT(suf, __type, __head, __cmp, __lt2) \
KRMQ_INIT2(suf,, __type, __head, __cmp, __lt2)
#endif

19
ksw2.h
View File

@ -16,6 +16,13 @@
#define KSW_EZ_SPLICE_REV 0x200
#define KSW_EZ_SPLICE_FLANK 0x400
// The subset of CIGAR operators used by ksw code.
// Use MM_CIGAR_* from minimap.h if you need the full list.
#define KSW_CIGAR_MATCH 0
#define KSW_CIGAR_INS 1
#define KSW_CIGAR_DEL 2
#define KSW_CIGAR_N_SKIP 3
#ifdef __cplusplus
extern "C" {
#endif
@ -137,13 +144,13 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intro
else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
if (force_state >= 0) state = force_state;
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j;
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i;
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i;
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j;
}
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion
if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion
if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion
if (!is_rev)
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;

View File

@ -4,15 +4,23 @@
#include "ksw2.h"
#ifdef __SSE2__
#ifdef USE_SIMDE
#include <simde/x86/sse2.h>
#else
#include <emmintrin.h>
#endif
#ifdef KSW_SSE2_ONLY
#undef __SSE4_1__
#endif
#ifdef __SSE4_1__
#ifdef USE_SIMDE
#include <simde/x86/sse4.1.h>
#else
#include <smmintrin.h>
#endif
#endif
#ifdef KSW_CPU_DISPATCH
#ifdef __SSE4_1__

View File

@ -4,15 +4,22 @@
#include "ksw2.h"
#ifdef __SSE2__
#ifdef USE_SIMDE
#include <simde/x86/sse2.h>
#else
#include <emmintrin.h>
#endif
#ifdef KSW_SSE2_ONLY
#undef __SSE4_1__
#endif
#ifdef __SSE4_1__
#ifdef USE_SIMDE
#include <simde/x86/sse4.1.h>
#else
#include <smmintrin.h>
#endif
#endif
#ifdef KSW_CPU_DISPATCH
#ifdef __SSE4_1__

View File

@ -3,15 +3,23 @@
#include "ksw2.h"
#ifdef __SSE2__
#ifdef USE_SIMDE
#include <simde/x86/sse2.h>
#else
#include <emmintrin.h>
#endif
#ifdef KSW_SSE2_ONLY
#undef __SSE4_1__
#endif
#ifdef __SSE4_1__
#ifdef USE_SIMDE
#include <simde/x86/sse4.1.h>
#else
#include <smmintrin.h>
#endif
#endif
#ifdef KSW_CPU_DISPATCH
#ifdef __SSE4_1__

View File

@ -1,9 +1,14 @@
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <emmintrin.h>
#include "ksw2.h"
#ifdef USE_SIMDE
#include <simde/x86/sse2.h>
#else
#include <emmintrin.h>
#endif
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x),1)
#define UNLIKELY(x) __builtin_expect((x),0)

354
lchain.c 100644
View File

@ -0,0 +1,354 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include "mmpriv.h"
#include "kalloc.h"
#include "krmq.h"
static inline float mg_log2(float x) // NB: this doesn't work when x<2
{
union { float f; uint32_t i; } z = { x };
float log_2 = ((z.i >> 23) & 255) - 128;
z.i &= ~(255 << 23);
z.i += 127 << 23;
log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f;
return log_2;
}
uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t *n_u_, int32_t *n_v_)
{
mm128_t *z;
uint64_t *u;
int64_t i, k, n_z, n_v;
int32_t n_u;
*n_u_ = *n_v_ = 0;
for (i = 0, n_z = 0; i < n; ++i) // precompute n_z
if (f[i] >= min_sc) ++n_z;
if (n_z == 0) return 0;
KMALLOC(km, z, n_z);
for (i = 0, k = 0; i < n; ++i) // populate z[]
if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i;
radix_sort_128x(z, z + n_z);
memset(t, 0, n * 4);
for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u
int64_t n_v0 = n_v;
int32_t sc;
for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i])
++n_v, t[i] = 1;
sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
++n_u;
else n_v = n_v0;
}
KMALLOC(km, u, n_u);
memset(t, 0, n * 4);
for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[]
int64_t n_v0 = n_v;
int32_t sc;
for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i])
v[n_v++] = i, t[i] = 1;
sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0);
else n_v = n_v0;
}
kfree(km, z);
assert(n_v < INT32_MAX);
*n_u_ = n_u, *n_v_ = n_v;
return u;
}
static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a)
{
mm128_t *b, *w;
uint64_t *u2;
int64_t i, j, k;
// write the result to b[]
KMALLOC(km, b, n_v);
for (i = 0, k = 0; i < n_u; ++i) {
int32_t k0 = k, ni = (int32_t)u[i];
for (j = 0; j < ni; ++j)
b[k++] = a[v[k0 + (ni - j - 1)]];
}
kfree(km, v);
// sort u[] and a[] by the target position, such that adjacent chains may be joined
KMALLOC(km, w, n_u);
for (i = k = 0; i < n_u; ++i) {
w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i;
k += (int32_t)u[i];
}
radix_sort_128x(w, w + n_u);
KMALLOC(km, u2, n_u);
for (i = k = 0; i < n_u; ++i) {
int32_t j = (int32_t)w[i].y, n = (int32_t)u[j];
u2[i] = u[j];
memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t));
k += n;
}
memcpy(u, u2, n_u * 8);
memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot
kfree(km, a); kfree(km, w); kfree(km, u2);
return b;
}
static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap, float chn_pen_skip, int is_cdna, int n_seg)
{
int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc;
int32_t sidi = (ai->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
int32_t sidj = (aj->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
if (dq <= 0 || dq > max_dist_x) return INT32_MIN;
dr = (int32_t)(ai->x - aj->x);
if (sidi == sidj && (dr == 0 || dq > max_dist_y)) return INT32_MIN;
dd = dr > dq? dr - dq : dq - dr;
if (sidi == sidj && dd > bw) return INT32_MIN;
if (n_seg > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) return INT32_MIN;
dg = dr < dq? dr : dq;
q_span = aj->y>>32&0xff;
sc = q_span < dg? q_span : dg;
if (dd || dg > q_span) {
float lin_pen, log_pen;
lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg;
log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2
if (is_cdna || sidi != sidj) {
if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus
else if (dr > dq || sidi != sidj) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends
else sc -= (int)(lin_pen + .5f * log_pen);
} else sc -= (int)(lin_pen + .5f * log_pen);
}
return sc;
}
/* Input:
* a[].x: tid<<33 | rev<<32 | tpos
* a[].y: flags<<40 | q_span<<32 | q_pos
* Output:
* n_u: #chains
* u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[])
* input a[] is deallocated on return
*/
mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
{ // TODO: make sure this works when n has more than 32 bits
int32_t *f, *t, *v, n_u, n_v, mmax_f = 0;
int64_t *p, i, j, max_ii, st = 0, n_iter = 0;
uint64_t *u;
if (_u) *_u = 0, *n_u_ = 0;
if (n == 0 || a == 0) {
kfree(km, a);
return 0;
}
if (max_dist_x < bw) max_dist_x = bw;
if (max_dist_y < bw && !is_cdna) max_dist_y = bw;
KMALLOC(km, p, n);
KMALLOC(km, f, n);
KMALLOC(km, v, n);
KCALLOC(km, t, n);
// fill the score and backtrack arrays
for (i = 0, max_ii = -1; i < n; ++i) {
int64_t max_j = -1, end_j;
int32_t max_f = a[i].y>>32&0xff, n_skip = 0;
while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st;
if (i - st > max_iter) st = i - max_iter;
for (j = i - 1; j >= st; --j) {
int32_t sc;
sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
++n_iter;
if (sc == INT32_MIN) continue;
sc += f[j];
if (sc > max_f) {
max_f = sc, max_j = j;
if (n_skip > 0) --n_skip;
} else if (t[j] == (int32_t)i) {
if (++n_skip > max_skip)
break;
}
if (p[j] >= 0) t[p[j]] = i;
}
end_j = j;
if (max_ii < 0 || a[i].x - a[max_ii].x > (int64_t)max_dist_x) {
int32_t max = INT32_MIN;
max_ii = -1;
for (j = i - 1; j >= st; --j)
if (max < f[j]) max = f[j], max_ii = j;
}
if (max_ii >= 0 && max_ii < end_j) {
int32_t tmp;
tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg);
if (tmp != INT32_MIN && max_f < tmp + f[max_ii])
max_f = tmp + f[max_ii], max_j = max_ii;
}
f[i] = max_f, p[i] = max_j;
v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i]))
max_ii = i;
if (mmax_f < max_f) mmax_f = max_f;
}
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v);
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
kfree(km, p); kfree(km, f); kfree(km, t);
if (n_u == 0) {
kfree(km, a); kfree(km, v);
return 0;
}
return compact_a(km, n_u, u, n_v, v, a);
}
typedef struct lc_elem_s {
int32_t y;
int64_t i;
double pri;
KRMQ_HEAD(struct lc_elem_s) head;
} lc_elem_t;
#define lc_elem_cmp(a, b) ((a)->y < (b)->y? -1 : (a)->y > (b)->y? 1 : ((a)->i > (b)->i) - ((a)->i < (b)->i))
#define lc_elem_lt2(a, b) ((a)->pri < (b)->pri)
KRMQ_INIT(lc_elem, lc_elem_t, head, lc_elem_cmp, lc_elem_lt2)
KALLOC_POOL_INIT(rmq, lc_elem_t)
static inline int32_t comput_sc_simple(const mm128_t *ai, const mm128_t *aj, float chn_pen_gap, float chn_pen_skip, int32_t *exact, int32_t *width)
{
int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc;
dr = (int32_t)(ai->x - aj->x);
*width = dd = dr > dq? dr - dq : dq - dr;
dg = dr < dq? dr : dq;
q_span = aj->y>>32&0xff;
sc = q_span < dg? q_span : dg;
if (exact) *exact = (dd == 0 && dg <= q_span);
if (dd || dq > q_span) {
float lin_pen, log_pen;
lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg;
log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2
sc -= (int)(lin_pen + .5f * log_pen);
}
return sc;
}
mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
{
int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0;
int64_t *p, i, i0, st = 0, st_inner = 0, n_iter = 0;
uint64_t *u;
lc_elem_t *root = 0, *root_inner = 0;
void *mem_mp = 0;
kmp_rmq_t *mp;
if (_u) *_u = 0, *n_u_ = 0;
if (n == 0 || a == 0) {
kfree(km, a);
return 0;
}
if (max_dist < bw) max_dist = bw;
if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0;
KMALLOC(km, p, n);
KMALLOC(km, f, n);
KCALLOC(km, t, n);
KMALLOC(km, v, n);
mem_mp = km_init2(km, 0x10000);
mp = kmp_init_rmq(mem_mp);
// fill the score and backtrack arrays
for (i = i0 = 0; i < n; ++i) {
int64_t max_j = -1;
int32_t q_span = a[i].y>>32&0xff, max_f = q_span;
lc_elem_t s, *q, *r, lo, hi;
// add in-range anchors
if (i0 < i && a[i0].x != a[i].x) {
int64_t j;
for (j = i0; j < i; ++j) {
q = kmp_alloc_rmq(mp);
q->y = (int32_t)a[j].y, q->i = j, q->pri = -(f[j] + 0.5 * chn_pen_gap * ((int32_t)a[j].x + (int32_t)a[j].y));
krmq_insert(lc_elem, &root, q, 0);
if (max_dist_inner > 0) {
r = kmp_alloc_rmq(mp);
*r = *q;
krmq_insert(lc_elem, &root_inner, r, 0);
}
}
i0 = i;
}
// get rid of active chains out of range
while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist || krmq_size(head, root) > cap_rmq_size)) {
s.y = (int32_t)a[st].y, s.i = st;
if ((q = krmq_find(lc_elem, root, &s, 0)) != 0) {
q = krmq_erase(lc_elem, &root, q, 0);
kmp_free_rmq(mp, q);
}
++st;
}
if (max_dist_inner > 0) { // similar to the block above, but applied to the inner tree
while (st_inner < i && (a[i].x>>32 != a[st_inner].x>>32 || a[i].x > a[st_inner].x + max_dist_inner || krmq_size(head, root_inner) > cap_rmq_size)) {
s.y = (int32_t)a[st_inner].y, s.i = st_inner;
if ((q = krmq_find(lc_elem, root_inner, &s, 0)) != 0) {
q = krmq_erase(lc_elem, &root_inner, q, 0);
kmp_free_rmq(mp, q);
}
++st_inner;
}
}
// RMQ
lo.i = INT32_MAX, lo.y = (int32_t)a[i].y - max_dist;
hi.i = 0, hi.y = (int32_t)a[i].y;
if ((q = krmq_rmq(lc_elem, root, &lo, &hi)) != 0) {
int32_t sc, exact, width, n_skip = 0;
int64_t j = q->i;
assert(q->y >= lo.y && q->y <= hi.y);
sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, &exact, &width);
if (width <= bw && sc > max_f) max_f = sc, max_j = j;
if (!exact && root_inner && (int32_t)a[i].y > 0) {
lc_elem_t *lo, *hi;
s.y = (int32_t)a[i].y - 1, s.i = n;
krmq_interval(lc_elem, root_inner, &s, &lo, &hi);
if (lo) {
const lc_elem_t *q;
int32_t width, n_rmq_iter = 0;
krmq_itr_t(lc_elem) itr;
krmq_itr_find(lc_elem, root_inner, lo, &itr);
while ((q = krmq_at(&itr)) != 0) {
if (q->y < (int32_t)a[i].y - max_dist_inner) break;
++n_rmq_iter;
j = q->i;
sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width);
if (width <= bw) {
if (sc > max_f) {
max_f = sc, max_j = j;
if (n_skip > 0) --n_skip;
} else if (t[j] == (int32_t)i) {
if (++n_skip > max_chn_skip)
break;
}
if (p[j] >= 0) t[p[j]] = i;
}
if (!krmq_itr_prev(lc_elem, &itr)) break;
}
n_iter += n_rmq_iter;
}
}
}
// set max
assert(max_j < 0 || (a[max_j].x < a[i].x && (int32_t)a[max_j].y < (int32_t)a[i].y));
f[i] = max_f, p[i] = max_j;
v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
if (mmax_f < max_f) mmax_f = max_f;
if (max_rmq_size < krmq_size(head, root)) max_rmq_size = krmq_size(head, root);
}
km_destroy(mem_mp);
u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v);
*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
kfree(km, p); kfree(km, f); kfree(km, t);
if (n_u == 0) {
kfree(km, a); kfree(km, v);
return 0;
}
return compact_a(km, n_u, u, n_v, v, a);
}

1
lib/simde 160000

@ -0,0 +1 @@
Subproject commit b30129b3b48a6823013da2b309c50a081177b6b8

46
main.c
View File

@ -7,7 +7,7 @@
#include "mmpriv.h"
#include "ketopt.h"
#define MM_VERSION "2.17-r987-qs-dirty"
#define MM_VERSION "2.21-r1071-qs-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -70,7 +70,9 @@ static ko_longopt_t long_options[] = {
{ "chain-gap-scale",ko_required_argument, 343 },
{ "alt", ko_required_argument, 344 },
{ "alt-drop", ko_required_argument, 345 },
{ "qstrand", ko_no_argument, 346 },
{ "mask-len", ko_required_argument, 346 },
{ "rmq", ko_optional_argument, 347 },
{ "qstrand", ko_no_argument, 348 },
{ "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' },
@ -82,17 +84,23 @@ static ko_longopt_t long_options[] = {
{ 0, 0, 0 }
};
static inline int64_t mm_parse_num(const char *str)
static inline int64_t mm_parse_num2(const char *str, char **q)
{
double x;
char *p;
x = strtod(str, &p);
if (*p == 'G' || *p == 'g') x *= 1e9;
else if (*p == 'M' || *p == 'm') x *= 1e6;
else if (*p == 'K' || *p == 'k') x *= 1e3;
if (*p == 'G' || *p == 'g') x *= 1e9, ++p;
else if (*p == 'M' || *p == 'm') x *= 1e6, ++p;
else if (*p == 'K' || *p == 'k') x *= 1e3, ++p;
if (q) *q = p;
return (int64_t)(x + .499);
}
static inline int64_t mm_parse_num(const char *str)
{
return mm_parse_num2(str, 0);
}
static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const char *arg, int yes_to_set)
{
if (yes_to_set) {
@ -108,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha
int main(int argc, char *argv[])
{
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:";
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:";
ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt;
mm_idxopt_t ipt;
@ -144,7 +152,6 @@ int main(int argc, char *argv[])
else if (c == 'k') ipt.k = atoi(o.arg);
else if (c == 'H') ipt.flag |= MM_I_HPC;
else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I
else if (c == 'r') opt.bw = (int)mm_parse_num(o.arg);
else if (c == 't') n_threads = atoi(o.arg);
else if (c == 'v') mm_verbose = atoi(o.arg);
else if (c == 'g') opt.max_gap = (int)mm_parse_num(o.arg);
@ -171,6 +178,7 @@ int main(int argc, char *argv[])
else if (c == 'C') opt.noncan = atoi(o.arg);
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg);
else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg);
else if (c == 'R') rg = o.arg;
else if (c == 'h') fp_help = stdout;
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
@ -203,7 +211,6 @@ int main(int argc, char *argv[])
else if (c == 327) opt.max_clip_ratio = atof(o.arg); // --max-clip-ratio
else if (c == 328) opt.min_mid_occ = atoi(o.arg); // --min-occ-floor
else if (c == 329) opt.flag |= MM_F_OUT_MD; // --MD
else if (c == 330) opt.min_join_flank_ratio = atof(o.arg); // --lj-min-ratio
else if (c == 331) opt.sc_ambi = atoi(o.arg); // --score-N
else if (c == 332) opt.flag |= MM_F_EQX; // --eqx
else if (c == 333) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit
@ -218,8 +225,11 @@ int main(int argc, char *argv[])
else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale
else if (c == 344) alt_list = o.arg; // --alt
else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop
else if (c == 346) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand
else if (c == 314) { // --frag
else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len
else if (c == 348) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand
else if (c == 330) {
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
} else if (c == 314) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
} else if (c == 315) { // --secondary
yes_or_no(&opt, MM_F_NO_PRINT_2ND, o.longidx, o.arg, 0);
@ -240,6 +250,8 @@ int main(int argc, char *argv[])
yes_or_no(&opt, MM_F_HEAP_SORT, o.longidx, o.arg, 1);
} else if (c == 326) { // --dual
yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0);
} else if (c == 347) { // --rmq
yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1);
} else if (c == 'S') {
opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
if (mm_verbose >= 2)
@ -247,6 +259,12 @@ int main(int argc, char *argv[])
} else if (c == 'V') {
puts(MM_VERSION);
return 0;
} else if (c == 'r') {
opt.bw = (int)mm_parse_num2(o.arg, &s);
if (*s == ',') opt.bw_long = (int)mm_parse_num2(s + 1, &s);
} else if (c == 'U') {
opt.min_mid_occ = strtol(o.arg, &s, 10);
if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10);
} else if (c == 'f') {
double x;
char *p;
@ -301,7 +319,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
fprintf(fp_help, " -G NUM max intron length (effective with -xsplice; changing -r) [200k]\n");
fprintf(fp_help, " -F NUM max fragment length (effective with -xsr or in the fragment mode) [800]\n");
fprintf(fp_help, " -r NUM bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
fprintf(fp_help, " -r NUM[,NUM] chaining/alignment bandwidth and long-join bandwidth [%d,%d]\n", opt.bw, opt.bw_long);
fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
// fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
@ -332,7 +350,8 @@ int main(int argc, char *argv[])
fprintf(fp_help, " --version show version number\n");
fprintf(fp_help, " Preset:\n");
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
fprintf(fp_help, " - map-pb/map-ont - PacBio/Nanopore vs reference mapping\n");
fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n");
@ -389,6 +408,7 @@ int main(int argc, char *argv[])
if (mm_verbose >= 3) mm_idx_stat(mi);
if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1);
if (alt_list) mm_idx_alt_read(mi, alt_list);
if (argc - (o.ind + 1) == 0) continue; // no query files
ret = 0;
if (!(opt.flag & MM_F_FRAG_MODE)) {
for (i = o.ind + 1; i < argc; ++i) {

89
map.c
View File

@ -80,49 +80,7 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t
#define heap_lt(a, b) ((a).x > (b).x)
KSORT_INIT(heap, mm128_t, heap_lt)
typedef struct {
uint32_t n;
uint32_t q_pos, q_span;
uint32_t seg_id:31, is_tandem:1;
const uint64_t *cr;
} mm_match_t;
static mm_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
{
int rep_st = 0, rep_en = 0, n_m;
size_t i;
mm_match_t *m;
*n_mini_pos = 0;
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
m = (mm_match_t*)kmalloc(km, mv->n * sizeof(mm_match_t));
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
const uint64_t *cr;
mm128_t *p = &mv->a[i];
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
int t;
cr = mm_idx_get(mi, p->x>>8, &t);
if (t >= max_occ) {
int en = (q_pos >> 1) + 1, st = en - q_span;
if (st > rep_en) {
*rep_len += rep_en - rep_st;
rep_st = st, rep_en = en;
} else rep_en = en;
} else {
mm_match_t *q = &m[n_m++];
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
q->is_tandem = 0;
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
*n_a += q->n;
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1;
}
}
*rep_len += rep_en - rep_st;
*_n_m = n_m;
return m;
}
static inline int skip_seed(int flag, uint64_t r, const mm_match_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self)
{
*is_self = 0;
if (qname && (flag & (MM_F_NO_DIAG|MM_F_NO_DUAL))) {
@ -151,10 +109,10 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
{
int i, n_m, heap_size = 0;
int64_t j, n_for = 0, n_rev = 0;
mm_match_t *m;
mm_seed_t *m;
mm128_t *a, *heap;
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t));
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
@ -168,7 +126,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max
}
ks_heapmake_heap(heap_size, heap);
while (heap_size > 0) {
mm_match_t *q = &m[heap->y>>32];
mm_seed_t *q = &m[heap->y>>32];
mm128_t *p;
uint64_t r = heap->x;
int32_t is_self, rpos = (uint32_t)r >> 1;
@ -216,12 +174,12 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
int *n_mini_pos, uint64_t **mini_pos)
{
int i, n_m;
mm_match_t *m;
mm_seed_t *m;
mm128_t *a;
m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos);
a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t));
for (i = 0, *n_a = 0; i < n_m; ++i) {
mm_match_t *q = &m[i];
mm_seed_t *q = &m[i];
const uint64_t *r = q->cr;
uint32_t k;
for (k = 0; k < q->n; ++k) {
@ -253,11 +211,9 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ,
static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a)
{
if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs);
if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains
mm_join_long(km, opt, qlen, n_regs, regs, a);
}
}
@ -266,7 +222,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
if (!(opt->flag & MM_F_CIGAR)) return regs;
regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s)
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
mm_set_sam_pri(*n_regs, regs);
}
@ -317,9 +273,25 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
} else max_chain_gap_ref = opt->max_gap;
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
if (opt->flag & MM_F_RMQ) {
a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score,
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km);
} else {
a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
}
if (opt->max_occ > opt->mid_occ && rep_len > 0) {
if (opt->bw_long > opt->bw && (opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences
int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y;
if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) {
int32_t i;
for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i];
kfree(b->km, u);
radix_sort_128x(a, a + n_a);
a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score,
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km);
}
} else if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads
int rechain = 0;
if (n_regs0 > 0) { // test if the best chain has all the segments
int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0;
@ -339,7 +311,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
kfree(b->km, mini_pos);
if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score,
opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
}
}
b->frag_gap = max_chain_gap_ref;
@ -370,7 +343,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains
free(regs0);
for (i = 0; i < n_segs; ++i) {
mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent
mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent
regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a);
mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr);
}
@ -514,7 +487,7 @@ static void merge_hits(step_t *s)
}
}
mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_drop);
mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
mm_set_parent(km, opt->mask_level, opt->mask_len, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop);
if (!(opt->flag & MM_F_ALL_CHAINS)) {
mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]);
mm_set_sam_pri(s->n_reg[k], s->reg[k]);

View File

@ -36,8 +36,9 @@
#define MM_F_NO_END_FLT 0x10000000
#define MM_F_HARD_MLEVEL 0x20000000
#define MM_F_SAM_HIT_ONLY 0x40000000
#define MM_F_QSTRAND (0x80000000LL)
#define MM_F_NO_INV (0x100000000LL)
#define MM_F_RMQ 0x80000000LL
#define MM_F_QSTRAND (0x100000000LL)
#define MM_F_NO_INV (0x200000000LL)
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
@ -47,6 +48,18 @@
#define MM_MAX_SEG 255
#define MM_CIGAR_MATCH 0
#define MM_CIGAR_INS 1
#define MM_CIGAR_DEL 2
#define MM_CIGAR_N_SKIP 3
#define MM_CIGAR_SOFTCLIP 4
#define MM_CIGAR_HARDCLIP 5
#define MM_CIGAR_PADDING 6
#define MM_CIGAR_EQ_MATCH 7
#define MM_CIGAR_X_MISMATCH 8
#define MM_CIGAR_STR "MIDNSHP=XB"
#ifdef __cplusplus
extern "C" {
#endif
@ -115,22 +128,22 @@ typedef struct {
int max_qlen; // max query length
int bw; // bandwidth
int bw, bw_long; // bandwidth
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_frag_len;
int max_chain_skip, max_chain_iter;
int min_cnt; // min number of minimizers on each chain
int min_chain_score; // min chaining score
float chain_gap_scale;
int rmq_size_cap, rmq_inner_dist;
int rmq_rescue_size;
float rmq_rescue_ratio;
float mask_level;
int mask_len;
float pri_ratio;
int best_n; // top best_n chains are subjected to DP alignment
int max_join_long, max_join_short;
int min_join_flank_sc;
float min_join_flank_ratio;
float alt_drop;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
@ -147,9 +160,9 @@ typedef struct {
int pe_ori, pe_bonus;
float mid_occ_frac; // only used by mm_mapopt_update(); see below
int32_t min_mid_occ;
int32_t min_mid_occ, max_mid_occ;
int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ;
int32_t max_occ, max_max_occ, occ_dist;
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
int64_t max_sw_mat;

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "4 May 2019" "minimap2-2.17 (r941)" "Bioinformatics tools"
.TH minimap2 1 "6 July 2021" "minimap2-2.21 (r1071)" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -145,22 +145,32 @@ or
.B -xsr
mode, which sets the threshold for a second round of seeding.
.TP
.BI --min-occ-floor \ INT
Force minimap2 to always use k-mers occurring
.BI -U \ INT1 [, INT2 ]
Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is
.RI max{ INT1 ,\ min{ INT2 ,
.BR -f }}.
This option prevents excessively small or large
.B -f
estimated from the input reference. It deprecates
.B --min-occ-floor
in earlier versions of minimap2.
.TP
.BI -e \ INT
Sample a high-frequency minimizer every
.I INT
times or less [0]. In effect, the max occurrence threshold is set to
the
.RI max{ INT ,
.BR -f }.
basepairs [500].
.TP
.BI -g \ INT
.BI -g \ NUM
Stop chain enlongation if there are no minimizers within
.IR INT -bp
[10000].
.IR NUM -bp
[10k].
.TP
.BI -r \ INT
Bandwidth used in chaining and DP-based alignment [500]. This option
approximately controls the maximum gap size.
.BI -r \ NUM1 [, NUM2 ]
Bandwidth for chaining and base alignment [500,20k].
.I NUM1
is used for initial chaining and alignment extension;
.I NUM2
for RMQ-based re-chaining and closing gaps in alignments.
.TP
.BI -n \ INT
Discard chains consisting of
@ -234,10 +244,21 @@ Mark as secondary a chain that overlaps with a better chain by
.I FLOAT
or more of the shorter chain [0.5]
.TP
.BR --rmq = no | yes
Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
for aligning contigs through long INDELs.
.TP
.B --hard-mask-level
Honor option
.B -M
and disable a heurstic to save unmapped subsequences.
and disable a heurstic to save unmapped subsequences and disables
.BR --mask-len .
.TP
.BI --mask-len \ NUM
Keep an alignment if dropping it leaves an unaligned region on query longer than
.IR INT
[inf]. Effective without
.BR --hard-mask-level .
.TP
.BI --max-chain-skip \ INT
A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming
@ -261,10 +282,6 @@ Disable the long gap patching heuristic. When this option is applied, the
maximum alignment gap is mostly controlled by
.BR -r .
.TP
.BI --lj-min-ratio \ FLOAT
Fraction of query sequence length required to bridge a long gap [0.5]. A
smaller value helps to recover longer gaps, at the cost of more false gaps.
.TP
.B --splice
Enable the splice alignment mode.
.TP
@ -384,7 +401,7 @@ BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
.BR --junc-bonus \ INT
Score bonus for a splice donor or acceptor found in annotation (effective with
.BR --junc-bed )
[0].
[9].
.TP
.BI --end-seed-pen \ INT
Drop a terminal anchor if
@ -405,7 +422,7 @@ alignment.
.BI --cap-sw-mem \ NUM
Skip alignment if the DP matrix size is above
.IR NUM .
Set 0 to disable [0].
Set 0 to disable [100m].
.SS Input/output options
.TP 10
.B -a
@ -516,60 +533,47 @@ Available
.I STR
are:
.RS
.TP 8
.B map-pb
PacBio/Oxford Nanopore read to reference mapping
.RB ( -Hk19 )
.TP
.TP 10
.B map-ont
Slightly more sensitive for Oxford Nanopore to reference mapping
.RB ( -k15 ).
For PacBio reads, HPC minimizers consistently leads to faster performance and
more sensitive results in comparison to normal minimizers. For Oxford Nanopore
data, normal minimizers are better, though not much. The effectiveness of HPC
is determined by the sequencing error mode.
Align noisy long reads of ~10% error rate to a reference genome. This is the
default mode.
.TP
.B map-hifi
Align PacBio high-fidelity (HiFi) reads to a reference genome
.RB ( -k19
.B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1
.BR -s200 ).
.TP
.B map-pb
Align older PacBio continuous long (CLR) reads to a reference genome
.RB ( -Hk19 ).
.TP
.B asm5
Long assembly to reference mapping
.RB ( -k19
.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 -N50
.BR --min-occ-floor=100 ).
.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
.BR -N50 ).
Typically, the alignment will not extend to regions with 5% or higher sequence
divergence. Only use this preset if the average divergence is far below 5%.
.TP
.B asm10
Long assembly to reference mapping
.RB ( -k19
.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 -N50
.BR --min-occ-floor=100 ).
.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
.BR -N50 ).
Up to 10% sequence divergence.
.TP
.B asm20
Long assembly to reference mapping
.RB ( -k19
.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50
.BR --min-occ-floor=100 ).
.B -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
.BR -N50 ).
Up to 20% sequence divergence.
.TP
.B ava-pb
PacBio all-vs-all overlap mapping
.RB ( -Hk19
.B -Xw5 -m100 -g10000 --max-chain-skip
.BR 25 ).
.TP
.B ava-ont
Oxford Nanopore all-vs-all overlap mapping
.RB ( -k15
.B -Xw5 -m100 -g10000 -r2000 --max-chain-skip
.BR 25 ).
Similarly, the major difference from
.B ava-pb
is that this preset is not using HPC minimizers.
.TP
.B splice
Long-read spliced alignment
.RB ( -k15
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9
.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
.BR --splice-flank=yes ).
In the splice mode, 1) long deletions are taken as introns and represented as
the
@ -588,9 +592,21 @@ Long-read splice alignment for PacBio CCS reads
.B sr
Short single-end reads without splicing
.RB ( -k21
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20
.B -s40 -g200 -2K50m --heap-sort=yes
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m20
.B -s40 -g100 -2K50m --heap-sort=yes
.BR --secondary=no ).
.TP
.B ava-pb
PacBio CLR all-vs-all overlap mapping
.RB ( -Hk19
.B -Xw5 -e0
.BR -m100 ).
.TP
.B ava-ont
Oxford Nanopore all-vs-all overlap mapping
.RB ( -k15
.B -Xw5 -e0 -m100
.BR -r2k ).
.RE
.SS Miscellaneous options
.TP 10

1
misc.c
View File

@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
KSORT_INIT_GENERIC(uint32_t)
KSORT_INIT_GENERIC(uint64_t)

View File

@ -1,6 +1,6 @@
#!/usr/bin/env k8
var paftools_version = '2.17-r982-dirty';
var paftools_version = '2.21-r1071';
/*****************************
***** Library functions *****
@ -962,12 +962,13 @@ function paf_asmgene(args)
function paf_stat(args)
{
var c, gap_out_len = null;
while ((c = getopt(args, "l:")) != null)
var c, gap_out_len = null, count_err = false;
while ((c = getopt(args, "cl:")) != null)
if (c == 'l') gap_out_len = parseInt(getopt.arg);
else if (c == 'c') count_err = true;
if (getopt.ind == args.length) {
print("Usage: paftools.js stat [-l gapOutLen] <in.sam>|<in.paf>");
print("Usage: paftools.js stat [-c] [-l gapOutLen] <in.sam>|<in.paf>");
exit(1);
}
@ -998,7 +999,7 @@ function paf_stat(args)
if (line.charAt(0) != '@') {
var t = line.split("\t", 12);
var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null;
var atlen = null, aqlen, qs, qe, mapq, ori_qlen;
var atlen = null, aqlen, qs, qe, mapq, ori_qlen, NM = null;
if (t.length < 2) continue;
if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
if (t[4] == '*') continue; // unmapped
@ -1006,6 +1007,8 @@ function paf_stat(args)
++n_2nd;
continue;
}
if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
NM = parseInt(m[1]);
if ((m = /\tcg:Z:(\S+)/.exec(line)) != null)
cigar = m[1];
if (cigar == null) {
@ -1027,6 +1030,8 @@ function paf_stat(args)
++n_2nd;
continue;
}
if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
NM = parseInt(m[1]);
cigar = t[5];
tname = t[2];
rs = parseInt(t[3]) - 1;
@ -1045,11 +1050,13 @@ function paf_stat(args)
++n_seq, last = t[0];
}
var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0;
var n_gapo = 0, n_gap_all = 0, l_match = 0;
while ((m = re.exec(cigar)) != null) {
var l = parseInt(m[1]);
++n_cigar;
if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') {
tl += l, ql += l, M += l;
l_match += l;
} else if (m[2] == 'I' || m[2] == 'D') {
var type;
if (l < 50) type = 0;
@ -1062,6 +1069,7 @@ function paf_stat(args)
else tl += l, ++n_gap[1][type];
if (gap_out_len != null && l >= gap_out_len)
print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l);
++n_gapo, n_gap_all += l;
} else if (m[2] == 'N') {
tl += l;
} else if (m[2] == 'S') {
@ -1079,6 +1087,12 @@ function paf_stat(args)
qs = clip[is_rev? 1 : 0], qe = qs + ql;
ori_qlen = clip[0] + ql + clip[1];
}
if (count_err && NM != null) {
var n_mm = NM - n_gap_all;
if (n_mm < 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno);
if (n_mm < 0) n_mm = 0;
print(t[0], ori_qlen, t[11], ori_qlen - (qe - qs), NM, l_match + n_gap_all, n_mm + n_gapo, l_match + n_gapo);
}
regs.push([qs, qe]);
last_qlen = ori_qlen;
}
@ -1091,7 +1105,7 @@ function paf_stat(args)
file.close();
buf.destroy();
if (gap_out_len == null) {
if (gap_out_len == null && !count_err) {
print("Number of mapped sequences: " + n_seq);
print("Number of primary alignments: " + n_pri);
print("Number of secondary alignments: " + n_2nd);
@ -1405,7 +1419,7 @@ function paf_view(args)
var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); // these are used to show padded alignment
var re_cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g;
var re_cg = /(\d+)([MIDNSH])/g;
var re_cg = /(\d+)([MIDNSHP=X])/g;
var buf = new Bytes();
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
@ -1885,7 +1899,7 @@ function paf_splice2bed(args)
a.length = 0;
}
var re = /(\d+)([MIDNSH])/g;
var re = /(\d+)([MIDNSHP=X])/g;
var c, fmt = "bed", fn_name_conv = null, keep_multi = false;
while ((c = getopt(args, "f:n:m")) != null) {
if (c == 'f') fmt = getopt.arg;
@ -2355,28 +2369,43 @@ function paf_junceval(args)
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
var last_qname = null;
var re_cigar = /(\d+)([MIDNSHX=])/g;
var re_cigar = /(\d+)([MIDNSHP=X])/g;
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
var ctg_name = null, cigar = null, pos = null, qname = t[0];
if (t[0].charAt(0) == '@') continue;
if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(t[2])) continue;
var flag = parseInt(t[1]);
if (flag&0x100) continue;
if (first_only && last_qname == t[0]) continue;
if (t[2] == '*') {
if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
ctg_name = t[5], pos = parseInt(t[7]);
var type = 'P';
for (i = 12; i < t.length; ++i) {
if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
if (m[1] == 'tp:A') type = m[2];
else cigar = m[2];
}
}
if (type == 'S') continue; // secondary
} else { // SAM
ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
var flag = parseInt(t[1]);
if (flag&0x100) continue; // secondary
}
if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
if (first_only && last_qname == qname) continue;
if (ctg_name == '*') { // unmapped
++n_unmapped;
continue;
} else {
++n_pri;
if (last_qname != t[0]) {
if (last_qname != qname) {
++n_mapped;
last_qname = t[0];
last_qname = qname;
}
}
var pos = parseInt(t[3]) - 1, intron = [];
while ((m = re_cigar.exec(t[5])) != null) {
var intron = [];
while ((m = re_cigar.exec(cigar)) != null) {
var len = parseInt(m[1]), op = m[2];
if (op == 'N') {
intron.push([pos, pos + len]);
@ -2389,7 +2418,7 @@ function paf_junceval(args)
}
n_splice += intron.length;
var chr = anno[t[2]];
var chr = anno[ctg_name];
if (chr != null) {
for (var i = 0; i < intron.length; ++i) {
var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]);
@ -2413,12 +2442,12 @@ function paf_junceval(args)
x += '(' + o[j][0] + "," + o[j][1] + ')';
}
x += ']';
print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x);
print(type, qname, i+1, ctg_name, intron[i][0], intron[i][1], x);
}
} else {
++n_splice_novel;
if (print_ovlp)
print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]);
print('N', qname, i+1, ctg_name, intron[i][0], intron[i][1]);
}
}
} else {
@ -2512,6 +2541,395 @@ function paf_ov_eval(args)
print((100 * (1 - n_missing / n_ovlp)).toFixed(2) + "% sensitivity");
}
function paf_vcfstat(args)
{
var c, ts = { "AG":1, "GA":1, "CT":1, "TC":1 };
while ((c = getopt(args, "")) != null) {
}
var buf = new Bytes();
var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]);
var x = { sub:0, ts:0, tv:0, ins:0, del:0, ins1:0, del1:0, ins2:0, del2:0, ins50:0, del50:0, ins1k:0, del1k:0, ins7k:0, del7k:0, insinf:0, delinf:0 };
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
if (t[0][0] == '#') continue;
var alt = t[4].split(",");
var ref = t[3];
for (var i = 0; i < alt.length; ++i) {
var a = alt[i];
if (a[0] == '<' || a[1] == '>') continue;
var l = ref.length < a.length? ref.length : a.length;
for (var j = 0; j < l; ++j) {
if (ref[j] != a[j]) {
++x.sub;
if (ts[ref[j] + a[j]]) ++x.ts;
else ++x.tv;
}
}
var d = a.length - ref.length;
if (d > 0) {
++x.ins;
if (d == 1) ++x.ins1;
else if (d == 2) ++x.ins2;
else if (d < 50) ++x.ins50;
else if (d < 1000) ++x.ins1k;
else if (d < 7000) ++x.ins7k;
else ++x.insinf;
} else if (d < 0) {
d = -d;
++x.del;
if (d == 1) ++x.del1;
else if (d == 2) ++x.del2;
else if (d < 50) ++x.del50;
else if (d < 1000) ++x.del1k;
else if (d < 7000) ++x.del7k;
else ++x.delinf;
}
}
}
file.close();
buf.destroy();
print("# substitutions: " + x.sub);
print("ts/tv: " + (x.ts / x.tv).toFixed(3));
print("# insertions: " + x.ins);
print("# 1bp insertions: " + x.ins1);
print("# 2bp insertions: " + x.ins2);
print("# [3,50) insertions: " + x.ins50);
print("# [50,1000) insertions: " + x.ins1k);
print("# [1000,7000) insertions: " + x.ins7k);
print("# >=7000 insertions: " + x.insinf);
print("# deletions: " + x.del);
print("# 1bp deletions: " + x.del1);
print("# 2bp deletions: " + x.del2);
print("# [3,50) deletions: " + x.del50);
print("# [50,1000) deletions: " + x.del1k);
print("# [1000,7000) deletions: " + x.del7k);
print("# >=7000 deletions: " + x.delinf);
}
function paf_parseNum(s) {
var m, x = null;
if ((m = /^(\d*\.?\d*)([mMgGkK]?)/.exec(s)) != null) {
x = parseFloat(m[1]);
if (m[2] == 'k' || m[2] == 'K') x *= 1000;
else if (m[2] == 'm' || m[2] == 'M') x *= 1000000;
else if (m[2] == 'g' || m[2] == 'G') x *= 1000000000;
}
return Math.floor(x + .499);
}
function paf_misjoin(args)
{
var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false, cen_ratio = 0.5;
var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0];
while ((c = getopt(args, "l:g:c:per:")) != null) {
if (c == 'l') min_seg_len = paf_parseNum(getopt.arg);
else if (c == 'g') max_gap = paf_parseNum(getopt.arg);
else if (c == 'c') fn_cen = getopt.arg;
else if (c == 'r') cen_ratio = parseFloat(getopt.arg);
else if (c == 'p') show_long = true;
else if (c == 'e') show_err = true;
}
if (args.length == getopt.ind) {
print("Usage: paftools.js misjoin [options] <in.paf>");
print("Options:");
print(" -c FILE BED for centromeres []");
print(" -r FLOAT count a centromeric event if overlap ratio > FLOAT [" + cen_ratio + "]");
print(" -l NUM min alignment block length [1m]");
print(" -g NUM max gap size [1m]");
print(" -e output misjoins not involving centromeres");
print(" -p output long alignment blocks for debugging");
return;
}
var cen = {};
var file, buf = new Bytes();
if (fn_cen != null) {
file = new File(fn_cen);
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
if (cen[t[0]] == null) cen[t[0]] = [];
cen[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
}
file.close();
}
function test_cen(cen, chr, st, en) {
var b = cen[chr], len = 0;
if (b == null) return false;
for (var j = 0; j < b.length; ++j)
if (b[j][0] < en && b[j][1] > st) {
var s = b[j][0] > st? b[j][0] : st;
var e = b[j][1] < en? b[j][1] : en;
len += e - s;
}
return len < (en - st) * cen_ratio? false : true;
}
function process(a) {
var k = 0;
for (var i = 0; i < a.length; ++i) {
for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]);
for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]);
if (a[i][10] >= min_seg_len) a[k++] = a[i];
}
a.length = k;
if (a.length == 1) return;
a = a.sort(function(x,y){return x[2]-y[2]});
if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
for (var i = 1; i < a.length; ++i) {
var ov = [false, false];
ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
if (a[i-1][5] != a[i][5]) { // different chr
if (ov[0] || ov[1]) ++n_diff[1];
else if (show_err) {
print("J", a[i-1].slice(0, 12).join("\t"));
print("J", a[i].slice(0, 12).join("\t"));
}
++n_diff[0];
} else if (a[i-1][4] == a[i][4]) { // a gap
var dq = a[i][2] - a[i-1][3];
var dr = a[i][4] == '+'? a[i][7] - a[i-1][8] : a[i-1][7] - a[i][8];
var gap = dr > dq? dr - dq : dq - dr;
if (gap > max_gap) {
if (ov[0] || ov[1]) ++n_gap[1];
else if (show_err) {
print("G", a[i-1].slice(0, 12).join("\t"));
print("G", a[i].slice(0, 12).join("\t"));
}
++n_gap[0];
}
} else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion
if (ov[0] || ov[1]) ++n_inv[1];
else if (show_err) {
print("M", a[i-1].slice(0, 12).join("\t"));
print("M", a[i].slice(0, 12).join("\t"));
print("M", a[i+1].slice(0, 12).join("\t"));
}
++n_inv[0];
++i;
} else { // hanging inversion
if (ov[0] || ov[1]) ++n_inv_end[1];
++n_inv_end[0];
}
}
}
file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
var a = [];
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
if (a.length > 0 && a[0][0] != t[0]) {
process(a);
a.length = 0;
}
a.push(t);
}
if (a.length > 0) process(a);
file.close();
buf.destroy();
print("# inter-chromosomal misjoins: " + n_diff.join(","));
print("# intra-chromosomal gaps: " + n_gap.join(","));
print("# candidate inversions in the middle: " + n_inv.join(","));
print("# candidate inversions at contig ends: " + n_inv_end.join(","));
}
function _paf_get_alen(t)
{
var svlen = null, alen = null;
if ((m = /(^|;)SVLEN=(-?\d+)/.exec(t[7])) != null)
svlen = parseInt(m[2]);
var s = t[4].split(",");
var min_abs_diff = 1<<30, max_abs_diff = 0;
if (svlen != null && svlen != 0)
alen = svlen, min_abs_diff = max_abs_diff = svlen > 0? svlen : -svlen;
var rlen = t[3].length;
for (var i = 0; i < s.length; ++i) {
if (/^<\S+>$/.test(s[i])) continue;
var diff = s[i].length - rlen;
var abs_diff = diff > 0? diff : -diff;
min_abs_diff = min_abs_diff < abs_diff? min_abs_diff : abs_diff;
if (max_abs_diff < abs_diff)
max_abs_diff = abs_diff, alen = diff;
}
return [alen, min_abs_diff, max_abs_diff];
}
function paf_sveval(args)
{
var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, print_match = false, bed_fn = null;
var len_diff_ratio = 0.5;
while ((c = getopt(args, "f:i:x:w:er:pd:")) != null) {
if (c == 'f') min_flt = paf_parseNum(getopt.arg);
else if (c == 'i') min_size = paf_parseNum(getopt.arg);
else if (c == 'x') max_size = paf_parseNum(getopt.arg);
else if (c == 'w') win_size = paf_parseNum(getopt.arg);
else if (c == 'd') len_diff_ratio = parseFloat(getopt.arg);
else if (c == 'r') bed_fn = getopt.arg;
else if (c == 'e') print_err = true;
else if (c == 'p') print_match = true;
}
if (args.length - getopt.ind < 2) {
print("Usage: paftools.js sveval [options] <base.vcf> <call.vcf>");
print("Options:");
print(" -r FILE confident region in BED []");
print(" -f INT min length to discard [" + min_flt + "]");
print(" -i INT min SV length [" + min_size + "]");
print(" -x INT max SV length [" + max_size + "]");
print(" -w INT fuzzy windown size [" + win_size + "]");
print(" -d FLOAT max allele diff if there is a single allele in the window [" + len_diff_ratio + "]");
print(" -e print errors");
return;
}
function read_bed(fn) {
var buf = new Bytes();
var file = new File(fn);
var bed = {};
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
if (bed[t[0]] == null) bed[t[0]] = [];
bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
}
file.close();
buf.destroy();
for (var x in bed) {
Interval.sort(bed[x]);
Interval.merge(bed[x]);
Interval.index_end(bed[x]);
}
return bed;
}
var bed = bed_fn != null? read_bed(bed_fn) : null;
function read_vcf(fn, bed) {
var buf = new Bytes();
var file = new File(fn);
var v = {};
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
if (t[0][0] == '#') continue;
if (bed != null && bed[t[0]] == null) continue;
if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
if (/[\[\]]/.test(t[4])) continue; // no break points
var st = parseInt(t[1]) - 1, en = st + t[3].length;
// parse svlen
var b = _paf_get_alen(t), svlen = b[0];
var abslen = svlen == null? 0 : svlen > 0? svlen : -svlen;
if (abslen < min_flt || abslen > max_size) continue;
// update end
if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
en = parseInt(m[2]);
else if (svlen != null && svlen < 0)
en = st + (-svlen);
if (en < st) en = st;
if (st == en) --st, ++en;
if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue;
// insert
if (v[t[0]] == null) v[t[0]] = [];
v[t[0]].push([st, en, svlen, abslen]);
}
file.close();
buf.destroy();
for (var x in v) {
Interval.sort(v[x]);
Interval.index_end(v[x]);
}
return v;
}
function compare_vcf(v0, v1, label) {
var m = 0, n = 0;
for (var x in v1) {
var a1 = v1[x], a0 = v0[x];
for (var i = 0; i < a1.length; ++i) {
if (a1[i][3] < min_size) continue;
++n;
if (a0 == null) continue;
var ws = win_size + (a1[i][3]>>1);
var st = a1[i][0] > ws? a1[i][0] - ws : 0;
b = Interval.find_ovlp(a0, st, a1[i][1] + ws);
var n_ins = 0, n_del = 0, sv_del = null, sv_ins = null;
for (var j = 0; j < b.length; ++j) {
if (b[j][2] < 0) ++n_del, sv_del = -b[j][2];
else if (b[j][2] > 0) ++n_ins, sv_ins = b[j][2];
if (print_match)
print("MA", x, a1[i].slice(0, 3).join("\t"), b[j].slice(0, 3).join("\t"));
}
var match = false;
if (a1[i][2] > 0) { // insertion
if (n_ins == 1) {
var diff = sv_ins - a1[i][3];
if (diff < 0) diff = -diff;
if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
match = true;
} else if (n_ins > 1) match = true; // multiple insertions; ambiguous
} else if (a1[i][2] < 0) {
if (n_del == 1) { // deletion
var diff = sv_del - a1[i][3];
if (diff < 0) diff = -diff;
if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
match = true;
} else if (n_del > 1) match = true; // multiple deletions; ambiguous
}
if (match) ++m;
else if (print_err) {
if ((a1[i][2] > 0 && n_ins > 0) || (a1[i][2] < 0 && n_del > 0))
print("MM", x, a1[i].slice(0, 3).join("\t"));
print(label, x, a1[i].slice(0, 3).join("\t"));
}
}
}
return [n, m];
}
var v_base = read_vcf(args[getopt.ind+0], bed);
var v_call = read_vcf(args[getopt.ind+1], bed);
var fn = compare_vcf(v_call, v_base, 'FN');
var fp = compare_vcf(v_base, v_call, 'FP');
print('SN', fn[0], fn[1], (fn[1] / fn[0]).toFixed(6));
print('PC', fp[0], fp[1], (fp[1] / fp[0]).toFixed(6));
print('F1', ((fn[1] / fn[0] + fp[1] / fp[0]) / 2).toFixed(6));
}
function paf_vcfsel(args)
{
var c, min_l = 0, max_l = 1<<30;
while ((c = getopt(args, "l:L:")) != null) {
if (c == 'l') min_l = parseInt(getopt.arg);
else if (c == 'L') max_l = parseInt(getopt.arg);
}
var buf = new Bytes();
if (getopt.ind == args.length) {
print("Usage: paftools.js vcfsel [options] <in.vcf>");
return 1;
}
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
if (line[0] == '#') {
print(line);
continue;
}
var t = line.split("\t");
var st = parseInt(t[1]), en = st + t[3].length - 1;
if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
en = parseInt(m[2]);
if (en < st) {
warn("END is smaller than POS: " + en + " < " + st);
en = st;
}
var b = _paf_get_alen(t);
var alen = b[0], min_abs_diff = b[1], max_abs_diff = b[2];
if (max_abs_diff < min_l || min_abs_diff > max_l)
continue;
print(line);
}
file.close();
buf.destroy();
}
/*************************
***** main function *****
*************************/
@ -2529,10 +2947,13 @@ function main(args)
print("");
print(" stat collect basic mapping information in PAF/SAM");
print(" asmstat collect basic assembly information");
print(" asmgene evaluate gene completeness (EXPERIMENTAL)");
print(" asmgene evaluate gene completeness");
print(" misjoin evaluate large-scale misjoins");
print(" liftover simplistic liftOver");
print(" call call variants from asm-to-ref alignment with the cs tag");
print(" bedcov compute the number of bases covered");
print(" vcfstat VCF statistics");
print(" sveval compare two SV callsets in VCF");
print(" version print paftools.js version");
print("");
print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ");
@ -2552,6 +2973,7 @@ function main(args)
else if (cmd == 'stat') paf_stat(args);
else if (cmd == 'asmstat') paf_asmstat(args);
else if (cmd == 'asmgene') paf_asmgene(args);
else if (cmd == 'misjoin') paf_misjoin(args);
else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args);
else if (cmd == 'vcfpair') paf_vcfpair(args);
else if (cmd == 'call') paf_call(args);
@ -2561,6 +2983,9 @@ function main(args)
else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
else if (cmd == 'junceval') paf_junceval(args);
else if (cmd == 'ov-eval') paf_ov_eval(args);
else if (cmd == 'vcfstat') paf_vcfstat(args);
else if (cmd == 'sveval') paf_sveval(args);
else if (cmd == 'vcfsel') paf_vcfsel(args);
else if (cmd == 'version') print(paftools_version);
else throw Error("unrecognized command: " + cmd);
}

View File

@ -36,6 +36,14 @@
extern "C" {
#endif
typedef struct {
uint32_t n;
uint32_t q_pos;
uint32_t q_span:31, flt:1;
uint32_t seg_id:31, is_tandem:1;
const uint64_t *cr;
} mm_seed_t;
typedef struct {
int n_u, n_a;
uint64_t *u;
@ -52,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len);
@ -63,20 +73,25 @@ void mm_idxopt_init(mm_idxopt_t *opt);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale,
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r);
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand);
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a);
int mm_set_sam_pri(int n, mm_reg1_t *r);
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac);
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r);
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);

View File

@ -1,4 +1,5 @@
#include <stdio.h>
#include <limits.h>
#include "mmpriv.h"
void mm_idxopt_init(mm_idxopt_t *opt)
@ -15,26 +16,30 @@ void mm_mapopt_init(mm_mapopt_t *opt)
memset(opt, 0, sizeof(mm_mapopt_t));
opt->seed = 11;
opt->mid_occ_frac = 2e-4f;
opt->min_mid_occ = 10;
opt->max_mid_occ = 1000000;
opt->sdust_thres = 0; // no SDUST masking
opt->min_cnt = 3;
opt->min_chain_score = 40;
opt->bw = 500;
opt->bw = 500, opt->bw_long = 20000;
opt->max_gap = 5000;
opt->max_gap_ref = -1;
opt->max_chain_skip = 25;
opt->max_chain_iter = 5000;
opt->chain_gap_scale = 1.0f;
opt->rmq_inner_dist = 1000;
opt->rmq_size_cap = 100000;
opt->rmq_rescue_size = 1000;
opt->rmq_rescue_ratio = 0.1f;
opt->chain_gap_scale = 0.8f;
opt->max_max_occ = 4095;
opt->occ_dist = 500;
opt->mask_level = 0.5f;
opt->mask_len = INT_MAX;
opt->pri_ratio = 0.8f;
opt->best_n = 5;
opt->max_join_long = 20000;
opt->max_join_short = 2000;
opt->min_join_flank_sc = 1000;
opt->min_join_flank_ratio = 0.5f;
opt->alt_drop = 0.15f;
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
@ -46,6 +51,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6;
opt->max_clip_ratio = 1.0f;
opt->mini_batch_size = 500000000;
opt->max_sw_mat = 100000000;
opt->pe_ori = 0; // FF
opt->pe_bonus = 33;
@ -55,10 +61,13 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
{
if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV))
opt->flag |= MM_F_SPLICE;
if (opt->mid_occ <= 0)
if (opt->mid_occ <= 0) {
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
if (opt->mid_occ < opt->min_mid_occ)
opt->mid_occ = opt->min_mid_occ;
if (opt->mid_occ < opt->min_mid_occ)
opt->mid_occ = opt->min_mid_occ;
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
opt->mid_occ = opt->max_mid_occ;
}
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
}
@ -66,7 +75,7 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len)
{
if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0)
opt->max_gap_ref = opt->bw = max_intron_len;
opt->max_gap_ref = opt->bw = opt->bw_long = max_intron_len;
}
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
@ -74,37 +83,44 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
if (preset == 0) {
mm_idxopt_init(io);
mm_mapopt_init(mo);
} else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
} else if (strcmp(preset, "ava-ont") == 0) {
io->flag = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
mo->bw = 2000;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
mo->bw = mo->bw_long = 2000;
mo->occ_dist = 0;
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
io->flag |= MM_I_HPC, io->k = 19;
} else if (strcmp(preset, "ava-pb") == 0) {
io->flag |= MM_I_HPC, io->k = 19, io->w = 5;
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
io->flag |= MM_I_HPC, io->k = 19;
} else if (strcmp(preset, "map-ont") == 0) {
io->flag = 0, io->k = 15;
} else if (strcmp(preset, "asm5") == 0) {
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
mo->bw_long = mo->bw;
mo->occ_dist = 0;
} else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
io->flag = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
mo->min_mid_occ = 100;
mo->max_gap = 10000;
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
mo->occ_dist = 500;
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
mo->best_n = 50;
} else if (strcmp(preset, "asm10") == 0) {
} else if (strncmp(preset, "asm", 3) == 0) {
io->flag = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
mo->min_mid_occ = 100;
mo->min_dp_max = 200;
mo->best_n = 50;
} else if (strcmp(preset, "asm20") == 0) {
io->flag = 0, io->k = 19, io->w = 10;
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
mo->min_mid_occ = 100;
mo->bw = mo->bw_long = 100000;
mo->max_gap = 10000;
mo->flag |= MM_F_RMQ;
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
mo->best_n = 50;
if (strcmp(preset, "asm5") == 0) {
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
} else if (strcmp(preset, "asm10") == 0) {
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
} else if (strcmp(preset, "asm20") == 0) {
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
io->w = 10;
} else return -1;
} else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {
io->flag = 0, io->k = 21, io->w = 11;
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT;
@ -114,7 +130,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->end_bonus = 10;
mo->max_frag_len = 800;
mo->max_gap = 100;
mo->bw = 100;
mo->bw = mo->bw_long = 100;
mo->pri_ratio = 0.5f;
mo->min_cnt = 2;
mo->min_chain_score = 25;
@ -126,7 +142,8 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
} else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) {
io->flag = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
mo->max_sw_mat = 0;
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000;
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
mo->noncan = 9;
mo->junc_bonus = 9;
@ -139,6 +156,16 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
{
if (mo->bw > mo->bw_long) {
if (mm_verbose >= 1)
fprintf(stderr, "[ERROR]\033[1;31m with '-rNUM1,NUM2', NUM1 (%d) can't be larger than NUM2 (%d)\033[0m\n", mo->bw, mo->bw_long);
return -8;
}
if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) {
if (mm_verbose >= 1)
fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n");
return -7;
}
if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) {
if (mm_verbose >= 1)
fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n");

View File

@ -144,7 +144,7 @@ properties:
* **mlen**: length of the matching bases in the alignment, excluding ambiguous
base matches.
* **NM**: number of mismatches, gaps and ambiguous poistions in the alignment
* **NM**: number of mismatches, gaps and ambiguous positions in the alignment
* **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the
reverse strand; 0 if unknown

View File

@ -13,21 +13,27 @@ cdef extern from "minimap.h":
int64_t flag
int seed
int sdust_thres
int max_qlen
int bw
int bw, bw_long
int max_gap, max_gap_ref
int max_frag_len
int max_chain_skip, max_chain_iter
int min_cnt
int min_chain_score
float chain_gap_scale
int rmq_size_cap, rmq_inner_dist
int rmq_rescue_size
float rmq_rescue_ratio
float mask_level
int mask_len
float pri_ratio
int best_n
int max_join_long, max_join_short
int min_join_flank_sc
float min_join_flank_ratio
float alt_drop
int a, b, q, e, q2, e2
int sc_ambi
int noncan
@ -38,13 +44,16 @@ cdef extern from "minimap.h":
int min_ksw_len
int anchor_ext_len, anchor_ext_shift
float max_clip_ratio
int pe_ori, pe_bonus
float mid_occ_frac
int32_t min_mid_occ
int32_t mid_occ
int32_t max_occ
int64_t mini_batch_size
int64_t max_sw_mat
const char *split_prefix
int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)

View File

@ -3,7 +3,7 @@ from libc.stdlib cimport free
cimport cmappy
import sys
__version__ = '2.17'
__version__ = '2.21'
cmappy.mm_reset_timer()
@ -82,7 +82,7 @@ cdef class Alignment:
@property
def cigar_str(self):
return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar))
def __str__(self):
if self._strand > 0: strand = '+'

106
seed.c 100644
View File

@ -0,0 +1,106 @@
#include "mmpriv.h"
#include "kalloc.h"
#include "ksort.h"
mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
{
mm_seed_t *m;
size_t i;
int32_t k;
m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
for (i = k = 0; i < mv->n; ++i) {
const uint64_t *cr;
mm_seed_t *q;
mm128_t *p = &mv->a[i];
uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
int t;
cr = mm_idx_get(mi, p->x>>8, &t);
if (t == 0) continue;
q = &m[k++];
q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
q->is_tandem = q->flt = 0;
if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
}
*n_m_ = k;
return m;
}
#define MAX_MAX_HIGH_OCC 128
void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
int32_t i, last0, m;
uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
if (n == 0 || n == 1) return;
for (i = m = 0; i < n; ++i)
if (a[i].n > max_occ) ++m;
if (m == 0) return; // no high-frequency k-mers; do nothing
for (i = 0, last0 = -1; i <= n; ++i) {
if (i == n || a[i].n <= max_occ) {
if (i - last0 > 1) {
int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
int32_t j, k, st = last0 + 1, en = i;
int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
if (max_high_occ > 0) {
if (max_high_occ > MAX_MAX_HIGH_OCC)
max_high_occ = MAX_MAX_HIGH_OCC;
for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
b[k] = (uint64_t)a[j].n<<32 | j;
ks_heapmake_uint64_t(k, b); // initialize the binomial heap
for (; j < en; ++j) { // if there are more, choose top max_high_occ
if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
b[0] = (uint64_t)a[j].n<<32 | j;
ks_heapdown_uint64_t(0, k, b);
}
}
for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
}
for (j = st; j < en; ++j) a[j].flt ^= 1;
for (j = st; j < en; ++j)
if (a[j].n > max_max_occ)
a[j].flt = 1;
}
last0 = i;
}
}
}
mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
{
int rep_st = 0, rep_en = 0, n_m, n_m0;
size_t i;
mm_seed_t *m;
*n_mini_pos = 0;
*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
m = mm_seed_collect_all(km, mi, mv, &n_m0);
if (dist > 0 && max_max_occ > max_occ) {
mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
} else {
for (i = 0; i < n_m0; ++i)
if (m[i].n > max_occ)
m[i].flt = 1;
}
for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
mm_seed_t *q = &m[i];
//fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
if (q->flt) {
int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
if (st > rep_en) {
*rep_len += rep_en - rep_st;
rep_st = st, rep_en = en;
} else rep_en = en;
} else {
*n_a += q->n;
(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
m[n_m++] = *q;
}
}
*rep_len += rep_en - rep_st;
*_n_m = n_m;
return m;
}

View File

@ -4,16 +4,6 @@ except ImportError:
from distutils.core import setup
from distutils.extension import Extension
cmdclass = {}
try:
from Cython.Build import build_ext
except ImportError: # without Cython
module_src = 'python/mappy.c'
else: # with Cython
module_src = 'python/mappy.pyx'
cmdclass['build_ext'] = build_ext
import sys, platform
sys.path.append('python')
@ -33,7 +23,7 @@ def readme():
setup(
name = 'mappy',
version = '2.17',
version = '2.21',
url = 'https://github.com/lh3/minimap2',
description = 'Minimap2 python binding',
long_description = readme(),
@ -42,8 +32,8 @@ setup(
license = 'MIT',
keywords = 'sequence-alignment',
scripts = ['python/minimap2.py'],
ext_modules = [Extension('mappy',
sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
ext_modules = [Extension('mappy',
sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
@ -62,4 +52,4 @@ setup(
'Programming Language :: Python :: 3',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Bio-Informatics'],
cmdclass = cmdclass)
setup_requires=["cython"])