diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..628acea --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,21 @@ +name: CI + +on: + push: + branches: + - master + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + compiler: [gcc, clang] + + steps: + - name: Checkout minimap2 + uses: actions/checkout@v2 + + - name: Compile with ${{ matrix.compiler }} + run: make CC=${{ matrix.compiler }} diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a80f848 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lib/simde"] + path = lib/simde + url = https://github.com/nemequ/simde.git diff --git a/.travis.yml b/.travis.yml index f0764f1..b43ca9c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,6 @@ matrix: before_install: pip install cython script: python setup.py build_ext - language: python - python: "3.6" + python: "3.9" before_install: pip install cython script: python setup.py build_ext diff --git a/MANIFEST.in b/MANIFEST.in index 599b046..3df1d19 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,7 +4,6 @@ include ksw2_dispatch.c include main.c include README.md include sse2neon/emmintrin.h -include python/mappy.c include python/cmappy.h include python/cmappy.pxd include python/mappy.pyx diff --git a/Makefile b/Makefile index 18622f5..4118616 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,9 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC INCLUDES= -OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \ + lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ + ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread @@ -103,26 +105,28 @@ depend: # DO NOT DELETE -align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h +align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h bseq.o: bseq.h kvec.h kalloc.h kseq.h -chain.o: minimap.h mmpriv.h bseq.h kalloc.h -esterr.o: mmpriv.h minimap.h bseq.h +esterr.o: mmpriv.h minimap.h bseq.h kseq.h example.o: minimap.h kseq.h -format.o: kalloc.h mmpriv.h minimap.h bseq.h -hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h -index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h +format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h +hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h +index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h +index.o: ksort.h kalloc.o: kalloc.h ksw2_extd2_sse.o: ksw2.h kalloc.h ksw2_exts2_sse.o: ksw2.h kalloc.h ksw2_extz2_sse.o: ksw2.h kalloc.h ksw2_ll_sse.o: ksw2.h kalloc.h kthread.o: kthread.h -main.o: bseq.h minimap.h mmpriv.h ketopt.h -map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h -map.o: ksort.h -misc.o: mmpriv.h minimap.h bseq.h ksort.h -options.o: mmpriv.h minimap.h bseq.h -pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h -sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h -sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h -splitidx.o: mmpriv.h minimap.h bseq.h +lchain.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h krmq.h +main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h +map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h +map.o: khash.h ksort.h +misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h +options.o: mmpriv.h minimap.h bseq.h kseq.h +pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h +sdust.o: kalloc.h kdq.h kvec.h sdust.h +seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h +sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h +splitidx.o: mmpriv.h minimap.h bseq.h kseq.h diff --git a/Makefile.simde b/Makefile.simde new file mode 100644 index 0000000..381a668 --- /dev/null +++ b/Makefile.simde @@ -0,0 +1,97 @@ +CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra +CPPFLAGS= -DHAVE_KALLOC -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES +INCLUDES= -Ilib/simde +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o \ + ksw2_extz2_simde.o ksw2_extd2_simde.o ksw2_exts2_simde.o ksw2_ll_simde.o +PROG= minimap2 +PROG_EXTRA= sdust minimap2-lite +LIBS= -lm -lz -lpthread + + +ifneq ($(arm_neon),) # if arm_neon is defined +ifeq ($(aarch64),) #if aarch64 is not defined + CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char +else #if aarch64 is defined + CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char +endif +endif + +ifneq ($(asan),) + CFLAGS+=-fsanitize=address + LIBS+=-fsanitize=address +endif + +ifneq ($(tsan),) + CFLAGS+=-fsanitize=thread + LIBS+=-fsanitize=thread +endif + +.PHONY:all extra clean depend +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +extra:all $(PROG_EXTRA) + +minimap2:main.o libminimap2.a + $(CC) $(CFLAGS) main.o -o $@ -L. -lminimap2 $(LIBS) + +minimap2-lite:example.o libminimap2.a + $(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS) + +libminimap2.a:$(OBJS) + $(AR) -csru $@ $(OBJS) + +sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h + $(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz + +ksw2_ll_simde.o:ksw2_ll_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@ + +ksw2_extz2_simde.o:ksw2_extz2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@ + +ksw2_extd2_simde.o:ksw2_extd2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@ + +ksw2_exts2_simde.o:ksw2_exts2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@ + +# other non-file targets + +clean: + rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM build dist mappy*.so mappy.c python/mappy.c mappy.egg* + +depend: + (LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(CPPFLAGS) -- *.c) + +# DO NOT DELETE + +align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h +bseq.o: bseq.h kvec.h kalloc.h kseq.h +chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h +esterr.o: mmpriv.h minimap.h bseq.h kseq.h +example.o: minimap.h kseq.h +format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h +hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h +index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h +index.o: ksort.h +kalloc.o: kalloc.h +ksw2_extd2_sse.o: ksw2.h kalloc.h +ksw2_exts2_sse.o: ksw2.h kalloc.h +ksw2_extz2_sse.o: ksw2.h kalloc.h +ksw2_ll_sse.o: ksw2.h kalloc.h +kthread.o: kthread.h +main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h +map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h +map.o: khash.h ksort.h +misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h +options.o: mmpriv.h minimap.h bseq.h kseq.h +pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h +sdust.o: kalloc.h kdq.h kvec.h sdust.h +self-chain.o: minimap.h kseq.h +sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h +splitidx.o: mmpriv.h minimap.h bseq.h kseq.h diff --git a/NEWS.md b/NEWS.md index 5c0ab9c..425545a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,151 @@ +Release 2.21-r1071 (6 July 2021) +-------------------------------- + +This release fixed a regression in short-read mapping introduced in v2.19 +(#776). It also fixed invalid comparisons of uninitialized variables, though +these are harmless (#752). Long-read alignment should be identical to v2.20. + +(2.21: 6 July 2021) + + + +Release 2.20-r1061 (27 May 2021) +-------------------------------- + +This release fixed a bug in the Python module and improves the command-line +compatibiliity with v2.18. In v2.19, if `-r` is specified with an `asm*` preset, +users would get alignments more fragmented than v2.18. This could be an issue +for existing pipelines specifying `-r`. This release resolves this issue. + +(2.20: 27 May 2021, r1061) + + + +Release 2.19-r1057 (26 May 2021) +-------------------------------- + +This release includes a few important improvements backported from unimap: + + * Improvement: more contiguous alignment through long INDELs. This is enabled + by the minigraph chaining algorithm. All `asm*` presets now use the new + algorithm. They can find INDELs up to 100kb and may be faster for + chromosome-long contigs. The default mode and `map*` presets use this + algorithm to replace the long-join heuristic. + + * Improvement: better alignment in highly repetitive regions by rescuing + high-occurrence seeds. If the distance between two adjacent seeds is too + large, attempt to choose a fraction of high-occurrence seeds in-between. + Minimap2 now produces fewer clippings and alignment break points in long + satellite regions. + + * Improvement: allow to specify an interval of k-mer occurrences with `-U`. + For repeat-rich genomes, the automatic k-mer occurrence threshold determined + by `-f` may be too large and makes alignment impractically slow. The new + option protects against such cases. Enabled for `asm*` and `map-hifi`. + + * New feature: added the `map-hifi` preset for maping PacBio High-Fidelity + (HiFi) reads. + + * Change to the default: apply `--cap-sw-mem=100m` for genomic alignment. + + * Bugfix: minimap2 could not generate an index file with `-xsr` (#734). + +This release represents the most signficant algorithmic change since v2.1 in +2017. With features backported from unimap, minimap2 now has similar power to +unimap for contig alignment. Unimap will remain an experimental project and is +no longer recommended over minimap2. Sorry for reverting the recommendation in +short time. + +(2.19: 26 May 2021, r1057) + + + +Release 2.18-r1015 (9 April 2021) +--------------------------------- + +This release fixes multiple rare bugs in minimap2 and adds additional +functionality to paftools.js. + +Changes to minimap2: + + * Bugfix: a rare segfault caused by an off-by-one error (#489) + + * Bugfix: minimap2 segfaulted due to an uninitilized variable (#622 and #625). + + * Bugfix: minimap2 parsed spaces as field separators in BED (#721). This led + to issues when the BED name column contains spaces. + + * Bugfix: minimap2 `--split-prefix` did not work with long reference names + (#394). + + * Bugfix: option `--junc-bonus` didn't work (#513) + + * Bugfix: minimap2 didn't return 1 on I/O errors (#532) + + * Bugfix: the `de:f` tag (sequence divergence) could be negative if there were + ambiguous bases + + * Bugfix: fixed two undefined behaviors caused by calling memcpy() on + zero-length blocks (#443) + + * Bugfix: there were duplicated SAM @SQ lines if option `--split-prefix` is in + use (#400 and #527) + + * Bugfix: option -K had to be smaller than 2 billion (#491). This was caused + by a 32-bit integer overflow. + + * Improvement: optionally compile against SIMDe (#597). Minimap2 should work + with IBM POWER CPUs, though this has not been tested. To compile with SIMDe, + please use `make -f Makefile.simde`. + + * Improvement: more informative error message for I/O errors (#454) and for + FASTQ parsing errors (#510) + + * Improvement: abort given malformatted RG line (#541) + + * Improvement: better formula to estimate the `dv:f` tag (approximate sequence + divergence). See DOI:10.1101/2021.01.15.426881. + + * New feature: added the `--mask-len` option to fine control the removal of + redundant hits (#659). The default behavior is unchanged. + +Changes to mappy: + + * Bugfix: mappy caused segmentation fault if the reference index is not + present (#413). + + * Bugfix: fixed a memory leak via 238b6bb3 + + * Change: always require Cython to compile the mappy module (#723). Older + mappy packages at PyPI bundled the C source code generated by Cython such + that end users did not need to install Cython to compile mappy. However, as + Python 3.9 is breaking backward compatibility, older mappy does not work + with Python 3.9 anymore. We have to add this Cython dependency as a + workaround. + +Changes to paftools.js: + + * Bugfix: the "part10-" line from asmgene was wrong (#581) + + * Improvement: compatibility with GTF files from GenBank (#422) + + * New feature: asmgene also checks missing multi-copy genes + + * New feature: added the misjoin command to evaluate large-scale misjoins and + megabase-long inversions. + +Although given the many bug fixes and minor improvements, the core algorithm +stays the same. This version of minimap2 produces nearly identical alignments +to v2.17 except very rare corner cases. + +Now unimap is recommended over minimap2 for aligning long contigs against a +reference genome. It often takes less wall-clock time and is much more +sensitive to long insertions and deletions. + +(2.18: 9 April 2021, r1015) + + + Release 2.17-r941 (4 May 2019) ------------------------------ diff --git a/README.md b/README.md index addeb2c..0bff2e4 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![GitHub Downloads](https://img.shields.io/github/downloads/lh3/minimap2/total.svg?style=social&logo=github&label=Download)](https://github.com/lh3/minimap2/releases) [![BioConda Install](https://img.shields.io/conda/dn/bioconda/minimap2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/minimap2) [![PyPI](https://img.shields.io/pypi/v/mappy.svg?style=flat)](https://pypi.python.org/pypi/mappy) -[![Build Status](https://travis-ci.org/lh3/minimap2.svg?branch=master)](https://travis-ci.org/lh3/minimap2) +[![Build Status](https://github.com/lh3/minimap2/actions/workflows/ci.yaml/badge.svg)](https://github.com/lh3/minimap2/actions) ## Getting Started ```sh git clone https://github.com/lh3/minimap2 @@ -12,19 +12,22 @@ cd minimap2 && make ./minimap2 -x map-ont -d MT-human-ont.mmi test/MT-human.fa ./minimap2 -a MT-human-ont.mmi test/MT-orang.fa > test.sam # use presets (no test data) -./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio genomic reads +./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads -./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio CCS genomic reads +./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later) +./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier) ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown) ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq ./minimap2 -ax splice:hq -uf ref.fa query.fa > aln.sam # Final PacBio Iso-seq or traditional cDNA +./minimap2 -ax splice --junc-bed anno.bed12 ref.fa query.fa > aln.sam # prioritize on annotated junctions ./minimap2 -cx asm5 asm1.fa asm2.fa > aln.paf # intra-species asm-to-asm alignment ./minimap2 -x ava-pb reads.fa reads.fa > overlaps.paf # PacBio read overlap ./minimap2 -x ava-ont reads.fa reads.fa > overlaps.paf # Nanopore read overlap # man page for detailed command line options man ./minimap2.1 ``` + ## Table of Contents - [Getting Started](#started) @@ -71,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from the [release page][release] with: ```sh -curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar -jxvf - -./minimap2-2.17_x64-linux/minimap2 +curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar -jxvf - +./minimap2-2.21_x64-linux/minimap2 ``` If you want to compile from the source, you need to have a C compiler, GNU make and zlib development files installed. Then type `make` in the source code @@ -80,13 +83,20 @@ directory to compile. If you see compilation errors, try `make sse2only=1` to disable SSE4 code, which will make minimap2 slightly slower. Minimap2 also works with ARM CPUs supporting the NEON instruction sets. To -compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 aarch64=1`. +compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To +compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 +aarch64=1`. + +Minimap2 can use [SIMD Everywhere (SIMDe)][simde] library for porting +implementation to the different SIMD instruction sets. To compile using SIMDe, +use `make -f Makefile.simde`. To compile for ARM CPUs, use `Makefile.simde` +with the ARM related command lines given above. ### General usage Without any options, minimap2 takes a reference database and a query sequence file as input and produce approximate mapping, without base-level alignment -(i.e. no CIGAR), in the [PAF format][paf]: +(i.e. coordinates are only approximate and no CIGAR in output), in the [PAF format][paf]: ```sh minimap2 ref.fa query.fq > approx-mapping.paf ``` @@ -127,13 +137,13 @@ parameters at the same time. The default setting is the same as `map-ont`. #### Map long noisy genomic reads ```sh -minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio subreads +minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads ``` The difference between `map-pb` and `map-ont` is that `map-pb` uses homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary minimizers as seeds. Emperical evaluation suggests HPC minimizers improve -performance and sensitivity when aligning PacBio reads, but hurt when aligning +performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning Nanopore reads. #### Map long mRNA/cDNA reads @@ -178,10 +188,23 @@ This is because SIRV does not honor the evolutionarily conservative splicing signal. If you are studying SIRV, you may apply `--splice-flank=no` to let minimap2 only model GT..AG, ignoring the additional base. +Since v2.17, minimap2 can optionally take annotated genes as input and +prioritize on annotated splice junctions. To use this feature, you can +```sh +paftools.js gff2bed anno.gff > anno.bed +minimap2 -ax splice --junc-bed anno.bed ref.fa query.fa > aln.sam +``` +Here, `anno.gff` is the gene annotation in the GTF or GFF3 format (`gff2bed` +automatically tests the format). The output of `gff2bed` is in the 12-column +BED format, or the BED12 format. With the `--junc-bed` option, minimap2 adds a +bonus score (tuned by `--junc-bonus`) if an aligned junction matches a junction +in the annotation. Option `--junc-bed` also takes 5-column BED, including the +strand field. In this case, each line indicates an oriented junction. + #### Find overlaps between long reads ```sh -minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio read overlap +minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio CLR read overlap minimap2 -x ava-ont reads.fq reads.fq > ovlp.paf # Oxford Nanopore read overlap ``` Similarly, `ava-pb` uses HPC minimizers while `ava-ont` uses ordinary @@ -228,7 +251,7 @@ To avoid this issue, you can add option `-L` at the minimap2 command line. This option moves a long CIGAR to the `CG` tag and leaves a fully clipped CIGAR at the SAM CIGAR column. Current tools that don't read CIGAR (e.g. merging and sorting) still work with such BAM records; tools that read CIGAR will -effectively ignore these records. It has been decided that future tools will +effectively ignore these records. It has been decided that future tools will seamlessly recognize long-cigar records generated by option `-L`. **TL;DR**: if you work with ultra-long reads and use tools that only process @@ -249,7 +272,7 @@ CGATCGATAAATAGAGTAG---GAATAGCA CGATCG---AATAGAGTAGGTCGAATtGCA ``` is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an -identical block, `-ata` represents a deltion, `+gtc` an insertion and `*at` +identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at` indicates reference base `a` is substituted with a query base `t`. It is similar to the `MD` SAM tag but is standalone and easier to parse. @@ -376,3 +399,5 @@ mappy` or [from BioConda][mappyconda] via `conda install -c bioconda mappy`. [manpage]: https://lh3.github.io/minimap2/minimap2.html [manpage-cs]: https://lh3.github.io/minimap2/minimap2.html#10 [doi]: https://doi.org/10.1093/bioinformatics/bty191 +[smide]: https://github.com/nemequ/simde +[unimap]: https://github.com/lh3/unimap diff --git a/align.c b/align.c index 3d67eeb..1ba2ba3 100644 --- a/align.c +++ b/align.c @@ -38,8 +38,8 @@ static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, i int z = *max - score - diff * e; if (z > *max_zdrop) { *max_zdrop = z; - pos[0][0] = *max_i, pos[0][1] = i + 1; - pos[1][0] = *max_j, pos[1][1] = j + 1; + pos[0][0] = *max_i, pos[0][1] = i; + pos[1][0] = *max_j, pos[1][1] = j; } } else *max = score, *max_i = i, *max_j = j; } @@ -53,16 +53,16 @@ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq, // find the score and the region where score drops most along diagonal for (k = 0, score = 0; k < n_cigar; ++k) { uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { for (l = 0; l < len; ++l) { score += mat[tseq[i + l] * 5 + qseq[j + l]]; update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); } i += len, j += len; - } else if (op == 1 || op == 2 || op == 3) { + } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) { score -= opt->q + opt->e * len; - if (op == 1) j += len; // insertion - else i += len; // deletion + if (op == MM_CIGAR_INS) j += len; + else i += len; update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); } } @@ -98,12 +98,12 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, for (k = 0; k < p->n_cigar; ++k) { // indel left alignment uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; if (len == 0) to_shrink = 1; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { toff += len, qoff += len; - } else if (op == 1 || op == 2) { // insertion or deletion + } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) { if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) { int l, prev_len = p->cigar[k-1] >> 4; - if (op == 1) { + if (op == MM_CIGAR_INS) { for (l = 0; l < prev_len; ++l) if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l]) break; @@ -116,9 +116,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l; if (l == prev_len) to_shrink = 1; } - if (op == 1) qoff += len; + if (op == MM_CIGAR_INS) qoff += len; else toff += len; - } else if (op == 3) { + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -128,13 +128,13 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, uint32_t l, s[3] = {0,0,0}; for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D uint32_t op = p->cigar[l]&0xf; - if (op == 1 || op == 2 || p->cigar[l]>>4 == 0) + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0) s[op] += p->cigar[l] >> 4; else break; } if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D - p->cigar[k] = s[1]<<4|1; - p->cigar[k+1] = s[2]<<4|2; + p->cigar[k] = s[1]<<4|MM_CIGAR_INS; + p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL; for (k += 2; k < l; ++k) p->cigar[k] &= 0xf; to_shrink = 1; @@ -154,9 +154,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator p->n_cigar = l; } - if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D + if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D int32_t l = p->cigar[0] >> 4; - if ((p->cigar[0]&0xf) == 1) { + if ((p->cigar[0]&0xf) == MM_CIGAR_INS) { if (r->rev) r->qe -= l; else r->qs += l; *qshift = l; @@ -174,7 +174,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (r->p == 0) return; for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { while (len > 0) { for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "=" if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; } @@ -183,11 +183,11 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; } } ++n_M; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -195,7 +195,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (n_EQX == n_M) { for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) r->p->cigar[k] = len << 4 | 7; + if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH; } return; } @@ -209,25 +209,25 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t toff = qoff = m = 0; for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) { // match/mismatch + if (op == MM_CIGAR_MATCH) { while (len > 0) { // match for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} - if (l > 0) p->cigar[m++] = l << 4 | 7; + if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH; len -= l; toff += l, qoff += l; // mismatch for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} - if (l > 0) p->cigar[m++] = l << 4 | 8; + if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH; len -= l; toff += l, qoff += l; } continue; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } p->cigar[m++] = r->p->cigar[k]; @@ -248,7 +248,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts r->blen = r->mlen = 0; for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; - if (op == 0) { // match/mismatch + if (op == MM_CIGAR_MATCH) { int n_ambi = 0, n_diff = 0; for (l = 0; l < len; ++l) { int cq = qseq[qoff + l], ct = tseq[toff + l]; @@ -260,7 +260,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts } r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi; toff += len, qoff += len; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { int n_ambi = 0; for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; @@ -268,7 +268,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts s -= q + e * len; if (s < 0) s = 0; qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { int n_ambi = 0; for (l = 0; l < len; ++l) if (tseq[toff + l] > 3) ++n_ambi; @@ -276,7 +276,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts s -= q + e * len; if (s < 0) s = 0; toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -333,7 +333,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint int i; fprintf(stderr, "score=%d, cigar=", ez->score); for (i = 0; i < ez->n_cigar; ++i) - fprintf(stderr, "%d%c", ez->cigar[i]>>4, "MIDN"[ez->cigar[i]&0xf]); + fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]); fprintf(stderr, "\n"); } } @@ -572,7 +572,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE); int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; uint8_t *tseq, *qseq, *junc; - int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; + int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; int32_t rs, re, qs, qe; int32_t rs1, qs1, re1, qe1; int8_t mat[25]; @@ -583,6 +583,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (r->cnt == 0) return; ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); bw = (int)(opt->bw * 1.5 + 1.); + bw_long = (int)(opt->bw_long * 1.5 + 1.); + if (bw_long < bw) bw_long = bw; if (is_sr && !(mi->flag & MM_I_HPC)) { mm_max_stretch(r, a, &as1, &cnt1); @@ -724,7 +726,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int } else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe); re1 = re, qe1 = qe; if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) { - int j, bw1 = bw, zdrop_code; + int j, bw1 = bw_long, zdrop_code; if (a[as1+i].y & MM_SEED_LONG_JOIN) bw1 = qe - qs > re - rs? qe - qs : re - rs; // perform alignment @@ -743,7 +745,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2; else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b; } - ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs); + ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs); } else { // perform normal gapped alignment mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop } @@ -754,6 +756,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (ez->n_cigar > 0) mm_append_cigar(r, ez->n_cigar, ez->cigar); if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle. + if (!r->p) { + assert(ez->n_cigar == 0); + uint32_t capacity = sizeof(mm_extra_t)/4; + kroundup32(capacity); + r->p = (mm_extra_t*)calloc(capacity, 4); + r->p->capacity = capacity; + } for (j = i - 1; j >= 0; --j) if ((int32_t)a[as1 + j].x <= rs + ez->max_t) break; diff --git a/chain.c b/chain.c deleted file mode 100644 index a2f7ac5..0000000 --- a/chain.c +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include -#include "minimap.h" -#include "mmpriv.h" -#include "kalloc.h" - -static const char LogTable256[256] = { -#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), - LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) -}; - -static inline int ilog2_32(uint32_t v) -{ - uint32_t t, tt; - if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; - return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; -} - -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) -{ // TODO: make sure this works when n has more than 32 bits - int32_t k, *f, *p, *t, *v, n_u, n_v; - int64_t i, j, st = 0; - uint64_t *u, *u2, sum_qspan = 0; - float avg_qspan; - mm128_t *b, *w; - - if (_u) *_u = 0, *n_u_ = 0; - if (n == 0 || a == 0) { - kfree(km, a); - return 0; - } - f = (int32_t*)kmalloc(km, n * 4); - p = (int32_t*)kmalloc(km, n * 4); - t = (int32_t*)kmalloc(km, n * 4); - v = (int32_t*)kmalloc(km, n * 4); - memset(t, 0, n * 4); - - for (i = 0; i < n; ++i) sum_qspan += a[i].y>>32&0xff; - avg_qspan = (float)sum_qspan / n; - - // fill the score and backtrack arrays - for (i = 0; i < n; ++i) { - uint64_t ri = a[i].x; - int64_t max_j = -1; - int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!! - int32_t max_f = q_span, n_skip = 0, min_d; - int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; - while (st < i && ri > a[st].x + max_dist_x) ++st; - if (i - st > max_iter) st = i - max_iter; - for (j = i - 1; j >= st; --j) { - int64_t dr = ri - a[j].x; - int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost; - int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; - if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below - if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue; - dd = dr > dq? dr - dq : dq - dr; - if (sidi == sidj && dd > bw) continue; - if (n_segs > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) continue; - min_d = dq < dr? dq : dr; - sc = min_d > q_span? q_span : dq < dr? dq : dr; - log_dd = dd? ilog2_32(dd) : 0; - gap_cost = 0; - if (is_cdna || sidi != sidj) { - int c_log, c_lin; - c_lin = (int)(dd * .01 * avg_qspan); - c_log = log_dd; - if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus - else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log; - else gap_cost = c_lin + (c_log>>1); - } else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1); - sc -= (int)((double)gap_cost * gap_scale + .499); - sc += f[j]; - if (sc > max_f) { - max_f = sc, max_j = j; - if (n_skip > 0) --n_skip; - } else if (t[j] == i) { - if (++n_skip > max_skip) - break; - } - if (p[j] >= 0) t[p[j]] = i; - } - f[i] = max_f, p[i] = max_j; - v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak - } - - // find the ending positions of chains - memset(t, 0, n * 4); - for (i = 0; i < n; ++i) - if (p[i] >= 0) t[p[i]] = 1; - for (i = n_u = 0; i < n; ++i) - if (t[i] == 0 && v[i] >= min_sc) - ++n_u; - if (n_u == 0) { - kfree(km, a); kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, v); - return 0; - } - u = (uint64_t*)kmalloc(km, n_u * 8); - for (i = n_u = 0; i < n; ++i) { - if (t[i] == 0 && v[i] >= min_sc) { - j = i; - while (j >= 0 && f[j] < v[j]) j = p[j]; // find the peak that maximizes f[] - if (j < 0) j = i; // TODO: this should really be assert(j>=0) - u[n_u++] = (uint64_t)f[j] << 32 | j; - } - } - radix_sort_64(u, u + n_u); - for (i = 0; i < n_u>>1; ++i) { // reverse, s.t. the highest scoring chain is the first - uint64_t t = u[i]; - u[i] = u[n_u - i - 1], u[n_u - i - 1] = t; - } - - // backtrack - memset(t, 0, n * 4); - for (i = n_v = k = 0; i < n_u; ++i) { // starting from the highest score - int32_t n_v0 = n_v, k0 = k; - j = (int32_t)u[i]; - do { - v[n_v++] = j; - t[j] = 1; - j = p[j]; - } while (j >= 0 && t[j] == 0); - if (j < 0) { - if (n_v - n_v0 >= min_cnt) u[k++] = u[i]>>32<<32 | (n_v - n_v0); - } else if ((int32_t)(u[i]>>32) - f[j] >= min_sc) { - if (n_v - n_v0 >= min_cnt) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0); - } - if (k0 == k) n_v = n_v0; // no new chain added, reset - } - *n_u_ = n_u = k, *_u = u; // NB: note that u[] may not be sorted by score here - - // free temporary arrays - kfree(km, f); kfree(km, p); kfree(km, t); - - // write the result to b[] - b = (mm128_t*)kmalloc(km, n_v * sizeof(mm128_t)); - for (i = 0, k = 0; i < n_u; ++i) { - int32_t k0 = k, ni = (int32_t)u[i]; - for (j = 0; j < ni; ++j) - b[k] = a[v[k0 + (ni - j - 1)]], ++k; - } - kfree(km, v); - - // sort u[] and a[] by a[].x, such that adjacent chains may be joined (required by mm_join_long) - w = (mm128_t*)kmalloc(km, n_u * sizeof(mm128_t)); - for (i = k = 0; i < n_u; ++i) { - w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; - k += (int32_t)u[i]; - } - radix_sort_128x(w, w + n_u); - u2 = (uint64_t*)kmalloc(km, n_u * 8); - for (i = k = 0; i < n_u; ++i) { - int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; - u2[i] = u[j]; - memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t)); - k += n; - } - if (n_u) memcpy(u, u2, n_u * 8); - if (k) memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot - kfree(km, a); kfree(km, w); kfree(km, u2); - return b; -} diff --git a/code_of_conduct.md b/code_of_conduct.md new file mode 100644 index 0000000..b7175c1 --- /dev/null +++ b/code_of_conduct.md @@ -0,0 +1,30 @@ +## Contributor Code of Conduct + +As contributors and maintainers of this project, we pledge to respect all +people who contribute through reporting issues, posting feature requests, +updating documentation, submitting pull requests or patches, and other +activities. + +We are committed to making participation in this project a harassment-free +experience for everyone, regardless of level of experience, gender, gender +identity and expression, sexual orientation, disability, personal appearance, +body size, race, age, or religion. + +Examples of unacceptable behavior by participants include the use of sexual +language or imagery, derogatory comments or personal attacks, trolling, public +or private harassment, insults, or other unprofessional conduct. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct. Project maintainers or +contributors who do not follow the Code of Conduct may be removed from the +project team. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by opening an issue or contacting the maintainer via email. + +This Code of Conduct is adapted from the [Contributor Covenant][cc], [version +1.0.0][v1]. + +[cc]: http://contributor-covenant.org/ +[v1]: http://contributor-covenant.org/version/1/0/0/ diff --git a/cookbook.md b/cookbook.md index 6cb4d5f..8ff2bd0 100644 --- a/cookbook.md +++ b/cookbook.md @@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools, please follow the command lines below: ```sh # install minimap2 executables -curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar jxf - -cp minimap2-2.17_x64-linux/{minimap2,k8,paftools.js} . # copy executables +curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar jxf - +cp minimap2-2.21_x64-linux/{minimap2,k8,paftools.js} . # copy executables export PATH="$PATH:"`pwd` # put the current directory on PATH # download example datasets curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf - diff --git a/esterr.c b/esterr.c index 733c762..7a509da 100644 --- a/esterr.c +++ b/esterr.c @@ -59,6 +59,6 @@ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const n_tot = en - st + 1; if (r->qs > avg_k && r->rs > avg_k) ++n_tot; if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot; - r->div = logf((float)n_tot / n_match) / avg_k; + r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k)); } } diff --git a/example.c b/example.c index ca1fdbe..b495051 100644 --- a/example.c +++ b/example.c @@ -47,7 +47,7 @@ int main(int argc, char *argv[]) printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]); printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq); for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings! - printf("%d%c", r->p->cigar[i]>>4, "MIDNSH"[r->p->cigar[i]&0xf]); + printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]); putchar('\n'); free(r->p); } diff --git a/format.c b/format.c index 1773c95..b7a7be9 100644 --- a/format.c +++ b/format.c @@ -144,8 +144,8 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq if (write_tag) mm_sprintf_lite(s, "\tcs:Z:"); for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; - assert((op >= 0 && op <= 3) || op == 7 || op == 8); - if (op == 0 || op == 7 || op == 8) { // match + assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH); + if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) { int l_tmp = 0; for (j = 0; j < len; ++j) { if (qseq[q_off + j] != tseq[t_off + j]) { @@ -166,12 +166,12 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq } else mm_sprintf_lite(s, ":%d", l_tmp); } q_off += len, t_off += len; - } else if (op == 1) { // insertion to ref + } else if (op == MM_CIGAR_INS) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "acgtn"[qseq[q_off + j]]; mm_sprintf_lite(s, "+%s", tmp); q_off += len; - } else if (op == 2) { // deletion from ref + } else if (op == MM_CIGAR_DEL) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "acgtn"[tseq[t_off + j]]; mm_sprintf_lite(s, "-%s", tmp); @@ -192,8 +192,8 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq if (write_tag) mm_sprintf_lite(s, "\tMD:Z:"); for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; - assert((op >= 0 && op <= 3) || op == 7 || op == 8); - if (op == 0 || op == 7 || op == 8) { // match + assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH); + if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) { for (j = 0; j < len; ++j) { if (qseq[q_off + j] != tseq[t_off + j]) { mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]); @@ -201,15 +201,15 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq } else ++l_MD; } q_off += len, t_off += len; - } else if (op == 1) { // insertion to ref + } else if (op == MM_CIGAR_INS) { q_off += len; - } else if (op == 2) { // deletion from ref + } else if (op == MM_CIGAR_DEL) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "ACGTN"[tseq[t_off + j]]; mm_sprintf_lite(s, "%d^%s", l_MD, tmp); l_MD = 0; t_off += len; - } else if (op == 3) { // reference skip + } else if (op == MM_CIGAR_N_SKIP) { t_off += len; } } @@ -277,7 +277,7 @@ double mm_event_identity(const mm_reg1_t *r) if (r->p == 0) return -1.0f; for (i = 0; i < r->p->n_cigar; ++i) { int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4; - if (op == 1 || op == 2) + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) ++n_gapo, n_gap += len; } return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo); @@ -335,7 +335,7 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); for (k = 0; k < r->p->n_cigar; ++k) - mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]); + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]); } if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, opt_flag&MM_F_QSTRAND); @@ -392,7 +392,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co assert(clip_len[0] < qlen && clip_len[1] < qlen); if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char); for (k = 0; k < r->p->n_cigar; ++k) - mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]); + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]); if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char); } } @@ -402,7 +402,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se { const int max_bam_cigar_op = 65535; int flag, n_regs = n_regss[seg_idx], cigar_in_tag = 0; - int this_rid = -1, this_pos = -1, this_rev = 0; + int this_rid = -1, this_pos = -1; const mm_reg1_t *regs = regss[seg_idx], *r_prev = NULL, *r_next; const mm_reg1_t *r = n_regs > 0 && reg_idx < n_regs && reg_idx >= 0? ®s[reg_idx] : NULL; @@ -451,7 +451,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1); } else mm_sprintf_lite(s, "\t*\t0\t0\t*"); } else { - this_rid = r->rid, this_pos = r->rs, this_rev = r->rev; + this_rid = r->rid, this_pos = r->rs; mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq); if ((opt_flag & MM_F_LONG_CIGAR) && r->p && r->p->n_cigar > max_bam_cigar_op - 2) { int n_cigar = r->p->n_cigar; diff --git a/hit.c b/hit.c index cd30ada..c2e8180 100644 --- a/hit.c +++ b/hit.c @@ -122,7 +122,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int r->split |= 1, r2->split |= 2; } -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc +void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc { int i, j, k, *w; uint64_t *cov; @@ -162,7 +162,7 @@ skip_uncov: min = ej - sj < ei - si? ej - sj : ei - si; max = ej - sj > ei - si? ej - sj : ei - si; ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified - if ((float)ol / min - (float)uncov_len / max > mask_level) { + if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit int cnt_sub = 0, sci = ri->score; ri->parent = rp->parent; if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac); @@ -312,64 +312,6 @@ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a) return as; } -void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_reg1_t *regs, mm128_t *a) -{ - int i, n_aux, n_regs = *n_regs_, n_drop = 0; - uint64_t *aux; - - if (n_regs < 2) return; // nothing to join - mm_squeeze_a(km, n_regs, regs, a); - - aux = (uint64_t*)kmalloc(km, n_regs * 8); - for (i = n_aux = 0; i < n_regs; ++i) - if (regs[i].parent == i || regs[i].parent < 0) - aux[n_aux++] = (uint64_t)regs[i].as << 32 | i; - radix_sort_64(aux, aux + n_aux); - - for (i = n_aux - 1; i >= 1; --i) { - mm_reg1_t *r0 = ®s[(int32_t)aux[i-1]], *r1 = ®s[(int32_t)aux[i]]; - mm128_t *a0e, *a1s; - int max_gap, min_gap, sc_thres, min_flank_len; - - // test - if (r0->as + r0->cnt != r1->as) continue; // not adjacent in a[] - if (r0->rid != r1->rid || r0->rev != r1->rev) continue; // make sure on the same target and strand - a0e = &a[r0->as + r0->cnt - 1]; - a1s = &a[r1->as]; - if (a1s->x <= a0e->x || (int32_t)a1s->y <= (int32_t)a0e->y) continue; // keep colinearity - max_gap = min_gap = (int32_t)a1s->y - (int32_t)a0e->y; - max_gap = a0e->x + max_gap > a1s->x? max_gap : a1s->x - a0e->x; - min_gap = a0e->x + min_gap < a1s->x? min_gap : a1s->x - a0e->x; - if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue; - sc_thres = (int)((float)opt->min_join_flank_sc / opt->max_join_long * max_gap + .499); - if (r0->score < sc_thres || r1->score < sc_thres) continue; // require good flanking chains - min_flank_len = (int)(max_gap * opt->min_join_flank_ratio); - if (r0->re - r0->rs < min_flank_len || r0->qe - r0->qs < min_flank_len) continue; // require enough flanking length - if (r1->re - r1->rs < min_flank_len || r1->qe - r1->qs < min_flank_len) continue; - - // all conditions satisfied; join - a[r1->as].y |= MM_SEED_LONG_JOIN; - r0->cnt += r1->cnt, r0->score += r1->score; - mm_reg_set_coor(r0, qlen, a, opt->flag&MM_F_QSTRAND); - r1->cnt = 0; - r1->parent = r0->id; - ++n_drop; - } - kfree(km, aux); - - if (n_drop > 0) { // then fix the hits hierarchy - for (i = 0; i < n_regs; ++i) { // adjust the mm_reg1_t::parent - mm_reg1_t *r = ®s[i]; - if (r->parent >= 0 && r->id != r->parent) { // fix for secondary hits only - if (regs[r->parent].parent >= 0 && regs[r->parent].parent != r->parent) - r->parent = regs[r->parent].parent; - } - } - mm_filter_regs(opt, qlen, n_regs_, regs); - mm_sync_regs(km, *n_regs_, regs); - } -} - mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a) { int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0; diff --git a/index.c b/index.c index 3838382..0d8a2ae 100644 --- a/index.c +++ b/index.c @@ -676,7 +676,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc char *p, *q, *bl, *bs; int32_t i, id = -1, n_blk = 0; for (p = q = str.s, i = 0;; ++p) { - if (*p == 0 || isspace(*p)) { + if (*p == 0 || *p == '\t') { int32_t c = *p; *p = 0; if (i == 0) { // chr diff --git a/kalloc.h b/kalloc.h index 6d72e4e..93bff5e 100644 --- a/kalloc.h +++ b/kalloc.h @@ -34,4 +34,43 @@ void km_stat(const void *_km, km_stat_t *s); KREALLOC((km), (a), (m)); \ } while (0) +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \ + typedef struct { \ + size_t cnt, n, max; \ + kmptype_t **buf; \ + void *km; \ + } kmp_##name##_t; \ + SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \ + kmp_##name##_t *mp; \ + KCALLOC(km, mp, 1); \ + mp->km = km; \ + return mp; \ + } \ + SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \ + size_t k; \ + for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \ + kfree(mp->km, mp->buf); kfree(mp->km, mp); \ + } \ + SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ + ++mp->cnt; \ + if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \ + return mp->buf[--mp->n]; \ + } \ + SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ + --mp->cnt; \ + if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \ + mp->buf[mp->n++] = p; \ + } + +#define KALLOC_POOL_INIT(name, kmptype_t) \ + KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t) + #endif diff --git a/krmq.h b/krmq.h new file mode 100644 index 0000000..8fa1cce --- /dev/null +++ b/krmq.h @@ -0,0 +1,474 @@ +/* The MIT License + + Copyright (c) 2019 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* An example: + +#include +#include +#include +#include "krmq.h" + +struct my_node { + char key; + KRMQ_HEAD(struct my_node) head; +}; +#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key)) +KRMQ_INIT(my, struct my_node, head, my_cmp) + +int main(void) { + const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate + struct my_node *root = 0; + int i, l = strlen(str); + for (i = 0; i < l; ++i) { // insert in the input order + struct my_node *q, *p = malloc(sizeof(*p)); + p->key = str[i]; + q = krmq_insert(my, &root, p, 0); + if (p != q) free(p); // if already present, free + } + krmq_itr_t(my) itr; + krmq_itr_first(my, root, &itr); // place at first + do { // traverse + const struct my_node *p = krmq_at(&itr); + putchar(p->key); + free((void*)p); // free node + } while (krmq_itr_next(my, &itr)); + putchar('\n'); + return 0; +} +*/ + +#ifndef KRMQ_H +#define KRMQ_H + +#ifdef __STRICT_ANSI__ +#define inline __inline__ +#endif + +#define KRMQ_MAX_DEPTH 64 + +#define krmq_size(head, p) ((p)? (p)->head.size : 0) +#define krmq_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0) + +#define KRMQ_HEAD(__type) \ + struct { \ + __type *p[2], *s; \ + signed char balance; /* balance factor */ \ + unsigned size; /* #elements in subtree */ \ + } + +#define __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \ + __scope __type *krmq_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \ + const __type *p = root; \ + unsigned cnt = 0; \ + while (p != 0) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + if (cnt_) *cnt_ = cnt; \ + return (__type*)p; \ + } \ + __scope __type *krmq_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \ + const __type *p = root, *l = 0, *u = 0; \ + while (p != 0) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp < 0) u = p, p = p->__head.p[0]; \ + else if (cmp > 0) l = p, p = p->__head.p[1]; \ + else { l = u = p; break; } \ + } \ + if (lower) *lower = (__type*)l; \ + if (upper) *upper = (__type*)u; \ + return (__type*)p; \ + } + +#define __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_rmq_##suf(const __type *root, const __type *lo, const __type *up) { /* CLOSED interval */ \ + const __type *p = root, *path[2][KRMQ_MAX_DEPTH], *min; \ + int plen[2] = {0, 0}, pcmp[2][KRMQ_MAX_DEPTH], i, cmp, lca; \ + if (root == 0) return 0; \ + while (p) { \ + cmp = __cmp(lo, p); \ + path[0][plen[0]] = p, pcmp[0][plen[0]++] = cmp; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + p = root; \ + while (p) { \ + cmp = __cmp(up, p); \ + path[1][plen[1]] = p, pcmp[1][plen[1]++] = cmp; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + for (i = 0; i < plen[0] && i < plen[1]; ++i) /* find the LCA */ \ + if (path[0][i] == path[1][i] && pcmp[0][i] <= 0 && pcmp[1][i] >= 0) \ + break; \ + if (i == plen[0] || i == plen[1]) return 0; /* no elements in the closed interval */ \ + lca = i, min = path[0][lca]; \ + for (i = lca + 1; i < plen[0]; ++i) { \ + if (pcmp[0][i] <= 0) { \ + if (__lt2(path[0][i], min)) min = path[0][i]; \ + if (path[0][i]->__head.p[1] && __lt2(path[0][i]->__head.p[1]->__head.s, min)) \ + min = path[0][i]->__head.p[1]->__head.s; \ + } \ + } \ + for (i = lca + 1; i < plen[1]; ++i) { \ + if (pcmp[1][i] >= 0) { \ + if (__lt2(path[1][i], min)) min = path[1][i]; \ + if (path[1][i]->__head.p[0] && __lt2(path[1][i]->__head.p[0]->__head.s, min)) \ + min = path[1][i]->__head.p[0]->__head.s; \ + } \ + } \ + return (__type*)min; \ + } + +#define __KRMQ_ROTATE(suf, __type, __head, __lt2) \ + /* */ \ + static inline void krmq_update_min_##suf(__type *p, const __type *q, const __type *r) { \ + p->__head.s = !q || __lt2(p, q->__head.s)? p : q->__head.s; \ + p->__head.s = !r || __lt2(p->__head.s, r->__head.s)? p->__head.s : r->__head.s; \ + } \ + /* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \ + static inline __type *krmq_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \ + int opp = 1 - dir; /* opposite direction */ \ + __type *q = p->__head.p[opp], *s = p->__head.s; \ + unsigned size_p = p->__head.size; \ + p->__head.size -= q->__head.size - krmq_size_child(__head, q, dir); \ + q->__head.size = size_p; \ + krmq_update_min_##suf(p, p->__head.p[dir], q->__head.p[dir]); \ + q->__head.s = s; \ + p->__head.p[opp] = q->__head.p[dir]; \ + q->__head.p[dir] = p; \ + return q; \ + } \ + /* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \ + static inline __type *krmq_rotate2_##suf(__type *p, int dir) { \ + int b1, opp = 1 - dir; \ + __type *q = p->__head.p[opp], *r = q->__head.p[dir], *s = p->__head.s; \ + unsigned size_x_dir = krmq_size_child(__head, r, dir); \ + r->__head.size = p->__head.size; \ + p->__head.size -= q->__head.size - size_x_dir; \ + q->__head.size -= size_x_dir + 1; \ + krmq_update_min_##suf(p, p->__head.p[dir], r->__head.p[dir]); \ + krmq_update_min_##suf(q, q->__head.p[opp], r->__head.p[opp]); \ + r->__head.s = s; \ + p->__head.p[opp] = r->__head.p[dir]; \ + r->__head.p[dir] = p; \ + q->__head.p[dir] = r->__head.p[opp]; \ + r->__head.p[opp] = q; \ + b1 = dir == 0? +1 : -1; \ + if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \ + else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \ + else q->__head.balance = b1, p->__head.balance = 0; \ + r->__head.balance = 0; \ + return r; \ + } + +#define __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \ + unsigned char stack[KRMQ_MAX_DEPTH]; \ + __type *path[KRMQ_MAX_DEPTH]; \ + __type *bp, *bq; \ + __type *p, *q, *r = 0; /* _r_ is potentially the new root */ \ + int i, which = 0, top, b1, path_len; \ + unsigned cnt = 0; \ + bp = *root_, bq = 0; \ + /* find the insertion location */ \ + for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + if (cmp == 0) { \ + if (cnt_) *cnt_ = cnt; \ + return p; \ + } \ + if (p->__head.balance != 0) \ + bq = q, bp = p, top = 0; \ + stack[top++] = which = (cmp > 0); \ + path[path_len++] = p; \ + } \ + if (cnt_) *cnt_ = cnt; \ + x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0, x->__head.s = x; \ + if (q == 0) *root_ = x; \ + else q->__head.p[which] = x; \ + if (bp == 0) return x; \ + for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \ + for (i = path_len - 1; i >= 0; --i) { \ + krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \ + if (path[i]->__head.s != x) break; \ + } \ + for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \ + if (stack[top] == 0) --p->__head.balance; \ + else ++p->__head.balance; \ + if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \ + /* re-balance */ \ + which = (bp->__head.balance < 0); \ + b1 = which == 0? +1 : -1; \ + q = bp->__head.p[1 - which]; \ + if (q->__head.balance == b1) { \ + r = krmq_rotate1_##suf(bp, which); \ + q->__head.balance = bp->__head.balance = 0; \ + } else r = krmq_rotate2_##suf(bp, which); \ + if (bq == 0) *root_ = r; \ + else bq->__head.p[bp != bq->__head.p[0]] = r; \ + return x; \ + } + +#define __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \ + __type *p, *path[KRMQ_MAX_DEPTH], fake; \ + unsigned char dir[KRMQ_MAX_DEPTH]; \ + int i, d = 0, cmp; \ + unsigned cnt = 0; \ + fake = **root_, fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \ + if (cnt_) *cnt_ = 0; \ + if (x) { \ + for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \ + int which = (cmp > 0); \ + if (cmp > 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + dir[d] = which; \ + path[d++] = p; \ + p = p->__head.p[which]; \ + if (p == 0) { \ + if (cnt_) *cnt_ = 0; \ + return 0; \ + } \ + } \ + cnt += krmq_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \ + } else { \ + for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \ + dir[d] = 0, path[d++] = p; \ + p = path[--d]; \ + } \ + if (cnt_) *cnt_ = cnt; \ + for (i = 1; i < d; ++i) --path[i]->__head.size; \ + if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \ + path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \ + } else { \ + __type *q = p->__head.p[1]; \ + if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3,q=2 */ \ + q->__head.p[0] = p->__head.p[0]; \ + q->__head.balance = p->__head.balance; \ + path[d-1]->__head.p[dir[d-1]] = q; \ + path[d] = q, dir[d++] = 1; \ + q->__head.size = p->__head.size - 1; \ + } else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \ + __type *r; \ + int e = d++; /* backup _d_ */\ + for (;;) { \ + dir[d] = 0; \ + path[d++] = q; \ + r = q->__head.p[0]; \ + if (r->__head.p[0] == 0) break; \ + q = r; \ + } \ + r->__head.p[0] = p->__head.p[0]; \ + q->__head.p[0] = r->__head.p[1]; \ + r->__head.p[1] = p->__head.p[1]; \ + r->__head.balance = p->__head.balance; \ + path[e-1]->__head.p[dir[e-1]] = r; \ + path[e] = r, dir[e] = 1; \ + for (i = e + 1; i < d; ++i) --path[i]->__head.size; \ + r->__head.size = p->__head.size - 1; \ + } \ + } \ + for (i = d - 1; i >= 0; --i) /* not sure why adding condition "path[i]->__head.s==p" doesn't work */ \ + krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \ + while (--d > 0) { \ + __type *q = path[d]; \ + int which, other, b1 = 1, b2 = 2; \ + which = dir[d], other = 1 - which; \ + if (which) b1 = -b1, b2 = -b2; \ + q->__head.balance += b1; \ + if (q->__head.balance == b1) break; \ + else if (q->__head.balance == b2) { \ + __type *r = q->__head.p[other]; \ + if (r->__head.balance == -b1) { \ + path[d-1]->__head.p[dir[d-1]] = krmq_rotate2_##suf(q, which); \ + } else { \ + path[d-1]->__head.p[dir[d-1]] = krmq_rotate1_##suf(q, which); \ + if (r->__head.balance == 0) { \ + r->__head.balance = -b1; \ + q->__head.balance = b1; \ + break; \ + } else r->__head.balance = q->__head.balance = 0; \ + } \ + } \ + } \ + *root_ = fake.__head.p[0]; \ + return p; \ + } + +#define krmq_free(__type, __head, __root, __free) do { \ + __type *_p, *_q; \ + for (_p = __root; _p; _p = _q) { \ + if (_p->__head.p[0] == 0) { \ + _q = _p->__head.p[1]; \ + __free(_p); \ + } else { \ + _q = _p->__head.p[0]; \ + _p->__head.p[0] = _q->__head.p[1]; \ + _q->__head.p[1] = _p; \ + } \ + } \ + } while (0) + +#define __KRMQ_ITR(suf, __scope, __type, __head, __cmp) \ + struct krmq_itr_##suf { \ + const __type *stack[KRMQ_MAX_DEPTH], **top; \ + }; \ + __scope void krmq_itr_first_##suf(const __type *root, struct krmq_itr_##suf *itr) { \ + const __type *p; \ + for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \ + *++itr->top = p; \ + } \ + __scope int krmq_itr_find_##suf(const __type *root, const __type *x, struct krmq_itr_##suf *itr) { \ + const __type *p = root; \ + itr->top = itr->stack - 1; \ + while (p != 0) { \ + int cmp; \ + *++itr->top = p; \ + cmp = __cmp(x, p); \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + return p? 1 : 0; \ + } \ + __scope int krmq_itr_next_bidir_##suf(struct krmq_itr_##suf *itr, int dir) { \ + const __type *p; \ + if (itr->top < itr->stack) return 0; \ + dir = !!dir; \ + p = (*itr->top)->__head.p[dir]; \ + if (p) { /* go down */ \ + for (; p; p = p->__head.p[!dir]) \ + *++itr->top = p; \ + return 1; \ + } else { /* go up */ \ + do { \ + p = *itr->top--; \ + } while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \ + return itr->top < itr->stack? 0 : 1; \ + } \ + } \ + +/** + * Insert a node to the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param proot pointer to the root of the tree (in/out: root may change) + * @param x node to insert (in) + * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out) + * + * @return _x_ if not present in the tree, or the node equal to x. + */ +#define krmq_insert(suf, proot, x, cnt) krmq_insert_##suf(proot, x, cnt) + +/** + * Find a node in the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param x node value to find (in) + * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out) + * + * @return node equal to _x_ if present, or NULL if absent + */ +#define krmq_find(suf, root, x, cnt) krmq_find_##suf(root, x, cnt) +#define krmq_interval(suf, root, x, lower, upper) krmq_interval_##suf(root, x, lower, upper) +#define krmq_rmq(suf, root, lo, up) krmq_rmq_##suf(root, lo, up) + +/** + * Delete a node from the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param proot pointer to the root of the tree (in/out: root may change) + * @param x node value to delete; if NULL, delete the first node (in) + * + * @return node removed from the tree if present, or NULL if absent + */ +#define krmq_erase(suf, proot, x, cnt) krmq_erase_##suf(proot, x, cnt) +#define krmq_erase_first(suf, proot) krmq_erase_##suf(proot, 0, 0) + +#define krmq_itr_t(suf) struct krmq_itr_##suf + +/** + * Place the iterator at the smallest object + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param itr iterator + */ +#define krmq_itr_first(suf, root, itr) krmq_itr_first_##suf(root, itr) + +/** + * Place the iterator at the object equal to or greater than the query + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param x query (in) + * @param itr iterator (out) + * + * @return 1 if find; 0 otherwise. krmq_at(itr) is NULL if and only if query is + * larger than all objects in the tree + */ +#define krmq_itr_find(suf, root, x, itr) krmq_itr_find_##suf(root, x, itr) + +/** + * Move to the next object in order + * + * @param itr iterator (modified) + * + * @return 1 if there is a next object; 0 otherwise + */ +#define krmq_itr_next(suf, itr) krmq_itr_next_bidir_##suf(itr, 1) +#define krmq_itr_prev(suf, itr) krmq_itr_next_bidir_##suf(itr, 0) + +/** + * Return the pointer at the iterator + * + * @param itr iterator + * + * @return pointer if present; NULL otherwise + */ +#define krmq_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top) + +#define KRMQ_INIT2(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \ + __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ROTATE(suf, __type, __head, __lt2) \ + __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ITR(suf, __scope, __type, __head, __cmp) + +#define KRMQ_INIT(suf, __type, __head, __cmp, __lt2) \ + KRMQ_INIT2(suf,, __type, __head, __cmp, __lt2) + +#endif diff --git a/ksw2.h b/ksw2.h index c700136..cbd1ddc 100644 --- a/ksw2.h +++ b/ksw2.h @@ -16,6 +16,13 @@ #define KSW_EZ_SPLICE_REV 0x200 #define KSW_EZ_SPLICE_FLANK 0x400 +// The subset of CIGAR operators used by ksw code. +// Use MM_CIGAR_* from minimap.h if you need the full list. +#define KSW_CIGAR_MATCH 0 +#define KSW_CIGAR_INS 1 +#define KSW_CIGAR_DEL 2 +#define KSW_CIGAR_N_SKIP 3 + #ifdef __cplusplus extern "C" { #endif @@ -137,13 +144,13 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intro else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure if (force_state >= 0) state = force_state; - if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match - else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion - else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron - else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion + if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j; + else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i; + else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i; + else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j; } - if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion - if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion + if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion + if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion if (!is_rev) for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index b578274..162e9e2 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -4,15 +4,23 @@ #include "ksw2.h" #ifdef __SSE2__ +#ifdef USE_SIMDE +#include +#else #include +#endif #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ +#ifdef USE_SIMDE +#include +#else #include #endif +#endif #ifdef KSW_CPU_DISPATCH #ifdef __SSE4_1__ diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index e7984c6..4157e38 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -4,15 +4,22 @@ #include "ksw2.h" #ifdef __SSE2__ +#ifdef USE_SIMDE +#include +#else #include - +#endif #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ +#ifdef USE_SIMDE +#include +#else #include #endif +#endif #ifdef KSW_CPU_DISPATCH #ifdef __SSE4_1__ diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index 02bb4c2..ad19131 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -3,15 +3,23 @@ #include "ksw2.h" #ifdef __SSE2__ +#ifdef USE_SIMDE +#include +#else #include +#endif #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ +#ifdef USE_SIMDE +#include +#else #include #endif +#endif #ifdef KSW_CPU_DISPATCH #ifdef __SSE4_1__ diff --git a/ksw2_ll_sse.c b/ksw2_ll_sse.c index 469de52..14b9b50 100644 --- a/ksw2_ll_sse.c +++ b/ksw2_ll_sse.c @@ -1,9 +1,14 @@ #include #include #include -#include #include "ksw2.h" +#ifdef USE_SIMDE +#include +#else +#include +#endif + #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) #define UNLIKELY(x) __builtin_expect((x),0) diff --git a/lchain.c b/lchain.c new file mode 100644 index 0000000..220e3e9 --- /dev/null +++ b/lchain.c @@ -0,0 +1,354 @@ +#include +#include +#include +#include +#include "mmpriv.h" +#include "kalloc.h" +#include "krmq.h" + +static inline float mg_log2(float x) // NB: this doesn't work when x<2 +{ + union { float f; uint32_t i; } z = { x }; + float log_2 = ((z.i >> 23) & 255) - 128; + z.i &= ~(255 << 23); + z.i += 127 << 23; + log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f; + return log_2; +} + +uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t *n_u_, int32_t *n_v_) +{ + mm128_t *z; + uint64_t *u; + int64_t i, k, n_z, n_v; + int32_t n_u; + + *n_u_ = *n_v_ = 0; + for (i = 0, n_z = 0; i < n; ++i) // precompute n_z + if (f[i] >= min_sc) ++n_z; + if (n_z == 0) return 0; + KMALLOC(km, z, n_z); + for (i = 0, k = 0; i < n; ++i) // populate z[] + if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i; + radix_sort_128x(z, z + n_z); + + memset(t, 0, n * 4); + for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u + int64_t n_v0 = n_v; + int32_t sc; + for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i]) + ++n_v, t[i] = 1; + sc = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt) + ++n_u; + else n_v = n_v0; + } + KMALLOC(km, u, n_u); + memset(t, 0, n * 4); + for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[] + int64_t n_v0 = n_v; + int32_t sc; + for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i]) + v[n_v++] = i, t[i] = 1; + sc = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt) + u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0); + else n_v = n_v0; + } + kfree(km, z); + assert(n_v < INT32_MAX); + *n_u_ = n_u, *n_v_ = n_v; + return u; +} + +static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a) +{ + mm128_t *b, *w; + uint64_t *u2; + int64_t i, j, k; + + // write the result to b[] + KMALLOC(km, b, n_v); + for (i = 0, k = 0; i < n_u; ++i) { + int32_t k0 = k, ni = (int32_t)u[i]; + for (j = 0; j < ni; ++j) + b[k++] = a[v[k0 + (ni - j - 1)]]; + } + kfree(km, v); + + // sort u[] and a[] by the target position, such that adjacent chains may be joined + KMALLOC(km, w, n_u); + for (i = k = 0; i < n_u; ++i) { + w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; + k += (int32_t)u[i]; + } + radix_sort_128x(w, w + n_u); + KMALLOC(km, u2, n_u); + for (i = k = 0; i < n_u; ++i) { + int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; + u2[i] = u[j]; + memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t)); + k += n; + } + memcpy(u, u2, n_u * 8); + memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot + kfree(km, a); kfree(km, w); kfree(km, u2); + return b; +} + +static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap, float chn_pen_skip, int is_cdna, int n_seg) +{ + int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc; + int32_t sidi = (ai->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; + int32_t sidj = (aj->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; + if (dq <= 0 || dq > max_dist_x) return INT32_MIN; + dr = (int32_t)(ai->x - aj->x); + if (sidi == sidj && (dr == 0 || dq > max_dist_y)) return INT32_MIN; + dd = dr > dq? dr - dq : dq - dr; + if (sidi == sidj && dd > bw) return INT32_MIN; + if (n_seg > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) return INT32_MIN; + dg = dr < dq? dr : dq; + q_span = aj->y>>32&0xff; + sc = q_span < dg? q_span : dg; + if (dd || dg > q_span) { + float lin_pen, log_pen; + lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; + log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2 + if (is_cdna || sidi != sidj) { + if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus + else if (dr > dq || sidi != sidj) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends + else sc -= (int)(lin_pen + .5f * log_pen); + } else sc -= (int)(lin_pen + .5f * log_pen); + } + return sc; +} + +/* Input: + * a[].x: tid<<33 | rev<<32 | tpos + * a[].y: flags<<40 | q_span<<32 | q_pos + * Output: + * n_u: #chains + * u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[]) + * input a[] is deallocated on return + */ +mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) +{ // TODO: make sure this works when n has more than 32 bits + int32_t *f, *t, *v, n_u, n_v, mmax_f = 0; + int64_t *p, i, j, max_ii, st = 0, n_iter = 0; + uint64_t *u; + + if (_u) *_u = 0, *n_u_ = 0; + if (n == 0 || a == 0) { + kfree(km, a); + return 0; + } + if (max_dist_x < bw) max_dist_x = bw; + if (max_dist_y < bw && !is_cdna) max_dist_y = bw; + KMALLOC(km, p, n); + KMALLOC(km, f, n); + KMALLOC(km, v, n); + KCALLOC(km, t, n); + + // fill the score and backtrack arrays + for (i = 0, max_ii = -1; i < n; ++i) { + int64_t max_j = -1, end_j; + int32_t max_f = a[i].y>>32&0xff, n_skip = 0; + while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st; + if (i - st > max_iter) st = i - max_iter; + for (j = i - 1; j >= st; --j) { + int32_t sc; + sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg); + ++n_iter; + if (sc == INT32_MIN) continue; + sc += f[j]; + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) { + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + end_j = j; + if (max_ii < 0 || a[i].x - a[max_ii].x > (int64_t)max_dist_x) { + int32_t max = INT32_MIN; + max_ii = -1; + for (j = i - 1; j >= st; --j) + if (max < f[j]) max = f[j], max_ii = j; + } + if (max_ii >= 0 && max_ii < end_j) { + int32_t tmp; + tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg); + if (tmp != INT32_MIN && max_f < tmp + f[max_ii]) + max_f = tmp + f[max_ii], max_j = max_ii; + } + f[i] = max_f, p[i] = max_j; + v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak + if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i])) + max_ii = i; + if (mmax_f < max_f) mmax_f = max_f; + } + + u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v); + *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here + kfree(km, p); kfree(km, f); kfree(km, t); + if (n_u == 0) { + kfree(km, a); kfree(km, v); + return 0; + } + return compact_a(km, n_u, u, n_v, v, a); +} + +typedef struct lc_elem_s { + int32_t y; + int64_t i; + double pri; + KRMQ_HEAD(struct lc_elem_s) head; +} lc_elem_t; + +#define lc_elem_cmp(a, b) ((a)->y < (b)->y? -1 : (a)->y > (b)->y? 1 : ((a)->i > (b)->i) - ((a)->i < (b)->i)) +#define lc_elem_lt2(a, b) ((a)->pri < (b)->pri) +KRMQ_INIT(lc_elem, lc_elem_t, head, lc_elem_cmp, lc_elem_lt2) + +KALLOC_POOL_INIT(rmq, lc_elem_t) + +static inline int32_t comput_sc_simple(const mm128_t *ai, const mm128_t *aj, float chn_pen_gap, float chn_pen_skip, int32_t *exact, int32_t *width) +{ + int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc; + dr = (int32_t)(ai->x - aj->x); + *width = dd = dr > dq? dr - dq : dq - dr; + dg = dr < dq? dr : dq; + q_span = aj->y>>32&0xff; + sc = q_span < dg? q_span : dg; + if (exact) *exact = (dd == 0 && dg <= q_span); + if (dd || dq > q_span) { + float lin_pen, log_pen; + lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; + log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2 + sc -= (int)(lin_pen + .5f * log_pen); + } + return sc; +} + +mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) +{ + int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0; + int64_t *p, i, i0, st = 0, st_inner = 0, n_iter = 0; + uint64_t *u; + lc_elem_t *root = 0, *root_inner = 0; + void *mem_mp = 0; + kmp_rmq_t *mp; + + if (_u) *_u = 0, *n_u_ = 0; + if (n == 0 || a == 0) { + kfree(km, a); + return 0; + } + if (max_dist < bw) max_dist = bw; + if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0; + KMALLOC(km, p, n); + KMALLOC(km, f, n); + KCALLOC(km, t, n); + KMALLOC(km, v, n); + mem_mp = km_init2(km, 0x10000); + mp = kmp_init_rmq(mem_mp); + + // fill the score and backtrack arrays + for (i = i0 = 0; i < n; ++i) { + int64_t max_j = -1; + int32_t q_span = a[i].y>>32&0xff, max_f = q_span; + lc_elem_t s, *q, *r, lo, hi; + // add in-range anchors + if (i0 < i && a[i0].x != a[i].x) { + int64_t j; + for (j = i0; j < i; ++j) { + q = kmp_alloc_rmq(mp); + q->y = (int32_t)a[j].y, q->i = j, q->pri = -(f[j] + 0.5 * chn_pen_gap * ((int32_t)a[j].x + (int32_t)a[j].y)); + krmq_insert(lc_elem, &root, q, 0); + if (max_dist_inner > 0) { + r = kmp_alloc_rmq(mp); + *r = *q; + krmq_insert(lc_elem, &root_inner, r, 0); + } + } + i0 = i; + } + // get rid of active chains out of range + while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist || krmq_size(head, root) > cap_rmq_size)) { + s.y = (int32_t)a[st].y, s.i = st; + if ((q = krmq_find(lc_elem, root, &s, 0)) != 0) { + q = krmq_erase(lc_elem, &root, q, 0); + kmp_free_rmq(mp, q); + } + ++st; + } + if (max_dist_inner > 0) { // similar to the block above, but applied to the inner tree + while (st_inner < i && (a[i].x>>32 != a[st_inner].x>>32 || a[i].x > a[st_inner].x + max_dist_inner || krmq_size(head, root_inner) > cap_rmq_size)) { + s.y = (int32_t)a[st_inner].y, s.i = st_inner; + if ((q = krmq_find(lc_elem, root_inner, &s, 0)) != 0) { + q = krmq_erase(lc_elem, &root_inner, q, 0); + kmp_free_rmq(mp, q); + } + ++st_inner; + } + } + // RMQ + lo.i = INT32_MAX, lo.y = (int32_t)a[i].y - max_dist; + hi.i = 0, hi.y = (int32_t)a[i].y; + if ((q = krmq_rmq(lc_elem, root, &lo, &hi)) != 0) { + int32_t sc, exact, width, n_skip = 0; + int64_t j = q->i; + assert(q->y >= lo.y && q->y <= hi.y); + sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, &exact, &width); + if (width <= bw && sc > max_f) max_f = sc, max_j = j; + if (!exact && root_inner && (int32_t)a[i].y > 0) { + lc_elem_t *lo, *hi; + s.y = (int32_t)a[i].y - 1, s.i = n; + krmq_interval(lc_elem, root_inner, &s, &lo, &hi); + if (lo) { + const lc_elem_t *q; + int32_t width, n_rmq_iter = 0; + krmq_itr_t(lc_elem) itr; + krmq_itr_find(lc_elem, root_inner, lo, &itr); + while ((q = krmq_at(&itr)) != 0) { + if (q->y < (int32_t)a[i].y - max_dist_inner) break; + ++n_rmq_iter; + j = q->i; + sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width); + if (width <= bw) { + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) { + if (++n_skip > max_chn_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + if (!krmq_itr_prev(lc_elem, &itr)) break; + } + n_iter += n_rmq_iter; + } + } + } + // set max + assert(max_j < 0 || (a[max_j].x < a[i].x && (int32_t)a[max_j].y < (int32_t)a[i].y)); + f[i] = max_f, p[i] = max_j; + v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak + if (mmax_f < max_f) mmax_f = max_f; + if (max_rmq_size < krmq_size(head, root)) max_rmq_size = krmq_size(head, root); + } + km_destroy(mem_mp); + + u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v); + *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here + kfree(km, p); kfree(km, f); kfree(km, t); + if (n_u == 0) { + kfree(km, a); kfree(km, v); + return 0; + } + return compact_a(km, n_u, u, n_v, v, a); +} diff --git a/lib/simde b/lib/simde new file mode 160000 index 0000000..b30129b --- /dev/null +++ b/lib/simde @@ -0,0 +1 @@ +Subproject commit b30129b3b48a6823013da2b309c50a081177b6b8 diff --git a/main.c b/main.c index daff256..1f7b66f 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.17-r987-qs-dirty" +#define MM_VERSION "2.21-r1071-qs-dirty" #ifdef __linux__ #include @@ -70,7 +70,9 @@ static ko_longopt_t long_options[] = { { "chain-gap-scale",ko_required_argument, 343 }, { "alt", ko_required_argument, 344 }, { "alt-drop", ko_required_argument, 345 }, - { "qstrand", ko_no_argument, 346 }, + { "mask-len", ko_required_argument, 346 }, + { "rmq", ko_optional_argument, 347 }, + { "qstrand", ko_no_argument, 348 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -82,17 +84,23 @@ static ko_longopt_t long_options[] = { { 0, 0, 0 } }; -static inline int64_t mm_parse_num(const char *str) +static inline int64_t mm_parse_num2(const char *str, char **q) { double x; char *p; x = strtod(str, &p); - if (*p == 'G' || *p == 'g') x *= 1e9; - else if (*p == 'M' || *p == 'm') x *= 1e6; - else if (*p == 'K' || *p == 'k') x *= 1e3; + if (*p == 'G' || *p == 'g') x *= 1e9, ++p; + else if (*p == 'M' || *p == 'm') x *= 1e6, ++p; + else if (*p == 'K' || *p == 'k') x *= 1e3, ++p; + if (q) *q = p; return (int64_t)(x + .499); } +static inline int64_t mm_parse_num(const char *str) +{ + return mm_parse_num2(str, 0); +} + static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const char *arg, int yes_to_set) { if (yes_to_set) { @@ -108,7 +116,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -144,7 +152,6 @@ int main(int argc, char *argv[]) else if (c == 'k') ipt.k = atoi(o.arg); else if (c == 'H') ipt.flag |= MM_I_HPC; else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I - else if (c == 'r') opt.bw = (int)mm_parse_num(o.arg); else if (c == 't') n_threads = atoi(o.arg); else if (c == 'v') mm_verbose = atoi(o.arg); else if (c == 'g') opt.max_gap = (int)mm_parse_num(o.arg); @@ -171,6 +178,7 @@ int main(int argc, char *argv[]) else if (c == 'C') opt.noncan = atoi(o.arg); else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg); else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg); + else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg); else if (c == 'R') rg = o.arg; else if (c == 'h') fp_help = stdout; else if (c == '2') opt.flag |= MM_F_2_IO_THREADS; @@ -203,7 +211,6 @@ int main(int argc, char *argv[]) else if (c == 327) opt.max_clip_ratio = atof(o.arg); // --max-clip-ratio else if (c == 328) opt.min_mid_occ = atoi(o.arg); // --min-occ-floor else if (c == 329) opt.flag |= MM_F_OUT_MD; // --MD - else if (c == 330) opt.min_join_flank_ratio = atof(o.arg); // --lj-min-ratio else if (c == 331) opt.sc_ambi = atoi(o.arg); // --score-N else if (c == 332) opt.flag |= MM_F_EQX; // --eqx else if (c == 333) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit @@ -218,8 +225,11 @@ int main(int argc, char *argv[]) else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale else if (c == 344) alt_list = o.arg; // --alt else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop - else if (c == 346) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand - else if (c == 314) { // --frag + else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len + else if (c == 348) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand + else if (c == 330) { + fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n"); + } else if (c == 314) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); } else if (c == 315) { // --secondary yes_or_no(&opt, MM_F_NO_PRINT_2ND, o.longidx, o.arg, 0); @@ -240,6 +250,8 @@ int main(int argc, char *argv[]) yes_or_no(&opt, MM_F_HEAP_SORT, o.longidx, o.arg, 1); } else if (c == 326) { // --dual yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0); + } else if (c == 347) { // --rmq + yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1); } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) @@ -247,6 +259,12 @@ int main(int argc, char *argv[]) } else if (c == 'V') { puts(MM_VERSION); return 0; + } else if (c == 'r') { + opt.bw = (int)mm_parse_num2(o.arg, &s); + if (*s == ',') opt.bw_long = (int)mm_parse_num2(s + 1, &s); + } else if (c == 'U') { + opt.min_mid_occ = strtol(o.arg, &s, 10); + if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10); } else if (c == 'f') { double x; char *p; @@ -301,7 +319,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); fprintf(fp_help, " -G NUM max intron length (effective with -xsplice; changing -r) [200k]\n"); fprintf(fp_help, " -F NUM max fragment length (effective with -xsr or in the fragment mode) [800]\n"); - fprintf(fp_help, " -r NUM bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw); + fprintf(fp_help, " -r NUM[,NUM] chaining/alignment bandwidth and long-join bandwidth [%d,%d]\n", opt.bw, opt.bw_long); fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt); fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); // fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy @@ -332,7 +350,8 @@ int main(int argc, char *argv[]) fprintf(fp_help, " --version show version number\n"); fprintf(fp_help, " Preset:\n"); fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n"); - fprintf(fp_help, " - map-pb/map-ont - PacBio/Nanopore vs reference mapping\n"); + fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n"); + fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n"); fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n"); fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n"); fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n"); @@ -389,6 +408,7 @@ int main(int argc, char *argv[]) if (mm_verbose >= 3) mm_idx_stat(mi); if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1); if (alt_list) mm_idx_alt_read(mi, alt_list); + if (argc - (o.ind + 1) == 0) continue; // no query files ret = 0; if (!(opt.flag & MM_F_FRAG_MODE)) { for (i = o.ind + 1; i < argc; ++i) { diff --git a/map.c b/map.c index 1def5fd..2428627 100644 --- a/map.c +++ b/map.c @@ -80,49 +80,7 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t #define heap_lt(a, b) ((a).x > (b).x) KSORT_INIT(heap, mm128_t, heap_lt) -typedef struct { - uint32_t n; - uint32_t q_pos, q_span; - uint32_t seg_id:31, is_tandem:1; - const uint64_t *cr; -} mm_match_t; - -static mm_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) -{ - int rep_st = 0, rep_en = 0, n_m; - size_t i; - mm_match_t *m; - *n_mini_pos = 0; - *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t)); - m = (mm_match_t*)kmalloc(km, mv->n * sizeof(mm_match_t)); - for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) { - const uint64_t *cr; - mm128_t *p = &mv->a[i]; - uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff; - int t; - cr = mm_idx_get(mi, p->x>>8, &t); - if (t >= max_occ) { - int en = (q_pos >> 1) + 1, st = en - q_span; - if (st > rep_en) { - *rep_len += rep_en - rep_st; - rep_st = st, rep_en = en; - } else rep_en = en; - } else { - mm_match_t *q = &m[n_m++]; - q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32; - q->is_tandem = 0; - if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1; - if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1; - *n_a += q->n; - (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1; - } - } - *rep_len += rep_en - rep_st; - *_n_m = n_m; - return m; -} - -static inline int skip_seed(int flag, uint64_t r, const mm_match_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self) +static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self) { *is_self = 0; if (qname && (flag & (MM_F_NO_DIAG|MM_F_NO_DUAL))) { @@ -151,10 +109,10 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max { int i, n_m, heap_size = 0; int64_t j, n_for = 0, n_rev = 0; - mm_match_t *m; + mm_seed_t *m; mm128_t *a, *heap; - m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); + m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t)); a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); @@ -168,7 +126,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max } ks_heapmake_heap(heap_size, heap); while (heap_size > 0) { - mm_match_t *q = &m[heap->y>>32]; + mm_seed_t *q = &m[heap->y>>32]; mm128_t *p; uint64_t r = heap->x; int32_t is_self, rpos = (uint32_t)r >> 1; @@ -216,12 +174,12 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, int *n_mini_pos, uint64_t **mini_pos) { int i, n_m; - mm_match_t *m; + mm_seed_t *m; mm128_t *a; - m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); + m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); for (i = 0, *n_a = 0; i < n_m; ++i) { - mm_match_t *q = &m[i]; + mm_seed_t *q = &m[i]; const uint64_t *r = q->cr; uint32_t k; for (k = 0; k < q->n; ++k) { @@ -253,11 +211,9 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a) { if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) - mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); - if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains - mm_join_long(km, opt, qlen, n_regs, regs, a); } } @@ -266,7 +222,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k if (!(opt->flag & MM_F_CIGAR)) return regs; regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) - mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); mm_set_sam_pri(*n_regs, regs); } @@ -317,9 +273,25 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; } else max_chain_gap_ref = opt->max_gap; - a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + if (opt->flag & MM_F_RMQ) { + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km); + } else { + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + } - if (opt->max_occ > opt->mid_occ && rep_len > 0) { + if (opt->bw_long > opt->bw && (opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences + int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; + if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) { + int32_t i; + for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i]; + kfree(b->km, u); + radix_sort_128x(a, a + n_a); + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km); + } + } else if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads int rechain = 0; if (n_regs0 > 0) { // test if the best chain has all the segments int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0; @@ -339,7 +311,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** kfree(b->km, mini_pos); if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); - a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } } b->frag_gap = max_chain_gap_ref; @@ -370,7 +343,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains free(regs0); for (i = 0; i < n_segs; ++i) { - mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent + mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a); mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr); } @@ -514,7 +487,7 @@ static void merge_hits(step_t *s) } } mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_drop); - mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); if (!(opt->flag & MM_F_ALL_CHAINS)) { mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]); mm_set_sam_pri(s->n_reg[k], s->reg[k]); diff --git a/minimap.h b/minimap.h index 150f52c..f75e7de 100644 --- a/minimap.h +++ b/minimap.h @@ -36,8 +36,9 @@ #define MM_F_NO_END_FLT 0x10000000 #define MM_F_HARD_MLEVEL 0x20000000 #define MM_F_SAM_HIT_ONLY 0x40000000 -#define MM_F_QSTRAND (0x80000000LL) -#define MM_F_NO_INV (0x100000000LL) +#define MM_F_RMQ 0x80000000LL +#define MM_F_QSTRAND (0x100000000LL) +#define MM_F_NO_INV (0x200000000LL) #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 @@ -47,6 +48,18 @@ #define MM_MAX_SEG 255 +#define MM_CIGAR_MATCH 0 +#define MM_CIGAR_INS 1 +#define MM_CIGAR_DEL 2 +#define MM_CIGAR_N_SKIP 3 +#define MM_CIGAR_SOFTCLIP 4 +#define MM_CIGAR_HARDCLIP 5 +#define MM_CIGAR_PADDING 6 +#define MM_CIGAR_EQ_MATCH 7 +#define MM_CIGAR_X_MISMATCH 8 + +#define MM_CIGAR_STR "MIDNSHP=XB" + #ifdef __cplusplus extern "C" { #endif @@ -115,22 +128,22 @@ typedef struct { int max_qlen; // max query length - int bw; // bandwidth + int bw, bw_long; // bandwidth int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window int max_frag_len; int max_chain_skip, max_chain_iter; int min_cnt; // min number of minimizers on each chain int min_chain_score; // min chaining score float chain_gap_scale; + int rmq_size_cap, rmq_inner_dist; + int rmq_rescue_size; + float rmq_rescue_ratio; float mask_level; + int mask_len; float pri_ratio; int best_n; // top best_n chains are subjected to DP alignment - int max_join_long, max_join_short; - int min_join_flank_sc; - float min_join_flank_ratio; - float alt_drop; int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties @@ -147,9 +160,9 @@ typedef struct { int pe_ori, pe_bonus; float mid_occ_frac; // only used by mm_mapopt_update(); see below - int32_t min_mid_occ; + int32_t min_mid_occ, max_mid_occ; int32_t mid_occ; // ignore seeds with occurrences above this threshold - int32_t max_occ; + int32_t max_occ, max_max_occ, occ_dist; int64_t mini_batch_size; // size of a batch of query bases to process in parallel int64_t max_sw_mat; diff --git a/minimap2.1 b/minimap2.1 index 88d895c..a1c0fb8 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "4 May 2019" "minimap2-2.17 (r941)" "Bioinformatics tools" +.TH minimap2 1 "6 July 2021" "minimap2-2.21 (r1071)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -145,22 +145,32 @@ or .B -xsr mode, which sets the threshold for a second round of seeding. .TP -.BI --min-occ-floor \ INT -Force minimap2 to always use k-mers occurring +.BI -U \ INT1 [, INT2 ] +Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is +.RI max{ INT1 ,\ min{ INT2 , +.BR -f }}. +This option prevents excessively small or large +.B -f +estimated from the input reference. It deprecates +.B --min-occ-floor +in earlier versions of minimap2. +.TP +.BI -e \ INT +Sample a high-frequency minimizer every .I INT -times or less [0]. In effect, the max occurrence threshold is set to -the -.RI max{ INT , -.BR -f }. +basepairs [500]. .TP -.BI -g \ INT +.BI -g \ NUM Stop chain enlongation if there are no minimizers within -.IR INT -bp -[10000]. +.IR NUM -bp +[10k]. .TP -.BI -r \ INT -Bandwidth used in chaining and DP-based alignment [500]. This option -approximately controls the maximum gap size. +.BI -r \ NUM1 [, NUM2 ] +Bandwidth for chaining and base alignment [500,20k]. +.I NUM1 +is used for initial chaining and alignment extension; +.I NUM2 +for RMQ-based re-chaining and closing gaps in alignments. .TP .BI -n \ INT Discard chains consisting of @@ -234,10 +244,21 @@ Mark as secondary a chain that overlaps with a better chain by .I FLOAT or more of the shorter chain [0.5] .TP +.BR --rmq = no | yes +Use the minigraph chaining algorithm [no]. The minigraph algorithm is better +for aligning contigs through long INDELs. +.TP .B --hard-mask-level Honor option .B -M -and disable a heurstic to save unmapped subsequences. +and disable a heurstic to save unmapped subsequences and disables +.BR --mask-len . +.TP +.BI --mask-len \ NUM +Keep an alignment if dropping it leaves an unaligned region on query longer than +.IR INT +[inf]. Effective without +.BR --hard-mask-level . .TP .BI --max-chain-skip \ INT A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming @@ -261,10 +282,6 @@ Disable the long gap patching heuristic. When this option is applied, the maximum alignment gap is mostly controlled by .BR -r . .TP -.BI --lj-min-ratio \ FLOAT -Fraction of query sequence length required to bridge a long gap [0.5]. A -smaller value helps to recover longer gaps, at the cost of more false gaps. -.TP .B --splice Enable the splice alignment mode. .TP @@ -384,7 +401,7 @@ BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf' .BR --junc-bonus \ INT Score bonus for a splice donor or acceptor found in annotation (effective with .BR --junc-bed ) -[0]. +[9]. .TP .BI --end-seed-pen \ INT Drop a terminal anchor if @@ -405,7 +422,7 @@ alignment. .BI --cap-sw-mem \ NUM Skip alignment if the DP matrix size is above .IR NUM . -Set 0 to disable [0]. +Set 0 to disable [100m]. .SS Input/output options .TP 10 .B -a @@ -516,60 +533,47 @@ Available .I STR are: .RS -.TP 8 -.B map-pb -PacBio/Oxford Nanopore read to reference mapping -.RB ( -Hk19 ) -.TP +.TP 10 .B map-ont -Slightly more sensitive for Oxford Nanopore to reference mapping -.RB ( -k15 ). -For PacBio reads, HPC minimizers consistently leads to faster performance and -more sensitive results in comparison to normal minimizers. For Oxford Nanopore -data, normal minimizers are better, though not much. The effectiveness of HPC -is determined by the sequencing error mode. +Align noisy long reads of ~10% error rate to a reference genome. This is the +default mode. +.TP +.B map-hifi +Align PacBio high-fidelity (HiFi) reads to a reference genome +.RB ( -k19 +.B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1 +.BR -s200 ). +.TP +.B map-pb +Align older PacBio continuous long (CLR) reads to a reference genome +.RB ( -Hk19 ). .TP .B asm5 Long assembly to reference mapping .RB ( -k19 -.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200 +.BR -N50 ). Typically, the alignment will not extend to regions with 5% or higher sequence divergence. Only use this preset if the average divergence is far below 5%. .TP .B asm10 Long assembly to reference mapping .RB ( -k19 -.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200 +.BR -N50 ). Up to 10% sequence divergence. .TP .B asm20 Long assembly to reference mapping .RB ( -k19 -.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200 +.BR -N50 ). Up to 20% sequence divergence. .TP -.B ava-pb -PacBio all-vs-all overlap mapping -.RB ( -Hk19 -.B -Xw5 -m100 -g10000 --max-chain-skip -.BR 25 ). -.TP -.B ava-ont -Oxford Nanopore all-vs-all overlap mapping -.RB ( -k15 -.B -Xw5 -m100 -g10000 -r2000 --max-chain-skip -.BR 25 ). -Similarly, the major difference from -.B ava-pb -is that this preset is not using HPC minimizers. -.TP .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 +.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 .BR --splice-flank=yes ). In the splice mode, 1) long deletions are taken as introns and represented as the @@ -588,9 +592,21 @@ Long-read splice alignment for PacBio CCS reads .B sr Short single-end reads without splicing .RB ( -k21 -.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 -.B -s40 -g200 -2K50m --heap-sort=yes +.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 +.B -s40 -g100 -2K50m --heap-sort=yes .BR --secondary=no ). +.TP +.B ava-pb +PacBio CLR all-vs-all overlap mapping +.RB ( -Hk19 +.B -Xw5 -e0 +.BR -m100 ). +.TP +.B ava-ont +Oxford Nanopore all-vs-all overlap mapping +.RB ( -k15 +.B -Xw5 -e0 -m100 +.BR -r2k ). .RE .SS Miscellaneous options .TP 10 diff --git a/misc.c b/misc.c index ddfa9de..f3a29f8 100644 --- a/misc.c +++ b/misc.c @@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8) KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8) KSORT_INIT_GENERIC(uint32_t) +KSORT_INIT_GENERIC(uint64_t) diff --git a/misc/paftools.js b/misc/paftools.js index f15cfde..0c5c186 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.17-r982-dirty'; +var paftools_version = '2.21-r1071'; /***************************** ***** Library functions ***** @@ -962,12 +962,13 @@ function paf_asmgene(args) function paf_stat(args) { - var c, gap_out_len = null; - while ((c = getopt(args, "l:")) != null) + var c, gap_out_len = null, count_err = false; + while ((c = getopt(args, "cl:")) != null) if (c == 'l') gap_out_len = parseInt(getopt.arg); + else if (c == 'c') count_err = true; if (getopt.ind == args.length) { - print("Usage: paftools.js stat [-l gapOutLen] |"); + print("Usage: paftools.js stat [-c] [-l gapOutLen] |"); exit(1); } @@ -998,7 +999,7 @@ function paf_stat(args) if (line.charAt(0) != '@') { var t = line.split("\t", 12); var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null; - var atlen = null, aqlen, qs, qe, mapq, ori_qlen; + var atlen = null, aqlen, qs, qe, mapq, ori_qlen, NM = null; if (t.length < 2) continue; if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF if (t[4] == '*') continue; // unmapped @@ -1006,6 +1007,8 @@ function paf_stat(args) ++n_2nd; continue; } + if ((m = /\tNM:i:(\d+)/.exec(line)) != null) + NM = parseInt(m[1]); if ((m = /\tcg:Z:(\S+)/.exec(line)) != null) cigar = m[1]; if (cigar == null) { @@ -1027,6 +1030,8 @@ function paf_stat(args) ++n_2nd; continue; } + if ((m = /\tNM:i:(\d+)/.exec(line)) != null) + NM = parseInt(m[1]); cigar = t[5]; tname = t[2]; rs = parseInt(t[3]) - 1; @@ -1045,11 +1050,13 @@ function paf_stat(args) ++n_seq, last = t[0]; } var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0; + var n_gapo = 0, n_gap_all = 0, l_match = 0; while ((m = re.exec(cigar)) != null) { var l = parseInt(m[1]); ++n_cigar; if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') { tl += l, ql += l, M += l; + l_match += l; } else if (m[2] == 'I' || m[2] == 'D') { var type; if (l < 50) type = 0; @@ -1062,6 +1069,7 @@ function paf_stat(args) else tl += l, ++n_gap[1][type]; if (gap_out_len != null && l >= gap_out_len) print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l); + ++n_gapo, n_gap_all += l; } else if (m[2] == 'N') { tl += l; } else if (m[2] == 'S') { @@ -1079,6 +1087,12 @@ function paf_stat(args) qs = clip[is_rev? 1 : 0], qe = qs + ql; ori_qlen = clip[0] + ql + clip[1]; } + if (count_err && NM != null) { + var n_mm = NM - n_gap_all; + if (n_mm < 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno); + if (n_mm < 0) n_mm = 0; + print(t[0], ori_qlen, t[11], ori_qlen - (qe - qs), NM, l_match + n_gap_all, n_mm + n_gapo, l_match + n_gapo); + } regs.push([qs, qe]); last_qlen = ori_qlen; } @@ -1091,7 +1105,7 @@ function paf_stat(args) file.close(); buf.destroy(); - if (gap_out_len == null) { + if (gap_out_len == null && !count_err) { print("Number of mapped sequences: " + n_seq); print("Number of primary alignments: " + n_pri); print("Number of secondary alignments: " + n_2nd); @@ -1405,7 +1419,7 @@ function paf_view(args) var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); // these are used to show padded alignment var re_cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g; - var re_cg = /(\d+)([MIDNSH])/g; + var re_cg = /(\d+)([MIDNSHP=X])/g; var buf = new Bytes(); var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]); @@ -1885,7 +1899,7 @@ function paf_splice2bed(args) a.length = 0; } - var re = /(\d+)([MIDNSH])/g; + var re = /(\d+)([MIDNSHP=X])/g; var c, fmt = "bed", fn_name_conv = null, keep_multi = false; while ((c = getopt(args, "f:n:m")) != null) { if (c == 'f') fmt = getopt.arg; @@ -2355,28 +2369,43 @@ function paf_junceval(args) file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]); var last_qname = null; - var re_cigar = /(\d+)([MIDNSHX=])/g; + var re_cigar = /(\d+)([MIDNSHP=X])/g; while (file.readline(buf) >= 0) { var m, t = buf.toString().split("\t"); + var ctg_name = null, cigar = null, pos = null, qname = t[0]; if (t[0].charAt(0) == '@') continue; - if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(t[2])) continue; - var flag = parseInt(t[1]); - if (flag&0x100) continue; - if (first_only && last_qname == t[0]) continue; - if (t[2] == '*') { + if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF + ctg_name = t[5], pos = parseInt(t[7]); + var type = 'P'; + for (i = 12; i < t.length; ++i) { + if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) { + if (m[1] == 'tp:A') type = m[2]; + else cigar = m[2]; + } + } + if (type == 'S') continue; // secondary + } else { // SAM + ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5]; + var flag = parseInt(t[1]); + if (flag&0x100) continue; // secondary + } + + if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue; + if (first_only && last_qname == qname) continue; + if (ctg_name == '*') { // unmapped ++n_unmapped; continue; } else { ++n_pri; - if (last_qname != t[0]) { + if (last_qname != qname) { ++n_mapped; - last_qname = t[0]; + last_qname = qname; } } - var pos = parseInt(t[3]) - 1, intron = []; - while ((m = re_cigar.exec(t[5])) != null) { + var intron = []; + while ((m = re_cigar.exec(cigar)) != null) { var len = parseInt(m[1]), op = m[2]; if (op == 'N') { intron.push([pos, pos + len]); @@ -2389,7 +2418,7 @@ function paf_junceval(args) } n_splice += intron.length; - var chr = anno[t[2]]; + var chr = anno[ctg_name]; if (chr != null) { for (var i = 0; i < intron.length; ++i) { var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]); @@ -2413,12 +2442,12 @@ function paf_junceval(args) x += '(' + o[j][0] + "," + o[j][1] + ')'; } x += ']'; - print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x); + print(type, qname, i+1, ctg_name, intron[i][0], intron[i][1], x); } } else { ++n_splice_novel; if (print_ovlp) - print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]); + print('N', qname, i+1, ctg_name, intron[i][0], intron[i][1]); } } } else { @@ -2512,6 +2541,395 @@ function paf_ov_eval(args) print((100 * (1 - n_missing / n_ovlp)).toFixed(2) + "% sensitivity"); } +function paf_vcfstat(args) +{ + var c, ts = { "AG":1, "GA":1, "CT":1, "TC":1 }; + while ((c = getopt(args, "")) != null) { + } + var buf = new Bytes(); + var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]); + var x = { sub:0, ts:0, tv:0, ins:0, del:0, ins1:0, del1:0, ins2:0, del2:0, ins50:0, del50:0, ins1k:0, del1k:0, ins7k:0, del7k:0, insinf:0, delinf:0 }; + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (t[0][0] == '#') continue; + var alt = t[4].split(","); + var ref = t[3]; + for (var i = 0; i < alt.length; ++i) { + var a = alt[i]; + if (a[0] == '<' || a[1] == '>') continue; + var l = ref.length < a.length? ref.length : a.length; + for (var j = 0; j < l; ++j) { + if (ref[j] != a[j]) { + ++x.sub; + if (ts[ref[j] + a[j]]) ++x.ts; + else ++x.tv; + } + } + var d = a.length - ref.length; + if (d > 0) { + ++x.ins; + if (d == 1) ++x.ins1; + else if (d == 2) ++x.ins2; + else if (d < 50) ++x.ins50; + else if (d < 1000) ++x.ins1k; + else if (d < 7000) ++x.ins7k; + else ++x.insinf; + } else if (d < 0) { + d = -d; + ++x.del; + if (d == 1) ++x.del1; + else if (d == 2) ++x.del2; + else if (d < 50) ++x.del50; + else if (d < 1000) ++x.del1k; + else if (d < 7000) ++x.del7k; + else ++x.delinf; + } + } + } + file.close(); + buf.destroy(); + print("# substitutions: " + x.sub); + print("ts/tv: " + (x.ts / x.tv).toFixed(3)); + print("# insertions: " + x.ins); + print("# 1bp insertions: " + x.ins1); + print("# 2bp insertions: " + x.ins2); + print("# [3,50) insertions: " + x.ins50); + print("# [50,1000) insertions: " + x.ins1k); + print("# [1000,7000) insertions: " + x.ins7k); + print("# >=7000 insertions: " + x.insinf); + print("# deletions: " + x.del); + print("# 1bp deletions: " + x.del1); + print("# 2bp deletions: " + x.del2); + print("# [3,50) deletions: " + x.del50); + print("# [50,1000) deletions: " + x.del1k); + print("# [1000,7000) deletions: " + x.del7k); + print("# >=7000 deletions: " + x.delinf); +} + +function paf_parseNum(s) { + var m, x = null; + if ((m = /^(\d*\.?\d*)([mMgGkK]?)/.exec(s)) != null) { + x = parseFloat(m[1]); + if (m[2] == 'k' || m[2] == 'K') x *= 1000; + else if (m[2] == 'm' || m[2] == 'M') x *= 1000000; + else if (m[2] == 'g' || m[2] == 'G') x *= 1000000000; + } + return Math.floor(x + .499); +} + +function paf_misjoin(args) +{ + var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false, cen_ratio = 0.5; + var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0]; + while ((c = getopt(args, "l:g:c:per:")) != null) { + if (c == 'l') min_seg_len = paf_parseNum(getopt.arg); + else if (c == 'g') max_gap = paf_parseNum(getopt.arg); + else if (c == 'c') fn_cen = getopt.arg; + else if (c == 'r') cen_ratio = parseFloat(getopt.arg); + else if (c == 'p') show_long = true; + else if (c == 'e') show_err = true; + } + if (args.length == getopt.ind) { + print("Usage: paftools.js misjoin [options] "); + print("Options:"); + print(" -c FILE BED for centromeres []"); + print(" -r FLOAT count a centromeric event if overlap ratio > FLOAT [" + cen_ratio + "]"); + print(" -l NUM min alignment block length [1m]"); + print(" -g NUM max gap size [1m]"); + print(" -e output misjoins not involving centromeres"); + print(" -p output long alignment blocks for debugging"); + return; + } + var cen = {}; + var file, buf = new Bytes(); + if (fn_cen != null) { + file = new File(fn_cen); + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (cen[t[0]] == null) cen[t[0]] = []; + cen[t[0]].push([parseInt(t[1]), parseInt(t[2])]); + } + file.close(); + } + + function test_cen(cen, chr, st, en) { + var b = cen[chr], len = 0; + if (b == null) return false; + for (var j = 0; j < b.length; ++j) + if (b[j][0] < en && b[j][1] > st) { + var s = b[j][0] > st? b[j][0] : st; + var e = b[j][1] < en? b[j][1] : en; + len += e - s; + } + return len < (en - st) * cen_ratio? false : true; + } + + function process(a) { + var k = 0; + for (var i = 0; i < a.length; ++i) { + for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]); + for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]); + if (a[i][10] >= min_seg_len) a[k++] = a[i]; + } + a.length = k; + if (a.length == 1) return; + a = a.sort(function(x,y){return x[2]-y[2]}); + if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t")); + for (var i = 1; i < a.length; ++i) { + var ov = [false, false]; + ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]); + ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]); + if (a[i-1][5] != a[i][5]) { // different chr + if (ov[0] || ov[1]) ++n_diff[1]; + else if (show_err) { + print("J", a[i-1].slice(0, 12).join("\t")); + print("J", a[i].slice(0, 12).join("\t")); + } + ++n_diff[0]; + } else if (a[i-1][4] == a[i][4]) { // a gap + var dq = a[i][2] - a[i-1][3]; + var dr = a[i][4] == '+'? a[i][7] - a[i-1][8] : a[i-1][7] - a[i][8]; + var gap = dr > dq? dr - dq : dq - dr; + if (gap > max_gap) { + if (ov[0] || ov[1]) ++n_gap[1]; + else if (show_err) { + print("G", a[i-1].slice(0, 12).join("\t")); + print("G", a[i].slice(0, 12).join("\t")); + } + ++n_gap[0]; + } + } else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion + if (ov[0] || ov[1]) ++n_inv[1]; + else if (show_err) { + print("M", a[i-1].slice(0, 12).join("\t")); + print("M", a[i].slice(0, 12).join("\t")); + print("M", a[i+1].slice(0, 12).join("\t")); + } + ++n_inv[0]; + ++i; + } else { // hanging inversion + if (ov[0] || ov[1]) ++n_inv_end[1]; + ++n_inv_end[0]; + } + } + } + + file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]); + var a = []; + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (a.length > 0 && a[0][0] != t[0]) { + process(a); + a.length = 0; + } + a.push(t); + } + if (a.length > 0) process(a); + file.close(); + buf.destroy(); + print("# inter-chromosomal misjoins: " + n_diff.join(",")); + print("# intra-chromosomal gaps: " + n_gap.join(",")); + print("# candidate inversions in the middle: " + n_inv.join(",")); + print("# candidate inversions at contig ends: " + n_inv_end.join(",")); +} + +function _paf_get_alen(t) +{ + var svlen = null, alen = null; + if ((m = /(^|;)SVLEN=(-?\d+)/.exec(t[7])) != null) + svlen = parseInt(m[2]); + var s = t[4].split(","); + var min_abs_diff = 1<<30, max_abs_diff = 0; + if (svlen != null && svlen != 0) + alen = svlen, min_abs_diff = max_abs_diff = svlen > 0? svlen : -svlen; + var rlen = t[3].length; + for (var i = 0; i < s.length; ++i) { + if (/^<\S+>$/.test(s[i])) continue; + var diff = s[i].length - rlen; + var abs_diff = diff > 0? diff : -diff; + min_abs_diff = min_abs_diff < abs_diff? min_abs_diff : abs_diff; + if (max_abs_diff < abs_diff) + max_abs_diff = abs_diff, alen = diff; + } + return [alen, min_abs_diff, max_abs_diff]; +} + +function paf_sveval(args) +{ + var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, print_match = false, bed_fn = null; + var len_diff_ratio = 0.5; + while ((c = getopt(args, "f:i:x:w:er:pd:")) != null) { + if (c == 'f') min_flt = paf_parseNum(getopt.arg); + else if (c == 'i') min_size = paf_parseNum(getopt.arg); + else if (c == 'x') max_size = paf_parseNum(getopt.arg); + else if (c == 'w') win_size = paf_parseNum(getopt.arg); + else if (c == 'd') len_diff_ratio = parseFloat(getopt.arg); + else if (c == 'r') bed_fn = getopt.arg; + else if (c == 'e') print_err = true; + else if (c == 'p') print_match = true; + } + if (args.length - getopt.ind < 2) { + print("Usage: paftools.js sveval [options] "); + print("Options:"); + print(" -r FILE confident region in BED []"); + print(" -f INT min length to discard [" + min_flt + "]"); + print(" -i INT min SV length [" + min_size + "]"); + print(" -x INT max SV length [" + max_size + "]"); + print(" -w INT fuzzy windown size [" + win_size + "]"); + print(" -d FLOAT max allele diff if there is a single allele in the window [" + len_diff_ratio + "]"); + print(" -e print errors"); + return; + } + + function read_bed(fn) { + var buf = new Bytes(); + var file = new File(fn); + var bed = {}; + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (bed[t[0]] == null) bed[t[0]] = []; + bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]); + } + file.close(); + buf.destroy(); + for (var x in bed) { + Interval.sort(bed[x]); + Interval.merge(bed[x]); + Interval.index_end(bed[x]); + } + return bed; + } + + var bed = bed_fn != null? read_bed(bed_fn) : null; + + function read_vcf(fn, bed) { + var buf = new Bytes(); + var file = new File(fn); + var v = {}; + while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + if (t[0][0] == '#') continue; + if (bed != null && bed[t[0]] == null) continue; + if (t[4] == '' || t[4] == '') continue; // no inversion + if (/[\[\]]/.test(t[4])) continue; // no break points + var st = parseInt(t[1]) - 1, en = st + t[3].length; + // parse svlen + var b = _paf_get_alen(t), svlen = b[0]; + var abslen = svlen == null? 0 : svlen > 0? svlen : -svlen; + if (abslen < min_flt || abslen > max_size) continue; + // update end + if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null) + en = parseInt(m[2]); + else if (svlen != null && svlen < 0) + en = st + (-svlen); + if (en < st) en = st; + if (st == en) --st, ++en; + if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue; + // insert + if (v[t[0]] == null) v[t[0]] = []; + v[t[0]].push([st, en, svlen, abslen]); + } + file.close(); + buf.destroy(); + for (var x in v) { + Interval.sort(v[x]); + Interval.index_end(v[x]); + } + return v; + } + + function compare_vcf(v0, v1, label) { + var m = 0, n = 0; + for (var x in v1) { + var a1 = v1[x], a0 = v0[x]; + for (var i = 0; i < a1.length; ++i) { + if (a1[i][3] < min_size) continue; + ++n; + if (a0 == null) continue; + var ws = win_size + (a1[i][3]>>1); + var st = a1[i][0] > ws? a1[i][0] - ws : 0; + b = Interval.find_ovlp(a0, st, a1[i][1] + ws); + var n_ins = 0, n_del = 0, sv_del = null, sv_ins = null; + for (var j = 0; j < b.length; ++j) { + if (b[j][2] < 0) ++n_del, sv_del = -b[j][2]; + else if (b[j][2] > 0) ++n_ins, sv_ins = b[j][2]; + if (print_match) + print("MA", x, a1[i].slice(0, 3).join("\t"), b[j].slice(0, 3).join("\t")); + } + var match = false; + if (a1[i][2] > 0) { // insertion + if (n_ins == 1) { + var diff = sv_ins - a1[i][3]; + if (diff < 0) diff = -diff; + if (diff < min_size || diff / a1[i][3] < len_diff_ratio) + match = true; + } else if (n_ins > 1) match = true; // multiple insertions; ambiguous + } else if (a1[i][2] < 0) { + if (n_del == 1) { // deletion + var diff = sv_del - a1[i][3]; + if (diff < 0) diff = -diff; + if (diff < min_size || diff / a1[i][3] < len_diff_ratio) + match = true; + } else if (n_del > 1) match = true; // multiple deletions; ambiguous + } + if (match) ++m; + else if (print_err) { + if ((a1[i][2] > 0 && n_ins > 0) || (a1[i][2] < 0 && n_del > 0)) + print("MM", x, a1[i].slice(0, 3).join("\t")); + print(label, x, a1[i].slice(0, 3).join("\t")); + } + } + } + return [n, m]; + } + + var v_base = read_vcf(args[getopt.ind+0], bed); + var v_call = read_vcf(args[getopt.ind+1], bed); + var fn = compare_vcf(v_call, v_base, 'FN'); + var fp = compare_vcf(v_base, v_call, 'FP'); + print('SN', fn[0], fn[1], (fn[1] / fn[0]).toFixed(6)); + print('PC', fp[0], fp[1], (fp[1] / fp[0]).toFixed(6)); + print('F1', ((fn[1] / fn[0] + fp[1] / fp[0]) / 2).toFixed(6)); +} + +function paf_vcfsel(args) +{ + var c, min_l = 0, max_l = 1<<30; + while ((c = getopt(args, "l:L:")) != null) { + if (c == 'l') min_l = parseInt(getopt.arg); + else if (c == 'L') max_l = parseInt(getopt.arg); + } + + var buf = new Bytes(); + if (getopt.ind == args.length) { + print("Usage: paftools.js vcfsel [options] "); + return 1; + } + var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]); + while (file.readline(buf) >= 0) { + var m, line = buf.toString(); + if (line[0] == '#') { + print(line); + continue; + } + var t = line.split("\t"); + var st = parseInt(t[1]), en = st + t[3].length - 1; + if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null) + en = parseInt(m[2]); + if (en < st) { + warn("END is smaller than POS: " + en + " < " + st); + en = st; + } + var b = _paf_get_alen(t); + var alen = b[0], min_abs_diff = b[1], max_abs_diff = b[2]; + if (max_abs_diff < min_l || min_abs_diff > max_l) + continue; + print(line); + } + file.close(); + buf.destroy(); +} + /************************* ***** main function ***** *************************/ @@ -2529,10 +2947,13 @@ function main(args) print(""); print(" stat collect basic mapping information in PAF/SAM"); print(" asmstat collect basic assembly information"); - print(" asmgene evaluate gene completeness (EXPERIMENTAL)"); + print(" asmgene evaluate gene completeness"); + print(" misjoin evaluate large-scale misjoins"); print(" liftover simplistic liftOver"); print(" call call variants from asm-to-ref alignment with the cs tag"); print(" bedcov compute the number of bases covered"); + print(" vcfstat VCF statistics"); + print(" sveval compare two SV callsets in VCF"); print(" version print paftools.js version"); print(""); print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ"); @@ -2552,6 +2973,7 @@ function main(args) else if (cmd == 'stat') paf_stat(args); else if (cmd == 'asmstat') paf_asmstat(args); else if (cmd == 'asmgene') paf_asmgene(args); + else if (cmd == 'misjoin') paf_misjoin(args); else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args); else if (cmd == 'vcfpair') paf_vcfpair(args); else if (cmd == 'call') paf_call(args); @@ -2561,6 +2983,9 @@ function main(args) else if (cmd == 'pbsim2fq') paf_pbsim2fq(args); else if (cmd == 'junceval') paf_junceval(args); else if (cmd == 'ov-eval') paf_ov_eval(args); + else if (cmd == 'vcfstat') paf_vcfstat(args); + else if (cmd == 'sveval') paf_sveval(args); + else if (cmd == 'vcfsel') paf_vcfsel(args); else if (cmd == 'version') print(paftools_version); else throw Error("unrecognized command: " + cmd); } diff --git a/mmpriv.h b/mmpriv.h index 3e03d57..cb9618b 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -36,6 +36,14 @@ extern "C" { #endif +typedef struct { + uint32_t n; + uint32_t q_pos; + uint32_t q_span:31, flt:1; + uint32_t seg_id:31, is_tandem:1; + const uint64_t *cr; +} mm_seed_t; + typedef struct { int n_u, n_a; uint64_t *u; @@ -52,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); +mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); + int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len); @@ -63,20 +73,25 @@ void mm_idxopt_init(mm_idxopt_t *opt); const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); - mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand); + +mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, + int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); +mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); +mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); + void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r); void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand); void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a); int mm_set_sam_pri(int n, mm_reg1_t *r); -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac); +void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac); void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r); void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs); -void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac); void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr); diff --git a/options.c b/options.c index 81f6f4c..a567fb2 100644 --- a/options.c +++ b/options.c @@ -1,4 +1,5 @@ #include +#include #include "mmpriv.h" void mm_idxopt_init(mm_idxopt_t *opt) @@ -15,26 +16,30 @@ void mm_mapopt_init(mm_mapopt_t *opt) memset(opt, 0, sizeof(mm_mapopt_t)); opt->seed = 11; opt->mid_occ_frac = 2e-4f; + opt->min_mid_occ = 10; + opt->max_mid_occ = 1000000; opt->sdust_thres = 0; // no SDUST masking opt->min_cnt = 3; opt->min_chain_score = 40; - opt->bw = 500; + opt->bw = 500, opt->bw_long = 20000; opt->max_gap = 5000; opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->max_chain_iter = 5000; - opt->chain_gap_scale = 1.0f; + opt->rmq_inner_dist = 1000; + opt->rmq_size_cap = 100000; + opt->rmq_rescue_size = 1000; + opt->rmq_rescue_ratio = 0.1f; + opt->chain_gap_scale = 0.8f; + opt->max_max_occ = 4095; + opt->occ_dist = 500; opt->mask_level = 0.5f; + opt->mask_len = INT_MAX; opt->pri_ratio = 0.8f; opt->best_n = 5; - opt->max_join_long = 20000; - opt->max_join_short = 2000; - opt->min_join_flank_sc = 1000; - opt->min_join_flank_ratio = 0.5f; - opt->alt_drop = 0.15f; opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; @@ -46,6 +51,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6; opt->max_clip_ratio = 1.0f; opt->mini_batch_size = 500000000; + opt->max_sw_mat = 100000000; opt->pe_ori = 0; // FF opt->pe_bonus = 33; @@ -55,10 +61,13 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) { if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV)) opt->flag |= MM_F_SPLICE; - if (opt->mid_occ <= 0) + if (opt->mid_occ <= 0) { opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); - if (opt->mid_occ < opt->min_mid_occ) - opt->mid_occ = opt->min_mid_occ; + if (opt->mid_occ < opt->min_mid_occ) + opt->mid_occ = opt->min_mid_occ; + if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) + opt->mid_occ = opt->max_mid_occ; + } if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); } @@ -66,7 +75,7 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len) { if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0) - opt->max_gap_ref = opt->bw = max_intron_len; + opt->max_gap_ref = opt->bw = opt->bw_long = max_intron_len; } int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) @@ -74,37 +83,44 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) if (preset == 0) { mm_idxopt_init(io); mm_mapopt_init(mo); + } else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default } else if (strcmp(preset, "ava-ont") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; - mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; - mo->bw = 2000; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; + mo->bw = mo->bw_long = 2000; + mo->occ_dist = 0; + } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { + io->flag |= MM_I_HPC, io->k = 19; } else if (strcmp(preset, "ava-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; - mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; - } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { - io->flag |= MM_I_HPC, io->k = 19; - } else if (strcmp(preset, "map-ont") == 0) { - io->flag = 0, io->k = 15; - } else if (strcmp(preset, "asm5") == 0) { + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; + mo->bw_long = mo->bw; + mo->occ_dist = 0; + } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; + mo->max_gap = 10000; + mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; + mo->occ_dist = 500; + mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; - mo->best_n = 50; - } else if (strcmp(preset, "asm10") == 0) { + } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; - mo->min_dp_max = 200; - mo->best_n = 50; - } else if (strcmp(preset, "asm20") == 0) { - io->flag = 0, io->k = 19, io->w = 10; - mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; + mo->bw = mo->bw_long = 100000; + mo->max_gap = 10000; + mo->flag |= MM_F_RMQ; + mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; mo->best_n = 50; + if (strcmp(preset, "asm5") == 0) { + mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + } else if (strcmp(preset, "asm10") == 0) { + mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + } else if (strcmp(preset, "asm20") == 0) { + mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + io->w = 10; + } else return -1; } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { io->flag = 0, io->k = 21, io->w = 11; mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT; @@ -114,7 +130,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->end_bonus = 10; mo->max_frag_len = 800; mo->max_gap = 100; - mo->bw = 100; + mo->bw = mo->bw_long = 100; mo->pri_ratio = 0.5f; mo->min_cnt = 2; mo->min_chain_score = 25; @@ -126,7 +142,8 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) } else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; - mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; + mo->max_sw_mat = 0; + mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->noncan = 9; mo->junc_bonus = 9; @@ -139,6 +156,16 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo) { + if (mo->bw > mo->bw_long) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m with '-rNUM1,NUM2', NUM1 (%d) can't be larger than NUM2 (%d)\033[0m\n", mo->bw, mo->bw_long); + return -8; + } + if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n"); + return -7; + } if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) { if (mm_verbose >= 1) fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n"); diff --git a/python/README.rst b/python/README.rst index 3d0fae8..3082980 100644 --- a/python/README.rst +++ b/python/README.rst @@ -144,7 +144,7 @@ properties: * **mlen**: length of the matching bases in the alignment, excluding ambiguous base matches. -* **NM**: number of mismatches, gaps and ambiguous poistions in the alignment +* **NM**: number of mismatches, gaps and ambiguous positions in the alignment * **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 72a19d7..abc3ef7 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -13,21 +13,27 @@ cdef extern from "minimap.h": int64_t flag int seed int sdust_thres + int max_qlen - int bw + + int bw, bw_long int max_gap, max_gap_ref int max_frag_len int max_chain_skip, max_chain_iter int min_cnt int min_chain_score float chain_gap_scale + int rmq_size_cap, rmq_inner_dist + int rmq_rescue_size + float rmq_rescue_ratio + float mask_level + int mask_len float pri_ratio int best_n - int max_join_long, max_join_short - int min_join_flank_sc - float min_join_flank_ratio + float alt_drop + int a, b, q, e, q2, e2 int sc_ambi int noncan @@ -38,13 +44,16 @@ cdef extern from "minimap.h": int min_ksw_len int anchor_ext_len, anchor_ext_shift float max_clip_ratio + int pe_ori, pe_bonus + float mid_occ_frac int32_t min_mid_occ int32_t mid_occ int32_t max_occ int64_t mini_batch_size int64_t max_sw_mat + const char *split_prefix int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) diff --git a/python/mappy.pyx b/python/mappy.pyx index 3273514..838b5ad 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -3,7 +3,7 @@ from libc.stdlib cimport free cimport cmappy import sys -__version__ = '2.17' +__version__ = '2.21' cmappy.mm_reset_timer() @@ -82,7 +82,7 @@ cdef class Alignment: @property def cigar_str(self): - return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar)) + return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar)) def __str__(self): if self._strand > 0: strand = '+' diff --git a/seed.c b/seed.c new file mode 100644 index 0000000..baf1b20 --- /dev/null +++ b/seed.c @@ -0,0 +1,106 @@ +#include "mmpriv.h" +#include "kalloc.h" +#include "ksort.h" + +mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_) +{ + mm_seed_t *m; + size_t i; + int32_t k; + m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t)); + for (i = k = 0; i < mv->n; ++i) { + const uint64_t *cr; + mm_seed_t *q; + mm128_t *p = &mv->a[i]; + uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff; + int t; + cr = mm_idx_get(mi, p->x>>8, &t); + if (t == 0) continue; + q = &m[k++]; + q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32; + q->is_tandem = q->flt = 0; + if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1; + if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1; + } + *n_m_ = k; + return m; +} + +#define MAX_MAX_HIGH_OCC 128 + +void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist) +{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak + extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*); + extern void ks_heapmake_uint64_t(size_t n, uint64_t*); + int32_t i, last0, m; + uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation + + if (n == 0 || n == 1) return; + for (i = m = 0; i < n; ++i) + if (a[i].n > max_occ) ++m; + if (m == 0) return; // no high-frequency k-mers; do nothing + for (i = 0, last0 = -1; i <= n; ++i) { + if (i == n || a[i].n <= max_occ) { + if (i - last0 > 1) { + int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1; + int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1; + int32_t j, k, st = last0 + 1, en = i; + int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499); + if (max_high_occ > 0) { + if (max_high_occ > MAX_MAX_HIGH_OCC) + max_high_occ = MAX_MAX_HIGH_OCC; + for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k) + b[k] = (uint64_t)a[j].n<<32 | j; + ks_heapmake_uint64_t(k, b); // initialize the binomial heap + for (; j < en; ++j) { // if there are more, choose top max_high_occ + if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap + b[0] = (uint64_t)a[j].n<<32 | j; + ks_heapdown_uint64_t(0, k, b); + } + } + for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1; + } + for (j = st; j < en; ++j) a[j].flt ^= 1; + for (j = st; j < en; ++j) + if (a[j].n > max_max_occ) + a[j].flt = 1; + } + last0 = i; + } + } +} + +mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) +{ + int rep_st = 0, rep_en = 0, n_m, n_m0; + size_t i; + mm_seed_t *m; + *n_mini_pos = 0; + *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t)); + m = mm_seed_collect_all(km, mi, mv, &n_m0); + if (dist > 0 && max_max_occ > max_occ) { + mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist); + } else { + for (i = 0; i < n_m0; ++i) + if (m[i].n > max_occ) + m[i].flt = 1; + } + for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) { + mm_seed_t *q = &m[i]; + //fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt); + if (q->flt) { + int en = (q->q_pos >> 1) + 1, st = en - q->q_span; + if (st > rep_en) { + *rep_len += rep_en - rep_st; + rep_st = st, rep_en = en; + } else rep_en = en; + } else { + *n_a += q->n; + (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1; + m[n_m++] = *q; + } + } + *rep_len += rep_en - rep_st; + *_n_m = n_m; + return m; +} diff --git a/setup.py b/setup.py index 042911c..1a4d227 100644 --- a/setup.py +++ b/setup.py @@ -4,16 +4,6 @@ except ImportError: from distutils.core import setup from distutils.extension import Extension -cmdclass = {} - -try: - from Cython.Build import build_ext -except ImportError: # without Cython - module_src = 'python/mappy.c' -else: # with Cython - module_src = 'python/mappy.pyx' - cmdclass['build_ext'] = build_ext - import sys, platform sys.path.append('python') @@ -33,7 +23,7 @@ def readme(): setup( name = 'mappy', - version = '2.17', + version = '2.21', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(), @@ -42,8 +32,8 @@ setup( license = 'MIT', keywords = 'sequence-alignment', scripts = ['python/minimap2.py'], - ext_modules = [Extension('mappy', - sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', + ext_modules = [Extension('mappy', + sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'], depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h', @@ -62,4 +52,4 @@ setup( 'Programming Language :: Python :: 3', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics'], - cmdclass = cmdclass) + setup_requires=["cython"])