From 9e6fdd376b5e6abb4ca7f4540f79eec7b8271a78 Mon Sep 17 00:00:00 2001 From: mbrcic Date: Mon, 20 Apr 2020 18:28:06 +0200 Subject: [PATCH 01/77] Changed sse2neon with SIMDe. Added building non-SIMD version. --- .gitmodules | 3 +++ Makefile | 31 ++++++++++++++++++++++++------- ksw2_extd2_sse.c | 6 ++++-- ksw2_exts2_sse.c | 7 ++++--- ksw2_extz2_sse.c | 6 ++++-- ksw2_ll_sse.c | 3 ++- lib/simde | 1 + 7 files changed, 42 insertions(+), 15 deletions(-) create mode 100644 .gitmodules create mode 160000 lib/simde diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a80f848 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lib/simde"] + path = lib/simde + url = https://github.com/nemequ/simde.git diff --git a/Makefile b/Makefile index 18622f5..61c4097 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,13 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC -INCLUDES= -OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o +INCLUDES= -Ilib/simde/simde +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread +ifeq ($(no_simd),) # if no_simd is not defined + OBJS+=ksw2_ll_sse.o ifeq ($(arm_neon),) # if arm_neon is not defined ifeq ($(sse2only),) # if sse2only is not defined OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o @@ -14,13 +16,15 @@ else # if sse2only is defined endif else # if arm_neon is defined OBJS+=ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o - INCLUDES+=-Isse2neon ifeq ($(aarch64),) #if aarch64 is not defined CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char else #if aarch64 is defined CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char endif endif +else + OBJS+=ksw2_ll_nosimd.o ksw2_extz2_nosimd.o ksw2_extd2_nosimd.o ksw2_exts2_nosimd.o +endif ifneq ($(asan),) CFLAGS+=-fsanitize=address @@ -83,15 +87,28 @@ ksw2_dispatch.o:ksw2_dispatch.c ksw2.h $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ # NEON-specific targets on ARM - ksw2_extz2_neon.o:ksw2_extz2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES $(INCLUDES) $< -o $@ ksw2_extd2_neon.o:ksw2_extd2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES $(INCLUDES) $< -o $@ ksw2_exts2_neon.o:ksw2_exts2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES $(INCLUDES) $< -o $@ + +# no-SIMD version + +ksw2_ll_nosimd.o:ksw2_ll_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DSIMDE_ENABLE_NATIVE_ALIASES -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ + +ksw2_extz2_nosimd.o:ksw2_extz2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ + +ksw2_extd2_nosimd.o:ksw2_extd2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ + +ksw2_exts2_nosimd.o:ksw2_exts2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ # other non-file targets diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index b578274..10c42fd 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -4,14 +4,16 @@ #include "ksw2.h" #ifdef __SSE2__ -#include +//#include +#include #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ -#include +//#include +#include #endif #ifdef KSW_CPU_DISPATCH diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index e7984c6..93efeb4 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -4,14 +4,15 @@ #include "ksw2.h" #ifdef __SSE2__ -#include - +//#include +#include #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ -#include +//#include +#include #endif #ifdef KSW_CPU_DISPATCH diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index 02bb4c2..c13fcb9 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -3,14 +3,16 @@ #include "ksw2.h" #ifdef __SSE2__ -#include +//#include +#include #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ -#include +//#include +#include #endif #ifdef KSW_CPU_DISPATCH diff --git a/ksw2_ll_sse.c b/ksw2_ll_sse.c index 469de52..0359f00 100644 --- a/ksw2_ll_sse.c +++ b/ksw2_ll_sse.c @@ -1,7 +1,8 @@ #include #include #include -#include +//#include +#include #include "ksw2.h" #ifdef __GNUC__ diff --git a/lib/simde b/lib/simde new file mode 160000 index 0000000..b30129b --- /dev/null +++ b/lib/simde @@ -0,0 +1 @@ +Subproject commit b30129b3b48a6823013da2b309c50a081177b6b8 From c172df7d2d9315329965226b6c09355b11285a0c Mon Sep 17 00:00:00 2001 From: mbrcic Date: Mon, 20 Apr 2020 21:14:15 +0200 Subject: [PATCH 02/77] fix for Neon --- Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 61c4097..6aa1760 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra +CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC INCLUDES= -Ilib/simde/simde OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o @@ -7,15 +7,15 @@ PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread ifeq ($(no_simd),) # if no_simd is not defined - OBJS+=ksw2_ll_sse.o ifeq ($(arm_neon),) # if arm_neon is not defined + OBJS+=ksw2_ll_sse.o ifeq ($(sse2only),) # if sse2only is not defined OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o else # if sse2only is defined OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o endif else # if arm_neon is defined - OBJS+=ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o + OBJS+=ksw2_ll_sse.o ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o ifeq ($(aarch64),) #if aarch64 is not defined CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char else #if aarch64 is defined @@ -60,10 +60,8 @@ sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h # SSE-specific targets on x86/x86_64 -ifeq ($(arm_neon),) # if arm_neon is defined, compile this target with the default setting (i.e. no -msse2) ksw2_ll_sse.o:ksw2_ll_sse.c ksw2.h kalloc.h $(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@ -endif ksw2_extz2_sse41.o:ksw2_extz2_sse.c ksw2.h kalloc.h $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ @@ -87,6 +85,9 @@ ksw2_dispatch.o:ksw2_dispatch.c ksw2.h $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ # NEON-specific targets on ARM +ksw2_ll_neon.o:ksw2_ll_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -DSIMDE_ENABLE_NATIVE_ALIASES $(CPPFLAGS) $(INCLUDES) $< -o $@ + ksw2_extz2_neon.o:ksw2_extz2_sse.c ksw2.h kalloc.h $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES $(INCLUDES) $< -o $@ From f47e8a525ec8740c47fda26d0fff11b9d52ef2ff Mon Sep 17 00:00:00 2001 From: mbrcic Date: Tue, 21 Apr 2020 01:08:32 +0200 Subject: [PATCH 03/77] SIMDe made optional. Include paths changes for SIMDe. --- Makefile | 31 +++++++++++++++++++++---------- ksw2_extd2_sse.c | 14 ++++++++++---- ksw2_exts2_sse.c | 14 ++++++++++---- ksw2_extz2_sse.c | 14 ++++++++++---- ksw2_ll_sse.c | 8 ++++++-- 5 files changed, 57 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 6aa1760..06f7427 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,12 @@ -CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra +CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC -INCLUDES= -Ilib/simde/simde +INCLUDES= OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread + ifeq ($(no_simd),) # if no_simd is not defined ifeq ($(arm_neon),) # if arm_neon is not defined OBJS+=ksw2_ll_sse.o @@ -16,6 +17,9 @@ else # if sse2only is defined endif else # if arm_neon is defined OBJS+=ksw2_ll_sse.o ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o +ifeq ($(simde),) # arm_neon without SIMDe -> use sse2neon + INCLUDES+=-Isse2neon +endif ifeq ($(aarch64),) #if aarch64 is not defined CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char else #if aarch64 is defined @@ -24,6 +28,12 @@ endif endif else OBJS+=ksw2_ll_nosimd.o ksw2_extz2_nosimd.o ksw2_extd2_nosimd.o ksw2_exts2_nosimd.o + simde=1 # no_simd can be used only with SIMDe +endif + +ifneq ($(simde),) # if simde is defined + CFLAGS+=-DSIMDE_ENABLE_NATIVE_ALIASES -DUSE_SIMDE + INCLUDES+=-Ilib/simde endif ifneq ($(asan),) @@ -85,31 +95,32 @@ ksw2_dispatch.o:ksw2_dispatch.c ksw2.h $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ # NEON-specific targets on ARM + ksw2_ll_neon.o:ksw2_ll_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) -DSIMDE_ENABLE_NATIVE_ALIASES $(CPPFLAGS) $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@ ksw2_extz2_neon.o:ksw2_extz2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ ksw2_extd2_neon.o:ksw2_extd2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ ksw2_exts2_neon.o:ksw2_exts2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ # no-SIMD version ksw2_ll_nosimd.o:ksw2_ll_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DSIMDE_ENABLE_NATIVE_ALIASES -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ ksw2_extz2_nosimd.o:ksw2_extz2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ ksw2_extd2_nosimd.o:ksw2_extd2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ ksw2_exts2_nosimd.o:ksw2_exts2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_ENABLE_NATIVE_ALIASES -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ # other non-file targets diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index 10c42fd..e8bee8e 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -4,16 +4,22 @@ #include "ksw2.h" #ifdef __SSE2__ -//#include -#include +#ifndef USE_SIMDE +#include +#else +#include +#endif #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ -//#include -#include +#ifndef USE_SIMDE +#include +#else +#include +#endif #endif #ifdef KSW_CPU_DISPATCH diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 93efeb4..1eb9502 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -4,15 +4,21 @@ #include "ksw2.h" #ifdef __SSE2__ -//#include -#include +#ifndef USE_SIMDE +#include +#else +#include +#endif #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ -//#include -#include +#ifndef USE_SIMDE +#include +#else +#include +#endif #endif #ifdef KSW_CPU_DISPATCH diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index c13fcb9..bcdf32f 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -3,16 +3,22 @@ #include "ksw2.h" #ifdef __SSE2__ -//#include -#include +#ifndef USE_SIMDE +#include +#else +#include +#endif #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ -//#include -#include +#ifndef USE_SIMDE +#include +#else +#include +#endif #endif #ifdef KSW_CPU_DISPATCH diff --git a/ksw2_ll_sse.c b/ksw2_ll_sse.c index 0359f00..8cef46c 100644 --- a/ksw2_ll_sse.c +++ b/ksw2_ll_sse.c @@ -1,10 +1,14 @@ #include #include #include -//#include -#include #include "ksw2.h" +#ifndef USE_SIMDE +#include +#else +#include +#endif + #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) #define UNLIKELY(x) __builtin_expect((x),0) From 3e16e4e39d24b8d9db89233df0ac093932a07386 Mon Sep 17 00:00:00 2001 From: mbrcic Date: Tue, 21 Apr 2020 01:36:19 +0200 Subject: [PATCH 04/77] Added documentation entry for added functionality, simde and no_simd. --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index addeb2c..a62d189 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,10 @@ to disable SSE4 code, which will make minimap2 slightly slower. Minimap2 also works with ARM CPUs supporting the NEON instruction sets. To compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 aarch64=1`. +Minimap2 can use [SIMD Everywhere (SIMDe)](https://github.com/nemequ/simde) library for porting implementation +to the different SIMD instruction sets. To compile using SIMDe, use `make simde=1`. To compile for ARM CPUs, add `simde=1` to the commands given above. +SIMDe also enables non-SIMD implementation using `make no_simd=1`. + ### General usage Without any options, minimap2 takes a reference database and a query sequence From 2b3403f09430e3e486b25b88e7617b6140dbe5e0 Mon Sep 17 00:00:00 2001 From: mbrcic Date: Tue, 21 Apr 2020 02:08:21 +0200 Subject: [PATCH 05/77] fix for Neon after test. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 06f7427..7d2afc9 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ else # if sse2only is defined OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o endif else # if arm_neon is defined - OBJS+=ksw2_ll_sse.o ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o + OBJS+=ksw2_ll_neon.o ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o ifeq ($(simde),) # arm_neon without SIMDe -> use sse2neon INCLUDES+=-Isse2neon endif From 66db9da7d8a4500aa210512914bedc0d7c7d817b Mon Sep 17 00:00:00 2001 From: mbrcic Date: Wed, 22 Apr 2020 19:50:59 +0200 Subject: [PATCH 06/77] changed preprocessor conditionals for SIMDe --- ksw2_extd2_sse.c | 12 ++++++------ ksw2_exts2_sse.c | 12 ++++++------ ksw2_extz2_sse.c | 12 ++++++------ ksw2_ll_sse.c | 6 +++--- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index e8bee8e..162e9e2 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -4,10 +4,10 @@ #include "ksw2.h" #ifdef __SSE2__ -#ifndef USE_SIMDE -#include -#else +#ifdef USE_SIMDE #include +#else +#include #endif #ifdef KSW_SSE2_ONLY @@ -15,10 +15,10 @@ #endif #ifdef __SSE4_1__ -#ifndef USE_SIMDE -#include -#else +#ifdef USE_SIMDE #include +#else +#include #endif #endif diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 1eb9502..4157e38 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -4,20 +4,20 @@ #include "ksw2.h" #ifdef __SSE2__ -#ifndef USE_SIMDE -#include -#else +#ifdef USE_SIMDE #include +#else +#include #endif #ifdef KSW_SSE2_ONLY #undef __SSE4_1__ #endif #ifdef __SSE4_1__ -#ifndef USE_SIMDE -#include -#else +#ifdef USE_SIMDE #include +#else +#include #endif #endif diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index bcdf32f..ad19131 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -3,10 +3,10 @@ #include "ksw2.h" #ifdef __SSE2__ -#ifndef USE_SIMDE -#include -#else +#ifdef USE_SIMDE #include +#else +#include #endif #ifdef KSW_SSE2_ONLY @@ -14,10 +14,10 @@ #endif #ifdef __SSE4_1__ -#ifndef USE_SIMDE -#include -#else +#ifdef USE_SIMDE #include +#else +#include #endif #endif diff --git a/ksw2_ll_sse.c b/ksw2_ll_sse.c index 8cef46c..14b9b50 100644 --- a/ksw2_ll_sse.c +++ b/ksw2_ll_sse.c @@ -3,10 +3,10 @@ #include #include "ksw2.h" -#ifndef USE_SIMDE -#include -#else +#ifdef USE_SIMDE #include +#else +#include #endif #ifdef __GNUC__ From ccb0f7b05df3a17011f0d0f4388ddb301198871b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 25 Apr 2020 22:43:29 -0400 Subject: [PATCH 07/77] added a new Makefile for simde --- Makefile | 39 +++----------------- Makefile.simde | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 12 ++++--- 3 files changed, 110 insertions(+), 38 deletions(-) create mode 100644 Makefile.simde diff --git a/Makefile b/Makefile index 7d2afc9..18622f5 100644 --- a/Makefile +++ b/Makefile @@ -1,40 +1,26 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC INCLUDES= -OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread - -ifeq ($(no_simd),) # if no_simd is not defined ifeq ($(arm_neon),) # if arm_neon is not defined - OBJS+=ksw2_ll_sse.o ifeq ($(sse2only),) # if sse2only is not defined OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o else # if sse2only is defined OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o endif else # if arm_neon is defined - OBJS+=ksw2_ll_neon.o ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o -ifeq ($(simde),) # arm_neon without SIMDe -> use sse2neon - INCLUDES+=-Isse2neon -endif + OBJS+=ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o + INCLUDES+=-Isse2neon ifeq ($(aarch64),) #if aarch64 is not defined CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char else #if aarch64 is defined CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char endif endif -else - OBJS+=ksw2_ll_nosimd.o ksw2_extz2_nosimd.o ksw2_extd2_nosimd.o ksw2_exts2_nosimd.o - simde=1 # no_simd can be used only with SIMDe -endif - -ifneq ($(simde),) # if simde is defined - CFLAGS+=-DSIMDE_ENABLE_NATIVE_ALIASES -DUSE_SIMDE - INCLUDES+=-Ilib/simde -endif ifneq ($(asan),) CFLAGS+=-fsanitize=address @@ -70,8 +56,10 @@ sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h # SSE-specific targets on x86/x86_64 +ifeq ($(arm_neon),) # if arm_neon is defined, compile this target with the default setting (i.e. no -msse2) ksw2_ll_sse.o:ksw2_ll_sse.c ksw2.h kalloc.h $(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@ +endif ksw2_extz2_sse41.o:ksw2_extz2_sse.c ksw2.h kalloc.h $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ @@ -96,9 +84,6 @@ ksw2_dispatch.o:ksw2_dispatch.c ksw2.h # NEON-specific targets on ARM -ksw2_ll_neon.o:ksw2_ll_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@ - ksw2_extz2_neon.o:ksw2_extz2_sse.c ksw2.h kalloc.h $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ @@ -108,20 +93,6 @@ ksw2_extd2_neon.o:ksw2_extd2_sse.c ksw2.h kalloc.h ksw2_exts2_neon.o:ksw2_exts2_sse.c ksw2.h kalloc.h $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ -# no-SIMD version - -ksw2_ll_nosimd.o:ksw2_ll_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ - -ksw2_extz2_nosimd.o:ksw2_extz2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ - -ksw2_extd2_nosimd.o:ksw2_extd2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ - -ksw2_exts2_nosimd.o:ksw2_exts2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ -DSIMDE_NO_NATIVE $(INCLUDES) $< -o $@ - # other non-file targets clean: diff --git a/Makefile.simde b/Makefile.simde new file mode 100644 index 0000000..381a668 --- /dev/null +++ b/Makefile.simde @@ -0,0 +1,97 @@ +CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra +CPPFLAGS= -DHAVE_KALLOC -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES +INCLUDES= -Ilib/simde +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o \ + ksw2_extz2_simde.o ksw2_extd2_simde.o ksw2_exts2_simde.o ksw2_ll_simde.o +PROG= minimap2 +PROG_EXTRA= sdust minimap2-lite +LIBS= -lm -lz -lpthread + + +ifneq ($(arm_neon),) # if arm_neon is defined +ifeq ($(aarch64),) #if aarch64 is not defined + CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char +else #if aarch64 is defined + CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char +endif +endif + +ifneq ($(asan),) + CFLAGS+=-fsanitize=address + LIBS+=-fsanitize=address +endif + +ifneq ($(tsan),) + CFLAGS+=-fsanitize=thread + LIBS+=-fsanitize=thread +endif + +.PHONY:all extra clean depend +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +extra:all $(PROG_EXTRA) + +minimap2:main.o libminimap2.a + $(CC) $(CFLAGS) main.o -o $@ -L. -lminimap2 $(LIBS) + +minimap2-lite:example.o libminimap2.a + $(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS) + +libminimap2.a:$(OBJS) + $(AR) -csru $@ $(OBJS) + +sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h ketopt.h sdust.h + $(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz + +ksw2_ll_simde.o:ksw2_ll_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@ + +ksw2_extz2_simde.o:ksw2_extz2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@ + +ksw2_extd2_simde.o:ksw2_extd2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@ + +ksw2_exts2_simde.o:ksw2_exts2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) $(INCLUDES) $< -o $@ + +# other non-file targets + +clean: + rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM build dist mappy*.so mappy.c python/mappy.c mappy.egg* + +depend: + (LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(CPPFLAGS) -- *.c) + +# DO NOT DELETE + +align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h +bseq.o: bseq.h kvec.h kalloc.h kseq.h +chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h +esterr.o: mmpriv.h minimap.h bseq.h kseq.h +example.o: minimap.h kseq.h +format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h +hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h +index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h +index.o: ksort.h +kalloc.o: kalloc.h +ksw2_extd2_sse.o: ksw2.h kalloc.h +ksw2_exts2_sse.o: ksw2.h kalloc.h +ksw2_extz2_sse.o: ksw2.h kalloc.h +ksw2_ll_sse.o: ksw2.h kalloc.h +kthread.o: kthread.h +main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h +map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h +map.o: khash.h ksort.h +misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h +options.o: mmpriv.h minimap.h bseq.h kseq.h +pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h +sdust.o: kalloc.h kdq.h kvec.h sdust.h +self-chain.o: minimap.h kseq.h +sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h +splitidx.o: mmpriv.h minimap.h bseq.h kseq.h diff --git a/README.md b/README.md index a62d189..4905417 100644 --- a/README.md +++ b/README.md @@ -80,11 +80,14 @@ directory to compile. If you see compilation errors, try `make sse2only=1` to disable SSE4 code, which will make minimap2 slightly slower. Minimap2 also works with ARM CPUs supporting the NEON instruction sets. To -compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 aarch64=1`. +compile for 32 bit ARM architectures (such as ARMv7), use `make arm_neon=1`. To +compile for for 64 bit ARM architectures (such as ARMv8), use `make arm_neon=1 +aarch64=1`. -Minimap2 can use [SIMD Everywhere (SIMDe)](https://github.com/nemequ/simde) library for porting implementation -to the different SIMD instruction sets. To compile using SIMDe, use `make simde=1`. To compile for ARM CPUs, add `simde=1` to the commands given above. -SIMDe also enables non-SIMD implementation using `make no_simd=1`. +Minimap2 can use [SIMD Everywhere (SIMDe)][simde] library for porting +implementation to the different SIMD instruction sets. To compile using SIMDe, +use `make -f Makefile.simde`. To compile for ARM CPUs, use `Makefile.simde` +with the ARM related command lines given above. ### General usage @@ -380,3 +383,4 @@ mappy` or [from BioConda][mappyconda] via `conda install -c bioconda mappy`. [manpage]: https://lh3.github.io/minimap2/minimap2.html [manpage-cs]: https://lh3.github.io/minimap2/minimap2.html#10 [doi]: https://doi.org/10.1093/bioinformatics/bty191 +[smide]: https://github.com/nemequ/simde From c9874e2dc50e32bbff4ded01cf5ec0e9be0a53dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Armin=20To=CC=88pfer?= Date: Fri, 12 Jun 2020 12:52:08 +0200 Subject: [PATCH 08/77] Initialize r->p if ez->zdropped --- align.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/align.c b/align.c index a7c4c3c..7bfb254 100644 --- a/align.c +++ b/align.c @@ -739,6 +739,13 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (ez->n_cigar > 0) mm_append_cigar(r, ez->n_cigar, ez->cigar); if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle. + if (!r->p) { + assert(ez->n_cigar == 0); + uint32_t capacity = sizeof(mm_extra_t)/4; + kroundup32(capacity); + r->p = (mm_extra_t*)calloc(capacity, 4); + r->p->capacity = capacity; + } for (j = i - 1; j >= 0; --j) if ((int32_t)a[as1 + j].x <= rs + ez->max_t) break; From f995f55610fc67f98c5a56f4e4b55378bd5b5b00 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 21 Aug 2020 11:12:50 -0400 Subject: [PATCH 09/77] added --mask-len for #659 --- hit.c | 4 ++-- main.c | 2 ++ map.c | 8 ++++---- minimap.h | 1 + minimap2.1 | 9 ++++++++- mmpriv.h | 2 +- options.c | 2 ++ python/cmappy.pxd | 1 + 8 files changed, 21 insertions(+), 8 deletions(-) diff --git a/hit.c b/hit.c index 506464b..88bbc03 100644 --- a/hit.c +++ b/hit.c @@ -122,7 +122,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a) r->split |= 1, r2->split |= 2; } -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc +void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc { int i, j, k, *w; uint64_t *cov; @@ -162,7 +162,7 @@ skip_uncov: min = ej - sj < ei - si? ej - sj : ei - si; max = ej - sj > ei - si? ej - sj : ei - si; ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified - if ((float)ol / min - (float)uncov_len / max > mask_level) { + if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit int cnt_sub = 0, sci = ri->score; ri->parent = rp->parent; if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac); diff --git a/main.c b/main.c index 6780f41..9b41ff6 100644 --- a/main.c +++ b/main.c @@ -70,6 +70,7 @@ static ko_longopt_t long_options[] = { { "chain-gap-scale",ko_required_argument, 343 }, { "alt", ko_required_argument, 344 }, { "alt-drop", ko_required_argument, 345 }, + { "mask-len", ko_required_argument, 346 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -217,6 +218,7 @@ int main(int argc, char *argv[]) else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale else if (c == 344) alt_list = o.arg; // --alt else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop + else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len else if (c == 314) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); } else if (c == 315) { // --secondary diff --git a/map.c b/map.c index 01d323c..0812783 100644 --- a/map.c +++ b/map.c @@ -249,7 +249,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a) { if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) - mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains @@ -262,7 +262,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k if (!(opt->flag & MM_F_CIGAR)) return regs; regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) - mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); mm_set_sam_pri(*n_regs, regs); } @@ -365,7 +365,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains free(regs0); for (i = 0; i < n_segs; ++i) { - mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent + mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a); mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr); } @@ -509,7 +509,7 @@ static void merge_hits(step_t *s) } } mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_drop); - mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); if (!(opt->flag & MM_F_ALL_CHAINS)) { mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]); mm_set_sam_pri(s->n_reg[k], s->reg[k]); diff --git a/minimap.h b/minimap.h index 251879b..a00869e 100644 --- a/minimap.h +++ b/minimap.h @@ -122,6 +122,7 @@ typedef struct { float chain_gap_scale; float mask_level; + int mask_len; float pri_ratio; int best_n; // top best_n chains are subjected to DP alignment diff --git a/minimap2.1 b/minimap2.1 index 88d895c..aeb1492 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -237,7 +237,14 @@ or more of the shorter chain [0.5] .B --hard-mask-level Honor option .B -M -and disable a heurstic to save unmapped subsequences. +and disable a heurstic to save unmapped subsequences and disables +.BR --mask-len . +.TP +.BI --mask-len \ NUM +Keep an alignment if dropping it leaves an unaligned region on query longer than +.IR INT +[inf]. Effective without +.BR --hard-mask-level . .TP .BI --max-chain-skip \ INT A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming diff --git a/mmpriv.h b/mmpriv.h index ffba6eb..1eebb5c 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -71,7 +71,7 @@ void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a); void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a); int mm_set_sam_pri(int n, mm_reg1_t *r); -void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac); +void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac); void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r); void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs); diff --git a/options.c b/options.c index 8276b24..d4eafaa 100644 --- a/options.c +++ b/options.c @@ -1,4 +1,5 @@ #include +#include #include "mmpriv.h" void mm_idxopt_init(mm_idxopt_t *opt) @@ -27,6 +28,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->chain_gap_scale = 1.0f; opt->mask_level = 0.5f; + opt->mask_len = INT_MAX; opt->pri_ratio = 0.8f; opt->best_n = 5; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 72a19d7..459b246 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -22,6 +22,7 @@ cdef extern from "minimap.h": int min_chain_score float chain_gap_scale float mask_level + int mask_len float pri_ratio int best_n int max_join_long, max_join_short From a3253d1a6b6820a4f45530fde01aefa0828eef43 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 14 Jan 2021 12:24:38 -0500 Subject: [PATCH 10/77] added a command for simple VCF statistics --- misc/paftools.js | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/misc/paftools.js b/misc/paftools.js index f15cfde..6caf897 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2512,6 +2512,71 @@ function paf_ov_eval(args) print((100 * (1 - n_missing / n_ovlp)).toFixed(2) + "% sensitivity"); } +function paf_vcfstat(args) +{ + var c, ts = { "AG":1, "GA":1, "CT":1, "TC":1 }; + while ((c = getopt(args, "")) != null) { + } + var buf = new Bytes(); + var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]); + var x = { sub:0, ts:0, tv:0, ins:0, del:0, ins1:0, del1:0, ins2:0, del2:0, ins50:0, del50:0, ins1k:0, del1k:0, ins7k:0, del7k:0, insinf:0, delinf:0 }; + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (t[0][0] == '#') continue; + var alt = t[4].split(","); + var ref = t[3]; + for (var i = 0; i < alt.length; ++i) { + var a = alt[i]; + if (a[0] == '<' || a[1] == '>') continue; + var l = ref.length < a.length? ref.length : a.length; + for (var j = 0; j < l; ++j) { + if (ref[j] != a[j]) { + ++x.sub; + if (ts[ref[j] + a[j]]) ++x.ts; + else ++x.tv; + } + } + var d = a.length - ref.length; + if (d > 0) { + ++x.ins; + if (d == 1) ++x.ins1; + else if (d == 2) ++x.ins2; + else if (d < 50) ++x.ins50; + else if (d < 1000) ++x.ins1k; + else if (d < 7000) ++x.ins7k; + else ++x.insinf; + } else if (d < 0) { + d = -d; + ++x.del; + if (d == 1) ++x.del1; + else if (d == 2) ++x.del2; + else if (d < 50) ++x.del50; + else if (d < 1000) ++x.del1k; + else if (d < 7000) ++x.del7k; + else ++x.delinf; + } + } + } + file.close(); + buf.destroy(); + print("# substitutions: " + x.sub); + print("ts/tv: " + (x.ts / x.tv).toFixed(3)); + print("# insertions: " + x.ins); + print("# 1bp insertions: " + x.ins1); + print("# 2bp insertions: " + x.ins2); + print("# [3,50) insertions: " + x.ins50); + print("# [50,1000) insertions: " + x.ins1k); + print("# [1000,7000) insertions: " + x.ins7k); + print("# >=7000 insertions: " + x.insinf); + print("# deletions: " + x.del); + print("# 1bp deletions: " + x.del1); + print("# 2bp deletions: " + x.del2); + print("# [3,50) deletions: " + x.del50); + print("# [50,1000) deletions: " + x.del1k); + print("# [1000,7000) deletions: " + x.del7k); + print("# >=7000 deletions: " + x.delinf); +} + /************************* ***** main function ***** *************************/ @@ -2533,6 +2598,7 @@ function main(args) print(" liftover simplistic liftOver"); print(" call call variants from asm-to-ref alignment with the cs tag"); print(" bedcov compute the number of bases covered"); + print(" vcfstat VCF statistics"); print(" version print paftools.js version"); print(""); print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ"); @@ -2561,6 +2627,7 @@ function main(args) else if (cmd == 'pbsim2fq') paf_pbsim2fq(args); else if (cmd == 'junceval') paf_junceval(args); else if (cmd == 'ov-eval') paf_ov_eval(args); + else if (cmd == 'vcfstat') paf_vcfstat(args); else if (cmd == 'version') print(paftools_version); else throw Error("unrecognized command: " + cmd); } From 10259934693ab975ca0cf2b9214a1e1540281c2b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 29 Jan 2021 14:08:21 -0500 Subject: [PATCH 11/77] print number of errors on each read --- misc/paftools.js | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index 6caf897..71fcba9 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -962,12 +962,13 @@ function paf_asmgene(args) function paf_stat(args) { - var c, gap_out_len = null; - while ((c = getopt(args, "l:")) != null) + var c, gap_out_len = null, count_err = false; + while ((c = getopt(args, "cl:")) != null) if (c == 'l') gap_out_len = parseInt(getopt.arg); + else if (c == 'c') count_err = true; if (getopt.ind == args.length) { - print("Usage: paftools.js stat [-l gapOutLen] |"); + print("Usage: paftools.js stat [-c] [-l gapOutLen] |"); exit(1); } @@ -998,7 +999,7 @@ function paf_stat(args) if (line.charAt(0) != '@') { var t = line.split("\t", 12); var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null; - var atlen = null, aqlen, qs, qe, mapq, ori_qlen; + var atlen = null, aqlen, qs, qe, mapq, ori_qlen, NM = null; if (t.length < 2) continue; if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF if (t[4] == '*') continue; // unmapped @@ -1006,6 +1007,8 @@ function paf_stat(args) ++n_2nd; continue; } + if ((m = /\tNM:i:(\d+)/.exec(line)) != null) + NM = parseInt(m[1]); if ((m = /\tcg:Z:(\S+)/.exec(line)) != null) cigar = m[1]; if (cigar == null) { @@ -1027,6 +1030,8 @@ function paf_stat(args) ++n_2nd; continue; } + if ((m = /\tNM:i:(\d+)/.exec(line)) != null) + NM = parseInt(m[1]); cigar = t[5]; tname = t[2]; rs = parseInt(t[3]) - 1; @@ -1045,11 +1050,13 @@ function paf_stat(args) ++n_seq, last = t[0]; } var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0; + var n_gapo = 0, n_gap_all = 0, l_match = 0; while ((m = re.exec(cigar)) != null) { var l = parseInt(m[1]); ++n_cigar; if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') { tl += l, ql += l, M += l; + l_match += l; } else if (m[2] == 'I' || m[2] == 'D') { var type; if (l < 50) type = 0; @@ -1062,6 +1069,7 @@ function paf_stat(args) else tl += l, ++n_gap[1][type]; if (gap_out_len != null && l >= gap_out_len) print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l); + ++n_gapo, n_gap_all += l; } else if (m[2] == 'N') { tl += l; } else if (m[2] == 'S') { @@ -1079,6 +1087,12 @@ function paf_stat(args) qs = clip[is_rev? 1 : 0], qe = qs + ql; ori_qlen = clip[0] + ql + clip[1]; } + if (count_err && NM != null) { + var n_mm = NM - n_gap_all; + if (n_mm < 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno); + if (n_mm < 0) n_mm = 0; + print(t[0], ori_qlen, t[11], ori_qlen - (qe - qs), NM, l_match + n_gap_all, n_mm + n_gapo, l_match + n_gapo); + } regs.push([qs, qe]); last_qlen = ori_qlen; } @@ -1091,7 +1105,7 @@ function paf_stat(args) file.close(); buf.destroy(); - if (gap_out_len == null) { + if (gap_out_len == null && !count_err) { print("Number of mapped sequences: " + n_seq); print("Number of primary alignments: " + n_pri); print("Number of secondary alignments: " + n_2nd); From 75c893351117754c41eca044116bfcb7211f285b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 7 Feb 2021 12:34:13 -0500 Subject: [PATCH 12/77] evaluate large-scale misjoins --- misc/paftools.js | 111 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/misc/paftools.js b/misc/paftools.js index 71fcba9..6f67a42 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2591,6 +2591,113 @@ function paf_vcfstat(args) print("# >=7000 deletions: " + x.delinf); } +function paf_parseNum(s) { + var m, x = null; + if ((m = /^(\d*\.?\d*)([mMgGkK]?)/.exec(s)) != null) { + x = parseFloat(m[1]); + if (m[2] == 'k' || m[2] == 'K') x *= 1000; + else if (m[2] == 'm' || m[2] == 'M') x *= 1000000; + else if (m[2] == 'g' || m[2] == 'G') x *= 1000000000; + } + return Math.floor(x + .499); +} + +function paf_misjoin(args) +{ + var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false; + var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0]; + while ((c = getopt(args, "l:g:c:p")) != null) { + if (c == 'l') min_seg_len = paf_parseNum(getopt.arg); + else if (c == 'g') max_gap = paf_parseNum(getopt.arg); + else if (c == 'c') fn_cen = getopt.arg; + else if (c == 'p') show_long = true; + } + if (args.length == getopt.ind) { + print("Usage: paftools.js misjoin [options] "); + print("Options:"); + print(" -c FILE BED for centromeres []"); + print(" -l NUM min alignment block length [1m]"); + print(" -g NUM max gap size [1m]"); + print(" -p output long alignment blocks for debugging"); + return; + } + var cen = {}; + var file, buf = new Bytes(); + if (fn_cen != null) { + file = new File(fn_cen); + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (cen[t[0]] == null) cen[t[0]] = []; + cen[t[0]].push([parseInt(t[1]), parseInt(t[2])]); + } + file.close(); + } + + function test_cen(cen, chr, st, en) { + var b = cen[chr]; + if (b == null) return false; + for (var j = 0; j < b.length; ++j) + if (b[j][0] < en && b[j][1] > st) + return true; + return false; + } + + function process(a) { + var k = 0; + for (var i = 0; i < a.length; ++i) { + for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]); + for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]); + if (a[i][10] >= min_seg_len) a[k++] = a[i]; + } + a.length = k; + if (a.length == 1) return; + a = a.sort(function(x,y){return x[2]-y[2]}); + if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t")); + for (var i = 1; i < a.length; ++i) { + var ov = [false, false]; + ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]); + ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]); + if (a[i-1][5] != a[i][5]) { // different chr + if (ov[0] || ov[1]) ++n_diff[1]; + ++n_diff[0]; + } else if (a[i-1][4] == a[i][4]) { // a gap + var dq = a[i][2] - a[i-1][3]; + var dr = a[i][4] == '+'? a[i][7] - a[i-1][8] : a[i-1][7] - a[i][8]; + var gap = dr > dq? dr - dq : dq - dr; + if (gap > max_gap) { + if (ov[0] || ov[1]) ++n_gap[1]; + ++n_gap[0]; + } + } else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion + if (ov[0] || ov[1]) ++n_inv[1]; + ++n_inv[0]; + ++i; + } else { // hanging inversion + if (ov[0] || ov[1]) ++n_inv_end[1]; + ++n_inv_end[0]; + } + } + } + + file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]); + var a = []; + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (a.length > 0 && a[0][0] != t[0]) { + process(a); + a.length = 0; + } + a.push(t); + } + if (a.length > 0) process(a); + file.close(); + buf.destroy(); + print("# inter-chromosomal misjoins: " + n_diff.join(",")); + print("# intra-chromosomal gaps: " + n_gap.join(",")); + print("# candidate inversions in the middle: " + n_inv.join(",")); + print("# candidate inversions at contig ends: " + n_inv_end.join(",")); +} + /************************* ***** main function ***** *************************/ @@ -2608,7 +2715,8 @@ function main(args) print(""); print(" stat collect basic mapping information in PAF/SAM"); print(" asmstat collect basic assembly information"); - print(" asmgene evaluate gene completeness (EXPERIMENTAL)"); + print(" asmgene evaluate gene completeness"); + print(" misjoin evaluate large-scale misjoins"); print(" liftover simplistic liftOver"); print(" call call variants from asm-to-ref alignment with the cs tag"); print(" bedcov compute the number of bases covered"); @@ -2632,6 +2740,7 @@ function main(args) else if (cmd == 'stat') paf_stat(args); else if (cmd == 'asmstat') paf_asmstat(args); else if (cmd == 'asmgene') paf_asmgene(args); + else if (cmd == 'misjoin') paf_misjoin(args); else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args); else if (cmd == 'vcfpair') paf_vcfpair(args); else if (cmd == 'call') paf_call(args); From 194b457e791396aa9e27d2254f058839f09b17aa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 7 Feb 2021 12:58:03 -0500 Subject: [PATCH 13/77] option to print errors only --- misc/paftools.js | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index 6f67a42..800ec00 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2604,13 +2604,14 @@ function paf_parseNum(s) { function paf_misjoin(args) { - var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false; + var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false; var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0]; - while ((c = getopt(args, "l:g:c:p")) != null) { + while ((c = getopt(args, "l:g:c:pe")) != null) { if (c == 'l') min_seg_len = paf_parseNum(getopt.arg); else if (c == 'g') max_gap = paf_parseNum(getopt.arg); else if (c == 'c') fn_cen = getopt.arg; else if (c == 'p') show_long = true; + else if (c == 'e') show_err = true; } if (args.length == getopt.ind) { print("Usage: paftools.js misjoin [options] "); @@ -2618,6 +2619,7 @@ function paf_misjoin(args) print(" -c FILE BED for centromeres []"); print(" -l NUM min alignment block length [1m]"); print(" -g NUM max gap size [1m]"); + print(" -e output misjoins not involving centromeres"); print(" -p output long alignment blocks for debugging"); return; } @@ -2659,6 +2661,10 @@ function paf_misjoin(args) ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]); if (a[i-1][5] != a[i][5]) { // different chr if (ov[0] || ov[1]) ++n_diff[1]; + else if (show_err) { + print("J", a[i-1].slice(0, 12).join("\t")); + print("J", a[i].slice(0, 12).join("\t")); + } ++n_diff[0]; } else if (a[i-1][4] == a[i][4]) { // a gap var dq = a[i][2] - a[i-1][3]; @@ -2670,6 +2676,11 @@ function paf_misjoin(args) } } else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion if (ov[0] || ov[1]) ++n_inv[1]; + else if (show_err) { + print("M", a[i-1].slice(0, 12).join("\t")); + print("M", a[i].slice(0, 12).join("\t")); + print("M", a[i+1].slice(0, 12).join("\t")); + } ++n_inv[0]; ++i; } else { // hanging inversion From 4dfd495cc2816f67556bc6318654e572636ee40a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Feb 2021 14:49:36 -0500 Subject: [PATCH 14/77] added sveval --- misc/paftools.js | 111 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/misc/paftools.js b/misc/paftools.js index 800ec00..6c86792 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2709,6 +2709,115 @@ function paf_misjoin(args) print("# candidate inversions at contig ends: " + n_inv_end.join(",")); } +function paf_sveval(args) +{ + var c, min_flt = 30, min_size = 50, max_size = 10000, win_size = 500, print_err = false; + while ((c = getopt(args, "f:i:x:w:e")) != null) { + if (c == 'f') min_flt = paf_parseNum(getopt.arg); + else if (c == 'i') min_size = paf_parseNum(getopt.arg); + else if (c == 'x') max_size = paf_parseNum(getopt.arg); + else if (c == 'w') win_size = paf_parseNum(getopt.arg); + else if (c == 'e') print_err = true; + } + if (args.length - getopt.ind < 3) { + print("Usage: paftools.js sveval [options] "); + print("Options:"); + print(" -f INT min length to discard [" + min_flt + "]"); + print(" -i INT min SV length [" + min_size + "]"); + print(" -x INT max SV length [" + max_size + "]"); + print(" -w INT fuzzy windown size [" + win_size + "]"); + print(" -e print errors"); + return; + } + + function read_bed(fn) { + var buf = new Bytes(); + var file = new File(fn); + var bed = {}; + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (bed[t[0]] == null) bed[t[0]] = []; + bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]); + } + file.close(); + buf.destroy(); + for (var x in bed) { + Interval.sort(bed[x]); + Interval.merge(bed[x]); + Interval.index_end(bed[x]); + } + return bed; + } + + var bed = read_bed(args[getopt.ind]); + + function read_vcf(fn, bed) { + var buf = new Bytes(); + var file = new File(fn); + var v = {}; + while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + if (t[0][0] == '#') continue; + if (bed[t[0]] == null) continue; + if (t[4] == '' || t[4] == '') continue; // no inversion + var st = parseInt(t[1]) - 1, en = st + t[3].length; + if ((m = /((;END)|(^END))=(\d+)/.exec(t[7])) != null) + en = parseInt(m[4]); + if (Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue; + // determine svlen + var s = t[4].split(","), max_del = 0, max_ins = 0; + for (var i = 0; i < s.length; ++i) { + var l = s[i].length - t[3].length; + if (l > 0) + max_ins = max_ins > l? max_ins : l; + else if (l < 0) + max_del = max_del > -l? max_del : -l; + } + if (max_ins < min_flt && max_del < min_flt) continue; + var svlen = max_ins > max_del? max_ins : -max_del; + if ((m = /((;SVLEN)|(^SVLEN))=(\d+)/.exec(t[7])) != null) + svlen = parseInt(m[4]); + var abslen = svlen > 0? svlen : -svlen; + if (abslen < min_flt || abslen > max_size) continue; + // insert + if (v[t[0]] == null) v[t[0]] = []; + v[t[0]].push([st, en, svlen, abslen]); + } + file.close(); + buf.destroy(); + for (var x in v) { + Interval.sort(v[x]); + Interval.index_end(v[x]); + } + return v; + } + + function compare_vcf(v0, v1, label) { + var m = 0, n = 0; + for (var x in v1) { + var a1 = v1[x], a0 = v0[x]; + for (var i = 0; i < a1.length; ++i) { + if (a1[i][3] < min_size) continue; + ++n; + if (a0 == null) continue; + var st = a1[i][0] > win_size? a1[i][0] - win_size : 0; + b = Interval.find_ovlp(a0, st, a1[i][1] + win_size); + if (b.length > 0) ++m; + else if (print_err) print(label, x, a1[i].slice(0, 3).join("\t")); + } + } + return [n, m]; + } + + var v_base = read_vcf(args[getopt.ind+1], bed); + var v_call = read_vcf(args[getopt.ind+2], bed); + var fn = compare_vcf(v_call, v_base, 'FN'); + var fp = compare_vcf(v_base, v_call, 'FP'); + print('SN', fn[0], fn[1], (fn[1] / fn[0]).toFixed(6)); + print('PC', fp[0], fp[1], (fp[1] / fp[0]).toFixed(6)); + print('F1', ((fn[1] / fn[0] + fp[1] / fp[0]) / 2).toFixed(6)); +} + /************************* ***** main function ***** *************************/ @@ -2732,6 +2841,7 @@ function main(args) print(" call call variants from asm-to-ref alignment with the cs tag"); print(" bedcov compute the number of bases covered"); print(" vcfstat VCF statistics"); + print(" sveval compare two SV callsets in VCF"); print(" version print paftools.js version"); print(""); print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ"); @@ -2762,6 +2872,7 @@ function main(args) else if (cmd == 'junceval') paf_junceval(args); else if (cmd == 'ov-eval') paf_ov_eval(args); else if (cmd == 'vcfstat') paf_vcfstat(args); + else if (cmd == 'sveval') paf_sveval(args); else if (cmd == 'version') print(paftools_version); else throw Error("unrecognized command: " + cmd); } From ecbc399fa2d14a8fbf177e53178b69ce8c869195 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 6 Mar 2021 19:44:13 -0500 Subject: [PATCH 15/77] improved sveval --- misc/paftools.js | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index 6c86792..915d0e8 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2711,17 +2711,19 @@ function paf_misjoin(args) function paf_sveval(args) { - var c, min_flt = 30, min_size = 50, max_size = 10000, win_size = 500, print_err = false; - while ((c = getopt(args, "f:i:x:w:e")) != null) { + var c, min_flt = 30, min_size = 50, max_size = 10000, win_size = 500, print_err = false, bed_fn = null; + while ((c = getopt(args, "f:i:x:w:er:")) != null) { if (c == 'f') min_flt = paf_parseNum(getopt.arg); else if (c == 'i') min_size = paf_parseNum(getopt.arg); else if (c == 'x') max_size = paf_parseNum(getopt.arg); else if (c == 'w') win_size = paf_parseNum(getopt.arg); + else if (c == 'r') bed_fn = getopt.arg; else if (c == 'e') print_err = true; } - if (args.length - getopt.ind < 3) { - print("Usage: paftools.js sveval [options] "); + if (args.length - getopt.ind < 2) { + print("Usage: paftools.js sveval [options] "); print("Options:"); + print(" -r FILE confident region in BED []"); print(" -f INT min length to discard [" + min_flt + "]"); print(" -i INT min SV length [" + min_size + "]"); print(" -x INT max SV length [" + max_size + "]"); @@ -2749,7 +2751,7 @@ function paf_sveval(args) return bed; } - var bed = read_bed(args[getopt.ind]); + var bed = bed_fn != null? read_bed(bed_fn) : null; function read_vcf(fn, bed) { var buf = new Bytes(); @@ -2758,12 +2760,13 @@ function paf_sveval(args) while (file.readline(buf) >= 0) { var m, t = buf.toString().split("\t"); if (t[0][0] == '#') continue; - if (bed[t[0]] == null) continue; + if (bed != null && bed[t[0]] == null) continue; if (t[4] == '' || t[4] == '') continue; // no inversion + if (/[\[\]]/.test(t[4])) continue; // no break points var st = parseInt(t[1]) - 1, en = st + t[3].length; if ((m = /((;END)|(^END))=(\d+)/.exec(t[7])) != null) en = parseInt(m[4]); - if (Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue; + if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue; // determine svlen var s = t[4].split(","), max_del = 0, max_ins = 0; for (var i = 0; i < s.length; ++i) { @@ -2809,8 +2812,8 @@ function paf_sveval(args) return [n, m]; } - var v_base = read_vcf(args[getopt.ind+1], bed); - var v_call = read_vcf(args[getopt.ind+2], bed); + var v_base = read_vcf(args[getopt.ind+0], bed); + var v_call = read_vcf(args[getopt.ind+1], bed); var fn = compare_vcf(v_call, v_base, 'FN'); var fp = compare_vcf(v_base, v_call, 'FP'); print('SN', fn[0], fn[1], (fn[1] / fn[0]).toFixed(6)); From 1c4b7e8a48ca6289a6b949f163528a299b0a28b6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 6 Mar 2021 19:44:24 -0500 Subject: [PATCH 16/77] explained --junc-bed in README --- README.md | 14 ++++++++++++++ minimap2.1 | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4905417..65395e6 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ cd minimap2 && make ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown) ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq ./minimap2 -ax splice:hq -uf ref.fa query.fa > aln.sam # Final PacBio Iso-seq or traditional cDNA +./minimap2 -ax splice --junc-bed anno.bed12 ref.fa query.fa > aln.sam # prioritize on annotated junctions ./minimap2 -cx asm5 asm1.fa asm2.fa > aln.paf # intra-species asm-to-asm alignment ./minimap2 -x ava-pb reads.fa reads.fa > overlaps.paf # PacBio read overlap ./minimap2 -x ava-ont reads.fa reads.fa > overlaps.paf # Nanopore read overlap @@ -185,6 +186,19 @@ This is because SIRV does not honor the evolutionarily conservative splicing signal. If you are studying SIRV, you may apply `--splice-flank=no` to let minimap2 only model GT..AG, ignoring the additional base. +Since v2.17, minimap2 can optionally take annotated genes as input and +prioritize on annotated splice junctions. To use this feature, you can +```sh +paftools.js gff2bed anno.gff > anno.bed +minimap2 -ax splice --junc-bed anno.bed ref.fa query.fa > aln.sam +``` +Here, `anno.gff` is the gene annotation in the GTF or GFF3 format (`gff2bed` +automatically tests the format). The output of `gff2bed` is in the 12-column +BED format, or the BED12 format. With the `--junc-bed` option, minimap2 adds a +bonus score (tuned by `--junc-bonus`) if an aligned junction matches a junction +in the annotation. Option `--junc-bed` also takes 5-column BED, including the +strand field. In this case, each line indicates an oriented junction. + #### Find overlaps between long reads ```sh diff --git a/minimap2.1 b/minimap2.1 index aeb1492..9e55543 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -391,7 +391,7 @@ BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf' .BR --junc-bonus \ INT Score bonus for a splice donor or acceptor found in annotation (effective with .BR --junc-bed ) -[0]. +[9]. .TP .BI --end-seed-pen \ INT Drop a terminal anchor if From 1b0bb7b0ba76cd18cdc153cb146ddd8a60bc6dd8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 11 Mar 2021 19:11:44 -0500 Subject: [PATCH 17/77] require overlap ratio when considering centromere --- misc/paftools.js | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index 915d0e8..111e96d 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2604,12 +2604,13 @@ function paf_parseNum(s) { function paf_misjoin(args) { - var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false; + var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false, cen_ratio = 0.5; var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0]; - while ((c = getopt(args, "l:g:c:pe")) != null) { + while ((c = getopt(args, "l:g:c:per:")) != null) { if (c == 'l') min_seg_len = paf_parseNum(getopt.arg); else if (c == 'g') max_gap = paf_parseNum(getopt.arg); else if (c == 'c') fn_cen = getopt.arg; + else if (c == 'r') cen_ratio = parseFloat(getopt.arg); else if (c == 'p') show_long = true; else if (c == 'e') show_err = true; } @@ -2617,6 +2618,7 @@ function paf_misjoin(args) print("Usage: paftools.js misjoin [options] "); print("Options:"); print(" -c FILE BED for centromeres []"); + print(" -r FLOAT count a centromeric event if overlap ratio > FLOAT [" + cen_ratio + "]"); print(" -l NUM min alignment block length [1m]"); print(" -g NUM max gap size [1m]"); print(" -e output misjoins not involving centromeres"); @@ -2636,12 +2638,15 @@ function paf_misjoin(args) } function test_cen(cen, chr, st, en) { - var b = cen[chr]; + var b = cen[chr], len = 0; if (b == null) return false; for (var j = 0; j < b.length; ++j) - if (b[j][0] < en && b[j][1] > st) - return true; - return false; + if (b[j][0] < en && b[j][1] > st) { + var s = b[j][0] > st? b[j][0] : st; + var e = b[j][1] < en? b[j][1] : en; + len += e - s; + } + return len < (en - st) * cen_ratio? false : true; } function process(a) { @@ -2672,6 +2677,10 @@ function paf_misjoin(args) var gap = dr > dq? dr - dq : dq - dr; if (gap > max_gap) { if (ov[0] || ov[1]) ++n_gap[1]; + else if (show_err) { + print("G", a[i-1].slice(0, 12).join("\t")); + print("G", a[i].slice(0, 12).join("\t")); + } ++n_gap[0]; } } else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion From 67dd906a80988dddacc8c551623fdc75b0c12dd2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 23 Mar 2021 09:12:49 -0400 Subject: [PATCH 18/77] bump travis python version to 3.9 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f0764f1..b43ca9c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,6 @@ matrix: before_install: pip install cython script: python setup.py build_ext - language: python - python: "3.6" + python: "3.9" before_install: pip install cython script: python setup.py build_ext From 2b47846cd6610bf471639db42ac67f5eb4df1dfe Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 5 Apr 2021 11:41:00 -0400 Subject: [PATCH 19/77] r1008: don't parse space in BED --- index.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/index.c b/index.c index fd93dd3..b24a505 100644 --- a/index.c +++ b/index.c @@ -654,7 +654,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc char *p, *q, *bl, *bs; int32_t i, id = -1, n_blk = 0; for (p = q = str.s, i = 0;; ++p) { - if (*p == 0 || isspace(*p)) { + if (*p == 0 || *p == '\t') { int32_t c = *p; *p = 0; if (i == 0) { // chr diff --git a/main.c b/main.c index 9b41ff6..aee299a 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.17-r974-dirty" +#define MM_VERSION "2.17-r1008-dirty" #ifdef __linux__ #include From 0b41dd26a23937485c0b6b5a3b32c359f06f8bcd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 5 Apr 2021 11:43:13 -0400 Subject: [PATCH 20/77] r1009: fixed a compiler warning --- format.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/format.c b/format.c index 890335c..11f24b5 100644 --- a/format.c +++ b/format.c @@ -392,7 +392,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se { const int max_bam_cigar_op = 65535; int flag, n_regs = n_regss[seg_idx], cigar_in_tag = 0; - int this_rid = -1, this_pos = -1, this_rev = 0; + int this_rid = -1, this_pos = -1; const mm_reg1_t *regs = regss[seg_idx], *r_prev = NULL, *r_next; const mm_reg1_t *r = n_regs > 0 && reg_idx < n_regs && reg_idx >= 0? ®s[reg_idx] : NULL; @@ -441,7 +441,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1); } else mm_sprintf_lite(s, "\t*\t0\t0\t*"); } else { - this_rid = r->rid, this_pos = r->rs, this_rev = r->rev; + this_rid = r->rid, this_pos = r->rs; mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq); if ((opt_flag & MM_F_LONG_CIGAR) && r->p && r->p->n_cigar > max_bam_cigar_op - 2) { int n_cigar = r->p->n_cigar; diff --git a/main.c b/main.c index aee299a..2af0799 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.17-r1008-dirty" +#define MM_VERSION "2.17-r1009-dirty" #ifdef __linux__ #include From 7db2e8d21af919b6366ebd53360e73c589cda534 Mon Sep 17 00:00:00 2001 From: Marcus Stoiber Date: Fri, 2 Apr 2021 06:31:27 -0700 Subject: [PATCH 21/77] Convert python install from build_ext to setuptools setup_requires. --- setup.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/setup.py b/setup.py index 042911c..4fab496 100644 --- a/setup.py +++ b/setup.py @@ -4,16 +4,6 @@ except ImportError: from distutils.core import setup from distutils.extension import Extension -cmdclass = {} - -try: - from Cython.Build import build_ext -except ImportError: # without Cython - module_src = 'python/mappy.c' -else: # with Cython - module_src = 'python/mappy.pyx' - cmdclass['build_ext'] = build_ext - import sys, platform sys.path.append('python') @@ -42,8 +32,8 @@ setup( license = 'MIT', keywords = 'sequence-alignment', scripts = ['python/minimap2.py'], - ext_modules = [Extension('mappy', - sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', + ext_modules = [Extension('mappy', + sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'], depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h', @@ -62,4 +52,4 @@ setup( 'Programming Language :: Python :: 3', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics'], - cmdclass = cmdclass) + setup_requires=["cython"]) From d3dde2fdd4a6bd09be00b2577d4b35ab913ab011 Mon Sep 17 00:00:00 2001 From: Marcus Stoiber Date: Mon, 5 Apr 2021 08:54:08 -0700 Subject: [PATCH 22/77] Convert from spaces to tabs. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4fab496..e4959ec 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ setup( license = 'MIT', keywords = 'sequence-alignment', scripts = ['python/minimap2.py'], - ext_modules = [Extension('mappy', + ext_modules = [Extension('mappy', sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'], @@ -52,4 +52,4 @@ setup( 'Programming Language :: Python :: 3', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics'], - setup_requires=["cython"]) + setup_requires=["cython"]) From 827ca4b4612bbf2a9d786566f3dbc98f249648b1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 7 Apr 2021 23:31:31 -0400 Subject: [PATCH 23/77] r1012: fixed an off-by-one bug; resolves #489 --- align.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/align.c b/align.c index 7bfb254..6028abb 100644 --- a/align.c +++ b/align.c @@ -38,8 +38,8 @@ static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, i int z = *max - score - diff * e; if (z > *max_zdrop) { *max_zdrop = z; - pos[0][0] = *max_i, pos[0][1] = i + 1; - pos[1][0] = *max_j, pos[1][1] = j + 1; + pos[0][0] = *max_i, pos[0][1] = i; + pos[1][0] = *max_j, pos[1][1] = j; } } else *max = score, *max_i = i, *max_j = j; } diff --git a/main.c b/main.c index 2af0799..c727d70 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.17-r1009-dirty" +#define MM_VERSION "2.17-r1012-dirty" #ifdef __linux__ #include From 507d39af15c311f8efcf762a6bc59b5ea1e0d599 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 8 Apr 2021 13:57:53 -0400 Subject: [PATCH 24/77] r1013: changed to a more accurate similarity est Based on DOI:10.1101/2021.01.15.426881. One minimap2 reviewer suggested the right formula to me but I thought the difference would be insignificant. I was wrong. --- esterr.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/esterr.c b/esterr.c index 733c762..7a509da 100644 --- a/esterr.c +++ b/esterr.c @@ -59,6 +59,6 @@ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const n_tot = en - st + 1; if (r->qs > avg_k && r->rs > avg_k) ++n_tot; if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot; - r->div = logf((float)n_tot / n_match) / avg_k; + r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k)); } } diff --git a/main.c b/main.c index c727d70..7c9255c 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.17-r1012-dirty" +#define MM_VERSION "2.17-r1013-dirty" #ifdef __linux__ #include From 77abafaaf3110e3ec92579cd58bccdb5f6559ba7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 9 Apr 2021 13:18:56 -0400 Subject: [PATCH 25/77] prepare for release --- NEWS.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 5 ++++ 2 files changed, 91 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5c0ab9c..fbfeb60 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,89 @@ +Release 2.18-r1015 (9 April 2021) +--------------------------------- + +This release fixes multiple rare bugs in minimap2 and adds additional +functionality to paftools.js. + +Changes to minimap2: + + * Bugfix: a rare segfault caused by an off-by-one error (#489) + + * Bugfix: minimap2 segfaulted due to an uninitilized variable (#622 and #625). + + * Bugfix: minimap2 parsed spaces as field separators in BED (#721). This led + to issues when the BED name column contains spaces. + + * Bugfix: minimap2 `--split-prefix` did not work with long reference names + (#394). + + * Bugfix: option `--junc-bonus` didn't work (#513) + + * Bugfix: minimap2 didn't return 1 on I/O errors (#532) + + * Bugfix: the `de:f` tag (sequence divergence) could be negative if there were + ambiguous bases + + * Bugfix: fixed two undefined behaviors caused by calling memcpy() on + zero-length blocks (#443) + + * Bugfix: there were duplicated SAM @SQ lines if option `--split-prefix` is in + use (#400 and #527) + + * Bugfix: option -K had to be smaller than 2 billion (#491). This was caused + by a 32-bit integer overflow. + + * Improvement: optionally compile against SIMDe (#597). Minimap2 should work + with IBM POWER CPUs, though this has not been tested. To compile with SIMDe, + please use `make -f Makefile.simde`. + + * Improvement: more informative error message for I/O errors (#454) and for + FASTQ parsing errors (#510) + + * Improvement: abort given malformatted RG line (#541) + + * Improvement: better formula to estimate the `dv:f` tag (approximate sequence + divergence). See DOI:10.1101/2021.01.15.426881. + + * New feature: added the `--mask-len` option to fine control the removal of + redundant hits (#659). The default behavior is unchanged. + +Changes to mappy: + + * Bugfix: mappy caused segmentation fault if the reference index is not + present (#413). + + * Bugfix: fixed a memory leak via 238b6bb3 + + * Change: always require Cython to compile the mappy module (#723). Older + mappy packages at PyPI bundled the C source code generated by Cython such + that end users did not need to install Cython to compile mappy. However, as + Python 3.9 is breaking backward compatibility, older mappy does not work + with Python 3.9 anymore. We have to add this Cython dependency as a + workaround. + +Changes to paftools.js: + + * Bugfix: the "part10-" line from asmgene was wrong (#581) + + * Improvement: compatibility with GTF files from GenBank (#422) + + * New feature: asmgene also checks missing multi-copy genes + + * New feature: added the misjoin command to evaluate large-scale misjoins and + megabase-long inversions. + +Although given the many bug fixes and minor improvements, the core algorithm +stays the same. This version of minimap2 produces nearly identical alignments +to v2.17 except corner cases. + +Now unimap is recommended over minimap2 for aligning long contigs against a +reference genome. It often takes less wall-clock time and is much more +sensitive to long insertions and deletions. + +(2.18: 9 April 2021, r1015) + + + Release 2.17-r941 (4 May 2019) ------------------------------ diff --git a/README.md b/README.md index 65395e6..ae6eac5 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,10 @@ cd minimap2 && make # man page for detailed command line options man ./minimap2.1 ``` +[Unimap][unimap] is recommended for aligning long contigs against a reference +genome. It often takes less wall-clock time and is much more sensitive to long +insertions and deletions. + ## Table of Contents - [Getting Started](#started) @@ -398,3 +402,4 @@ mappy` or [from BioConda][mappyconda] via `conda install -c bioconda mappy`. [manpage-cs]: https://lh3.github.io/minimap2/minimap2.html#10 [doi]: https://doi.org/10.1093/bioinformatics/bty191 [smide]: https://github.com/nemequ/simde +[unimap]: https://github.com/lh3/unimap From b6ff332de1a2f1d0f0f6e9eb9824cad185f4d02b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 9 Apr 2021 13:33:34 -0400 Subject: [PATCH 26/77] Release minimap2-2.18 (r1015) --- NEWS.md | 2 +- README.md | 4 ++-- cookbook.md | 4 ++-- main.c | 2 +- minimap2.1 | 2 +- misc/paftools.js | 2 +- python/mappy.pyx | 2 +- setup.py | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index fbfeb60..b823cb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -74,7 +74,7 @@ Changes to paftools.js: Although given the many bug fixes and minor improvements, the core algorithm stays the same. This version of minimap2 produces nearly identical alignments -to v2.17 except corner cases. +to v2.17 except very rare corner cases. Now unimap is recommended over minimap2 for aligning long contigs against a reference genome. It often takes less wall-clock time and is much more diff --git a/README.md b/README.md index ae6eac5..9b4dfe9 100644 --- a/README.md +++ b/README.md @@ -76,8 +76,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from the [release page][release] with: ```sh -curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar -jxvf - -./minimap2-2.17_x64-linux/minimap2 +curl -L https://github.com/lh3/minimap2/releases/download/v2.18/minimap2-2.18_x64-linux.tar.bz2 | tar -jxvf - +./minimap2-2.18_x64-linux/minimap2 ``` If you want to compile from the source, you need to have a C compiler, GNU make and zlib development files installed. Then type `make` in the source code diff --git a/cookbook.md b/cookbook.md index 6cb4d5f..f963ccc 100644 --- a/cookbook.md +++ b/cookbook.md @@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools, please follow the command lines below: ```sh # install minimap2 executables -curl -L https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 | tar jxf - -cp minimap2-2.17_x64-linux/{minimap2,k8,paftools.js} . # copy executables +curl -L https://github.com/lh3/minimap2/releases/download/v2.18/minimap2-2.18_x64-linux.tar.bz2 | tar jxf - +cp minimap2-2.18_x64-linux/{minimap2,k8,paftools.js} . # copy executables export PATH="$PATH:"`pwd` # put the current directory on PATH # download example datasets curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf - diff --git a/main.c b/main.c index 7c9255c..395c0d9 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.17-r1013-dirty" +#define MM_VERSION "2.18-r1015" #ifdef __linux__ #include diff --git a/minimap2.1 b/minimap2.1 index 9e55543..f132780 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "4 May 2019" "minimap2-2.17 (r941)" "Bioinformatics tools" +.TH minimap2 1 "9 April 2021" "minimap2-2.18 (r1015)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences diff --git a/misc/paftools.js b/misc/paftools.js index 111e96d..13e9b6d 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.17-r982-dirty'; +var paftools_version = '2.18-r1015'; /***************************** ***** Library functions ***** diff --git a/python/mappy.pyx b/python/mappy.pyx index 3273514..fbbc8ab 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -3,7 +3,7 @@ from libc.stdlib cimport free cimport cmappy import sys -__version__ = '2.17' +__version__ = '2.18' cmappy.mm_reset_timer() diff --git a/setup.py b/setup.py index e4959ec..960775a 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def readme(): setup( name = 'mappy', - version = '2.17', + version = '2.18', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(), From 9729fa99adab003fdd9a3dda983bfe3555dc988d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 9 Apr 2021 13:48:16 -0400 Subject: [PATCH 27/77] removed mappy.c --- MANIFEST.in | 1 - 1 file changed, 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 599b046..3df1d19 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,7 +4,6 @@ include ksw2_dispatch.c include main.c include README.md include sse2neon/emmintrin.h -include python/mappy.c include python/cmappy.h include python/cmappy.pxd include python/mappy.pyx From a9037dc16ccee7992975cf0b4be57ad36f7d9f70 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 11 Apr 2021 01:08:17 -0400 Subject: [PATCH 28/77] r1018: scripts for SV evaluation --- misc/paftools.js | 85 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index 13e9b6d..ff8f0d9 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.18-r1015'; +var paftools_version = '2.18-r1018-dirty'; /***************************** ***** Library functions ***** @@ -2718,9 +2718,30 @@ function paf_misjoin(args) print("# candidate inversions at contig ends: " + n_inv_end.join(",")); } +function _paf_get_alen(t) +{ + var svlen = null, alen = null; + if ((m = /(^|;)SVLEN=(-?\d+)/.exec(t[7])) != null) + svlen = parseInt(m[2]); + var s = t[4].split(","); + var min_abs_diff = 1<<30, max_abs_diff = 0; + if (svlen != null && svlen != 0) + alen = svlen, min_abs_diff = max_abs_diff = svlen > 0? svlen : -svlen; + var rlen = t[3].length; + for (var i = 0; i < s.length; ++i) { + if (/^<\S+>$/.test(s[i])) continue; + var diff = s[i].length - rlen; + var abs_diff = diff > 0? diff : -diff; + min_abs_diff = min_abs_diff < abs_diff? min_abs_diff : abs_diff; + if (max_abs_diff < abs_diff) + max_abs_diff = abs_diff, alen = diff; + } + return [alen, min_abs_diff, max_abs_diff]; +} + function paf_sveval(args) { - var c, min_flt = 30, min_size = 50, max_size = 10000, win_size = 500, print_err = false, bed_fn = null; + var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, bed_fn = null; while ((c = getopt(args, "f:i:x:w:er:")) != null) { if (c == 'f') min_flt = paf_parseNum(getopt.arg); else if (c == 'i') min_size = paf_parseNum(getopt.arg); @@ -2773,22 +2794,13 @@ function paf_sveval(args) if (t[4] == '' || t[4] == '') continue; // no inversion if (/[\[\]]/.test(t[4])) continue; // no break points var st = parseInt(t[1]) - 1, en = st + t[3].length; - if ((m = /((;END)|(^END))=(\d+)/.exec(t[7])) != null) - en = parseInt(m[4]); + if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null) + en = parseInt(m[2]); + if (en < st) en = st; + if (st == en) --st, ++en; if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue; - // determine svlen - var s = t[4].split(","), max_del = 0, max_ins = 0; - for (var i = 0; i < s.length; ++i) { - var l = s[i].length - t[3].length; - if (l > 0) - max_ins = max_ins > l? max_ins : l; - else if (l < 0) - max_del = max_del > -l? max_del : -l; - } - if (max_ins < min_flt && max_del < min_flt) continue; - var svlen = max_ins > max_del? max_ins : -max_del; - if ((m = /((;SVLEN)|(^SVLEN))=(\d+)/.exec(t[7])) != null) - svlen = parseInt(m[4]); + // parse svlen + var b = _paf_get_alen(t), svlen = b[0]; var abslen = svlen > 0? svlen : -svlen; if (abslen < min_flt || abslen > max_size) continue; // insert @@ -2830,6 +2842,44 @@ function paf_sveval(args) print('F1', ((fn[1] / fn[0] + fp[1] / fp[0]) / 2).toFixed(6)); } +function paf_vcfsel(args) +{ + var c, min_l = 0, max_l = 1<<30; + while ((c = getopt(args, "l:L:")) != null) { + if (c == 'l') min_l = parseInt(getopt.arg); + else if (c == 'L') max_l = parseInt(getopt.arg); + } + + var buf = new Bytes(); + if (getopt.ind == args.length) { + print("Usage: paftools.js vcfsel [options] "); + return 1; + } + var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]); + while (file.readline(buf) >= 0) { + var m, line = buf.toString(); + if (line[0] == '#') { + print(line); + continue; + } + var t = line.split("\t"); + var st = parseInt(t[1]), en = st + t[3].length - 1; + if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null) + en = parseInt(m[2]); + if (en < st) { + warn("END is smaller than POS: " + en + " < " + st); + en = st; + } + var b = _paf_get_alen(t); + var alen = b[0], min_abs_diff = b[1], max_abs_diff = b[2]; + if (max_abs_diff < min_l || min_abs_diff > max_l) + continue; + print(line); + } + file.close(); + buf.destroy(); +} + /************************* ***** main function ***** *************************/ @@ -2885,6 +2935,7 @@ function main(args) else if (cmd == 'ov-eval') paf_ov_eval(args); else if (cmd == 'vcfstat') paf_vcfstat(args); else if (cmd == 'sveval') paf_sveval(args); + else if (cmd == 'vcfsel') paf_vcfsel(args); else if (cmd == 'version') print(paftools_version); else throw Error("unrecognized command: " + cmd); } From b51e8599451475e5a54f58604d18ee758fe66ec2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 11 Apr 2021 11:47:14 -0400 Subject: [PATCH 29/77] make sure ins/del match --- misc/paftools.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/misc/paftools.js b/misc/paftools.js index ff8f0d9..03917ca 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2826,7 +2826,11 @@ function paf_sveval(args) if (a0 == null) continue; var st = a1[i][0] > win_size? a1[i][0] - win_size : 0; b = Interval.find_ovlp(a0, st, a1[i][1] + win_size); - if (b.length > 0) ++m; + var match = false; + for (var j = 0; j < b.length; ++j) + if (b[j][2] * a1[i][2] > 0) + match = true; + if (match) ++m; else if (print_err) print(label, x, a1[i].slice(0, 3).join("\t")); } } From 9ab95be1bb36d2c1925f3f31b92ea5fbf2bf9d25 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 11 Apr 2021 12:59:21 -0400 Subject: [PATCH 30/77] update END when it is not there --- misc/paftools.js | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index 03917ca..ce8c886 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2741,14 +2741,15 @@ function _paf_get_alen(t) function paf_sveval(args) { - var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, bed_fn = null; - while ((c = getopt(args, "f:i:x:w:er:")) != null) { + var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, print_match = false, bed_fn = null; + while ((c = getopt(args, "f:i:x:w:er:p")) != null) { if (c == 'f') min_flt = paf_parseNum(getopt.arg); else if (c == 'i') min_size = paf_parseNum(getopt.arg); else if (c == 'x') max_size = paf_parseNum(getopt.arg); else if (c == 'w') win_size = paf_parseNum(getopt.arg); else if (c == 'r') bed_fn = getopt.arg; else if (c == 'e') print_err = true; + else if (c == 'p') print_match = true; } if (args.length - getopt.ind < 2) { print("Usage: paftools.js sveval [options] "); @@ -2794,15 +2795,18 @@ function paf_sveval(args) if (t[4] == '' || t[4] == '') continue; // no inversion if (/[\[\]]/.test(t[4])) continue; // no break points var st = parseInt(t[1]) - 1, en = st + t[3].length; + // parse svlen + var b = _paf_get_alen(t), svlen = b[0]; + var abslen = svlen == null? 0 : svlen > 0? svlen : -svlen; + if (abslen < min_flt || abslen > max_size) continue; + // update end if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null) en = parseInt(m[2]); + else if (svlen != null && svlen < 0) + en = st + (-svlen); if (en < st) en = st; if (st == en) --st, ++en; if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue; - // parse svlen - var b = _paf_get_alen(t), svlen = b[0]; - var abslen = svlen > 0? svlen : -svlen; - if (abslen < min_flt || abslen > max_size) continue; // insert if (v[t[0]] == null) v[t[0]] = []; v[t[0]].push([st, en, svlen, abslen]); @@ -2827,9 +2831,12 @@ function paf_sveval(args) var st = a1[i][0] > win_size? a1[i][0] - win_size : 0; b = Interval.find_ovlp(a0, st, a1[i][1] + win_size); var match = false; - for (var j = 0; j < b.length; ++j) + for (var j = 0; j < b.length; ++j) { if (b[j][2] * a1[i][2] > 0) match = true; + if (print_match) + print("MA", x, a1[i].slice(0, 3).join("\t"), b[j].slice(0, 3).join("\t")); + } if (match) ++m; else if (print_err) print(label, x, a1[i].slice(0, 3).join("\t")); } From 86b716448c75bd160c5ff32c4ecfe1bc5fa112e0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 11 Apr 2021 16:08:29 -0400 Subject: [PATCH 31/77] larger window size for longer INDELs --- misc/paftools.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index ce8c886..dbb8aeb 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -2828,8 +2828,9 @@ function paf_sveval(args) if (a1[i][3] < min_size) continue; ++n; if (a0 == null) continue; - var st = a1[i][0] > win_size? a1[i][0] - win_size : 0; - b = Interval.find_ovlp(a0, st, a1[i][1] + win_size); + var ws = win_size + (a1[i][3]>>1); + var st = a1[i][0] > ws? a1[i][0] - ws : 0; + b = Interval.find_ovlp(a0, st, a1[i][1] + ws); var match = false; for (var j = 0; j < b.length; ++j) { if (b[j][2] * a1[i][2] > 0) From cd9ccfa069a2cdbdaa97296e1927468982939c4f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 11 Apr 2021 22:07:00 -0400 Subject: [PATCH 32/77] r1022: check INDEL lengths in simple cases --- misc/paftools.js | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index dbb8aeb..9124fb7 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.18-r1018-dirty'; +var paftools_version = '2.18-r1022-dirty'; /***************************** ***** Library functions ***** @@ -2742,11 +2742,13 @@ function _paf_get_alen(t) function paf_sveval(args) { var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, print_match = false, bed_fn = null; - while ((c = getopt(args, "f:i:x:w:er:p")) != null) { + var len_diff_ratio = 0.5; + while ((c = getopt(args, "f:i:x:w:er:pd:")) != null) { if (c == 'f') min_flt = paf_parseNum(getopt.arg); else if (c == 'i') min_size = paf_parseNum(getopt.arg); else if (c == 'x') max_size = paf_parseNum(getopt.arg); else if (c == 'w') win_size = paf_parseNum(getopt.arg); + else if (c == 'd') len_diff_ratio = parseFloat(getopt.arg); else if (c == 'r') bed_fn = getopt.arg; else if (c == 'e') print_err = true; else if (c == 'p') print_match = true; @@ -2759,6 +2761,7 @@ function paf_sveval(args) print(" -i INT min SV length [" + min_size + "]"); print(" -x INT max SV length [" + max_size + "]"); print(" -w INT fuzzy windown size [" + win_size + "]"); + print(" -d FLOAT max allele diff if there is a single allele in the window [" + len_diff_ratio + "]"); print(" -e print errors"); return; } @@ -2831,15 +2834,35 @@ function paf_sveval(args) var ws = win_size + (a1[i][3]>>1); var st = a1[i][0] > ws? a1[i][0] - ws : 0; b = Interval.find_ovlp(a0, st, a1[i][1] + ws); - var match = false; + var n_ins = 0, n_del = 0, sv_del = null, sv_ins = null; for (var j = 0; j < b.length; ++j) { - if (b[j][2] * a1[i][2] > 0) - match = true; + if (b[j][2] < 0) ++n_del, sv_del = -b[j][2]; + else if (b[j][2] > 0) ++n_ins, sv_ins = b[j][2]; if (print_match) print("MA", x, a1[i].slice(0, 3).join("\t"), b[j].slice(0, 3).join("\t")); } + var match = false; + if (a1[i][2] > 0) { // insertion + if (n_ins == 1) { + var diff = sv_ins - a1[i][3]; + if (diff < 0) diff = -diff; + if (diff < min_size || diff / a1[i][3] < len_diff_ratio) + match = true; + } else if (n_ins > 1) match = true; // multiple insertions; ambiguous + } else if (a1[i][2] < 0) { + if (n_del == 1) { // deletion + var diff = sv_del - a1[i][3]; + if (diff < 0) diff = -diff; + if (diff < min_size || diff / a1[i][3] < len_diff_ratio) + match = true; + } else if (n_del > 1) match = true; // multiple deletions; ambiguous + } if (match) ++m; - else if (print_err) print(label, x, a1[i].slice(0, 3).join("\t")); + else if (print_err) { + if ((a1[i][2] > 0 && n_ins > 0) || (a1[i][2] < 0 && n_del > 0)) + print("MM", x, a1[i].slice(0, 3).join("\t")); + print(label, x, a1[i].slice(0, 3).join("\t")); + } } } return [n, m]; From ba52c7902479b140f5450c88b305535214e24fa2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 22 Apr 2021 17:30:30 -0400 Subject: [PATCH 33/77] added code of conduct --- code_of_conduct.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 code_of_conduct.md diff --git a/code_of_conduct.md b/code_of_conduct.md new file mode 100644 index 0000000..b7175c1 --- /dev/null +++ b/code_of_conduct.md @@ -0,0 +1,30 @@ +## Contributor Code of Conduct + +As contributors and maintainers of this project, we pledge to respect all +people who contribute through reporting issues, posting feature requests, +updating documentation, submitting pull requests or patches, and other +activities. + +We are committed to making participation in this project a harassment-free +experience for everyone, regardless of level of experience, gender, gender +identity and expression, sexual orientation, disability, personal appearance, +body size, race, age, or religion. + +Examples of unacceptable behavior by participants include the use of sexual +language or imagery, derogatory comments or personal attacks, trolling, public +or private harassment, insults, or other unprofessional conduct. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct. Project maintainers or +contributors who do not follow the Code of Conduct may be removed from the +project team. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by opening an issue or contacting the maintainer via email. + +This Code of Conduct is adapted from the [Contributor Covenant][cc], [version +1.0.0][v1]. + +[cc]: http://contributor-covenant.org/ +[v1]: http://contributor-covenant.org/version/1/0/0/ From cdbd96be0c5d6d139546f51ad2230ce3125eac7f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 30 Apr 2021 11:24:53 -0400 Subject: [PATCH 34/77] a bit refactoring for future changes --- map.c | 25 +++++++++---------------- mmpriv.h | 8 ++++++++ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/map.c b/map.c index 0812783..89f7818 100644 --- a/map.c +++ b/map.c @@ -80,21 +80,14 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t #define heap_lt(a, b) ((a).x > (b).x) KSORT_INIT(heap, mm128_t, heap_lt) -typedef struct { - uint32_t n; - uint32_t q_pos, q_span; - uint32_t seg_id:31, is_tandem:1; - const uint64_t *cr; -} mm_match_t; - -static mm_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) +static mm_seed_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) { int rep_st = 0, rep_en = 0, n_m; size_t i; - mm_match_t *m; + mm_seed_t *m; *n_mini_pos = 0; *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t)); - m = (mm_match_t*)kmalloc(km, mv->n * sizeof(mm_match_t)); + m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t)); for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) { const uint64_t *cr; mm128_t *p = &mv->a[i]; @@ -108,7 +101,7 @@ static mm_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_id rep_st = st, rep_en = en; } else rep_en = en; } else { - mm_match_t *q = &m[n_m++]; + mm_seed_t *q = &m[n_m++]; q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32; q->is_tandem = 0; if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1; @@ -122,7 +115,7 @@ static mm_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_id return m; } -static inline int skip_seed(int flag, uint64_t r, const mm_match_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self) +static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self) { *is_self = 0; if (qname && (flag & (MM_F_NO_DIAG|MM_F_NO_DUAL))) { @@ -151,7 +144,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max { int i, n_m, heap_size = 0; int64_t j, n_for = 0, n_rev = 0; - mm_match_t *m; + mm_seed_t *m; mm128_t *a, *heap; m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); @@ -168,7 +161,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max } ks_heapmake_heap(heap_size, heap); while (heap_size > 0) { - mm_match_t *q = &m[heap->y>>32]; + mm_seed_t *q = &m[heap->y>>32]; mm128_t *p; uint64_t r = heap->x; int32_t is_self, rpos = (uint32_t)r >> 1; @@ -216,12 +209,12 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, int *n_mini_pos, uint64_t **mini_pos) { int i, n_m; - mm_match_t *m; + mm_seed_t *m; mm128_t *a; m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); for (i = 0, *n_a = 0; i < n_m; ++i) { - mm_match_t *q = &m[i]; + mm_seed_t *q = &m[i]; const uint64_t *r = q->cr; uint32_t k; for (k = 0; k < q->n; ++k) { diff --git a/mmpriv.h b/mmpriv.h index 1eebb5c..7583001 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -36,6 +36,14 @@ extern "C" { #endif +typedef struct { + uint32_t n; + uint32_t q_pos; + uint32_t q_span:31, flt:1; + uint32_t seg_id:31, is_tandem:1; + const uint64_t *cr; +} mm_seed_t; + typedef struct { int n_u, n_a; uint64_t *u; From feb92d32ea16692c104fe351a579c2c1d4ada27d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 30 Apr 2021 17:33:16 -0400 Subject: [PATCH 35/77] r1025: seed rescuring --- Makefile | 37 ++++++++++++--------- main.c | 5 +-- map.c | 39 ++-------------------- minimap.h | 2 +- misc.c | 1 + mmpriv.h | 2 ++ options.c | 2 ++ seed.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 130 insertions(+), 56 deletions(-) create mode 100644 seed.c diff --git a/Makefile b/Makefile index 18622f5..8bb0890 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,9 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC INCLUDES= -OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o splitidx.o ksw2_ll_sse.o +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \ + chain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ + ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread @@ -103,26 +105,29 @@ depend: # DO NOT DELETE -align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h +align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h bseq.o: bseq.h kvec.h kalloc.h kseq.h -chain.o: minimap.h mmpriv.h bseq.h kalloc.h -esterr.o: mmpriv.h minimap.h bseq.h +chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h +esterr.o: mmpriv.h minimap.h bseq.h kseq.h example.o: minimap.h kseq.h -format.o: kalloc.h mmpriv.h minimap.h bseq.h -hit.o: mmpriv.h minimap.h bseq.h kalloc.h khash.h -index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h +format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h +hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h +index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h +index.o: ksort.h kalloc.o: kalloc.h ksw2_extd2_sse.o: ksw2.h kalloc.h ksw2_exts2_sse.o: ksw2.h kalloc.h ksw2_extz2_sse.o: ksw2.h kalloc.h ksw2_ll_sse.o: ksw2.h kalloc.h kthread.o: kthread.h -main.o: bseq.h minimap.h mmpriv.h ketopt.h -map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h -map.o: ksort.h -misc.o: mmpriv.h minimap.h bseq.h ksort.h -options.o: mmpriv.h minimap.h bseq.h -pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h -sdust.o: kalloc.h kdq.h kvec.h ketopt.h sdust.h -sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h -splitidx.o: mmpriv.h minimap.h bseq.h +main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h +map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h +map.o: khash.h ksort.h +misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h +options.o: mmpriv.h minimap.h bseq.h kseq.h +pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h +sdust.o: kalloc.h kdq.h kvec.h sdust.h +seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h +self-chain.o: minimap.h kseq.h +sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h +splitidx.o: mmpriv.h minimap.h bseq.h kseq.h diff --git a/main.c b/main.c index 395c0d9..0e95bbc 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1015" +#define MM_VERSION "2.18-r1025-dirty" #ifdef __linux__ #include @@ -108,7 +108,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -171,6 +171,7 @@ int main(int argc, char *argv[]) else if (c == 'C') opt.noncan = atoi(o.arg); else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg); else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg); + else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg); else if (c == 'R') rg = o.arg; else if (c == 'h') fp_help = stdout; else if (c == '2') opt.flag |= MM_F_2_IO_THREADS; diff --git a/map.c b/map.c index 89f7818..b4b07d1 100644 --- a/map.c +++ b/map.c @@ -80,41 +80,6 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t #define heap_lt(a, b) ((a).x > (b).x) KSORT_INIT(heap, mm128_t, heap_lt) -static mm_seed_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) -{ - int rep_st = 0, rep_en = 0, n_m; - size_t i; - mm_seed_t *m; - *n_mini_pos = 0; - *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t)); - m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t)); - for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) { - const uint64_t *cr; - mm128_t *p = &mv->a[i]; - uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff; - int t; - cr = mm_idx_get(mi, p->x>>8, &t); - if (t >= max_occ) { - int en = (q_pos >> 1) + 1, st = en - q_span; - if (st > rep_en) { - *rep_len += rep_en - rep_st; - rep_st = st, rep_en = en; - } else rep_en = en; - } else { - mm_seed_t *q = &m[n_m++]; - q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32; - q->is_tandem = 0; - if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1; - if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1; - *n_a += q->n; - (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q_span<<32 | q_pos>>1; - } - } - *rep_len += rep_en - rep_st; - *_n_m = n_m; - return m; -} - static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self) { *is_self = 0; @@ -147,7 +112,7 @@ static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max mm_seed_t *m; mm128_t *a, *heap; - m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); + m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t)); a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); @@ -211,7 +176,7 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, int i, n_m; mm_seed_t *m; mm128_t *a; - m = collect_matches(km, &n_m, max_occ, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); + m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); for (i = 0, *n_a = 0; i < n_m; ++i) { mm_seed_t *q = &m[i]; diff --git a/minimap.h b/minimap.h index a00869e..9b7b4ae 100644 --- a/minimap.h +++ b/minimap.h @@ -148,7 +148,7 @@ typedef struct { float mid_occ_frac; // only used by mm_mapopt_update(); see below int32_t min_mid_occ; int32_t mid_occ; // ignore seeds with occurrences above this threshold - int32_t max_occ; + int32_t max_occ, max_max_occ, occ_dist; int64_t mini_batch_size; // size of a batch of query bases to process in parallel int64_t max_sw_mat; diff --git a/misc.c b/misc.c index ddfa9de..f3a29f8 100644 --- a/misc.c +++ b/misc.c @@ -159,3 +159,4 @@ KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8) KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8) KSORT_INIT_GENERIC(uint32_t) +KSORT_INIT_GENERIC(uint64_t) diff --git a/mmpriv.h b/mmpriv.h index 7583001..d2d8641 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -60,6 +60,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); +mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); + int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int rep_len); diff --git a/options.c b/options.c index d4eafaa..7b87f1c 100644 --- a/options.c +++ b/options.c @@ -26,6 +26,8 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->max_chain_skip = 25; opt->max_chain_iter = 5000; opt->chain_gap_scale = 1.0f; + opt->max_max_occ = 5000; + opt->occ_dist = 0; opt->mask_level = 0.5f; opt->mask_len = INT_MAX; diff --git a/seed.c b/seed.c new file mode 100644 index 0000000..27f7ffe --- /dev/null +++ b/seed.c @@ -0,0 +1,98 @@ +#include "mmpriv.h" +#include "kalloc.h" + +mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv) +{ + mm_seed_t *m; + size_t i; + m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t)); + for (i = 0; i < mv->n; ++i) { + const uint64_t *cr; + mm_seed_t *q = &m[i]; + mm128_t *p = &mv->a[i]; + uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff; + int t; + cr = mm_idx_get(mi, p->x>>8, &t); + q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32; + q->is_tandem = q->flt = 0; + if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1; + if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1; + } + return m; +} + +#define MAX_MAX_HIGH_OCC 128 + +void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist) +{ // for high-occ minimizers, choose up to max_high_occ in each high-occ streak + extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*); + extern void ks_heapmake_uint64_t(size_t n, uint64_t*); + int32_t i, last0, m; + uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation + + if (n == 0 || n == 1) return; + for (i = m = 0; i < n; ++i) + if (a[i].n > max_occ) ++m; + if (m == 0) return; // no high-frequency k-mers; do nothing + for (i = 0, last0 = -1; i <= n; ++i) { + if (i == n || a[i].n <= max_occ) { + if (i - last0 > 1) { + int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos; + int32_t pe = i == n? len : (uint32_t)a[i].q_pos; + int32_t j, k, st = last0 + 1, en = i; + int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499); + if (max_high_occ > MAX_MAX_HIGH_OCC) + max_high_occ = MAX_MAX_HIGH_OCC; + for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k) + b[k] = (uint64_t)a[j].n<<32 | j; + ks_heapmake_uint64_t(k, b); // initialize the binomial heap + for (; j < en; ++j) { // if there are more, choose top max_high_occ + if (a[j].n < (uint32_t)b[0]) { // then update the heap + b[0] = (b[0]&0xFFFFFFFF00000000ULL) | j; + ks_heapdown_uint64_t(0, k, b); + } + } + for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1; + for (j = st; j < en; ++j) a[j].flt ^= 1; + for (j = st; j < en; ++j) + if (a[j].n > max_max_occ) + a[j].flt = 1; + } + last0 = i; + } + } +} + +mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) +{ + int rep_st = 0, rep_en = 0, n_m; + size_t i; + mm_seed_t *m; + *n_mini_pos = 0; + *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t)); + m = mm_seed_collect_all(km, mi, mv); + if (dist > 0 && max_max_occ > max_occ) { + mm_seed_select(mv->n, m, qlen, max_occ, max_max_occ, dist); + } else { + for (i = 0; i < mv->n; ++i) + if (m[i].n > max_occ) + m[i].flt = 1; + } + for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) { + mm_seed_t *q = &m[i]; + if (q->flt) { + int en = (q->q_pos >> 1) + 1, st = en - q->q_span; + if (st > rep_en) { + *rep_len += rep_en - rep_st; + rep_st = st, rep_en = en; + } else rep_en = en; + } else { + *n_a += q->n; + (*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1; + m[n_m++] = *q; + } + } + *rep_len += rep_en - rep_st; + *_n_m = n_m; + return m; +} From 7edeec67b02665843c23cf7fc93bed24f6ab18d3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 1 May 2021 15:07:56 -0400 Subject: [PATCH 36/77] r1026: fixed bugs in seed sampling add hifi --- main.c | 2 +- options.c | 8 +++++++- seed.c | 31 +++++++++++++++++++------------ 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/main.c b/main.c index 0e95bbc..1dae0ac 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1025-dirty" +#define MM_VERSION "2.18-r1026-dirty" #ifdef __linux__ #include diff --git a/options.c b/options.c index 7b87f1c..45212dd 100644 --- a/options.c +++ b/options.c @@ -26,7 +26,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->max_chain_skip = 25; opt->max_chain_iter = 5000; opt->chain_gap_scale = 1.0f; - opt->max_max_occ = 5000; + opt->max_max_occ = 4095; opt->occ_dist = 0; opt->mask_level = 0.5f; @@ -91,6 +91,12 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag |= MM_I_HPC, io->k = 19; } else if (strcmp(preset, "map-ont") == 0) { io->flag = 0, io->k = 15; + } else if (strcmp(preset, "hifi") == 0 || strcmp(preset, "ccs") == 0) { + io->flag = 0, io->k = 19, io->w = 19; + mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; + mo->occ_dist = 100; + mo->min_mid_occ = 100; + mo->min_dp_max = 200; } else if (strcmp(preset, "asm5") == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; diff --git a/seed.c b/seed.c index 27f7ffe..bd250d9 100644 --- a/seed.c +++ b/seed.c @@ -1,23 +1,28 @@ #include "mmpriv.h" #include "kalloc.h" +#include "ksort.h" -mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv) +mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_) { mm_seed_t *m; size_t i; + int32_t k; m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t)); - for (i = 0; i < mv->n; ++i) { + for (i = k = 0; i < mv->n; ++i) { const uint64_t *cr; - mm_seed_t *q = &m[i]; + mm_seed_t *q; mm128_t *p = &mv->a[i]; uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff; int t; cr = mm_idx_get(mi, p->x>>8, &t); + if (t == 0) continue; + q = &m[k++]; q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32; q->is_tandem = q->flt = 0; if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1; if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1; } + *n_m_ = k; return m; } @@ -37,18 +42,19 @@ void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_o for (i = 0, last0 = -1; i <= n; ++i) { if (i == n || a[i].n <= max_occ) { if (i - last0 > 1) { - int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos; - int32_t pe = i == n? len : (uint32_t)a[i].q_pos; + int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1; + int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1; int32_t j, k, st = last0 + 1, en = i; int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499); + //fprintf(stderr, "Y\t%d\t%d\n", ps, pe); if (max_high_occ > MAX_MAX_HIGH_OCC) max_high_occ = MAX_MAX_HIGH_OCC; for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k) b[k] = (uint64_t)a[j].n<<32 | j; ks_heapmake_uint64_t(k, b); // initialize the binomial heap for (; j < en; ++j) { // if there are more, choose top max_high_occ - if (a[j].n < (uint32_t)b[0]) { // then update the heap - b[0] = (b[0]&0xFFFFFFFF00000000ULL) | j; + if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap + b[0] = (uint64_t)a[j].n<<32 | j; ks_heapdown_uint64_t(0, k, b); } } @@ -65,21 +71,22 @@ void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_o mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos) { - int rep_st = 0, rep_en = 0, n_m; + int rep_st = 0, rep_en = 0, n_m, n_m0; size_t i; mm_seed_t *m; *n_mini_pos = 0; *mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t)); - m = mm_seed_collect_all(km, mi, mv); + m = mm_seed_collect_all(km, mi, mv, &n_m0); if (dist > 0 && max_max_occ > max_occ) { - mm_seed_select(mv->n, m, qlen, max_occ, max_max_occ, dist); + mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist); } else { - for (i = 0; i < mv->n; ++i) + for (i = 0; i < n_m0; ++i) if (m[i].n > max_occ) m[i].flt = 1; } - for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) { + for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) { mm_seed_t *q = &m[i]; + //fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt); if (q->flt) { int en = (q->q_pos >> 1) + 1, st = en - q->q_span; if (st > rep_en) { From e8823a37098dcc98c28db2108ae785aacdd5aaad Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 1 May 2021 15:22:51 -0400 Subject: [PATCH 37/77] r1027: renamed hifi to map-hifi; changed default --- main.c | 2 +- minimap2.1 | 8 +++++++- options.c | 5 +++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/main.c b/main.c index 1dae0ac..160801f 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1026-dirty" +#define MM_VERSION "2.18-r1027-dirty" #ifdef __linux__ #include diff --git a/minimap2.1 b/minimap2.1 index f132780..018d958 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -523,7 +523,7 @@ Available .I STR are: .RS -.TP 8 +.TP 9 .B map-pb PacBio/Oxford Nanopore read to reference mapping .RB ( -Hk19 ) @@ -536,6 +536,12 @@ more sensitive results in comparison to normal minimizers. For Oxford Nanopore data, normal minimizers are better, though not much. The effectiveness of HPC is determined by the sequencing error mode. .TP +.B map-hifi +PacBio HiFi reads to reference mapping +.RB ( -k19 +.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -e100 -g10k +.BR --min-occ-floor=100 ). +.TP .B asm5 Long assembly to reference mapping .RB ( -k19 diff --git a/options.c b/options.c index 45212dd..a597962 100644 --- a/options.c +++ b/options.c @@ -91,10 +91,11 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag |= MM_I_HPC, io->k = 19; } else if (strcmp(preset, "map-ont") == 0) { io->flag = 0, io->k = 15; - } else if (strcmp(preset, "hifi") == 0 || strcmp(preset, "ccs") == 0) { + } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; - mo->occ_dist = 100; + mo->max_gap = 10000; + mo->occ_dist = 500; mo->min_mid_occ = 100; mo->min_dp_max = 200; } else if (strcmp(preset, "asm5") == 0) { From 0f5608c4a43d733af3c458220eb862ddd7a072b5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 1 May 2021 15:41:39 -0400 Subject: [PATCH 38/77] r1028: backport minigraph -U --- main.c | 7 +++++-- minimap.h | 2 +- minimap2.1 | 2 +- options.c | 5 ++++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/main.c b/main.c index 160801f..5234e86 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1027-dirty" +#define MM_VERSION "2.18-r1028-dirty" #ifdef __linux__ #include @@ -108,7 +108,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -248,6 +248,9 @@ int main(int argc, char *argv[]) } else if (c == 'V') { puts(MM_VERSION); return 0; + } else if (c == 'U') { + opt.min_mid_occ = strtol(o.arg, &s, 10); + if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10); } else if (c == 'f') { double x; char *p; diff --git a/minimap.h b/minimap.h index 9b7b4ae..543a0ed 100644 --- a/minimap.h +++ b/minimap.h @@ -146,7 +146,7 @@ typedef struct { int pe_ori, pe_bonus; float mid_occ_frac; // only used by mm_mapopt_update(); see below - int32_t min_mid_occ; + int32_t min_mid_occ, max_mid_occ; int32_t mid_occ; // ignore seeds with occurrences above this threshold int32_t max_occ, max_max_occ, occ_dist; int64_t mini_batch_size; // size of a batch of query bases to process in parallel diff --git a/minimap2.1 b/minimap2.1 index 018d958..66ea41a 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -540,7 +540,7 @@ is determined by the sequencing error mode. PacBio HiFi reads to reference mapping .RB ( -k19 .B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -e100 -g10k -.BR --min-occ-floor=100 ). +.BR -U100,500 ). .TP .B asm5 Long assembly to reference mapping diff --git a/options.c b/options.c index a597962..2044b62 100644 --- a/options.c +++ b/options.c @@ -16,6 +16,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) memset(opt, 0, sizeof(mm_mapopt_t)); opt->seed = 11; opt->mid_occ_frac = 2e-4f; + opt->max_mid_occ = 1000000000; opt->sdust_thres = 0; // no SDUST masking opt->min_cnt = 3; @@ -63,6 +64,8 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); if (opt->mid_occ < opt->min_mid_occ) opt->mid_occ = opt->min_mid_occ; + if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) + opt->mid_occ = opt->max_mid_occ; if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); } @@ -96,7 +99,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; mo->max_gap = 10000; mo->occ_dist = 500; - mo->min_mid_occ = 100; + mo->min_mid_occ = 100, mo->max_mid_occ = 500; mo->min_dp_max = 200; } else if (strcmp(preset, "asm5") == 0) { io->flag = 0, io->k = 19, io->w = 19; From 817cb81cb00134760e49392b4ec9998ed1898c6b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 1 May 2021 15:47:25 -0400 Subject: [PATCH 39/77] Updated README for the HiFi preset --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9b4dfe9..b375278 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ cd minimap2 && make # use presets (no test data) ./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio genomic reads ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads -./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio CCS genomic reads +./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown) ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq From f7dc5799c567f406936033ea9fa7f11f6843385f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 1 May 2021 15:56:33 -0400 Subject: [PATCH 40/77] clarify CLR when necessary --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b375278..0d40147 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ cd minimap2 && make ./minimap2 -x map-ont -d MT-human-ont.mmi test/MT-human.fa ./minimap2 -a MT-human-ont.mmi test/MT-orang.fa > test.sam # use presets (no test data) -./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio genomic reads +./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads ./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads @@ -139,13 +139,13 @@ parameters at the same time. The default setting is the same as `map-ont`. #### Map long noisy genomic reads ```sh -minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio subreads +minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads ``` The difference between `map-pb` and `map-ont` is that `map-pb` uses homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary minimizers as seeds. Emperical evaluation suggests HPC minimizers improve -performance and sensitivity when aligning PacBio reads, but hurt when aligning +performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning Nanopore reads. #### Map long mRNA/cDNA reads @@ -206,7 +206,7 @@ strand field. In this case, each line indicates an oriented junction. #### Find overlaps between long reads ```sh -minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio read overlap +minimap2 -x ava-pb reads.fq reads.fq > ovlp.paf # PacBio CLR read overlap minimap2 -x ava-ont reads.fq reads.fq > ovlp.paf # Oxford Nanopore read overlap ``` Similarly, `ava-pb` uses HPC minimizers while `ava-ont` uses ordinary From e81927e7a1e622c92a3c6d95791924df1536bed8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 2 May 2021 18:25:49 -0400 Subject: [PATCH 41/77] prepare to backport unimap/minigraph chaining --- Makefile | 5 +- kalloc.h | 39 +++++ krmq.h | 474 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ lchain.c | 347 ++++++++++++++++++++++++++++++++++++++++ map.c | 8 +- mmpriv.h | 8 +- 6 files changed, 876 insertions(+), 5 deletions(-) create mode 100644 krmq.h create mode 100644 lchain.c diff --git a/Makefile b/Makefile index 8bb0890..d540940 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC INCLUDES= OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \ - chain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ + chain.o lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite @@ -120,6 +120,7 @@ ksw2_exts2_sse.o: ksw2.h kalloc.h ksw2_extz2_sse.o: ksw2.h kalloc.h ksw2_ll_sse.o: ksw2.h kalloc.h kthread.o: kthread.h +lchain.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h krmq.h main.o: bseq.h minimap.h mmpriv.h kseq.h ketopt.h map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h kseq.h map.o: khash.h ksort.h @@ -127,7 +128,7 @@ misc.o: mmpriv.h minimap.h bseq.h kseq.h ksort.h options.o: mmpriv.h minimap.h bseq.h kseq.h pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h sdust.o: kalloc.h kdq.h kvec.h sdust.h -seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h +seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h self-chain.o: minimap.h kseq.h sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h splitidx.o: mmpriv.h minimap.h bseq.h kseq.h diff --git a/kalloc.h b/kalloc.h index 6d72e4e..93bff5e 100644 --- a/kalloc.h +++ b/kalloc.h @@ -34,4 +34,43 @@ void km_stat(const void *_km, km_stat_t *s); KREALLOC((km), (a), (m)); \ } while (0) +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \ + typedef struct { \ + size_t cnt, n, max; \ + kmptype_t **buf; \ + void *km; \ + } kmp_##name##_t; \ + SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \ + kmp_##name##_t *mp; \ + KCALLOC(km, mp, 1); \ + mp->km = km; \ + return mp; \ + } \ + SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \ + size_t k; \ + for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \ + kfree(mp->km, mp->buf); kfree(mp->km, mp); \ + } \ + SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ + ++mp->cnt; \ + if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \ + return mp->buf[--mp->n]; \ + } \ + SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ + --mp->cnt; \ + if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \ + mp->buf[mp->n++] = p; \ + } + +#define KALLOC_POOL_INIT(name, kmptype_t) \ + KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t) + #endif diff --git a/krmq.h b/krmq.h new file mode 100644 index 0000000..4f4ea75 --- /dev/null +++ b/krmq.h @@ -0,0 +1,474 @@ +/* The MIT License + + Copyright (c) 2019 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* An example: + +#include +#include +#include +#include "krmq.h" + +struct my_node { + char key; + KRMQ_HEAD(struct my_node) head; +}; +#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key)) +KRMQ_INIT(my, struct my_node, head, my_cmp) + +int main(void) { + const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate + struct my_node *root = 0; + int i, l = strlen(str); + for (i = 0; i < l; ++i) { // insert in the input order + struct my_node *q, *p = malloc(sizeof(*p)); + p->key = str[i]; + q = krmq_insert(my, &root, p, 0); + if (p != q) free(p); // if already present, free + } + krmq_itr_t(my) itr; + krmq_itr_first(my, root, &itr); // place at first + do { // traverse + const struct my_node *p = krmq_at(&itr); + putchar(p->key); + free((void*)p); // free node + } while (krmq_itr_next(my, &itr)); + putchar('\n'); + return 0; +} +*/ + +#ifndef KRMQ_H +#define KRMQ_H + +#ifdef __STRICT_ANSI__ +#define inline __inline__ +#endif + +#define KRMQ_MAX_DEPTH 64 + +#define krmq_size(head, p) ((p)? (p)->head.size : 0) +#define krmq_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0) + +#define KRMQ_HEAD(__type) \ + struct { \ + __type *p[2], *s; \ + signed char balance; /* balance factor */ \ + unsigned size; /* #elements in subtree */ \ + } + +#define __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \ + __scope __type *krmq_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \ + const __type *p = root; \ + unsigned cnt = 0; \ + while (p != 0) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + if (cnt_) *cnt_ = cnt; \ + return (__type*)p; \ + } \ + __scope __type *krmq_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \ + const __type *p = root, *l = 0, *u = 0; \ + while (p != 0) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp < 0) u = p, p = p->__head.p[0]; \ + else if (cmp > 0) l = p, p = p->__head.p[1]; \ + else { l = u = p; break; } \ + } \ + if (lower) *lower = (__type*)l; \ + if (upper) *upper = (__type*)u; \ + return (__type*)p; \ + } + +#define __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_rmq_##suf(const __type *root, const __type *lo, const __type *up) { /* CLOSED interval */ \ + const __type *p = root, *path[2][KRMQ_MAX_DEPTH], *min; \ + int plen[2] = {0, 0}, pcmp[2][KRMQ_MAX_DEPTH], i, cmp, lca; \ + if (root == 0) return 0; \ + while (p) { \ + cmp = __cmp(lo, p); \ + path[0][plen[0]] = p, pcmp[0][plen[0]++] = cmp; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + p = root; \ + while (p) { \ + cmp = __cmp(up, p); \ + path[1][plen[1]] = p, pcmp[1][plen[1]++] = cmp; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + for (i = 0; i < plen[0] && i < plen[1]; ++i) /* find the LCA */ \ + if (path[0][i] == path[1][i] && pcmp[0][i] <= 0 && pcmp[1][i] >= 0) \ + break; \ + if (i == plen[0] || i == plen[1]) return 0; /* no elements in the closed interval */ \ + lca = i, min = path[0][lca]; \ + for (i = lca + 1; i < plen[0]; ++i) { \ + if (pcmp[0][i] <= 0) { \ + if (__lt2(path[0][i], min)) min = path[0][i]; \ + if (path[0][i]->__head.p[1] && __lt2(path[0][i]->__head.p[1]->__head.s, min)) \ + min = path[0][i]->__head.p[1]->__head.s; \ + } \ + } \ + for (i = lca + 1; i < plen[1]; ++i) { \ + if (pcmp[1][i] >= 0) { \ + if (__lt2(path[1][i], min)) min = path[1][i]; \ + if (path[1][i]->__head.p[0] && __lt2(path[1][i]->__head.p[0]->__head.s, min)) \ + min = path[1][i]->__head.p[0]->__head.s; \ + } \ + } \ + return (__type*)min; \ + } + +#define __KRMQ_ROTATE(suf, __type, __head, __lt2) \ + /* */ \ + static inline void krmq_update_min_##suf(__type *p, const __type *q, const __type *r) { \ + p->__head.s = !q || __lt2(p, q->__head.s)? p : q->__head.s; \ + p->__head.s = !r || __lt2(p->__head.s, r->__head.s)? p->__head.s : r->__head.s; \ + } \ + /* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \ + static inline __type *krmq_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \ + int opp = 1 - dir; /* opposite direction */ \ + __type *q = p->__head.p[opp], *s = p->__head.s; \ + unsigned size_p = p->__head.size; \ + p->__head.size -= q->__head.size - krmq_size_child(__head, q, dir); \ + q->__head.size = size_p; \ + krmq_update_min_##suf(p, p->__head.p[dir], q->__head.p[dir]); \ + q->__head.s = s; \ + p->__head.p[opp] = q->__head.p[dir]; \ + q->__head.p[dir] = p; \ + return q; \ + } \ + /* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \ + static inline __type *krmq_rotate2_##suf(__type *p, int dir) { \ + int b1, opp = 1 - dir; \ + __type *q = p->__head.p[opp], *r = q->__head.p[dir], *s = p->__head.s; \ + unsigned size_x_dir = krmq_size_child(__head, r, dir); \ + r->__head.size = p->__head.size; \ + p->__head.size -= q->__head.size - size_x_dir; \ + q->__head.size -= size_x_dir + 1; \ + krmq_update_min_##suf(p, p->__head.p[dir], r->__head.p[dir]); \ + krmq_update_min_##suf(q, q->__head.p[opp], r->__head.p[opp]); \ + r->__head.s = s; \ + p->__head.p[opp] = r->__head.p[dir]; \ + r->__head.p[dir] = p; \ + q->__head.p[dir] = r->__head.p[opp]; \ + r->__head.p[opp] = q; \ + b1 = dir == 0? +1 : -1; \ + if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \ + else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \ + else q->__head.balance = b1, p->__head.balance = 0; \ + r->__head.balance = 0; \ + return r; \ + } + +#define __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \ + unsigned char stack[KRMQ_MAX_DEPTH]; \ + __type *path[KRMQ_MAX_DEPTH]; \ + __type *bp, *bq; \ + __type *p, *q, *r = 0; /* _r_ is potentially the new root */ \ + int i, which = 0, top, b1, path_len; \ + unsigned cnt = 0; \ + bp = *root_, bq = 0; \ + /* find the insertion location */ \ + for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + if (cmp == 0) { \ + if (cnt_) *cnt_ = cnt; \ + return p; \ + } \ + if (p->__head.balance != 0) \ + bq = q, bp = p, top = 0; \ + stack[top++] = which = (cmp > 0); \ + path[path_len++] = p; \ + } \ + if (cnt_) *cnt_ = cnt; \ + x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0, x->__head.s = x; \ + if (q == 0) *root_ = x; \ + else q->__head.p[which] = x; \ + if (bp == 0) return x; \ + for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \ + for (i = path_len - 1; i >= 0; --i) { \ + krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \ + if (path[i]->__head.s != x) break; \ + } \ + for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \ + if (stack[top] == 0) --p->__head.balance; \ + else ++p->__head.balance; \ + if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \ + /* re-balance */ \ + which = (bp->__head.balance < 0); \ + b1 = which == 0? +1 : -1; \ + q = bp->__head.p[1 - which]; \ + if (q->__head.balance == b1) { \ + r = krmq_rotate1_##suf(bp, which); \ + q->__head.balance = bp->__head.balance = 0; \ + } else r = krmq_rotate2_##suf(bp, which); \ + if (bq == 0) *root_ = r; \ + else bq->__head.p[bp != bq->__head.p[0]] = r; \ + return x; \ + } + +#define __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \ + __type *p, *path[KRMQ_MAX_DEPTH], fake; \ + unsigned char dir[KRMQ_MAX_DEPTH]; \ + int i, d = 0, cmp; \ + unsigned cnt = 0; \ + fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \ + if (cnt_) *cnt_ = 0; \ + if (x) { \ + for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \ + int which = (cmp > 0); \ + if (cmp > 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + dir[d] = which; \ + path[d++] = p; \ + p = p->__head.p[which]; \ + if (p == 0) { \ + if (cnt_) *cnt_ = 0; \ + return 0; \ + } \ + } \ + cnt += krmq_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \ + } else { \ + for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \ + dir[d] = 0, path[d++] = p; \ + p = path[--d]; \ + } \ + if (cnt_) *cnt_ = cnt; \ + for (i = 1; i < d; ++i) --path[i]->__head.size; \ + if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \ + path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \ + } else { \ + __type *q = p->__head.p[1]; \ + if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3,q=2 */ \ + q->__head.p[0] = p->__head.p[0]; \ + q->__head.balance = p->__head.balance; \ + path[d-1]->__head.p[dir[d-1]] = q; \ + path[d] = q, dir[d++] = 1; \ + q->__head.size = p->__head.size - 1; \ + } else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \ + __type *r; \ + int e = d++; /* backup _d_ */\ + for (;;) { \ + dir[d] = 0; \ + path[d++] = q; \ + r = q->__head.p[0]; \ + if (r->__head.p[0] == 0) break; \ + q = r; \ + } \ + r->__head.p[0] = p->__head.p[0]; \ + q->__head.p[0] = r->__head.p[1]; \ + r->__head.p[1] = p->__head.p[1]; \ + r->__head.balance = p->__head.balance; \ + path[e-1]->__head.p[dir[e-1]] = r; \ + path[e] = r, dir[e] = 1; \ + for (i = e + 1; i < d; ++i) --path[i]->__head.size; \ + r->__head.size = p->__head.size - 1; \ + } \ + } \ + for (i = d - 1; i >= 0; --i) /* not sure why adding condition "path[i]->__head.s==p" doesn't work */ \ + krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \ + while (--d > 0) { \ + __type *q = path[d]; \ + int which, other, b1 = 1, b2 = 2; \ + which = dir[d], other = 1 - which; \ + if (which) b1 = -b1, b2 = -b2; \ + q->__head.balance += b1; \ + if (q->__head.balance == b1) break; \ + else if (q->__head.balance == b2) { \ + __type *r = q->__head.p[other]; \ + if (r->__head.balance == -b1) { \ + path[d-1]->__head.p[dir[d-1]] = krmq_rotate2_##suf(q, which); \ + } else { \ + path[d-1]->__head.p[dir[d-1]] = krmq_rotate1_##suf(q, which); \ + if (r->__head.balance == 0) { \ + r->__head.balance = -b1; \ + q->__head.balance = b1; \ + break; \ + } else r->__head.balance = q->__head.balance = 0; \ + } \ + } \ + } \ + *root_ = fake.__head.p[0]; \ + return p; \ + } + +#define krmq_free(__type, __head, __root, __free) do { \ + __type *_p, *_q; \ + for (_p = __root; _p; _p = _q) { \ + if (_p->__head.p[0] == 0) { \ + _q = _p->__head.p[1]; \ + __free(_p); \ + } else { \ + _q = _p->__head.p[0]; \ + _p->__head.p[0] = _q->__head.p[1]; \ + _q->__head.p[1] = _p; \ + } \ + } \ + } while (0) + +#define __KRMQ_ITR(suf, __scope, __type, __head, __cmp) \ + struct krmq_itr_##suf { \ + const __type *stack[KRMQ_MAX_DEPTH], **top; \ + }; \ + __scope void krmq_itr_first_##suf(const __type *root, struct krmq_itr_##suf *itr) { \ + const __type *p; \ + for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \ + *++itr->top = p; \ + } \ + __scope int krmq_itr_find_##suf(const __type *root, const __type *x, struct krmq_itr_##suf *itr) { \ + const __type *p = root; \ + itr->top = itr->stack - 1; \ + while (p != 0) { \ + int cmp; \ + *++itr->top = p; \ + cmp = __cmp(x, p); \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + return p? 1 : 0; \ + } \ + __scope int krmq_itr_next_bidir_##suf(struct krmq_itr_##suf *itr, int dir) { \ + const __type *p; \ + if (itr->top < itr->stack) return 0; \ + dir = !!dir; \ + p = (*itr->top)->__head.p[dir]; \ + if (p) { /* go down */ \ + for (; p; p = p->__head.p[!dir]) \ + *++itr->top = p; \ + return 1; \ + } else { /* go up */ \ + do { \ + p = *itr->top--; \ + } while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \ + return itr->top < itr->stack? 0 : 1; \ + } \ + } \ + +/** + * Insert a node to the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param proot pointer to the root of the tree (in/out: root may change) + * @param x node to insert (in) + * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out) + * + * @return _x_ if not present in the tree, or the node equal to x. + */ +#define krmq_insert(suf, proot, x, cnt) krmq_insert_##suf(proot, x, cnt) + +/** + * Find a node in the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param x node value to find (in) + * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out) + * + * @return node equal to _x_ if present, or NULL if absent + */ +#define krmq_find(suf, root, x, cnt) krmq_find_##suf(root, x, cnt) +#define krmq_interval(suf, root, x, lower, upper) krmq_interval_##suf(root, x, lower, upper) +#define krmq_rmq(suf, root, lo, up) krmq_rmq_##suf(root, lo, up) + +/** + * Delete a node from the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param proot pointer to the root of the tree (in/out: root may change) + * @param x node value to delete; if NULL, delete the first node (in) + * + * @return node removed from the tree if present, or NULL if absent + */ +#define krmq_erase(suf, proot, x, cnt) krmq_erase_##suf(proot, x, cnt) +#define krmq_erase_first(suf, proot) krmq_erase_##suf(proot, 0, 0) + +#define krmq_itr_t(suf) struct krmq_itr_##suf + +/** + * Place the iterator at the smallest object + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param itr iterator + */ +#define krmq_itr_first(suf, root, itr) krmq_itr_first_##suf(root, itr) + +/** + * Place the iterator at the object equal to or greater than the query + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param x query (in) + * @param itr iterator (out) + * + * @return 1 if find; 0 otherwise. krmq_at(itr) is NULL if and only if query is + * larger than all objects in the tree + */ +#define krmq_itr_find(suf, root, x, itr) krmq_itr_find_##suf(root, x, itr) + +/** + * Move to the next object in order + * + * @param itr iterator (modified) + * + * @return 1 if there is a next object; 0 otherwise + */ +#define krmq_itr_next(suf, itr) krmq_itr_next_bidir_##suf(itr, 1) +#define krmq_itr_prev(suf, itr) krmq_itr_next_bidir_##suf(itr, 0) + +/** + * Return the pointer at the iterator + * + * @param itr iterator + * + * @return pointer if present; NULL otherwise + */ +#define krmq_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top) + +#define KRMQ_INIT2(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \ + __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ROTATE(suf, __type, __head, __lt2) \ + __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ITR(suf, __scope, __type, __head, __cmp) + +#define KRMQ_INIT(suf, __type, __head, __cmp, __lt2) \ + KRMQ_INIT2(suf,, __type, __head, __cmp, __lt2) + +#endif diff --git a/lchain.c b/lchain.c new file mode 100644 index 0000000..fb84b94 --- /dev/null +++ b/lchain.c @@ -0,0 +1,347 @@ +#include +#include +#include +#include +#include "mmpriv.h" +#include "kalloc.h" +#include "krmq.h" + +static inline float mg_log2(float x) // NB: this doesn't work when x<2 +{ + union { float f; uint32_t i; } z = { x }; + float log_2 = ((z.i >> 23) & 255) - 128; + z.i &= ~(255 << 23); + z.i += 127 << 23; + log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f; + return log_2; +} + +uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t *n_u_, int32_t *n_v_) +{ + mm128_t *z; + uint64_t *u; + int64_t i, k, n_z, n_v; + int32_t n_u; + + *n_u_ = *n_v_ = 0; + for (i = 0, n_z = 0; i < n; ++i) // precompute n_z + if (f[i] >= min_sc) ++n_z; + if (n_z == 0) return 0; + KMALLOC(km, z, n_z); + for (i = 0, k = 0; i < n; ++i) // populate z[] + if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i; + radix_sort_128x(z, z + n_z); + + memset(t, 0, n * 4); + for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u + int64_t n_v0 = n_v; + int32_t sc; + for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i]) + ++n_v, t[i] = 1; + sc = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt) + ++n_u; + else n_v = n_v0; + } + KMALLOC(km, u, n_u); + memset(t, 0, n * 4); + for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[] + int64_t n_v0 = n_v; + int32_t sc; + for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i]) + v[n_v++] = i, t[i] = 1; + sc = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt) + u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0); + else n_v = n_v0; + } + kfree(km, z); + assert(n_v < INT32_MAX); + *n_u_ = n_u, *n_v_ = n_v; + return u; +} + +static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a) +{ + mm128_t *b, *w; + uint64_t *u2; + int64_t i, j, k; + + // write the result to b[] + KMALLOC(km, b, n_v); + for (i = 0, k = 0; i < n_u; ++i) { + int32_t k0 = k, ni = (int32_t)u[i]; + for (j = 0; j < ni; ++j) + b[k++] = a[v[k0 + (ni - j - 1)]]; + } + kfree(km, v); + + // sort u[] and a[] by the target position, such that adjacent chains may be joined + KMALLOC(km, w, n_u); + for (i = k = 0; i < n_u; ++i) { + w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; + k += (int32_t)u[i]; + } + radix_sort_128x(w, w + n_u); + KMALLOC(km, u2, n_u); + for (i = k = 0; i < n_u; ++i) { + int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; + u2[i] = u[j]; + memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t)); + k += n; + } + memcpy(u, u2, n_u * 8); + memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot + kfree(km, a); kfree(km, w); kfree(km, u2); + return b; +} + +static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap, float chn_pen_skip, int is_cdna, int n_seg) +{ + int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc; + int32_t sidi = (ai->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; + int32_t sidj = (aj->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; + if (dq <= 0 || dq > max_dist_x) return INT32_MIN; + dr = (int32_t)(ai->x - aj->x); + if (sidi == sidj && (dr == 0 || dq > max_dist_y)) return INT32_MIN; + dd = dr > dq? dr - dq : dq - dr; + if (sidi == sidj && dd > bw) return INT32_MIN; + if (n_seg > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) return INT32_MIN; + dg = dr < dq? dr : dq; + q_span = aj->y>>32&0xff; + sc = q_span < dg? q_span : dg; + if (dd || dg > q_span) { + float lin_pen, log_pen; + lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; + log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2 + if (is_cdna) { + if (dr > dq) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends + else sc -= (int)(lin_pen + .5f * log_pen); + } else sc -= (int)(lin_pen + .5f * log_pen); + } + return sc; +} + +/* Input: + * a[].x: tid<<33 | rev<<32 | tpos + * a[].y: flags<<40 | q_span<<32 | q_pos + * Output: + * n_u: #chains + * u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[]) + * input a[] is deallocated on return + */ +mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) +{ // TODO: make sure this works when n has more than 32 bits + int32_t *f, *t, *v, n_u, n_v, mmax_f = 0; + int64_t *p, i, j, max_ii, st = 0, n_iter = 0; + uint64_t *u; + + if (_u) *_u = 0, *n_u_ = 0; + if (n == 0 || a == 0) return 0; + if (max_dist_x < bw) max_dist_x = bw; + if (max_dist_y < bw && !is_cdna) max_dist_y = bw; + KMALLOC(km, p, n); + KMALLOC(km, f, n); + KMALLOC(km, v, n); + KCALLOC(km, t, n); + + // fill the score and backtrack arrays + for (i = 0, max_ii = -1; i < n; ++i) { + int64_t max_j = -1, end_j; + int32_t max_f = a[i].y>>32&0xff, n_skip = 0; + while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st; + if (i - st > max_iter) st = i - max_iter; + for (j = i - 1; j >= st; --j) { + int32_t sc; + sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg); + ++n_iter; + if (sc == INT32_MIN) continue; + sc += f[j]; + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) { + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + end_j = j; + if (max_ii < 0 || a[i].x - a[max_ii].x > (int64_t)max_dist_x) { + int32_t max = INT32_MIN; + max_ii = -1; + for (j = i - 1; j >= st; --j) + if (max < f[j]) max = f[j], max_ii = j; + } + if (max_ii >= 0 && max_ii < end_j) { + int32_t tmp; + tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg); + if (tmp != INT32_MIN && max_f < tmp + f[max_ii]) + max_f = tmp + f[max_ii], max_j = max_ii; + } + f[i] = max_f, p[i] = max_j; + v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak + if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i])) + max_ii = i; + if (mmax_f < max_f) mmax_f = max_f; + } + + u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v); + *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here + kfree(km, p); kfree(km, f); kfree(km, t); + if (n_u == 0) { + kfree(km, a); kfree(km, v); + return 0; + } + return compact_a(km, n_u, u, n_v, v, a); +} + +typedef struct lc_elem_s { + int32_t y; + int64_t i; + double pri; + KRMQ_HEAD(struct lc_elem_s) head; +} lc_elem_t; + +#define lc_elem_cmp(a, b) ((a)->y < (b)->y? -1 : (a)->y > (b)->y? 1 : ((a)->i > (b)->i) - ((a)->i < (b)->i)) +#define lc_elem_lt2(a, b) ((a)->pri < (b)->pri) +KRMQ_INIT(lc_elem, lc_elem_t, head, lc_elem_cmp, lc_elem_lt2) + +KALLOC_POOL_INIT(rmq, lc_elem_t) + +static inline int32_t comput_sc_simple(const mm128_t *ai, const mm128_t *aj, float chn_pen_gap, float chn_pen_skip, int32_t *exact, int32_t *width) +{ + int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc; + dr = (int32_t)(ai->x - aj->x); + *width = dd = dr > dq? dr - dq : dq - dr; + dg = dr < dq? dr : dq; + q_span = aj->y>>32&0xff; + sc = q_span < dg? q_span : dg; + if (exact) *exact = (dd == 0 && dg <= q_span); + if (dd || dq > q_span) { + float lin_pen, log_pen; + lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; + log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2 + sc -= (int)(lin_pen + .5f * log_pen); + } + return sc; +} + +mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) +{ + int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0; + int64_t *p, i, i0, st = 0, st_inner = 0, n_iter = 0; + uint64_t *u; + lc_elem_t *root = 0, *root_inner = 0; + void *mem_mp = 0; + kmp_rmq_t *mp; + + if (_u) *_u = 0, *n_u_ = 0; + if (n == 0 || a == 0) return 0; + if (max_dist < bw) max_dist = bw; + if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0; + KMALLOC(km, p, n); + KMALLOC(km, f, n); + KCALLOC(km, t, n); + KMALLOC(km, v, n); + mem_mp = km_init2(km, 0x10000); + mp = kmp_init_rmq(mem_mp); + + // fill the score and backtrack arrays + for (i = i0 = 0; i < n; ++i) { + int64_t max_j = -1; + int32_t q_span = a[i].y>>32&0xff, max_f = q_span; + lc_elem_t s, *q, *r, lo, hi; + // add in-range anchors + if (i0 < i && a[i0].x != a[i].x) { + int64_t j; + for (j = i0; j < i; ++j) { + q = kmp_alloc_rmq(mp); + q->y = (int32_t)a[j].y, q->i = j, q->pri = -(f[j] + 0.5 * chn_pen_gap * ((int32_t)a[j].x + (int32_t)a[j].y)); + krmq_insert(lc_elem, &root, q, 0); + if (max_dist_inner > 0) { + r = kmp_alloc_rmq(mp); + *r = *q; + krmq_insert(lc_elem, &root_inner, r, 0); + } + } + i0 = i; + } + // get rid of active chains out of range + while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist || krmq_size(head, root) > cap_rmq_size)) { + s.y = (int32_t)a[st].y, s.i = st; + if ((q = krmq_find(lc_elem, root, &s, 0)) != 0) { + q = krmq_erase(lc_elem, &root, q, 0); + kmp_free_rmq(mp, q); + } + ++st; + } + if (max_dist_inner > 0) { // similar to the block above, but applied to the inner tree + while (st_inner < i && (a[i].x>>32 != a[st_inner].x>>32 || a[i].x > a[st_inner].x + max_dist_inner || krmq_size(head, root_inner) > cap_rmq_size)) { + s.y = (int32_t)a[st_inner].y, s.i = st_inner; + if ((q = krmq_find(lc_elem, root_inner, &s, 0)) != 0) { + q = krmq_erase(lc_elem, &root_inner, q, 0); + kmp_free_rmq(mp, q); + } + ++st_inner; + } + } + // RMQ + lo.i = INT32_MAX, lo.y = (int32_t)a[i].y - max_dist; + hi.i = 0, hi.y = (int32_t)a[i].y; + if ((q = krmq_rmq(lc_elem, root, &lo, &hi)) != 0) { + int32_t sc, exact, width, n_skip = 0; + int64_t j = q->i; + assert(q->y >= lo.y && q->y <= hi.y); + sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, &exact, &width); + if (width <= bw && sc > max_f) max_f = sc, max_j = j; + if (!exact && root_inner && (int32_t)a[i].y > 0) { + lc_elem_t *lo, *hi; + s.y = (int32_t)a[i].y - 1, s.i = n; + krmq_interval(lc_elem, root_inner, &s, &lo, &hi); + if (lo) { + const lc_elem_t *q; + int32_t width, n_rmq_iter = 0; + krmq_itr_t(lc_elem) itr; + krmq_itr_find(lc_elem, root_inner, lo, &itr); + while ((q = krmq_at(&itr)) != 0) { + if (q->y < (int32_t)a[i].y - max_dist_inner) break; + ++n_rmq_iter; + j = q->i; + sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width); + if (width <= bw) { + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) { + if (++n_skip > max_chn_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + if (!krmq_itr_prev(lc_elem, &itr)) break; + } + n_iter += n_rmq_iter; + } + } + } + // set max + assert(max_j < 0 || (a[max_j].x < a[i].x && (int32_t)a[max_j].y < (int32_t)a[i].y)); + f[i] = max_f, p[i] = max_j; + v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak + if (mmax_f < max_f) mmax_f = max_f; + if (max_rmq_size < krmq_size(head, root)) max_rmq_size = krmq_size(head, root); + } + km_destroy(mem_mp); + + u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, &n_u, &n_v); + *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here + kfree(km, p); kfree(km, f); kfree(km, t); + if (n_u == 0) { + kfree(km, a); kfree(km, v); + return 0; + } + return compact_a(km, n_u, u, n_v, v, a); +} diff --git a/map.c b/map.c index b4b07d1..cf730d9 100644 --- a/map.c +++ b/map.c @@ -271,7 +271,9 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; } else max_chain_gap_ref = opt->max_gap; - a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); +// a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); if (opt->max_occ > opt->mid_occ && rep_len > 0) { int rechain = 0; @@ -293,7 +295,9 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** kfree(b->km, mini_pos); if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); - a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); +// a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } } b->frag_gap = max_chain_gap_ref; diff --git a/mmpriv.h b/mmpriv.h index d2d8641..45e7c0f 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -72,9 +72,15 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se void mm_idxopt_init(mm_idxopt_t *opt); const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); +mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, + int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); +mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); +mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); + mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a); void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r); void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a); From b7f4d8a0f411b1b0e13fbed6fdf884ab9807af2d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 2 May 2021 18:55:37 -0400 Subject: [PATCH 42/77] removed the old minimap2 chaining --- Makefile | 4 +- chain.c | 164 ------------------------------------------------------- map.c | 2 - 3 files changed, 1 insertion(+), 169 deletions(-) delete mode 100644 chain.c diff --git a/Makefile b/Makefile index d540940..4118616 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra CPPFLAGS= -DHAVE_KALLOC INCLUDES= OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \ - chain.o lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ + lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \ ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite @@ -107,7 +107,6 @@ depend: align.o: minimap.h mmpriv.h bseq.h kseq.h ksw2.h kalloc.h bseq.o: bseq.h kvec.h kalloc.h kseq.h -chain.o: minimap.h mmpriv.h bseq.h kseq.h kalloc.h esterr.o: mmpriv.h minimap.h bseq.h kseq.h example.o: minimap.h kseq.h format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h @@ -129,6 +128,5 @@ options.o: mmpriv.h minimap.h bseq.h kseq.h pe.o: mmpriv.h minimap.h bseq.h kseq.h kvec.h kalloc.h ksort.h sdust.o: kalloc.h kdq.h kvec.h sdust.h seed.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h ksort.h -self-chain.o: minimap.h kseq.h sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h kseq.h splitidx.o: mmpriv.h minimap.h bseq.h kseq.h diff --git a/chain.c b/chain.c deleted file mode 100644 index a2f7ac5..0000000 --- a/chain.c +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include -#include "minimap.h" -#include "mmpriv.h" -#include "kalloc.h" - -static const char LogTable256[256] = { -#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), - LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) -}; - -static inline int ilog2_32(uint32_t v) -{ - uint32_t t, tt; - if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; - return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; -} - -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) -{ // TODO: make sure this works when n has more than 32 bits - int32_t k, *f, *p, *t, *v, n_u, n_v; - int64_t i, j, st = 0; - uint64_t *u, *u2, sum_qspan = 0; - float avg_qspan; - mm128_t *b, *w; - - if (_u) *_u = 0, *n_u_ = 0; - if (n == 0 || a == 0) { - kfree(km, a); - return 0; - } - f = (int32_t*)kmalloc(km, n * 4); - p = (int32_t*)kmalloc(km, n * 4); - t = (int32_t*)kmalloc(km, n * 4); - v = (int32_t*)kmalloc(km, n * 4); - memset(t, 0, n * 4); - - for (i = 0; i < n; ++i) sum_qspan += a[i].y>>32&0xff; - avg_qspan = (float)sum_qspan / n; - - // fill the score and backtrack arrays - for (i = 0; i < n; ++i) { - uint64_t ri = a[i].x; - int64_t max_j = -1; - int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!! - int32_t max_f = q_span, n_skip = 0, min_d; - int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; - while (st < i && ri > a[st].x + max_dist_x) ++st; - if (i - st > max_iter) st = i - max_iter; - for (j = i - 1; j >= st; --j) { - int64_t dr = ri - a[j].x; - int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost; - int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; - if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below - if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue; - dd = dr > dq? dr - dq : dq - dr; - if (sidi == sidj && dd > bw) continue; - if (n_segs > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) continue; - min_d = dq < dr? dq : dr; - sc = min_d > q_span? q_span : dq < dr? dq : dr; - log_dd = dd? ilog2_32(dd) : 0; - gap_cost = 0; - if (is_cdna || sidi != sidj) { - int c_log, c_lin; - c_lin = (int)(dd * .01 * avg_qspan); - c_log = log_dd; - if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus - else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log; - else gap_cost = c_lin + (c_log>>1); - } else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1); - sc -= (int)((double)gap_cost * gap_scale + .499); - sc += f[j]; - if (sc > max_f) { - max_f = sc, max_j = j; - if (n_skip > 0) --n_skip; - } else if (t[j] == i) { - if (++n_skip > max_skip) - break; - } - if (p[j] >= 0) t[p[j]] = i; - } - f[i] = max_f, p[i] = max_j; - v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak - } - - // find the ending positions of chains - memset(t, 0, n * 4); - for (i = 0; i < n; ++i) - if (p[i] >= 0) t[p[i]] = 1; - for (i = n_u = 0; i < n; ++i) - if (t[i] == 0 && v[i] >= min_sc) - ++n_u; - if (n_u == 0) { - kfree(km, a); kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, v); - return 0; - } - u = (uint64_t*)kmalloc(km, n_u * 8); - for (i = n_u = 0; i < n; ++i) { - if (t[i] == 0 && v[i] >= min_sc) { - j = i; - while (j >= 0 && f[j] < v[j]) j = p[j]; // find the peak that maximizes f[] - if (j < 0) j = i; // TODO: this should really be assert(j>=0) - u[n_u++] = (uint64_t)f[j] << 32 | j; - } - } - radix_sort_64(u, u + n_u); - for (i = 0; i < n_u>>1; ++i) { // reverse, s.t. the highest scoring chain is the first - uint64_t t = u[i]; - u[i] = u[n_u - i - 1], u[n_u - i - 1] = t; - } - - // backtrack - memset(t, 0, n * 4); - for (i = n_v = k = 0; i < n_u; ++i) { // starting from the highest score - int32_t n_v0 = n_v, k0 = k; - j = (int32_t)u[i]; - do { - v[n_v++] = j; - t[j] = 1; - j = p[j]; - } while (j >= 0 && t[j] == 0); - if (j < 0) { - if (n_v - n_v0 >= min_cnt) u[k++] = u[i]>>32<<32 | (n_v - n_v0); - } else if ((int32_t)(u[i]>>32) - f[j] >= min_sc) { - if (n_v - n_v0 >= min_cnt) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0); - } - if (k0 == k) n_v = n_v0; // no new chain added, reset - } - *n_u_ = n_u = k, *_u = u; // NB: note that u[] may not be sorted by score here - - // free temporary arrays - kfree(km, f); kfree(km, p); kfree(km, t); - - // write the result to b[] - b = (mm128_t*)kmalloc(km, n_v * sizeof(mm128_t)); - for (i = 0, k = 0; i < n_u; ++i) { - int32_t k0 = k, ni = (int32_t)u[i]; - for (j = 0; j < ni; ++j) - b[k] = a[v[k0 + (ni - j - 1)]], ++k; - } - kfree(km, v); - - // sort u[] and a[] by a[].x, such that adjacent chains may be joined (required by mm_join_long) - w = (mm128_t*)kmalloc(km, n_u * sizeof(mm128_t)); - for (i = k = 0; i < n_u; ++i) { - w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; - k += (int32_t)u[i]; - } - radix_sort_128x(w, w + n_u); - u2 = (uint64_t*)kmalloc(km, n_u * 8); - for (i = k = 0; i < n_u; ++i) { - int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; - u2[i] = u[j]; - memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t)); - k += n; - } - if (n_u) memcpy(u, u2, n_u * 8); - if (k) memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot - kfree(km, a); kfree(km, w); kfree(km, u2); - return b; -} diff --git a/map.c b/map.c index cf730d9..48d1f09 100644 --- a/map.c +++ b/map.c @@ -271,7 +271,6 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; } else max_chain_gap_ref = opt->max_gap; -// a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); @@ -295,7 +294,6 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** kfree(b->km, mini_pos); if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); -// a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } From bbb4f97e52955e9574b516e211b91c02052432e1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 3 May 2021 09:27:04 -0400 Subject: [PATCH 43/77] support RMQ --- main.c | 3 +++ map.c | 11 ++++++++--- minimap.h | 2 ++ options.c | 47 ++++++++++++++++++++++++++--------------------- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/main.c b/main.c index 5234e86..246e0f6 100644 --- a/main.c +++ b/main.c @@ -71,6 +71,7 @@ static ko_longopt_t long_options[] = { { "alt", ko_required_argument, 344 }, { "alt-drop", ko_required_argument, 345 }, { "mask-len", ko_required_argument, 346 }, + { "rmq", ko_optional_argument, 347 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -241,6 +242,8 @@ int main(int argc, char *argv[]) yes_or_no(&opt, MM_F_HEAP_SORT, o.longidx, o.arg, 1); } else if (c == 326) { // --dual yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0); + } else if (c == 347) { // --rmq + yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1); } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) diff --git a/map.c b/map.c index 48d1f09..fd648c4 100644 --- a/map.c +++ b/map.c @@ -271,10 +271,15 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; } else max_chain_gap_ref = opt->max_gap; - a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, - opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + if (opt->flag & MM_F_RMQ) { + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.2f, 0.0f, n_a, a, &n_regs0, &u, b->km); + } else { + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + } - if (opt->max_occ > opt->mid_occ && rep_len > 0) { + if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { int rechain = 0; if (n_regs0 > 0) { // test if the best chain has all the segments int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0; diff --git a/minimap.h b/minimap.h index 543a0ed..281ca3d 100644 --- a/minimap.h +++ b/minimap.h @@ -36,6 +36,7 @@ #define MM_F_NO_END_FLT 0x10000000 #define MM_F_HARD_MLEVEL 0x20000000 #define MM_F_SAM_HIT_ONLY 0x40000000 +#define MM_F_RMQ 0x80000000LL #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 @@ -120,6 +121,7 @@ typedef struct { int min_cnt; // min number of minimizers on each chain int min_chain_score; // min chaining score float chain_gap_scale; + int rmq_size_cap, rmq_inner_dist; float mask_level; int mask_len; diff --git a/options.c b/options.c index 2044b62..4bba3ec 100644 --- a/options.c +++ b/options.c @@ -22,13 +22,15 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_cnt = 3; opt->min_chain_score = 40; opt->bw = 500; - opt->max_gap = 5000; + opt->max_gap = 10000; opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->max_chain_iter = 5000; + opt->rmq_inner_dist = 1000; + opt->rmq_size_cap = 100000; opt->chain_gap_scale = 1.0f; opt->max_max_occ = 4095; - opt->occ_dist = 0; + opt->occ_dist = 500; opt->mask_level = 0.5f; opt->mask_len = INT_MAX; @@ -51,6 +53,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6; opt->max_clip_ratio = 1.0f; opt->mini_batch_size = 500000000; + opt->max_sw_mat = 100000000; opt->pe_ori = 0; // FF opt->pe_bonus = 33; @@ -81,19 +84,18 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) if (preset == 0) { mm_idxopt_init(io); mm_mapopt_init(mo); + } else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default } else if (strcmp(preset, "ava-ont") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; mo->bw = 2000; + } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { + io->flag |= MM_I_HPC, io->k = 19; } else if (strcmp(preset, "ava-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; - } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { - io->flag |= MM_I_HPC, io->k = 19; - } else if (strcmp(preset, "map-ont") == 0) { - io->flag = 0, io->k = 15; } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; @@ -101,24 +103,21 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->occ_dist = 500; mo->min_mid_occ = 100, mo->max_mid_occ = 500; mo->min_dp_max = 200; - } else if (strcmp(preset, "asm5") == 0) { + } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; - mo->min_dp_max = 200; - mo->best_n = 50; - } else if (strcmp(preset, "asm10") == 0) { - io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; - mo->min_dp_max = 200; - mo->best_n = 50; - } else if (strcmp(preset, "asm20") == 0) { - io->flag = 0, io->k = 19, io->w = 10; - mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; + mo->bw = 100000; + mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN; + mo->min_mid_occ = 100, mo->max_mid_occ = 500; mo->min_dp_max = 200; mo->best_n = 50; + if (strcmp(preset, "asm5") == 0) { + mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + } else if (strcmp(preset, "asm10") == 0) { + mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + } else if (strcmp(preset, "asm20") == 0) { + mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + io->w = 10; + } else return -1; } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { io->flag = 0, io->k = 21, io->w = 11; mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT; @@ -140,6 +139,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) } else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; + mo->max_sw_mat = 0; mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->noncan = 9; @@ -153,6 +153,11 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo) { + if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n"); + return -7; + } if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) { if (mm_verbose >= 1) fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n"); From 6c96078ed02de77d5575a183f097a5518a6bb12a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 3 May 2021 22:51:34 -0400 Subject: [PATCH 44/77] r1034: changed multiple defaults; updated manpage --- main.c | 5 +-- minimap2.1 | 98 ++++++++++++++++++++++++++++-------------------------- options.c | 25 ++++++++------ 3 files changed, 68 insertions(+), 60 deletions(-) diff --git a/main.c b/main.c index 246e0f6..57e4173 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1028-dirty" +#define MM_VERSION "2.18-r1034-dirty" #ifdef __linux__ #include @@ -339,7 +339,8 @@ int main(int argc, char *argv[]) fprintf(fp_help, " --version show version number\n"); fprintf(fp_help, " Preset:\n"); fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n"); - fprintf(fp_help, " - map-pb/map-ont - PacBio/Nanopore vs reference mapping\n"); + fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n"); + fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n"); fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n"); fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n"); fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n"); diff --git a/minimap2.1 b/minimap2.1 index 66ea41a..c7f5a07 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -145,18 +145,25 @@ or .B -xsr mode, which sets the threshold for a second round of seeding. .TP -.BI --min-occ-floor \ INT -Force minimap2 to always use k-mers occurring -.I INT -times or less [0]. In effect, the max occurrence threshold is set to -the -.RI max{ INT , -.BR -f }. +.BI -U \ INT1 [, INT2 ] +Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is +.RI max{ INT1 ,\ min{ INT2 , +.BR -f }}. +This option prevents excessively small or large +.B -f +estimated from the input reference. It deprecates +.B --min-occ-floor +in earlier versions of minimap2. .TP -.BI -g \ INT +.BI -e \ INT +Sample a high-frequency minimizer every +.I INT +basepairs [500]. +.TP +.BI -g \ NUM Stop chain enlongation if there are no minimizers within -.IR INT -bp -[10000]. +.IR NUM -bp +[10k]. .TP .BI -r \ INT Bandwidth used in chaining and DP-based alignment [500]. This option @@ -234,6 +241,10 @@ Mark as secondary a chain that overlaps with a better chain by .I FLOAT or more of the shorter chain [0.5] .TP +.BR --rmq = no | yes +Use the minigraph chaining algorithm [no]. The minigraph algorithm is better +for aligning contigs through long INDELs. +.TP .B --hard-mask-level Honor option .B -M @@ -412,7 +423,7 @@ alignment. .BI --cap-sw-mem \ NUM Skip alignment if the DP matrix size is above .IR NUM . -Set 0 to disable [0]. +Set 0 to disable [100m]. .SS Input/output options .TP 10 .B -a @@ -523,66 +534,47 @@ Available .I STR are: .RS -.TP 9 -.B map-pb -PacBio/Oxford Nanopore read to reference mapping -.RB ( -Hk19 ) -.TP +.TP 10 .B map-ont -Slightly more sensitive for Oxford Nanopore to reference mapping -.RB ( -k15 ). -For PacBio reads, HPC minimizers consistently leads to faster performance and -more sensitive results in comparison to normal minimizers. For Oxford Nanopore -data, normal minimizers are better, though not much. The effectiveness of HPC -is determined by the sequencing error mode. +Align noisy long reads of ~10% error rate to a reference genome. This is the +default mode. .TP .B map-hifi -PacBio HiFi reads to reference mapping +Align PacBio high-fidelity (HiFi) reads to a reference genome .RB ( -k19 -.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -e100 -g10k -.BR -U100,500 ). +.B -w19 -U50,500 -A1 -B4 -O6,26 -E2,1 +.BR -s200 ). +.TP +.B map-pb +Align older PacBio continuous long (CLR) reads to a reference genome +.RB ( -Hk19 ). .TP .B asm5 Long assembly to reference mapping .RB ( -k19 -.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w19 -U50,500 --rmq -r100k --no-long-join -A1 -B19 -O39,81 -E3,1 -s200 -z200 +.BR -N50 ). Typically, the alignment will not extend to regions with 5% or higher sequence divergence. Only use this preset if the average divergence is far below 5%. .TP .B asm10 Long assembly to reference mapping .RB ( -k19 -.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w19 -U50,500 --rmq -r100k --no-long-join -A1 -B9 -O16,41 -E2,1 -s200 -z200 +.BR -N50 ). Up to 10% sequence divergence. .TP .B asm20 Long assembly to reference mapping .RB ( -k19 -.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w10 -U50,500 --rmq -r100k --no-long-join -A1 -B4 -O6,26 -E2,1 -s200 -z200 +.BR -N50 ). Up to 20% sequence divergence. .TP -.B ava-pb -PacBio all-vs-all overlap mapping -.RB ( -Hk19 -.B -Xw5 -m100 -g10000 --max-chain-skip -.BR 25 ). -.TP -.B ava-ont -Oxford Nanopore all-vs-all overlap mapping -.RB ( -k15 -.B -Xw5 -m100 -g10000 -r2000 --max-chain-skip -.BR 25 ). -Similarly, the major difference from -.B ava-pb -is that this preset is not using HPC minimizers. -.TP .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 +.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 .BR --splice-flank=yes ). In the splice mode, 1) long deletions are taken as introns and represented as the @@ -604,6 +596,18 @@ Short single-end reads without splicing .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 .B -s40 -g200 -2K50m --heap-sort=yes .BR --secondary=no ). +.TP +.B ava-pb +PacBio CLR all-vs-all overlap mapping +.RB ( -Hk19 +.B -Xw5 -e0 +.BR -m100 ). +.TP +.B ava-ont +Oxford Nanopore all-vs-all overlap mapping +.RB ( -k15 +.B -Xw5 -e0 -m100 +.BR -r2k ). .RE .SS Miscellaneous options .TP 10 diff --git a/options.c b/options.c index 4bba3ec..a64a071 100644 --- a/options.c +++ b/options.c @@ -16,7 +16,8 @@ void mm_mapopt_init(mm_mapopt_t *opt) memset(opt, 0, sizeof(mm_mapopt_t)); opt->seed = 11; opt->mid_occ_frac = 2e-4f; - opt->max_mid_occ = 1000000000; + opt->min_mid_occ = 10; + opt->max_mid_occ = 1000000; opt->sdust_thres = 0; // no SDUST masking opt->min_cnt = 3; @@ -63,12 +64,13 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) { if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV)) opt->flag |= MM_F_SPLICE; - if (opt->mid_occ <= 0) + if (opt->mid_occ <= 0) { opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); - if (opt->mid_occ < opt->min_mid_occ) - opt->mid_occ = opt->min_mid_occ; - if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) - opt->mid_occ = opt->max_mid_occ; + if (opt->mid_occ < opt->min_mid_occ) + opt->mid_occ = opt->min_mid_occ; + if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) + opt->mid_occ = opt->max_mid_occ; + } if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); } @@ -88,26 +90,27 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) } else if (strcmp(preset, "ava-ont") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; - mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; mo->bw = 2000; + mo->occ_dist = 0; } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19; } else if (strcmp(preset, "ava-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; - mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; + mo->occ_dist = 0; } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; - mo->max_gap = 10000; mo->occ_dist = 500; - mo->min_mid_occ = 100, mo->max_mid_occ = 500; + mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->bw = 100000; mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN; - mo->min_mid_occ = 100, mo->max_mid_occ = 500; + mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; mo->best_n = 50; if (strcmp(preset, "asm5") == 0) { From ac146fe7bca8fbe81ab79bcc531f9198f56cab54 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 10 May 2021 12:36:48 -0400 Subject: [PATCH 45/77] r1035: failed to index under the --frag=yes mode Resolves #734 --- main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.c b/main.c index 5234e86..c4144ed 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1028-dirty" +#define MM_VERSION "2.18-r1035-dirty" #ifdef __linux__ #include @@ -393,6 +393,7 @@ int main(int argc, char *argv[]) if (mm_verbose >= 3) mm_idx_stat(mi); if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1); if (alt_list) mm_idx_alt_read(mi, alt_list); + if (argc - (o.ind + 1) == 0) continue; // no query files ret = 0; if (!(opt.flag & MM_F_FRAG_MODE)) { for (i = o.ind + 1; i < argc; ++i) { From 76df351fa8e0049fef6007feb31fbe822e8c5ec3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 10 May 2021 12:55:58 -0400 Subject: [PATCH 46/77] added github action --- .github/workflows/ci.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/ci.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..628acea --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,21 @@ +name: CI + +on: + push: + branches: + - master + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + compiler: [gcc, clang] + + steps: + - name: Checkout minimap2 + uses: actions/checkout@v2 + + - name: Compile with ${{ matrix.compiler }} + run: make CC=${{ matrix.compiler }} From 42a11e1d588be2ce4492528b0fa68e54dc2b6b87 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 10 May 2021 12:58:25 -0400 Subject: [PATCH 47/77] Changed Travis to Github Actions in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0d40147..dafb655 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![GitHub Downloads](https://img.shields.io/github/downloads/lh3/minimap2/total.svg?style=social&logo=github&label=Download)](https://github.com/lh3/minimap2/releases) [![BioConda Install](https://img.shields.io/conda/dn/bioconda/minimap2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/minimap2) [![PyPI](https://img.shields.io/pypi/v/mappy.svg?style=flat)](https://pypi.python.org/pypi/mappy) -[![Build Status](https://travis-ci.org/lh3/minimap2.svg?branch=master)](https://travis-ci.org/lh3/minimap2) +[![Build Status](https://github.com/lh3/minimap2/actions/workflows/ci.yaml/badge.svg)](https://github.com/lh3/minimap2/actions) ## Getting Started ```sh git clone https://github.com/lh3/minimap2 From 4422c0c6f93ab005874604b7c51575780b0d49eb Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Fri, 14 May 2021 12:16:12 -0700 Subject: [PATCH 48/77] Typo fix in Python README. Thanks for sharing minimap2. Here's a little fix. --- python/README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/README.rst b/python/README.rst index 3d0fae8..3082980 100644 --- a/python/README.rst +++ b/python/README.rst @@ -144,7 +144,7 @@ properties: * **mlen**: length of the matching bases in the alignment, excluding ambiguous base matches. -* **NM**: number of mismatches, gaps and ambiguous poistions in the alignment +* **NM**: number of mismatches, gaps and ambiguous positions in the alignment * **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown From 10502e2a78dcc7be87b0a3ecb0e70900b386e30a Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 13 May 2021 17:55:47 +0200 Subject: [PATCH 49/77] Fixed typo in Readme.md deltion -> deletion --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dafb655..9865b93 100644 --- a/README.md +++ b/README.md @@ -274,7 +274,7 @@ CGATCGATAAATAGAGTAG---GAATAGCA CGATCG---AATAGAGTAGGTCGAATtGCA ``` is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an -identical block, `-ata` represents a deltion, `+gtc` an insertion and `*at` +identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at` indicates reference base `a` is substituted with a query base `t`. It is similar to the `MD` SAM tag but is standalone and easier to parse. From 2a3793bbd264f241bc79af1793dcf6c839fd9edb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 23 May 2021 13:08:15 -0400 Subject: [PATCH 50/77] clarify that map-hifi is for HEAD only Resolves #747 --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dafb655..3e5e0bc 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ cd minimap2 && make # use presets (no test data) ./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads -./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads +./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18) +./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (GitHub HEAD) ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown) ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq From cd66777bfbcfe044a0d0b50051244f30404b8185 Mon Sep 17 00:00:00 2001 From: Ariel Erijman Date: Wed, 19 May 2021 13:58:14 -0400 Subject: [PATCH 51/77] small typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bd40bce..10514a8 100644 --- a/README.md +++ b/README.md @@ -254,7 +254,7 @@ To avoid this issue, you can add option `-L` at the minimap2 command line. This option moves a long CIGAR to the `CG` tag and leaves a fully clipped CIGAR at the SAM CIGAR column. Current tools that don't read CIGAR (e.g. merging and sorting) still work with such BAM records; tools that read CIGAR will -effectively ignore these records. It has been decided that future tools will +effectively ignore these records. It has been decided that future tools will seamlessly recognize long-cigar records generated by option `-L`. **TL;DR**: if you work with ultra-long reads and use tools that only process From 9e7247cff938b09fa7ae4ba78810f85497df2bb9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 23 May 2021 21:00:15 -0400 Subject: [PATCH 52/77] sync python with recent changes --- python/cmappy.pxd | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 459b246..c864f55 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -13,7 +13,9 @@ cdef extern from "minimap.h": int64_t flag int seed int sdust_thres + int max_qlen + int bw int max_gap, max_gap_ref int max_frag_len @@ -21,14 +23,19 @@ cdef extern from "minimap.h": int min_cnt int min_chain_score float chain_gap_scale + int rmq_size_cap, rmq_inner_dist + float mask_level int mask_len float pri_ratio int best_n + int max_join_long, max_join_short int min_join_flank_sc float min_join_flank_ratio + float alt_drop + int a, b, q, e, q2, e2 int sc_ambi int noncan @@ -39,13 +46,16 @@ cdef extern from "minimap.h": int min_ksw_len int anchor_ext_len, anchor_ext_shift float max_clip_ratio + int pe_ori, pe_bonus + float mid_occ_frac int32_t min_mid_occ int32_t mid_occ int32_t max_occ int64_t mini_batch_size int64_t max_sw_mat + const char *split_prefix int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) From ec3bc6efd778376177367ed93ed2f8ab6cdcc04d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 May 2021 13:18:17 -0400 Subject: [PATCH 53/77] r1047: make gap penalty proportional to k-mer closer to the older minimap2 --- main.c | 2 +- map.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/main.c b/main.c index e5a50b6..390adf1 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1035-dirty" +#define MM_VERSION "2.18-r1047-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index fd648c4..b974155 100644 --- a/map.c +++ b/map.c @@ -273,10 +273,10 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (opt->flag & MM_F_RMQ) { a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, - opt->chain_gap_scale * 0.2f, 0.0f, n_a, a, &n_regs0, &u, b->km); + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km); } else { a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, - opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { @@ -300,7 +300,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, - opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } } b->frag_gap = max_chain_gap_ref; From 4f91558160be3b6595b18e013984a50ed3ffae67 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 May 2021 16:09:09 -0400 Subject: [PATCH 54/77] r1048: rescue long gaps --- align.c | 6 ++++-- main.c | 2 +- map.c | 10 ++++++++++ minimap.h | 2 +- options.c | 13 +++++++------ python/cmappy.pxd | 2 +- 6 files changed, 24 insertions(+), 11 deletions(-) diff --git a/align.c b/align.c index 6028abb..f2f154f 100644 --- a/align.c +++ b/align.c @@ -567,7 +567,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE); int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; uint8_t *tseq, *qseq, *junc; - int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; + int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; int32_t rs, re, qs, qe; int32_t rs1, qs1, re1, qe1; int8_t mat[25]; @@ -578,6 +578,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (r->cnt == 0) return; ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); bw = (int)(opt->bw * 1.5 + 1.); + bw_long = (int)(opt->bw_long * 1.5 + 1.); + if (bw_long < bw) bw_long = bw; if (is_sr && !(mi->flag & MM_I_HPC)) { mm_max_stretch(r, a, &as1, &cnt1); @@ -714,7 +716,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int } else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe); re1 = re, qe1 = qe; if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) { - int j, bw1 = bw, zdrop_code; + int j, bw1 = bw_long, zdrop_code; if (a[as1+i].y & MM_SEED_LONG_JOIN) bw1 = qe - qs > re - rs? qe - qs : re - rs; // perform alignment diff --git a/main.c b/main.c index 390adf1..27470e7 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1047-dirty" +#define MM_VERSION "2.18-r1048-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index b974155..42b0eaa 100644 --- a/map.c +++ b/map.c @@ -302,6 +302,16 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } + } else if (opt->bw_long > opt->bw && !(opt->flag & MM_F_RMQ) && n_segs == 1 && n_regs0 > 1) { + int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; + if (qlen_sum - (en - st) > 1000 || en - st > qlen_sum * .1) { + int32_t i; + for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i]; + kfree(b->km, u); + radix_sort_128x(a, a + n_a); + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km); + } } b->frag_gap = max_chain_gap_ref; b->rep_len = rep_len; diff --git a/minimap.h b/minimap.h index 281ca3d..10a4b60 100644 --- a/minimap.h +++ b/minimap.h @@ -114,7 +114,7 @@ typedef struct { int max_qlen; // max query length - int bw; // bandwidth + int bw, bw_long; // bandwidth int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window int max_frag_len; int max_chain_skip, max_chain_iter; diff --git a/options.c b/options.c index a64a071..6a3e780 100644 --- a/options.c +++ b/options.c @@ -22,14 +22,14 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_cnt = 3; opt->min_chain_score = 40; - opt->bw = 500; + opt->bw = 500, opt->bw_long = 20000; opt->max_gap = 10000; opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->max_chain_iter = 5000; opt->rmq_inner_dist = 1000; opt->rmq_size_cap = 100000; - opt->chain_gap_scale = 1.0f; + opt->chain_gap_scale = 0.8f; opt->max_max_occ = 4095; opt->occ_dist = 500; @@ -91,7 +91,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; - mo->bw = 2000; + mo->bw = mo->bw_long = 2000; mo->occ_dist = 0; } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19; @@ -99,6 +99,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag |= MM_I_HPC, io->k = 19, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; + mo->bw_long = mo->bw; mo->occ_dist = 0; } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; @@ -108,7 +109,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->min_dp_max = 200; } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; - mo->bw = 100000; + mo->bw = mo->bw_long = 100000; mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN; mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; @@ -130,7 +131,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->end_bonus = 10; mo->max_frag_len = 800; mo->max_gap = 100; - mo->bw = 100; + mo->bw = mo->bw_long = 100; mo->pri_ratio = 0.5f; mo->min_cnt = 2; mo->min_chain_score = 25; @@ -143,7 +144,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; mo->max_sw_mat = 0; - mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; + mo->max_gap = 2000, mo->max_gap_ref = mo->bw = mo->bw_long = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->noncan = 9; mo->junc_bonus = 9; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index c864f55..0c8c57f 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -16,7 +16,7 @@ cdef extern from "minimap.h": int max_qlen - int bw + int bw, bw_long int max_gap, max_gap_ref int max_frag_len int max_chain_skip, max_chain_iter From 379728726a084d0921a5f6396df36ae530e66ac5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 May 2021 16:21:40 -0400 Subject: [PATCH 55/77] r1049: removed the long-join heuristics --- hit.c | 58 ----------------------------------------------- main.c | 7 +++--- map.c | 4 +--- minimap.h | 4 ---- mmpriv.h | 1 - options.c | 5 ---- python/cmappy.pxd | 4 ---- 7 files changed, 5 insertions(+), 78 deletions(-) diff --git a/hit.c b/hit.c index 88bbc03..9a12afb 100644 --- a/hit.c +++ b/hit.c @@ -312,64 +312,6 @@ int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a) return as; } -void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_reg1_t *regs, mm128_t *a) -{ - int i, n_aux, n_regs = *n_regs_, n_drop = 0; - uint64_t *aux; - - if (n_regs < 2) return; // nothing to join - mm_squeeze_a(km, n_regs, regs, a); - - aux = (uint64_t*)kmalloc(km, n_regs * 8); - for (i = n_aux = 0; i < n_regs; ++i) - if (regs[i].parent == i || regs[i].parent < 0) - aux[n_aux++] = (uint64_t)regs[i].as << 32 | i; - radix_sort_64(aux, aux + n_aux); - - for (i = n_aux - 1; i >= 1; --i) { - mm_reg1_t *r0 = ®s[(int32_t)aux[i-1]], *r1 = ®s[(int32_t)aux[i]]; - mm128_t *a0e, *a1s; - int max_gap, min_gap, sc_thres, min_flank_len; - - // test - if (r0->as + r0->cnt != r1->as) continue; // not adjacent in a[] - if (r0->rid != r1->rid || r0->rev != r1->rev) continue; // make sure on the same target and strand - a0e = &a[r0->as + r0->cnt - 1]; - a1s = &a[r1->as]; - if (a1s->x <= a0e->x || (int32_t)a1s->y <= (int32_t)a0e->y) continue; // keep colinearity - max_gap = min_gap = (int32_t)a1s->y - (int32_t)a0e->y; - max_gap = a0e->x + max_gap > a1s->x? max_gap : a1s->x - a0e->x; - min_gap = a0e->x + min_gap < a1s->x? min_gap : a1s->x - a0e->x; - if (max_gap > opt->max_join_long || min_gap > opt->max_join_short) continue; - sc_thres = (int)((float)opt->min_join_flank_sc / opt->max_join_long * max_gap + .499); - if (r0->score < sc_thres || r1->score < sc_thres) continue; // require good flanking chains - min_flank_len = (int)(max_gap * opt->min_join_flank_ratio); - if (r0->re - r0->rs < min_flank_len || r0->qe - r0->qs < min_flank_len) continue; // require enough flanking length - if (r1->re - r1->rs < min_flank_len || r1->qe - r1->qs < min_flank_len) continue; - - // all conditions satisfied; join - a[r1->as].y |= MM_SEED_LONG_JOIN; - r0->cnt += r1->cnt, r0->score += r1->score; - mm_reg_set_coor(r0, qlen, a); - r1->cnt = 0; - r1->parent = r0->id; - ++n_drop; - } - kfree(km, aux); - - if (n_drop > 0) { // then fix the hits hierarchy - for (i = 0; i < n_regs; ++i) { // adjust the mm_reg1_t::parent - mm_reg1_t *r = ®s[i]; - if (r->parent >= 0 && r->id != r->parent) { // fix for secondary hits only - if (regs[r->parent].parent >= 0 && regs[r->parent].parent != r->parent) - r->parent = regs[r->parent].parent; - } - } - mm_filter_regs(opt, qlen, n_regs_, regs); - mm_sync_regs(km, *n_regs_, regs); - } -} - mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a) { int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0; diff --git a/main.c b/main.c index 27470e7..d202adc 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1048-dirty" +#define MM_VERSION "2.18-r1049-dirty" #ifdef __linux__ #include @@ -205,7 +205,6 @@ int main(int argc, char *argv[]) else if (c == 327) opt.max_clip_ratio = atof(o.arg); // --max-clip-ratio else if (c == 328) opt.min_mid_occ = atoi(o.arg); // --min-occ-floor else if (c == 329) opt.flag |= MM_F_OUT_MD; // --MD - else if (c == 330) opt.min_join_flank_ratio = atof(o.arg); // --lj-min-ratio else if (c == 331) opt.sc_ambi = atoi(o.arg); // --score-N else if (c == 332) opt.flag |= MM_F_EQX; // --eqx else if (c == 333) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit @@ -221,7 +220,9 @@ int main(int argc, char *argv[]) else if (c == 344) alt_list = o.arg; // --alt else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len - else if (c == 314) { // --frag + else if (c == 330) { + fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n"); + } else if (c == 314) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); } else if (c == 315) { // --secondary yes_or_no(&opt, MM_F_NO_PRINT_2ND, o.longidx, o.arg, 0); diff --git a/map.c b/map.c index 42b0eaa..efcf933 100644 --- a/map.c +++ b/map.c @@ -210,8 +210,6 @@ static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_i mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); - if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains - mm_join_long(km, opt, qlen, n_regs, regs, a); } } @@ -302,7 +300,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } - } else if (opt->bw_long > opt->bw && !(opt->flag & MM_F_RMQ) && n_segs == 1 && n_regs0 > 1) { + } else if (opt->bw_long > opt->bw && !(opt->flag & (MM_F_RMQ|MM_F_NO_LJOIN)) && n_segs == 1 && n_regs0 > 1) { int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; if (qlen_sum - (en - st) > 1000 || en - st > qlen_sum * .1) { int32_t i; diff --git a/minimap.h b/minimap.h index 10a4b60..3deca6b 100644 --- a/minimap.h +++ b/minimap.h @@ -128,10 +128,6 @@ typedef struct { float pri_ratio; int best_n; // top best_n chains are subjected to DP alignment - int max_join_long, max_join_short; - int min_join_flank_sc; - float min_join_flank_ratio; - float alt_drop; int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties diff --git a/mmpriv.h b/mmpriv.h index 45e7c0f..16b5751 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -91,7 +91,6 @@ void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r); void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs); -void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac); void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr); diff --git a/options.c b/options.c index 6a3e780..07f3f88 100644 --- a/options.c +++ b/options.c @@ -38,11 +38,6 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->pri_ratio = 0.8f; opt->best_n = 5; - opt->max_join_long = 20000; - opt->max_join_short = 2000; - opt->min_join_flank_sc = 1000; - opt->min_join_flank_ratio = 0.5f; - opt->alt_drop = 0.15f; opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 0c8c57f..bfa706d 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -30,10 +30,6 @@ cdef extern from "minimap.h": float pri_ratio int best_n - int max_join_long, max_join_short - int min_join_flank_sc - float min_join_flank_ratio - float alt_drop int a, b, q, e, q2, e2 From 9626b3e716a43673674fa40358b874c7726b6311 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 May 2021 16:29:21 -0400 Subject: [PATCH 56/77] r1050: -r accepts two bandwidths --- main.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/main.c b/main.c index d202adc..7912dda 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1049-dirty" +#define MM_VERSION "2.18-r1050-dirty" #ifdef __linux__ #include @@ -83,17 +83,23 @@ static ko_longopt_t long_options[] = { { 0, 0, 0 } }; -static inline int64_t mm_parse_num(const char *str) +static inline int64_t mm_parse_num2(const char *str, char **q) { double x; char *p; x = strtod(str, &p); - if (*p == 'G' || *p == 'g') x *= 1e9; - else if (*p == 'M' || *p == 'm') x *= 1e6; - else if (*p == 'K' || *p == 'k') x *= 1e3; + if (*p == 'G' || *p == 'g') x *= 1e9, ++p; + else if (*p == 'M' || *p == 'm') x *= 1e6, ++p; + else if (*p == 'K' || *p == 'k') x *= 1e3, ++p; + if (q) *q = p; return (int64_t)(x + .499); } +static inline int64_t mm_parse_num(const char *str) +{ + return mm_parse_num2(str, 0); +} + static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const char *arg, int yes_to_set) { if (yes_to_set) { @@ -145,7 +151,6 @@ int main(int argc, char *argv[]) else if (c == 'k') ipt.k = atoi(o.arg); else if (c == 'H') ipt.flag |= MM_I_HPC; else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I - else if (c == 'r') opt.bw = (int)mm_parse_num(o.arg); else if (c == 't') n_threads = atoi(o.arg); else if (c == 'v') mm_verbose = atoi(o.arg); else if (c == 'g') opt.max_gap = (int)mm_parse_num(o.arg); @@ -252,6 +257,9 @@ int main(int argc, char *argv[]) } else if (c == 'V') { puts(MM_VERSION); return 0; + } else if (c == 'r') { + opt.bw = (int)mm_parse_num2(o.arg, &s); + if (*s == ',') opt.bw_long = (int)mm_parse_num2(s + 1, &s); } else if (c == 'U') { opt.min_mid_occ = strtol(o.arg, &s, 10); if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10); From 34a41197d7ec60bb4853a58a9d76e8f1643608e4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 May 2021 16:38:45 -0400 Subject: [PATCH 57/77] r1051: added two internal parameters rmq_rescue_size and rmq_rescue_ratio --- main.c | 2 +- map.c | 4 ++-- minimap.h | 2 ++ options.c | 2 ++ python/cmappy.pxd | 2 ++ 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/main.c b/main.c index 7912dda..37da4e3 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1050-dirty" +#define MM_VERSION "2.18-r1051-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index efcf933..513fe1f 100644 --- a/map.c +++ b/map.c @@ -300,9 +300,9 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } - } else if (opt->bw_long > opt->bw && !(opt->flag & (MM_F_RMQ|MM_F_NO_LJOIN)) && n_segs == 1 && n_regs0 > 1) { + } else if (opt->bw_long > opt->bw && (opt->flag & (MM_F_RMQ|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; - if (qlen_sum - (en - st) > 1000 || en - st > qlen_sum * .1) { + if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) { int32_t i; for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i]; kfree(b->km, u); diff --git a/minimap.h b/minimap.h index 3deca6b..7386632 100644 --- a/minimap.h +++ b/minimap.h @@ -122,6 +122,8 @@ typedef struct { int min_chain_score; // min chaining score float chain_gap_scale; int rmq_size_cap, rmq_inner_dist; + int rmq_rescue_size; + float rmq_rescue_ratio; float mask_level; int mask_len; diff --git a/options.c b/options.c index 07f3f88..b5a2289 100644 --- a/options.c +++ b/options.c @@ -29,6 +29,8 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->max_chain_iter = 5000; opt->rmq_inner_dist = 1000; opt->rmq_size_cap = 100000; + opt->rmq_rescue_size = 1000; + opt->rmq_rescue_ratio = 0.1f; opt->chain_gap_scale = 0.8f; opt->max_max_occ = 4095; opt->occ_dist = 500; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index bfa706d..abc3ef7 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -24,6 +24,8 @@ cdef extern from "minimap.h": int min_chain_score float chain_gap_scale int rmq_size_cap, rmq_inner_dist + int rmq_rescue_size + float rmq_rescue_ratio float mask_level int mask_len From 41d7ccb191a3c1734fb3ad715a97224ecbd37d84 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 May 2021 16:46:16 -0400 Subject: [PATCH 58/77] r1052: default -g to 5k --- main.c | 2 +- minimap2.1 | 18 +++++++----------- options.c | 4 +++- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/main.c b/main.c index 37da4e3..209fb32 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1051-dirty" +#define MM_VERSION "2.18-r1052-dirty" #ifdef __linux__ #include diff --git a/minimap2.1 b/minimap2.1 index c7f5a07..ba154a6 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -165,8 +165,8 @@ Stop chain enlongation if there are no minimizers within .IR NUM -bp [10k]. .TP -.BI -r \ INT -Bandwidth used in chaining and DP-based alignment [500]. This option +.BI -r \ NUM +Bandwidth used in chaining and DP-based alignment [500,20k]. This option approximately controls the maximum gap size. .TP .BI -n \ INT @@ -279,10 +279,6 @@ Disable the long gap patching heuristic. When this option is applied, the maximum alignment gap is mostly controlled by .BR -r . .TP -.BI --lj-min-ratio \ FLOAT -Fraction of query sequence length required to bridge a long gap [0.5]. A -smaller value helps to recover longer gaps, at the cost of more false gaps. -.TP .B --splice Enable the splice alignment mode. .TP @@ -542,7 +538,7 @@ default mode. .B map-hifi Align PacBio high-fidelity (HiFi) reads to a reference genome .RB ( -k19 -.B -w19 -U50,500 -A1 -B4 -O6,26 -E2,1 +.B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1 .BR -s200 ). .TP .B map-pb @@ -552,7 +548,7 @@ Align older PacBio continuous long (CLR) reads to a reference genome .B asm5 Long assembly to reference mapping .RB ( -k19 -.B -w19 -U50,500 --rmq -r100k --no-long-join -A1 -B19 -O39,81 -E3,1 -s200 -z200 +.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200 .BR -N50 ). Typically, the alignment will not extend to regions with 5% or higher sequence divergence. Only use this preset if the average divergence is far below 5%. @@ -560,21 +556,21 @@ divergence. Only use this preset if the average divergence is far below 5%. .B asm10 Long assembly to reference mapping .RB ( -k19 -.B -w19 -U50,500 --rmq -r100k --no-long-join -A1 -B9 -O16,41 -E2,1 -s200 -z200 +.B -w19 -U50,500 --rmq -r100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200 .BR -N50 ). Up to 10% sequence divergence. .TP .B asm20 Long assembly to reference mapping .RB ( -k19 -.B -w10 -U50,500 --rmq -r100k --no-long-join -A1 -B4 -O6,26 -E2,1 -s200 -z200 +.B -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200 .BR -N50 ). Up to 20% sequence divergence. .TP .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 +.B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 .BR --splice-flank=yes ). In the splice mode, 1) long deletions are taken as introns and represented as the diff --git a/options.c b/options.c index b5a2289..8ff99d5 100644 --- a/options.c +++ b/options.c @@ -23,7 +23,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_cnt = 3; opt->min_chain_score = 40; opt->bw = 500, opt->bw_long = 20000; - opt->max_gap = 10000; + opt->max_gap = 5000; opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->max_chain_iter = 5000; @@ -100,6 +100,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->occ_dist = 0; } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; + mo->max_gap = 10000; mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; mo->occ_dist = 500; mo->min_mid_occ = 50, mo->max_mid_occ = 500; @@ -107,6 +108,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->bw = mo->bw_long = 100000; + mo->max_gap = 10000; mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN; mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; From f31705bb4a9f037c132920681a8a9b736384c88f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 26 May 2021 11:54:46 -0400 Subject: [PATCH 59/77] r1053: made junceval work with PAF --- misc/paftools.js | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index 9124fb7..dada919 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.18-r1022-dirty'; +var paftools_version = '2.18-r1053-dirty'; /***************************** ***** Library functions ***** @@ -2372,25 +2372,40 @@ function paf_junceval(args) var re_cigar = /(\d+)([MIDNSHX=])/g; while (file.readline(buf) >= 0) { var m, t = buf.toString().split("\t"); + var ctg_name = null, cigar = null, pos = null, qname = t[0]; if (t[0].charAt(0) == '@') continue; - if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(t[2])) continue; - var flag = parseInt(t[1]); - if (flag&0x100) continue; - if (first_only && last_qname == t[0]) continue; - if (t[2] == '*') { + if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF + ctg_name = t[5], pos = parseInt(t[7]); + var type = 'P'; + for (i = 12; i < t.length; ++i) { + if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) { + if (m[1] == 'tp:A') type = m[2]; + else cigar = m[2]; + } + } + if (type == 'S') continue; // secondary + } else { // SAM + ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5]; + var flag = parseInt(t[1]); + if (flag&0x100) continue; // secondary + } + + if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue; + if (first_only && last_qname == qname) continue; + if (ctg_name == '*') { // unmapped ++n_unmapped; continue; } else { ++n_pri; - if (last_qname != t[0]) { + if (last_qname != qname) { ++n_mapped; - last_qname = t[0]; + last_qname = qname; } } - var pos = parseInt(t[3]) - 1, intron = []; - while ((m = re_cigar.exec(t[5])) != null) { + var intron = []; + while ((m = re_cigar.exec(cigar)) != null) { var len = parseInt(m[1]), op = m[2]; if (op == 'N') { intron.push([pos, pos + len]); @@ -2403,7 +2418,7 @@ function paf_junceval(args) } n_splice += intron.length; - var chr = anno[t[2]]; + var chr = anno[ctg_name]; if (chr != null) { for (var i = 0; i < intron.length; ++i) { var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]); @@ -2427,12 +2442,12 @@ function paf_junceval(args) x += '(' + o[j][0] + "," + o[j][1] + ')'; } x += ']'; - print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x); + print(type, qname, i+1, ctg_name, intron[i][0], intron[i][1], x); } } else { ++n_splice_novel; if (print_ovlp) - print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]); + print('N', qname, i+1, ctg_name, intron[i][0], intron[i][1]); } } } else { From e74dfd1aa9a580239f38ceab8535950a95f6ef9e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 26 May 2021 12:32:04 -0400 Subject: [PATCH 60/77] r1054: fixed a memory leak --- lchain.c | 10 ++++++++-- main.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lchain.c b/lchain.c index fb84b94..ac60ddb 100644 --- a/lchain.c +++ b/lchain.c @@ -138,7 +138,10 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int uint64_t *u; if (_u) *_u = 0, *n_u_ = 0; - if (n == 0 || a == 0) return 0; + if (n == 0 || a == 0) { + kfree(km, a); + return 0; + } if (max_dist_x < bw) max_dist_x = bw; if (max_dist_y < bw && !is_cdna) max_dist_y = bw; KMALLOC(km, p, n); @@ -239,7 +242,10 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski kmp_rmq_t *mp; if (_u) *_u = 0, *n_u_ = 0; - if (n == 0 || a == 0) return 0; + if (n == 0 || a == 0) { + kfree(km, a); + return 0; + } if (max_dist < bw) max_dist = bw; if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0; KMALLOC(km, p, n); diff --git a/main.c b/main.c index 209fb32..79a5131 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1052-dirty" +#define MM_VERSION "2.18-r1054-dirty" #ifdef __linux__ #include From ef09ccf1044492eebaa190a784a5ad1800dcfc0b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 26 May 2021 21:01:03 -0400 Subject: [PATCH 61/77] Release minimap2-2.19 (r1055) --- NEWS.md | 39 +++++++++++++++++++++++++++++++++++++++ README.md | 11 ++++------- cookbook.md | 4 ++-- main.c | 2 +- minimap2.1 | 2 +- misc/paftools.js | 2 +- python/mappy.pyx | 2 +- setup.py | 2 +- 8 files changed, 50 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index b823cb5..b02b80b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,42 @@ +Release 2.19-r1055 (26 May 2021) +-------------------------------- + +This release includes a few important improvements backported from unimap: + + * Improvement: more contiguous alignment through long INDELs. This is enabled + by the minigraph chaining algorithm. All `asm*` presets now use the new + algorithm. They can find INDELs up to 100kb and may be faster for + chromosome-long contigs. The default mode and `map*` presets use this + algorithm to replace the long-join heuristic. + + * Improvement: better alignment in highly repetitive regions by rescuing + high-occurrence seeds. If the distance between two adjacent seeds is too + large, attempt to choose a fraction of high-occurrence seeds in-between. + Minimap2 now produces fewer clippings and alignment break points in long + satellite regions. + + * Improvement: allow to specify an interval of k-mer occurrences with `-U`. + For repeat-rich genomes, the automatic k-mer occurrence threshold determined + by `-f` may be too large and makes alignment impractically slow. The new + option protects against such cases. Enabled for `asm*` and `map-hifi`. + + * New feature: added the `map-hifi` preset for maping PacBio High-Fidelity + (HiFi) reads. + + * Change to the default: apply `--cap-sw-mem=100m` for genomic alignment. + + * Bugfix: minimap2 could not generate an index file with `-xsr` (#734). + +This release represents the most signficant algorithmic change since v2.1 in +2017. With features backported from unimap, minimap2 now has similar power to +unimap for contig alignment. Unimap will remain an experimental project and is +no longer recommended over minimap2. Sorry for reverting the recommendation in +short time. + +(2.20: 26 May 2021, r1055) + + + Release 2.18-r1015 (9 April 2021) --------------------------------- diff --git a/README.md b/README.md index 10514a8..0141a35 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,8 @@ cd minimap2 && make # use presets (no test data) ./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads -./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18) -./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (GitHub HEAD) +./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or lateer) +./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier) ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown) ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq @@ -27,9 +27,6 @@ cd minimap2 && make # man page for detailed command line options man ./minimap2.1 ``` -[Unimap][unimap] is recommended for aligning long contigs against a reference -genome. It often takes less wall-clock time and is much more sensitive to long -insertions and deletions. ## Table of Contents @@ -77,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from the [release page][release] with: ```sh -curl -L https://github.com/lh3/minimap2/releases/download/v2.18/minimap2-2.18_x64-linux.tar.bz2 | tar -jxvf - -./minimap2-2.18_x64-linux/minimap2 +curl -L https://github.com/lh3/minimap2/releases/download/v2.19/minimap2-2.19_x64-linux.tar.bz2 | tar -jxvf - +./minimap2-2.19_x64-linux/minimap2 ``` If you want to compile from the source, you need to have a C compiler, GNU make and zlib development files installed. Then type `make` in the source code diff --git a/cookbook.md b/cookbook.md index f963ccc..84e5be8 100644 --- a/cookbook.md +++ b/cookbook.md @@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools, please follow the command lines below: ```sh # install minimap2 executables -curl -L https://github.com/lh3/minimap2/releases/download/v2.18/minimap2-2.18_x64-linux.tar.bz2 | tar jxf - -cp minimap2-2.18_x64-linux/{minimap2,k8,paftools.js} . # copy executables +curl -L https://github.com/lh3/minimap2/releases/download/v2.19/minimap2-2.19_x64-linux.tar.bz2 | tar jxf - +cp minimap2-2.19_x64-linux/{minimap2,k8,paftools.js} . # copy executables export PATH="$PATH:"`pwd` # put the current directory on PATH # download example datasets curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf - diff --git a/main.c b/main.c index 79a5131..fd86826 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1054-dirty" +#define MM_VERSION "2.19-r1055" #ifdef __linux__ #include diff --git a/minimap2.1 b/minimap2.1 index ba154a6..c93d67a 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "9 April 2021" "minimap2-2.18 (r1015)" "Bioinformatics tools" +.TH minimap2 1 "26 May 2021" "minimap2-2.19 (r1055)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences diff --git a/misc/paftools.js b/misc/paftools.js index dada919..e258dbc 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.18-r1053-dirty'; +var paftools_version = '2.19-r1055'; /***************************** ***** Library functions ***** diff --git a/python/mappy.pyx b/python/mappy.pyx index fbbc8ab..bd53bac 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -3,7 +3,7 @@ from libc.stdlib cimport free cimport cmappy import sys -__version__ = '2.18' +__version__ = '2.19' cmappy.mm_reset_timer() diff --git a/setup.py b/setup.py index 960775a..7551fe9 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def readme(): setup( name = 'mappy', - version = '2.18', + version = '2.19', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(), From cec23131e461465b3481429aee36766dd95adbf3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 26 May 2021 21:20:48 -0400 Subject: [PATCH 62/77] fixed a python compilation error Caused by chain.c -> lchain.c --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7551fe9..7eed99b 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ setup( keywords = 'sequence-alignment', scripts = ['python/minimap2.py'], ext_modules = [Extension('mappy', - sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', + sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'], depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h', From 9febf532c13c6428c58d422b51ec51f1c5162ef9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 26 May 2021 21:23:42 -0400 Subject: [PATCH 63/77] Release minimap2-2.19 (r1057) --- NEWS.md | 4 ++-- main.c | 2 +- minimap2.1 | 2 +- misc/paftools.js | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index b02b80b..48dd14d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -Release 2.19-r1055 (26 May 2021) +Release 2.19-r1057 (26 May 2021) -------------------------------- This release includes a few important improvements backported from unimap: @@ -33,7 +33,7 @@ unimap for contig alignment. Unimap will remain an experimental project and is no longer recommended over minimap2. Sorry for reverting the recommendation in short time. -(2.20: 26 May 2021, r1055) +(2.20: 26 May 2021, r1057) diff --git a/main.c b/main.c index fd86826..f452a69 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.19-r1055" +#define MM_VERSION "2.19-r1057" #ifdef __linux__ #include diff --git a/minimap2.1 b/minimap2.1 index c93d67a..94e3ce3 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "26 May 2021" "minimap2-2.19 (r1055)" "Bioinformatics tools" +.TH minimap2 1 "26 May 2021" "minimap2-2.19 (r1057)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences diff --git a/misc/paftools.js b/misc/paftools.js index e258dbc..313e490 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.19-r1055'; +var paftools_version = '2.19-r1057'; /***************************** ***** Library functions ***** From 1776c0c64509df24d273ab53a6ee92e0156a76ae Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 26 May 2021 21:36:09 -0400 Subject: [PATCH 64/77] missing seed.c in setup.py Ok, I am not going to re-release again... --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7eed99b..8bc1f00 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ setup( keywords = 'sequence-alignment', scripts = ['python/minimap2.py'], ext_modules = [Extension('mappy', - sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', + sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'], depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h', From 4f8d1bc360e7185784f99cfeec40d7f7adec244d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 26 May 2021 23:01:49 -0400 Subject: [PATCH 65/77] r1059: with --rmq, use the larger bandwidth --- main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.c b/main.c index f452a69..eca3a99 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.19-r1057" +#define MM_VERSION "2.19-r1059-dirty" #ifdef __linux__ #include @@ -296,6 +296,8 @@ int main(int argc, char *argv[]) } if (!fnw && !(opt.flag&MM_F_CIGAR)) ipt.flag |= MM_I_NO_SEQ; + if (opt.bw < opt.bw_long && (opt.flag&MM_F_RMQ)) + opt.bw = opt.bw_long; if (mm_check_opt(&ipt, &opt) < 0) return 1; if (opt.best_n == 0) { From ca19463268ccaa9d9644a006cb41a20a7951c1cd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 27 May 2021 10:55:13 -0400 Subject: [PATCH 66/77] r1060: safer ways to use -rNUM1,NUM2 --- main.c | 6 ++---- map.c | 22 +++++++++++----------- minimap2.1 | 9 ++++++--- options.c | 5 +++++ 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/main.c b/main.c index eca3a99..acd573e 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.19-r1059-dirty" +#define MM_VERSION "2.19-r1060-dirty" #ifdef __linux__ #include @@ -296,8 +296,6 @@ int main(int argc, char *argv[]) } if (!fnw && !(opt.flag&MM_F_CIGAR)) ipt.flag |= MM_I_NO_SEQ; - if (opt.bw < opt.bw_long && (opt.flag&MM_F_RMQ)) - opt.bw = opt.bw_long; if (mm_check_opt(&ipt, &opt) < 0) return 1; if (opt.best_n == 0) { @@ -319,7 +317,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); fprintf(fp_help, " -G NUM max intron length (effective with -xsplice; changing -r) [200k]\n"); fprintf(fp_help, " -F NUM max fragment length (effective with -xsr or in the fragment mode) [800]\n"); - fprintf(fp_help, " -r NUM bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw); + fprintf(fp_help, " -r NUM[,NUM] chaining/alignment bandwidth and long-join bandwidth [%d,%d]\n", opt.bw, opt.bw_long); fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt); fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); // fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy diff --git a/map.c b/map.c index 513fe1f..a41b7a9 100644 --- a/map.c +++ b/map.c @@ -277,7 +277,17 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } - if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { + if (opt->bw_long > opt->bw && (opt->flag & MM_F_NO_LJOIN) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences + int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; + if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) { + int32_t i; + for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i]; + kfree(b->km, u); + radix_sort_128x(a, a + n_a); + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km); + } + } else if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads int rechain = 0; if (n_regs0 > 0) { // test if the best chain has all the segments int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0; @@ -300,16 +310,6 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } - } else if (opt->bw_long > opt->bw && (opt->flag & (MM_F_RMQ|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { - int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; - if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) { - int32_t i; - for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i]; - kfree(b->km, u); - radix_sort_128x(a, a + n_a); - a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, - opt->chain_gap_scale * 0.01 * mi->k, 0.0f, n_a, a, &n_regs0, &u, b->km); - } } b->frag_gap = max_chain_gap_ref; b->rep_len = rep_len; diff --git a/minimap2.1 b/minimap2.1 index 94e3ce3..251ed99 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -165,9 +165,12 @@ Stop chain enlongation if there are no minimizers within .IR NUM -bp [10k]. .TP -.BI -r \ NUM -Bandwidth used in chaining and DP-based alignment [500,20k]. This option -approximately controls the maximum gap size. +.BI -r \ NUM1 [, NUM2 ] +Bandwidth for chaining and base alignment [500,20k]. +.I NUM1 +is used for initial chaining and alignment extension; +.I NUM2 +for RMQ-based re-chaining and closing gaps in alignments. .TP .BI -n \ INT Discard chains consisting of diff --git a/options.c b/options.c index 8ff99d5..08a4d93 100644 --- a/options.c +++ b/options.c @@ -156,6 +156,11 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo) { + if (mo->bw > mo->bw_long) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m with '-rNUM1,NUM2', NUM1 (%d) can't be larger than NUM2 (%d)\033[0m\n", mo->bw, mo->bw_long); + return -8; + } if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) { if (mm_verbose >= 1) fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n"); From 15471bd6297bd6fe17a9ea235ce9a7bdd6effa6c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 27 May 2021 15:26:04 -0400 Subject: [PATCH 67/77] Release minimap2-2.20 (r1061) --- NEWS.md | 14 +++++++++++++- README.md | 4 ++-- cookbook.md | 4 ++-- main.c | 2 +- map.c | 2 +- minimap2.1 | 2 +- misc/paftools.js | 2 +- options.c | 4 ++-- python/mappy.pyx | 2 +- setup.py | 2 +- 10 files changed, 25 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 48dd14d..b770498 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +Release 2.20-r1061 (27 May 2021) +-------------------------------- + +This release fixed a bug in the Python module and improves the command-line +compatibiliity with v2.18. In v2.19, if `-r` is specified with an `asm*` preset, +users would get alignments more fragmented than v2.18. This could be an issue +for existing pipelines specifying `-r`. This release resolves this issue. + +(2.20: 27 May 2021, r1061) + + + Release 2.19-r1057 (26 May 2021) -------------------------------- @@ -33,7 +45,7 @@ unimap for contig alignment. Unimap will remain an experimental project and is no longer recommended over minimap2. Sorry for reverting the recommendation in short time. -(2.20: 26 May 2021, r1057) +(2.19: 26 May 2021, r1057) diff --git a/README.md b/README.md index 0141a35..6e10935 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from the [release page][release] with: ```sh -curl -L https://github.com/lh3/minimap2/releases/download/v2.19/minimap2-2.19_x64-linux.tar.bz2 | tar -jxvf - -./minimap2-2.19_x64-linux/minimap2 +curl -L https://github.com/lh3/minimap2/releases/download/v2.20/minimap2-2.20_x64-linux.tar.bz2 | tar -jxvf - +./minimap2-2.20_x64-linux/minimap2 ``` If you want to compile from the source, you need to have a C compiler, GNU make and zlib development files installed. Then type `make` in the source code diff --git a/cookbook.md b/cookbook.md index 84e5be8..d72e646 100644 --- a/cookbook.md +++ b/cookbook.md @@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools, please follow the command lines below: ```sh # install minimap2 executables -curl -L https://github.com/lh3/minimap2/releases/download/v2.19/minimap2-2.19_x64-linux.tar.bz2 | tar jxf - -cp minimap2-2.19_x64-linux/{minimap2,k8,paftools.js} . # copy executables +curl -L https://github.com/lh3/minimap2/releases/download/v2.20/minimap2-2.20_x64-linux.tar.bz2 | tar jxf - +cp minimap2-2.20_x64-linux/{minimap2,k8,paftools.js} . # copy executables export PATH="$PATH:"`pwd` # put the current directory on PATH # download example datasets curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf - diff --git a/main.c b/main.c index acd573e..32eb211 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.19-r1060-dirty" +#define MM_VERSION "2.20-r1061" #ifdef __linux__ #include diff --git a/map.c b/map.c index a41b7a9..a079d9e 100644 --- a/map.c +++ b/map.c @@ -277,7 +277,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** opt->chain_gap_scale * 0.01 * mi->k, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } - if (opt->bw_long > opt->bw && (opt->flag & MM_F_NO_LJOIN) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences + if (opt->bw_long > opt->bw && (opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) { int32_t i; diff --git a/minimap2.1 b/minimap2.1 index 251ed99..11bc4f5 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "26 May 2021" "minimap2-2.19 (r1057)" "Bioinformatics tools" +.TH minimap2 1 "27 May 2021" "minimap2-2.20 (r1061)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences diff --git a/misc/paftools.js b/misc/paftools.js index 313e490..f048b07 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.19-r1057'; +var paftools_version = '2.20-r1061'; /***************************** ***** Library functions ***** diff --git a/options.c b/options.c index 08a4d93..dfd274d 100644 --- a/options.c +++ b/options.c @@ -75,7 +75,7 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len) { if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0) - opt->max_gap_ref = opt->bw = max_intron_len; + opt->max_gap_ref = opt->bw = opt->bw_long = max_intron_len; } int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) @@ -109,7 +109,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->flag = 0, io->k = 19, io->w = 19; mo->bw = mo->bw_long = 100000; mo->max_gap = 10000; - mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN; + mo->flag |= MM_F_RMQ; mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; mo->best_n = 50; diff --git a/python/mappy.pyx b/python/mappy.pyx index bd53bac..d0bf7f8 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -3,7 +3,7 @@ from libc.stdlib cimport free cimport cmappy import sys -__version__ = '2.19' +__version__ = '2.20' cmappy.mm_reset_timer() diff --git a/setup.py b/setup.py index 8bc1f00..c0b576a 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def readme(): setup( name = 'mappy', - version = '2.19', + version = '2.20', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(), From ce7a59f4127ceb26c1d42b5274f2d1c094caeaa0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 27 May 2021 15:43:04 -0400 Subject: [PATCH 68/77] Fixed a typo in README I just hate my butterfly keyboard! --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6e10935..1255c91 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ cd minimap2 && make # use presets (no test data) ./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads -./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or lateer) +./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later) ./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier) ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown) From c83589b9ea61a7a2ebc32659b1147e8f0b7a0575 Mon Sep 17 00:00:00 2001 From: Torsten Houwaart Date: Mon, 7 Jun 2021 15:31:58 +0200 Subject: [PATCH 69/77] Update README.md Clarification on approximate mapping --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1255c91..d016a46 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ with the ARM related command lines given above. Without any options, minimap2 takes a reference database and a query sequence file as input and produce approximate mapping, without base-level alignment -(i.e. no CIGAR), in the [PAF format][paf]: +(i.e. coordinates are only approximate and no CIGAR in output), in the [PAF format][paf]: ```sh minimap2 ref.fa query.fq > approx-mapping.paf ``` From 3f7147864bcc48604fe0065e68e31dc063c8572d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 16 Jun 2021 09:27:30 -0400 Subject: [PATCH 70/77] r1063: fixed an uninitialized access (#752) This one is harmless. --- main.c | 2 +- seed.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/main.c b/main.c index 32eb211..030e023 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.20-r1061" +#define MM_VERSION "2.20-r1063-dirty" #ifdef __linux__ #include diff --git a/seed.c b/seed.c index bd250d9..d5a1c48 100644 --- a/seed.c +++ b/seed.c @@ -46,7 +46,7 @@ void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_o int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1; int32_t j, k, st = last0 + 1, en = i; int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499); - //fprintf(stderr, "Y\t%d\t%d\n", ps, pe); + if (max_high_occ == 0) goto next_intv; if (max_high_occ > MAX_MAX_HIGH_OCC) max_high_occ = MAX_MAX_HIGH_OCC; for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k) @@ -64,7 +64,7 @@ void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_o if (a[j].n > max_max_occ) a[j].flt = 1; } - last0 = i; +next_intv: last0 = i; } } } From fe6a0bb337e68312538b4715f0850bcfc7131ccb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 16 Jun 2021 10:16:36 -0400 Subject: [PATCH 71/77] r1064: fixed another uninitialized condition This one should also be harmless. It affects a min value, but that value is not actually used. --- krmq.h | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/krmq.h b/krmq.h index 4f4ea75..8fa1cce 100644 --- a/krmq.h +++ b/krmq.h @@ -247,7 +247,7 @@ int main(void) { unsigned char dir[KRMQ_MAX_DEPTH]; \ int i, d = 0, cmp; \ unsigned cnt = 0; \ - fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \ + fake = **root_, fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \ if (cnt_) *cnt_ = 0; \ if (x) { \ for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \ diff --git a/main.c b/main.c index 030e023..e58198f 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.20-r1063-dirty" +#define MM_VERSION "2.20-r1064-dirty" #ifdef __linux__ #include From e6cce019e4f2266a1f9c9f0b91f2a9fd432965c6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 30 Jun 2021 19:20:06 -0400 Subject: [PATCH 72/77] r1068: fixed a bug caused by 3f71478 Resolves #752 (again) --- main.c | 2 +- seed.c | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/main.c b/main.c index e58198f..233d7dd 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.20-r1064-dirty" +#define MM_VERSION "2.20-r1068-dirty" #ifdef __linux__ #include diff --git a/seed.c b/seed.c index d5a1c48..baf1b20 100644 --- a/seed.c +++ b/seed.c @@ -46,25 +46,26 @@ void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_o int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1; int32_t j, k, st = last0 + 1, en = i; int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499); - if (max_high_occ == 0) goto next_intv; - if (max_high_occ > MAX_MAX_HIGH_OCC) - max_high_occ = MAX_MAX_HIGH_OCC; - for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k) - b[k] = (uint64_t)a[j].n<<32 | j; - ks_heapmake_uint64_t(k, b); // initialize the binomial heap - for (; j < en; ++j) { // if there are more, choose top max_high_occ - if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap - b[0] = (uint64_t)a[j].n<<32 | j; - ks_heapdown_uint64_t(0, k, b); + if (max_high_occ > 0) { + if (max_high_occ > MAX_MAX_HIGH_OCC) + max_high_occ = MAX_MAX_HIGH_OCC; + for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k) + b[k] = (uint64_t)a[j].n<<32 | j; + ks_heapmake_uint64_t(k, b); // initialize the binomial heap + for (; j < en; ++j) { // if there are more, choose top max_high_occ + if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap + b[0] = (uint64_t)a[j].n<<32 | j; + ks_heapdown_uint64_t(0, k, b); + } } + for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1; } - for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1; for (j = st; j < en; ++j) a[j].flt ^= 1; for (j = st; j < en; ++j) if (a[j].n > max_max_occ) a[j].flt = 1; } -next_intv: last0 = i; + last0 = i; } } } From 459ce04c8463331ae89ab3780fb02193a4289458 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 2 Jul 2021 11:45:21 -0400 Subject: [PATCH 73/77] r1069: fixed a regression in comparison to v2.18 for PE short reads. An interesting omission. Resolves #776 --- lchain.c | 5 +++-- main.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lchain.c b/lchain.c index ac60ddb..220e3e9 100644 --- a/lchain.c +++ b/lchain.c @@ -114,8 +114,9 @@ static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t ma float lin_pen, log_pen; lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2 - if (is_cdna) { - if (dr > dq) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends + if (is_cdna || sidi != sidj) { + if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus + else if (dr > dq || sidi != sidj) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends else sc -= (int)(lin_pen + .5f * log_pen); } else sc -= (int)(lin_pen + .5f * log_pen); } diff --git a/main.c b/main.c index 233d7dd..7973aa0 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.20-r1068-dirty" +#define MM_VERSION "2.20-r1069-dirty" #ifdef __linux__ #include From 177eef259d1d3f3aa453dd6bb7222e1cc3a0ce52 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 9 Apr 2021 19:34:53 +0100 Subject: [PATCH 74/77] Use the full MIDNSHP=X string whenever printing CIGAR strings Define MM_CIGAR_STR to the full string of CIGAR operators (including the 'B' operator as well) and use it throughout the C code. It would be possible to use it from the Cython code too, but it's easier to keep that as a Cython string literal to avoid adding extra runtime code to handle locale conversion. --- align.c | 2 +- example.c | 2 +- format.c | 4 ++-- minimap.h | 2 ++ misc/paftools.js | 6 +++--- python/mappy.pyx | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/align.c b/align.c index f2f154f..3b5ff9a 100644 --- a/align.c +++ b/align.c @@ -333,7 +333,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint int i; fprintf(stderr, "score=%d, cigar=", ez->score); for (i = 0; i < ez->n_cigar; ++i) - fprintf(stderr, "%d%c", ez->cigar[i]>>4, "MIDN"[ez->cigar[i]&0xf]); + fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]); fprintf(stderr, "\n"); } } diff --git a/example.c b/example.c index ca1fdbe..b495051 100644 --- a/example.c +++ b/example.c @@ -47,7 +47,7 @@ int main(int argc, char *argv[]) printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]); printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq); for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings! - printf("%d%c", r->p->cigar[i]>>4, "MIDNSH"[r->p->cigar[i]&0xf]); + printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]); putchar('\n'); free(r->p); } diff --git a/format.c b/format.c index 11f24b5..a33c3b6 100644 --- a/format.c +++ b/format.c @@ -325,7 +325,7 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); for (k = 0; k < r->p->n_cigar; ++k) - mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]); + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]); } if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1); @@ -382,7 +382,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co assert(clip_len[0] < qlen && clip_len[1] < qlen); if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char); for (k = 0; k < r->p->n_cigar; ++k) - mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]); + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]); if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char); } } diff --git a/minimap.h b/minimap.h index 7386632..d095f46 100644 --- a/minimap.h +++ b/minimap.h @@ -46,6 +46,8 @@ #define MM_MAX_SEG 255 +#define MM_CIGAR_STR "MIDNSHP=XB" + #ifdef __cplusplus extern "C" { #endif diff --git a/misc/paftools.js b/misc/paftools.js index f048b07..999aa78 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1419,7 +1419,7 @@ function paf_view(args) var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); // these are used to show padded alignment var re_cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g; - var re_cg = /(\d+)([MIDNSH])/g; + var re_cg = /(\d+)([MIDNSHP=X])/g; var buf = new Bytes(); var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]); @@ -1899,7 +1899,7 @@ function paf_splice2bed(args) a.length = 0; } - var re = /(\d+)([MIDNSH])/g; + var re = /(\d+)([MIDNSHP=X])/g; var c, fmt = "bed", fn_name_conv = null, keep_multi = false; while ((c = getopt(args, "f:n:m")) != null) { if (c == 'f') fmt = getopt.arg; @@ -2369,7 +2369,7 @@ function paf_junceval(args) file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]); var last_qname = null; - var re_cigar = /(\d+)([MIDNSHX=])/g; + var re_cigar = /(\d+)([MIDNSHP=X])/g; while (file.readline(buf) >= 0) { var m, t = buf.toString().split("\t"); var ctg_name = null, cigar = null, pos = null, qname = t[0]; diff --git a/python/mappy.pyx b/python/mappy.pyx index d0bf7f8..920bc0d 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -82,7 +82,7 @@ cdef class Alignment: @property def cigar_str(self): - return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar)) + return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar)) def __str__(self): if self._strand > 0: strand = '+' From 260a68d23264af55d78b39d55601e99b20d20c76 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 10 Apr 2021 12:49:39 +0100 Subject: [PATCH 75/77] Use #defines for CIGAR operators in C code Give the CIGAR constants names to clarify the code. So that ksw2.h remains self-contained, define KSW_* versions of the CIGAR operators it needs for use within ksw2.h. Other code should in general use the full set of MM_CIGAR_* constants in minimap.h. --- align.c | 60 +++++++++++++++++++++++++++---------------------------- format.c | 20 +++++++++---------- ksw2.h | 19 ++++++++++++------ minimap.h | 10 ++++++++++ 4 files changed, 63 insertions(+), 46 deletions(-) diff --git a/align.c b/align.c index 3b5ff9a..9d00bb4 100644 --- a/align.c +++ b/align.c @@ -53,16 +53,16 @@ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq, // find the score and the region where score drops most along diagonal for (k = 0, score = 0; k < n_cigar; ++k) { uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { for (l = 0; l < len; ++l) { score += mat[tseq[i + l] * 5 + qseq[j + l]]; update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); } i += len, j += len; - } else if (op == 1 || op == 2 || op == 3) { + } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) { score -= opt->q + opt->e * len; - if (op == 1) j += len; // insertion - else i += len; // deletion + if (op == MM_CIGAR_INS) j += len; + else i += len; update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); } } @@ -98,12 +98,12 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, for (k = 0; k < p->n_cigar; ++k) { // indel left alignment uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; if (len == 0) to_shrink = 1; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { toff += len, qoff += len; - } else if (op == 1 || op == 2) { // insertion or deletion + } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) { if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) { int l, prev_len = p->cigar[k-1] >> 4; - if (op == 1) { + if (op == MM_CIGAR_INS) { for (l = 0; l < prev_len; ++l) if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l]) break; @@ -116,9 +116,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l; if (l == prev_len) to_shrink = 1; } - if (op == 1) qoff += len; + if (op == MM_CIGAR_INS) qoff += len; else toff += len; - } else if (op == 3) { + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -128,13 +128,13 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, uint32_t l, s[3] = {0,0,0}; for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D uint32_t op = p->cigar[l]&0xf; - if (op == 1 || op == 2 || p->cigar[l]>>4 == 0) + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0) s[op] += p->cigar[l] >> 4; else break; } if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D - p->cigar[k] = s[1]<<4|1; - p->cigar[k+1] = s[2]<<4|2; + p->cigar[k] = s[1]<<4|MM_CIGAR_INS; + p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL; for (k += 2; k < l; ++k) p->cigar[k] &= 0xf; to_shrink = 1; @@ -154,9 +154,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator p->n_cigar = l; } - if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D + if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D int32_t l = p->cigar[0] >> 4; - if ((p->cigar[0]&0xf) == 1) { + if ((p->cigar[0]&0xf) == MM_CIGAR_INS) { if (r->rev) r->qe -= l; else r->qs += l; *qshift = l; @@ -174,7 +174,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (r->p == 0) return; for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { while (len > 0) { for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "=" if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; } @@ -183,11 +183,11 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; } } ++n_M; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -195,7 +195,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (n_EQX == n_M) { for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) r->p->cigar[k] = len << 4 | 7; + if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH; } return; } @@ -209,25 +209,25 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t toff = qoff = m = 0; for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) { // match/mismatch + if (op == MM_CIGAR_MATCH) { while (len > 0) { // match for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} - if (l > 0) p->cigar[m++] = l << 4 | 7; + if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH; len -= l; toff += l, qoff += l; // mismatch for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} - if (l > 0) p->cigar[m++] = l << 4 | 8; + if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH; len -= l; toff += l, qoff += l; } continue; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } p->cigar[m++] = r->p->cigar[k]; @@ -248,7 +248,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts r->blen = r->mlen = 0; for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; - if (op == 0) { // match/mismatch + if (op == MM_CIGAR_MATCH) { int n_ambi = 0, n_diff = 0; for (l = 0; l < len; ++l) { int cq = qseq[qoff + l], ct = tseq[toff + l]; @@ -260,7 +260,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts } r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi; toff += len, qoff += len; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { int n_ambi = 0; for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; @@ -268,7 +268,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts s -= q + e * len; if (s < 0) s = 0; qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { int n_ambi = 0; for (l = 0; l < len; ++l) if (tseq[toff + l] > 3) ++n_ambi; @@ -276,7 +276,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts s -= q + e * len; if (s < 0) s = 0; toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -730,7 +730,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2; else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b; } - ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs); + ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs); } else { // perform normal gapped alignment mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop } diff --git a/format.c b/format.c index a33c3b6..f94290f 100644 --- a/format.c +++ b/format.c @@ -144,8 +144,8 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq if (write_tag) mm_sprintf_lite(s, "\tcs:Z:"); for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; - assert((op >= 0 && op <= 3) || op == 7 || op == 8); - if (op == 0 || op == 7 || op == 8) { // match + assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH); + if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) { int l_tmp = 0; for (j = 0; j < len; ++j) { if (qseq[q_off + j] != tseq[t_off + j]) { @@ -166,12 +166,12 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq } else mm_sprintf_lite(s, ":%d", l_tmp); } q_off += len, t_off += len; - } else if (op == 1) { // insertion to ref + } else if (op == MM_CIGAR_INS) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "acgtn"[qseq[q_off + j]]; mm_sprintf_lite(s, "+%s", tmp); q_off += len; - } else if (op == 2) { // deletion from ref + } else if (op == MM_CIGAR_DEL) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "acgtn"[tseq[t_off + j]]; mm_sprintf_lite(s, "-%s", tmp); @@ -192,8 +192,8 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq if (write_tag) mm_sprintf_lite(s, "\tMD:Z:"); for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; - assert((op >= 0 && op <= 3) || op == 7 || op == 8); - if (op == 0 || op == 7 || op == 8) { // match + assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH); + if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) { for (j = 0; j < len; ++j) { if (qseq[q_off + j] != tseq[t_off + j]) { mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]); @@ -201,15 +201,15 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq } else ++l_MD; } q_off += len, t_off += len; - } else if (op == 1) { // insertion to ref + } else if (op == MM_CIGAR_INS) { q_off += len; - } else if (op == 2) { // deletion from ref + } else if (op == MM_CIGAR_DEL) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "ACGTN"[tseq[t_off + j]]; mm_sprintf_lite(s, "%d^%s", l_MD, tmp); l_MD = 0; t_off += len; - } else if (op == 3) { // reference skip + } else if (op == MM_CIGAR_N_SKIP) { t_off += len; } } @@ -271,7 +271,7 @@ double mm_event_identity(const mm_reg1_t *r) if (r->p == 0) return -1.0f; for (i = 0; i < r->p->n_cigar; ++i) { int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4; - if (op == 1 || op == 2) + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) ++n_gapo, n_gap += len; } return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo); diff --git a/ksw2.h b/ksw2.h index c700136..cbd1ddc 100644 --- a/ksw2.h +++ b/ksw2.h @@ -16,6 +16,13 @@ #define KSW_EZ_SPLICE_REV 0x200 #define KSW_EZ_SPLICE_FLANK 0x400 +// The subset of CIGAR operators used by ksw code. +// Use MM_CIGAR_* from minimap.h if you need the full list. +#define KSW_CIGAR_MATCH 0 +#define KSW_CIGAR_INS 1 +#define KSW_CIGAR_DEL 2 +#define KSW_CIGAR_N_SKIP 3 + #ifdef __cplusplus extern "C" { #endif @@ -137,13 +144,13 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intro else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure if (force_state >= 0) state = force_state; - if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match - else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion - else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron - else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion + if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j; + else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i; + else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i; + else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j; } - if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion - if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion + if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion + if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion if (!is_rev) for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; diff --git a/minimap.h b/minimap.h index d095f46..fecd775 100644 --- a/minimap.h +++ b/minimap.h @@ -46,6 +46,16 @@ #define MM_MAX_SEG 255 +#define MM_CIGAR_MATCH 0 +#define MM_CIGAR_INS 1 +#define MM_CIGAR_DEL 2 +#define MM_CIGAR_N_SKIP 3 +#define MM_CIGAR_SOFTCLIP 4 +#define MM_CIGAR_HARDCLIP 5 +#define MM_CIGAR_PADDING 6 +#define MM_CIGAR_EQ_MATCH 7 +#define MM_CIGAR_X_MISMATCH 8 + #define MM_CIGAR_STR "MIDNSHP=XB" #ifdef __cplusplus From 52dbd439bcd6bb08b428aed1c0f9b778af04d78e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 2 Jul 2021 20:26:13 -0400 Subject: [PATCH 76/77] r1080: re-versioning --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 7973aa0..cc586cb 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.20-r1069-dirty" +#define MM_VERSION "2.20-r1080-dirty" #ifdef __linux__ #include From 581f2d7123f651d04c9e103b1c7ea8f7051e909a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 6 Jul 2021 13:18:55 -0400 Subject: [PATCH 77/77] Release minimap2-2.21 (r1071) --- NEWS.md | 11 +++++++++++ README.md | 4 ++-- cookbook.md | 4 ++-- main.c | 2 +- minimap2.1 | 6 +++--- misc/paftools.js | 2 +- python/mappy.pyx | 2 +- setup.py | 2 +- 8 files changed, 22 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index b770498..425545a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,14 @@ +Release 2.21-r1071 (6 July 2021) +-------------------------------- + +This release fixed a regression in short-read mapping introduced in v2.19 +(#776). It also fixed invalid comparisons of uninitialized variables, though +these are harmless (#752). Long-read alignment should be identical to v2.20. + +(2.21: 6 July 2021) + + + Release 2.20-r1061 (27 May 2021) -------------------------------- diff --git a/README.md b/README.md index d016a46..0bff2e4 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from the [release page][release] with: ```sh -curl -L https://github.com/lh3/minimap2/releases/download/v2.20/minimap2-2.20_x64-linux.tar.bz2 | tar -jxvf - -./minimap2-2.20_x64-linux/minimap2 +curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar -jxvf - +./minimap2-2.21_x64-linux/minimap2 ``` If you want to compile from the source, you need to have a C compiler, GNU make and zlib development files installed. Then type `make` in the source code diff --git a/cookbook.md b/cookbook.md index d72e646..8ff2bd0 100644 --- a/cookbook.md +++ b/cookbook.md @@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools, please follow the command lines below: ```sh # install minimap2 executables -curl -L https://github.com/lh3/minimap2/releases/download/v2.20/minimap2-2.20_x64-linux.tar.bz2 | tar jxf - -cp minimap2-2.20_x64-linux/{minimap2,k8,paftools.js} . # copy executables +curl -L https://github.com/lh3/minimap2/releases/download/v2.21/minimap2-2.21_x64-linux.tar.bz2 | tar jxf - +cp minimap2-2.21_x64-linux/{minimap2,k8,paftools.js} . # copy executables export PATH="$PATH:"`pwd` # put the current directory on PATH # download example datasets curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf - diff --git a/main.c b/main.c index cc586cb..f4167d8 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.20-r1080-dirty" +#define MM_VERSION "2.21-r1071" #ifdef __linux__ #include diff --git a/minimap2.1 b/minimap2.1 index 11bc4f5..a1c0fb8 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "27 May 2021" "minimap2-2.20 (r1061)" "Bioinformatics tools" +.TH minimap2 1 "6 July 2021" "minimap2-2.21 (r1071)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -592,8 +592,8 @@ Long-read splice alignment for PacBio CCS reads .B sr Short single-end reads without splicing .RB ( -k21 -.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 -.B -s40 -g200 -2K50m --heap-sort=yes +.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 +.B -s40 -g100 -2K50m --heap-sort=yes .BR --secondary=no ). .TP .B ava-pb diff --git a/misc/paftools.js b/misc/paftools.js index 999aa78..0c5c186 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.20-r1061'; +var paftools_version = '2.21-r1071'; /***************************** ***** Library functions ***** diff --git a/python/mappy.pyx b/python/mappy.pyx index 920bc0d..838b5ad 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -3,7 +3,7 @@ from libc.stdlib cimport free cimport cmappy import sys -__version__ = '2.20' +__version__ = '2.21' cmappy.mm_reset_timer() diff --git a/setup.py b/setup.py index c0b576a..1a4d227 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def readme(): setup( name = 'mappy', - version = '2.20', + version = '2.21', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(),