r1034: changed multiple defaults; updated manpage

This commit is contained in:
Heng Li 2021-05-03 22:51:34 -04:00
parent bbb4f97e52
commit 6c96078ed0
3 changed files with 68 additions and 60 deletions

5
main.c
View File

@ -7,7 +7,7 @@
#include "mmpriv.h"
#include "ketopt.h"
#define MM_VERSION "2.18-r1028-dirty"
#define MM_VERSION "2.18-r1034-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -339,7 +339,8 @@ int main(int argc, char *argv[])
fprintf(fp_help, " --version show version number\n");
fprintf(fp_help, " Preset:\n");
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
fprintf(fp_help, " - map-pb/map-ont - PacBio/Nanopore vs reference mapping\n");
fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n");

View File

@ -145,18 +145,25 @@ or
.B -xsr
mode, which sets the threshold for a second round of seeding.
.TP
.BI --min-occ-floor \ INT
Force minimap2 to always use k-mers occurring
.I INT
times or less [0]. In effect, the max occurrence threshold is set to
the
.RI max{ INT ,
.BR -f }.
.BI -U \ INT1 [, INT2 ]
Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is
.RI max{ INT1 ,\ min{ INT2 ,
.BR -f }}.
This option prevents excessively small or large
.B -f
estimated from the input reference. It deprecates
.B --min-occ-floor
in earlier versions of minimap2.
.TP
.BI -g \ INT
.BI -e \ INT
Sample a high-frequency minimizer every
.I INT
basepairs [500].
.TP
.BI -g \ NUM
Stop chain enlongation if there are no minimizers within
.IR INT -bp
[10000].
.IR NUM -bp
[10k].
.TP
.BI -r \ INT
Bandwidth used in chaining and DP-based alignment [500]. This option
@ -234,6 +241,10 @@ Mark as secondary a chain that overlaps with a better chain by
.I FLOAT
or more of the shorter chain [0.5]
.TP
.BR --rmq = no | yes
Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
for aligning contigs through long INDELs.
.TP
.B --hard-mask-level
Honor option
.B -M
@ -412,7 +423,7 @@ alignment.
.BI --cap-sw-mem \ NUM
Skip alignment if the DP matrix size is above
.IR NUM .
Set 0 to disable [0].
Set 0 to disable [100m].
.SS Input/output options
.TP 10
.B -a
@ -523,66 +534,47 @@ Available
.I STR
are:
.RS
.TP 9
.B map-pb
PacBio/Oxford Nanopore read to reference mapping
.RB ( -Hk19 )
.TP
.TP 10
.B map-ont
Slightly more sensitive for Oxford Nanopore to reference mapping
.RB ( -k15 ).
For PacBio reads, HPC minimizers consistently leads to faster performance and
more sensitive results in comparison to normal minimizers. For Oxford Nanopore
data, normal minimizers are better, though not much. The effectiveness of HPC
is determined by the sequencing error mode.
Align noisy long reads of ~10% error rate to a reference genome. This is the
default mode.
.TP
.B map-hifi
PacBio HiFi reads to reference mapping
Align PacBio high-fidelity (HiFi) reads to a reference genome
.RB ( -k19
.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -e100 -g10k
.BR -U100,500 ).
.B -w19 -U50,500 -A1 -B4 -O6,26 -E2,1
.BR -s200 ).
.TP
.B map-pb
Align older PacBio continuous long (CLR) reads to a reference genome
.RB ( -Hk19 ).
.TP
.B asm5
Long assembly to reference mapping
.RB ( -k19
.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 -N50
.BR --min-occ-floor=100 ).
.B -w19 -U50,500 --rmq -r100k --no-long-join -A1 -B19 -O39,81 -E3,1 -s200 -z200
.BR -N50 ).
Typically, the alignment will not extend to regions with 5% or higher sequence
divergence. Only use this preset if the average divergence is far below 5%.
.TP
.B asm10
Long assembly to reference mapping
.RB ( -k19
.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 -N50
.BR --min-occ-floor=100 ).
.B -w19 -U50,500 --rmq -r100k --no-long-join -A1 -B9 -O16,41 -E2,1 -s200 -z200
.BR -N50 ).
Up to 10% sequence divergence.
.TP
.B asm20
Long assembly to reference mapping
.RB ( -k19
.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50
.BR --min-occ-floor=100 ).
.B -w10 -U50,500 --rmq -r100k --no-long-join -A1 -B4 -O6,26 -E2,1 -s200 -z200
.BR -N50 ).
Up to 20% sequence divergence.
.TP
.B ava-pb
PacBio all-vs-all overlap mapping
.RB ( -Hk19
.B -Xw5 -m100 -g10000 --max-chain-skip
.BR 25 ).
.TP
.B ava-ont
Oxford Nanopore all-vs-all overlap mapping
.RB ( -k15
.B -Xw5 -m100 -g10000 -r2000 --max-chain-skip
.BR 25 ).
Similarly, the major difference from
.B ava-pb
is that this preset is not using HPC minimizers.
.TP
.B splice
Long-read spliced alignment
.RB ( -k15
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
.BR --splice-flank=yes ).
In the splice mode, 1) long deletions are taken as introns and represented as
the
@ -604,6 +596,18 @@ Short single-end reads without splicing
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20
.B -s40 -g200 -2K50m --heap-sort=yes
.BR --secondary=no ).
.TP
.B ava-pb
PacBio CLR all-vs-all overlap mapping
.RB ( -Hk19
.B -Xw5 -e0
.BR -m100 ).
.TP
.B ava-ont
Oxford Nanopore all-vs-all overlap mapping
.RB ( -k15
.B -Xw5 -e0 -m100
.BR -r2k ).
.RE
.SS Miscellaneous options
.TP 10

View File

@ -16,7 +16,8 @@ void mm_mapopt_init(mm_mapopt_t *opt)
memset(opt, 0, sizeof(mm_mapopt_t));
opt->seed = 11;
opt->mid_occ_frac = 2e-4f;
opt->max_mid_occ = 1000000000;
opt->min_mid_occ = 10;
opt->max_mid_occ = 1000000;
opt->sdust_thres = 0; // no SDUST masking
opt->min_cnt = 3;
@ -63,12 +64,13 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
{
if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV))
opt->flag |= MM_F_SPLICE;
if (opt->mid_occ <= 0)
if (opt->mid_occ <= 0) {
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
if (opt->mid_occ < opt->min_mid_occ)
opt->mid_occ = opt->min_mid_occ;
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
opt->mid_occ = opt->max_mid_occ;
if (opt->mid_occ < opt->min_mid_occ)
opt->mid_occ = opt->min_mid_occ;
if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ)
opt->mid_occ = opt->max_mid_occ;
}
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
}
@ -88,26 +90,27 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
} else if (strcmp(preset, "ava-ont") == 0) {
io->flag = 0, io->k = 15, io->w = 5;
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
mo->bw = 2000;
mo->occ_dist = 0;
} else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) {
io->flag |= MM_I_HPC, io->k = 19;
} else if (strcmp(preset, "ava-pb") == 0) {
io->flag |= MM_I_HPC, io->k = 19, io->w = 5;
mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25;
mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
mo->occ_dist = 0;
} else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
io->flag = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
mo->max_gap = 10000;
mo->occ_dist = 500;
mo->min_mid_occ = 100, mo->max_mid_occ = 500;
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
} else if (strncmp(preset, "asm", 3) == 0) {
io->flag = 0, io->k = 19, io->w = 19;
mo->bw = 100000;
mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN;
mo->min_mid_occ = 100, mo->max_mid_occ = 500;
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
mo->best_n = 50;
if (strcmp(preset, "asm5") == 0) {