diff --git a/main.c b/main.c index 246e0f6..57e4173 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.18-r1028-dirty" +#define MM_VERSION "2.18-r1034-dirty" #ifdef __linux__ #include @@ -339,7 +339,8 @@ int main(int argc, char *argv[]) fprintf(fp_help, " --version show version number\n"); fprintf(fp_help, " Preset:\n"); fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n"); - fprintf(fp_help, " - map-pb/map-ont - PacBio/Nanopore vs reference mapping\n"); + fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n"); + fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n"); fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n"); fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n"); fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n"); diff --git a/minimap2.1 b/minimap2.1 index 66ea41a..c7f5a07 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -145,18 +145,25 @@ or .B -xsr mode, which sets the threshold for a second round of seeding. .TP -.BI --min-occ-floor \ INT -Force minimap2 to always use k-mers occurring -.I INT -times or less [0]. In effect, the max occurrence threshold is set to -the -.RI max{ INT , -.BR -f }. +.BI -U \ INT1 [, INT2 ] +Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is +.RI max{ INT1 ,\ min{ INT2 , +.BR -f }}. +This option prevents excessively small or large +.B -f +estimated from the input reference. It deprecates +.B --min-occ-floor +in earlier versions of minimap2. .TP -.BI -g \ INT +.BI -e \ INT +Sample a high-frequency minimizer every +.I INT +basepairs [500]. +.TP +.BI -g \ NUM Stop chain enlongation if there are no minimizers within -.IR INT -bp -[10000]. +.IR NUM -bp +[10k]. .TP .BI -r \ INT Bandwidth used in chaining and DP-based alignment [500]. This option @@ -234,6 +241,10 @@ Mark as secondary a chain that overlaps with a better chain by .I FLOAT or more of the shorter chain [0.5] .TP +.BR --rmq = no | yes +Use the minigraph chaining algorithm [no]. The minigraph algorithm is better +for aligning contigs through long INDELs. +.TP .B --hard-mask-level Honor option .B -M @@ -412,7 +423,7 @@ alignment. .BI --cap-sw-mem \ NUM Skip alignment if the DP matrix size is above .IR NUM . -Set 0 to disable [0]. +Set 0 to disable [100m]. .SS Input/output options .TP 10 .B -a @@ -523,66 +534,47 @@ Available .I STR are: .RS -.TP 9 -.B map-pb -PacBio/Oxford Nanopore read to reference mapping -.RB ( -Hk19 ) -.TP +.TP 10 .B map-ont -Slightly more sensitive for Oxford Nanopore to reference mapping -.RB ( -k15 ). -For PacBio reads, HPC minimizers consistently leads to faster performance and -more sensitive results in comparison to normal minimizers. For Oxford Nanopore -data, normal minimizers are better, though not much. The effectiveness of HPC -is determined by the sequencing error mode. +Align noisy long reads of ~10% error rate to a reference genome. This is the +default mode. .TP .B map-hifi -PacBio HiFi reads to reference mapping +Align PacBio high-fidelity (HiFi) reads to a reference genome .RB ( -k19 -.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -e100 -g10k -.BR -U100,500 ). +.B -w19 -U50,500 -A1 -B4 -O6,26 -E2,1 +.BR -s200 ). +.TP +.B map-pb +Align older PacBio continuous long (CLR) reads to a reference genome +.RB ( -Hk19 ). .TP .B asm5 Long assembly to reference mapping .RB ( -k19 -.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w19 -U50,500 --rmq -r100k --no-long-join -A1 -B19 -O39,81 -E3,1 -s200 -z200 +.BR -N50 ). Typically, the alignment will not extend to regions with 5% or higher sequence divergence. Only use this preset if the average divergence is far below 5%. .TP .B asm10 Long assembly to reference mapping .RB ( -k19 -.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w19 -U50,500 --rmq -r100k --no-long-join -A1 -B9 -O16,41 -E2,1 -s200 -z200 +.BR -N50 ). Up to 10% sequence divergence. .TP .B asm20 Long assembly to reference mapping .RB ( -k19 -.B -w10 -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50 -.BR --min-occ-floor=100 ). +.B -w10 -U50,500 --rmq -r100k --no-long-join -A1 -B4 -O6,26 -E2,1 -s200 -z200 +.BR -N50 ). Up to 20% sequence divergence. .TP -.B ava-pb -PacBio all-vs-all overlap mapping -.RB ( -Hk19 -.B -Xw5 -m100 -g10000 --max-chain-skip -.BR 25 ). -.TP -.B ava-ont -Oxford Nanopore all-vs-all overlap mapping -.RB ( -k15 -.B -Xw5 -m100 -g10000 -r2000 --max-chain-skip -.BR 25 ). -Similarly, the major difference from -.B ava-pb -is that this preset is not using HPC minimizers. -.TP .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 +.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0 .BR --splice-flank=yes ). In the splice mode, 1) long deletions are taken as introns and represented as the @@ -604,6 +596,18 @@ Short single-end reads without splicing .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 .B -s40 -g200 -2K50m --heap-sort=yes .BR --secondary=no ). +.TP +.B ava-pb +PacBio CLR all-vs-all overlap mapping +.RB ( -Hk19 +.B -Xw5 -e0 +.BR -m100 ). +.TP +.B ava-ont +Oxford Nanopore all-vs-all overlap mapping +.RB ( -k15 +.B -Xw5 -e0 -m100 +.BR -r2k ). .RE .SS Miscellaneous options .TP 10 diff --git a/options.c b/options.c index 4bba3ec..a64a071 100644 --- a/options.c +++ b/options.c @@ -16,7 +16,8 @@ void mm_mapopt_init(mm_mapopt_t *opt) memset(opt, 0, sizeof(mm_mapopt_t)); opt->seed = 11; opt->mid_occ_frac = 2e-4f; - opt->max_mid_occ = 1000000000; + opt->min_mid_occ = 10; + opt->max_mid_occ = 1000000; opt->sdust_thres = 0; // no SDUST masking opt->min_cnt = 3; @@ -63,12 +64,13 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) { if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV)) opt->flag |= MM_F_SPLICE; - if (opt->mid_occ <= 0) + if (opt->mid_occ <= 0) { opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); - if (opt->mid_occ < opt->min_mid_occ) - opt->mid_occ = opt->min_mid_occ; - if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) - opt->mid_occ = opt->max_mid_occ; + if (opt->mid_occ < opt->min_mid_occ) + opt->mid_occ = opt->min_mid_occ; + if (opt->max_mid_occ > opt->min_mid_occ && opt->mid_occ > opt->max_mid_occ) + opt->mid_occ = opt->max_mid_occ; + } if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); } @@ -88,26 +90,27 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) } else if (strcmp(preset, "ava-ont") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; - mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; mo->bw = 2000; + mo->occ_dist = 0; } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19; } else if (strcmp(preset, "ava-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; - mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; + mo->occ_dist = 0; } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; - mo->max_gap = 10000; mo->occ_dist = 500; - mo->min_mid_occ = 100, mo->max_mid_occ = 500; + mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->bw = 100000; mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN; - mo->min_mid_occ = 100, mo->max_mid_occ = 500; + mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; mo->best_n = 50; if (strcmp(preset, "asm5") == 0) {