r741: added --min-occ-floor to improve #107

This commit is contained in:
Heng Li 2018-03-12 14:32:27 -04:00
parent ad1beaf255
commit bdc615c1d4
5 changed files with 34 additions and 5 deletions

4
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h" #include "mmpriv.h"
#include "getopt.h" #include "getopt.h"
#define MM_VERSION "2.9-r720" #define MM_VERSION "2.9-r741-dirty"
#ifdef __linux__ #ifdef __linux__
#include <sys/resource.h> #include <sys/resource.h>
@ -51,6 +51,7 @@ static struct option long_options[] = {
{ "all-chain", no_argument, 0, 'P' }, { "all-chain", no_argument, 0, 'P' },
{ "dual", required_argument, 0, 0 }, // 26 { "dual", required_argument, 0, 0 }, // 26
{ "max-clip-ratio", required_argument, 0, 0 }, // 27 { "max-clip-ratio", required_argument, 0, 0 }, // 27
{ "min-occ-floor", required_argument, 0, 0 }, // 28
{ "help", no_argument, 0, 'h' }, { "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' }, { "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' }, { "version", no_argument, 0, 'V' },
@ -164,6 +165,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx ==22) opt.flag |= MM_F_FOR_ONLY; // --for-only else if (c == 0 && long_idx ==22) opt.flag |= MM_F_FOR_ONLY; // --for-only
else if (c == 0 && long_idx ==23) opt.flag |= MM_F_REV_ONLY; // --rev-only else if (c == 0 && long_idx ==23) opt.flag |= MM_F_REV_ONLY; // --rev-only
else if (c == 0 && long_idx ==27) opt.max_clip_ratio = atof(optarg); // --max-clip-ratio else if (c == 0 && long_idx ==27) opt.max_clip_ratio = atof(optarg); // --max-clip-ratio
else if (c == 0 && long_idx ==28) opt.min_mid_occ = atoi(optarg); // --min-occ-floor
else if (c == 0 && long_idx == 14) { // --frag else if (c == 0 && long_idx == 14) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1); yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1);
} else if (c == 0 && long_idx == 15) { // --secondary } else if (c == 0 && long_idx == 15) { // --secondary

View File

@ -126,6 +126,7 @@ typedef struct {
int pe_ori, pe_bonus; int pe_ori, pe_bonus;
float mid_occ_frac; // only used by mm_mapopt_update(); see below float mid_occ_frac; // only used by mm_mapopt_update(); see below
int32_t min_mid_occ;
int32_t mid_occ; // ignore seeds with occurrences above this threshold int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ; int32_t max_occ;
int mini_batch_size; // size of a batch of query bases to process in parallel int mini_batch_size; // size of a batch of query bases to process in parallel

View File

@ -217,6 +217,14 @@ on chains. Set
.I INT .I INT
to a large number to switch off this heurstics. to a large number to switch off this heurstics.
.TP .TP
.BI --min-occ-floor \ INT
Force minimap2 to always use k-mers occurring
.I INT
times or less [0]. In effect, the max occurrence threshold is set to
the
.RI max{ INT ,
.BR -f }.
.TP
.B --no-long-join .B --no-long-join
Disable the long gap patching heuristic. When this option is applied, the Disable the long gap patching heuristic. When this option is applied, the
maximum alignment gap is mostly controlled by maximum alignment gap is mostly controlled by
@ -433,18 +441,25 @@ is determined by the sequencing error mode.
.B asm5 .B asm5
Long assembly to reference mapping Long assembly to reference mapping
.RB ( -k19 .RB ( -k19
.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 .B -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200
.BR -z200 ). .BR --min-occ-floor=100 ).
Typically, the alignment will not extend to regions with 5% or higher sequence Typically, the alignment will not extend to regions with 5% or higher sequence
divergence. Only use this preset if the average divergence is far below 5%. divergence. Only use this preset if the average divergence is far below 5%.
.TP .TP
.B asm10 .B asm10
Long assembly to reference mapping Long assembly to reference mapping
.RB ( -k19 .RB ( -k19
.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 .B -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200
.BR -z200 ). .BR --min-occ-floor=100 ).
Up to 10% sequence divergence. Up to 10% sequence divergence.
.TP .TP
.B asm20
Long assembly to reference mapping
.RB ( -k19
.B -w10 -A1 -B6 -O6,26 -E2,1 -s200 -z200
.BR --min-occ-floor=100 ).
Up to 20% sequence divergence.
.TP
.B ava-pb .B ava-pb
PacBio all-vs-all overlap mapping PacBio all-vs-all overlap mapping
.RB ( -Hk19 .RB ( -Hk19

View File

@ -51,6 +51,8 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
opt->flag |= MM_F_SPLICE; opt->flag |= MM_F_SPLICE;
if (opt->mid_occ <= 0) if (opt->mid_occ <= 0)
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
if (opt->mid_occ < opt->min_mid_occ)
opt->mid_occ = opt->min_mid_occ;
if (mm_verbose >= 3) if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
} }
@ -81,11 +83,19 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
} else if (strcmp(preset, "asm5") == 0) { } else if (strcmp(preset, "asm5") == 0) {
io->flag = 0, io->k = 19, io->w = 19; io->flag = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
mo->min_mid_occ = 100;
mo->min_dp_max = 200; mo->min_dp_max = 200;
mo->best_n = 50; mo->best_n = 50;
} else if (strcmp(preset, "asm10") == 0) { } else if (strcmp(preset, "asm10") == 0) {
io->flag = 0, io->k = 19, io->w = 19; io->flag = 0, io->k = 19, io->w = 19;
mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
mo->min_mid_occ = 100;
mo->min_dp_max = 200;
mo->best_n = 50;
} else if (strcmp(preset, "asm20") == 0) {
io->flag = 0, io->k = 19, io->w = 10;
mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200;
mo->min_mid_occ = 100;
mo->min_dp_max = 200; mo->min_dp_max = 200;
mo->best_n = 50; mo->best_n = 50;
} else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) {

View File

@ -34,6 +34,7 @@ cdef extern from "minimap.h":
float max_clip_ratio float max_clip_ratio
int pe_ori, pe_bonus int pe_ori, pe_bonus
float mid_occ_frac float mid_occ_frac
int32_t min_mid_occ
int32_t mid_occ int32_t mid_occ
int32_t max_occ int32_t max_occ
int mini_batch_size int mini_batch_size