r126: filter by fraction of seed coverage

otherwise we may get too many poor overlap mappings.
This commit is contained in:
Heng Li 2017-06-30 22:15:45 -04:00
parent d73bb28097
commit 426c2975f6
6 changed files with 41 additions and 23 deletions

16
align.c
View File

@ -304,21 +304,9 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
++n_regs;
}
}
*n_regs_ = n_regs;
kfree(km, qseq0[0]); kfree(km, qseq0[1]);
kfree(km, ez.cigar);
// filter
for (r = i = 0; r < n_regs; ++r) {
mm_reg1_t *reg = &regs[r];
int flt = 0;
if (reg->cnt < opt->min_cnt) flt = 1;
else if (reg->p->blen - reg->p->n_ambi - reg->p->n_diff < opt->min_chain_score) flt = 1;
else if (reg->p->dp_max < opt->min_dp_max) flt = 1;
if (flt) free(reg->p);
else if (i < r) regs[i++] = regs[r]; // NB: this also move the regs[r].p pointer
else ++i;
}
*n_regs_ = i;
mm_sync_regs(km, i, regs);
mm_filter_regs(km, opt, n_regs_, regs);
return regs;
}

25
hit.c
View File

@ -147,6 +147,31 @@ void mm_select_sub(void *km, float mask_level, float pri_ratio, int *n_, mm_reg1
}
}
void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs)
{
int i, k;
for (i = k = 0; i < *n_regs; ++i) {
mm_reg1_t *r = &regs[i];
int flt = 0;
if (r->cnt < opt->min_cnt) flt = 1;
else {
int blen = r->qe - r->qs < r->re - r->rs? r->qe - r->qs : r->re - r->rs;
if (r->score < blen * opt->min_seedcov_ratio) flt = 1;
}
if (r->p) {
if (r->p->blen - r->p->n_ambi - r->p->n_diff < opt->min_chain_score) flt = 1;
else if (r->p->dp_max < opt->min_dp_max) flt = 1;
if (flt) free(r->p);
}
if (!flt) {
if (k < i) regs[k++] = regs[i];
else ++k;
}
}
*n_regs = k;
mm_sync_regs(km, k, regs);
}
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
{ // squeeze out regions in a[] that are not referenced by regs[]
int i, as = 0;

16
main.c
View File

@ -10,7 +10,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r125-pre"
#define MM_VERSION "2.0-r126-pre"
void liftrlimit()
{
@ -68,7 +68,7 @@ int main(int argc, char *argv[])
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
while ((c = getopt_long(argc, argv, "bw:k:t:r:f:Vv:g:I:d:ST:s:x:Hcp:M:n:z:A:B:O:E:m:", long_options, &long_idx)) >= 0) {
while ((c = getopt_long(argc, argv, "bw:k:t:r:f:Vv:g:I:d:ST:s:x:Hcp:M:n:z:A:B:O:E:m:D:", long_options, &long_idx)) >= 0) {
if (c == 'w') w = atoi(optarg);
else if (c == 'k') k = atoi(optarg);
else if (c == 'H') is_hpc = 1;
@ -79,6 +79,7 @@ int main(int argc, char *argv[])
else if (c == 'v') mm_verbose = atoi(optarg);
else if (c == 'g') opt.max_gap = atoi(optarg);
else if (c == 'p') opt.pri_ratio = atof(optarg);
else if (c == 'D') opt.min_seedcov_ratio = atof(optarg);
else if (c == 'M') opt.mask_level = atof(optarg);
else if (c == 'c') opt.flag |= MM_F_CIGAR;
else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
@ -109,7 +110,7 @@ int main(int argc, char *argv[])
} else if (c == 'x') {
if (strcmp(optarg, "ava10k") == 0) {
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
opt.min_chain_score = 100, opt.pri_ratio = 0.0f;
opt.min_chain_score = 100, opt.pri_ratio = 0.0f, opt.min_seedcov_ratio = 0.05f;
is_hpc = 1, k = 19, w = 5;
} else if (strcmp(optarg, "map10k") == 0) {
is_hpc = 1, k = 19;
@ -140,11 +141,12 @@ int main(int argc, char *argv[])
fprintf(stderr, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
// fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
fprintf(stderr, " -S skip self and dual mappings (for the all-vs-all mode)\n");
fprintf(stderr, " -p FLOAT threshold to output a mapping [%g]\n", opt.pri_ratio);
fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
fprintf(stderr, " -D FLOAT min fraction of seed matches [%g]\n", opt.min_seedcov_ratio);
fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
fprintf(stderr, " ava10k: -Hk19 -Sw5 -p0 -m100 (PacBio/ONT all-vs-all read mapping)\n");
fprintf(stderr, " map10k: -Hk19 (PacBio/ONT vs reference mapping)\n");
fprintf(stderr, " asm1m: -k19 -w19 (intra-species assembly to ref mapping)\n");
fprintf(stderr, " ava10k: -Hk19 -Sw5 -p0 -m100 -D.05 (PacBio/ONT all-vs-all read mapping)\n");
fprintf(stderr, " map10k: -Hk19 (PacBio/ONT vs reference mapping)\n");
fprintf(stderr, " asm1m: -k19 -w19 (intra-species assembly to ref mapping)\n");
fprintf(stderr, " Alignment:\n");
fprintf(stderr, " -A INT matching score [%d]\n", opt.a);
fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b);

5
map.c
View File

@ -20,6 +20,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->bw = 1000;
opt->max_gap = 10000;
opt->max_chain_skip = 15;
opt->min_seedcov_ratio = 0.0f;
opt->pri_ratio = 2.0f;
opt->mask_level = 0.5f;
@ -243,12 +244,12 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
mm_join_long(b->km, opt, qlen, *n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
}
if (opt->flag & MM_F_CIGAR) {
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a);
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
if (!(opt->flag & MM_F_AVA)) {
mm_update_parent(b->km, opt->mask_level, *n_regs, regs);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, n_regs, regs);
}
}
} else mm_filter_regs(b->km, opt, n_regs, regs);
mm_set_mapq(*n_regs, regs);
// free

View File

@ -75,6 +75,7 @@ typedef struct {
int max_chain_skip;
int min_cnt;
int min_chain_score;
float min_seedcov_ratio;
float pri_ratio;
float mask_level;

View File

@ -42,6 +42,7 @@ void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r);
void mm_update_parent(void *km, float mask_level, int n, mm_reg1_t *r);
void mm_select_sub(void *km, float mask_level, float pri_ratio, int *n_, mm_reg1_t *r);
void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs);
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_set_mapq(int n_regs, mm_reg1_t *regs);