r746: tuned heuristic for GRCh38
Reduced -c to 500 by default. As a compensation, we choose up to 1000 positions if a seed has 500 or more occurrences. In addition, a read with big portion from such seeds will have lower mapping quality.
This commit is contained in:
parent
8763e0ced7
commit
6db761e269
34
bwamem.c
34
bwamem.c
|
|
@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init()
|
||||||
o->pen_clip5 = o->pen_clip3 = 5;
|
o->pen_clip5 = o->pen_clip3 = 5;
|
||||||
o->min_seed_len = 19;
|
o->min_seed_len = 19;
|
||||||
o->split_width = 10;
|
o->split_width = 10;
|
||||||
o->max_occ = 10000;
|
o->max_occ = 500;
|
||||||
o->max_chain_gap = 10000;
|
o->max_chain_gap = 10000;
|
||||||
o->max_ins = 10000;
|
o->max_ins = 10000;
|
||||||
o->mask_level = 0.50;
|
o->mask_level = 0.50;
|
||||||
|
|
@ -118,7 +118,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
||||||
for (i = 0; i < a->mem1.n; ++i) {
|
for (i = 0; i < a->mem1.n; ++i) {
|
||||||
bwtintv_t *p = &a->mem1.a[i];
|
bwtintv_t *p = &a->mem1.a[i];
|
||||||
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||||||
if (slen >= opt->min_seed_len && p->x[2] <= opt->max_occ)
|
if (slen >= opt->min_seed_len)
|
||||||
kv_push(bwtintv_t, a->mem, *p);
|
kv_push(bwtintv_t, a->mem, *p);
|
||||||
}
|
}
|
||||||
} else ++x;
|
} else ++x;
|
||||||
|
|
@ -149,7 +149,8 @@ typedef struct {
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int n, m, first, rid;
|
int n, m, first, rid;
|
||||||
int w, kept;
|
uint32_t w:30, kept:2;
|
||||||
|
float frac_rep;
|
||||||
int64_t pos;
|
int64_t pos;
|
||||||
mem_seed_t *seeds;
|
mem_seed_t *seeds;
|
||||||
} mem_chain_t;
|
} mem_chain_t;
|
||||||
|
|
@ -202,7 +203,8 @@ int mem_chain_weight(const mem_chain_t *c)
|
||||||
else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
|
else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
|
||||||
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
|
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
|
||||||
}
|
}
|
||||||
return w < tmp? w : tmp;
|
w = w < tmp? w : tmp;
|
||||||
|
return w < 1<<30? w : (1<<30)-1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
|
void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
|
||||||
|
|
@ -224,7 +226,7 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
|
||||||
|
|
||||||
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq)
|
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq)
|
||||||
{
|
{
|
||||||
int i;
|
int i, b, e, l_rep;
|
||||||
int64_t l_pac = bns->l_pac;
|
int64_t l_pac = bns->l_pac;
|
||||||
mem_chain_v chain;
|
mem_chain_v chain;
|
||||||
kbtree_t(chn) *tree;
|
kbtree_t(chn) *tree;
|
||||||
|
|
@ -236,12 +238,21 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
|
||||||
|
|
||||||
aux = smem_aux_init();
|
aux = smem_aux_init();
|
||||||
mem_collect_intv(opt, bwt, len, seq, aux);
|
mem_collect_intv(opt, bwt, len, seq, aux);
|
||||||
|
for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep
|
||||||
|
bwtintv_t *p = &aux->mem.a[i];
|
||||||
|
int sb = (p->info>>32), se = (uint32_t)p->info;
|
||||||
|
if (p->x[2] <= opt->max_occ) continue;
|
||||||
|
if (sb > e) l_rep += e - b, b = sb, e = se;
|
||||||
|
else e = e > se? e : se;
|
||||||
|
}
|
||||||
|
l_rep += e - b;
|
||||||
for (i = 0; i < aux->mem.n; ++i) {
|
for (i = 0; i < aux->mem.n; ++i) {
|
||||||
bwtintv_t *p = &aux->mem.a[i];
|
bwtintv_t *p = &aux->mem.a[i];
|
||||||
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
int step, slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||||||
int64_t k;
|
int64_t k;
|
||||||
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
|
if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
|
||||||
for (k = 0; k < p->x[2]; ++k) {
|
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
|
||||||
|
for (k = 0; k < p->x[2]; k += step) {
|
||||||
mem_chain_t tmp, *lower, *upper;
|
mem_chain_t tmp, *lower, *upper;
|
||||||
mem_seed_t s;
|
mem_seed_t s;
|
||||||
int rid, to_add = 0;
|
int rid, to_add = 0;
|
||||||
|
|
@ -271,6 +282,9 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
|
||||||
__kb_traverse(mem_chain_t, tree, traverse_func);
|
__kb_traverse(mem_chain_t, tree, traverse_func);
|
||||||
#undef traverse_func
|
#undef traverse_func
|
||||||
|
|
||||||
|
for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len;
|
||||||
|
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
|
||||||
|
|
||||||
kb_destroy(chn, tree);
|
kb_destroy(chn, tree);
|
||||||
return chain;
|
return chain;
|
||||||
}
|
}
|
||||||
|
|
@ -592,6 +606,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
|
||||||
a.score = x.score;
|
a.score = x.score;
|
||||||
a.csub = x.score2;
|
a.csub = x.score2;
|
||||||
a.rid = c->rid;
|
a.rid = c->rid;
|
||||||
|
a.frac_rep = c->frac_rep;
|
||||||
if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb);
|
if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb);
|
||||||
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
|
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
|
||||||
kv_push(mem_alnreg_t, *av, a);
|
kv_push(mem_alnreg_t, *av, a);
|
||||||
|
|
@ -759,6 +774,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
||||||
}
|
}
|
||||||
a->w = aw[0] > aw[1]? aw[0] : aw[1];
|
a->w = aw[0] > aw[1]? aw[0] : aw[1];
|
||||||
a->seedlen0 = s->len;
|
a->seedlen0 = s->len;
|
||||||
|
|
||||||
|
a->frac_rep = c->frac_rep;
|
||||||
}
|
}
|
||||||
free(srt); free(rseq);
|
free(srt); free(rseq);
|
||||||
}
|
}
|
||||||
|
|
@ -930,6 +947,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
|
||||||
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
|
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
|
||||||
if (mapq > 60) mapq = 60;
|
if (mapq > 60) mapq = 60;
|
||||||
if (mapq < 0) mapq = 0;
|
if (mapq < 0) mapq = 0;
|
||||||
|
mapq = (int)(mapq * (1. - a->frac_rep) + .499);
|
||||||
return mapq;
|
return mapq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
1
bwamem.h
1
bwamem.h
|
|
@ -65,6 +65,7 @@ typedef struct {
|
||||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||||
int seedlen0; // length of the starting seed
|
int seedlen0; // length of the starting seed
|
||||||
int n_comp; // number of sub-alignments chained together
|
int n_comp; // number of sub-alignments chained together
|
||||||
|
int frac_rep;
|
||||||
uint64_t hash;
|
uint64_t hash;
|
||||||
} mem_alnreg_t;
|
} mem_alnreg_t;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue