r746: tuned heuristic for GRCh38

Reduced -c to 500 by default. As a compensation, we choose up to 1000 positions
if a seed has 500 or more occurrences. In addition, a read with big portion
from such seeds will have lower mapping quality.
This commit is contained in:
Heng Li 2014-05-02 16:06:27 -04:00
parent 8763e0ced7
commit 6db761e269
3 changed files with 28 additions and 9 deletions

View File

@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init()
o->pen_clip5 = o->pen_clip3 = 5;
o->min_seed_len = 19;
o->split_width = 10;
o->max_occ = 10000;
o->max_occ = 500;
o->max_chain_gap = 10000;
o->max_ins = 10000;
o->mask_level = 0.50;
@ -118,7 +118,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length
if (slen >= opt->min_seed_len && p->x[2] <= opt->max_occ)
if (slen >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, *p);
}
} else ++x;
@ -149,7 +149,8 @@ typedef struct {
typedef struct {
int n, m, first, rid;
int w, kept;
uint32_t w:30, kept:2;
float frac_rep;
int64_t pos;
mem_seed_t *seeds;
} mem_chain_t;
@ -202,7 +203,8 @@ int mem_chain_weight(const mem_chain_t *c)
else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
}
return w < tmp? w : tmp;
w = w < tmp? w : tmp;
return w < 1<<30? w : (1<<30)-1;
}
void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
@ -224,7 +226,7 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq)
{
int i;
int i, b, e, l_rep;
int64_t l_pac = bns->l_pac;
mem_chain_v chain;
kbtree_t(chn) *tree;
@ -236,12 +238,21 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
aux = smem_aux_init();
mem_collect_intv(opt, bwt, len, seq, aux);
for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep
bwtintv_t *p = &aux->mem.a[i];
int sb = (p->info>>32), se = (uint32_t)p->info;
if (p->x[2] <= opt->max_occ) continue;
if (sb > e) l_rep += e - b, b = sb, e = se;
else e = e > se? e : se;
}
l_rep += e - b;
for (i = 0; i < aux->mem.n; ++i) {
bwtintv_t *p = &aux->mem.a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length
int step, slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k;
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
for (k = 0; k < p->x[2]; ++k) {
if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
for (k = 0; k < p->x[2]; k += step) {
mem_chain_t tmp, *lower, *upper;
mem_seed_t s;
int rid, to_add = 0;
@ -271,6 +282,9 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
__kb_traverse(mem_chain_t, tree, traverse_func);
#undef traverse_func
for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len;
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
kb_destroy(chn, tree);
return chain;
}
@ -592,6 +606,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
a.score = x.score;
a.csub = x.score2;
a.rid = c->rid;
a.frac_rep = c->frac_rep;
if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb);
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
kv_push(mem_alnreg_t, *av, a);
@ -759,6 +774,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
}
a->w = aw[0] > aw[1]? aw[0] : aw[1];
a->seedlen0 = s->len;
a->frac_rep = c->frac_rep;
}
free(srt); free(rseq);
}
@ -930,6 +947,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
if (mapq > 60) mapq = 60;
if (mapq < 0) mapq = 0;
mapq = (int)(mapq * (1. - a->frac_rep) + .499);
return mapq;
}

View File

@ -65,6 +65,7 @@ typedef struct {
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
int seedlen0; // length of the starting seed
int n_comp; // number of sub-alignments chained together
int frac_rep;
uint64_t hash;
} mem_alnreg_t;

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.8-r744-dirty"
#define PACKAGE_VERSION "0.7.8-r746-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);