From 6db761e2696c20f8f27653d1fccb74b10b203289 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 2 May 2014 16:06:27 -0400 Subject: [PATCH] r746: tuned heuristic for GRCh38 Reduced -c to 500 by default. As a compensation, we choose up to 1000 positions if a seed has 500 or more occurrences. In addition, a read with big portion from such seeds will have lower mapping quality. --- bwamem.c | 34 ++++++++++++++++++++++++++-------- bwamem.h | 1 + main.c | 2 +- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/bwamem.c b/bwamem.c index 5709586..5fb41f8 100644 --- a/bwamem.c +++ b/bwamem.c @@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init() o->pen_clip5 = o->pen_clip3 = 5; o->min_seed_len = 19; o->split_width = 10; - o->max_occ = 10000; + o->max_occ = 500; o->max_chain_gap = 10000; o->max_ins = 10000; o->mask_level = 0.50; @@ -118,7 +118,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length - if (slen >= opt->min_seed_len && p->x[2] <= opt->max_occ) + if (slen >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, *p); } } else ++x; @@ -149,7 +149,8 @@ typedef struct { typedef struct { int n, m, first, rid; - int w, kept; + uint32_t w:30, kept:2; + float frac_rep; int64_t pos; mem_seed_t *seeds; } mem_chain_t; @@ -202,7 +203,8 @@ int mem_chain_weight(const mem_chain_t *c) else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; end = end > s->qbeg + s->len? end : s->qbeg + s->len; } - return w < tmp? w : tmp; + w = w < tmp? w : tmp; + return w < 1<<30? w : (1<<30)-1; } void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) @@ -224,7 +226,7 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq) { - int i; + int i, b, e, l_rep; int64_t l_pac = bns->l_pac; mem_chain_v chain; kbtree_t(chn) *tree; @@ -236,12 +238,21 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn aux = smem_aux_init(); mem_collect_intv(opt, bwt, len, seq, aux); + for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep + bwtintv_t *p = &aux->mem.a[i]; + int sb = (p->info>>32), se = (uint32_t)p->info; + if (p->x[2] <= opt->max_occ) continue; + if (sb > e) l_rep += e - b, b = sb, e = se; + else e = e > se? e : se; + } + l_rep += e - b; for (i = 0; i < aux->mem.n; ++i) { bwtintv_t *p = &aux->mem.a[i]; - int slen = (uint32_t)p->info - (p->info>>32); // seed length + int step, slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; - if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive - for (k = 0; k < p->x[2]; ++k) { + if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive + step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; + for (k = 0; k < p->x[2]; k += step) { mem_chain_t tmp, *lower, *upper; mem_seed_t s; int rid, to_add = 0; @@ -271,6 +282,9 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn __kb_traverse(mem_chain_t, tree, traverse_func); #undef traverse_func + for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len; + if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); + kb_destroy(chn, tree); return chain; } @@ -592,6 +606,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t a.score = x.score; a.csub = x.score2; a.rid = c->rid; + a.frac_rep = c->frac_rep; if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb); if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; kv_push(mem_alnreg_t, *av, a); @@ -759,6 +774,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac } a->w = aw[0] > aw[1]? aw[0] : aw[1]; a->seedlen0 = s->len; + + a->frac_rep = c->frac_rep; } free(srt); free(rseq); } @@ -930,6 +947,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; + mapq = (int)(mapq * (1. - a->frac_rep) + .499); return mapq; } diff --git a/bwamem.h b/bwamem.h index a6a6c8d..d4e2d1d 100644 --- a/bwamem.h +++ b/bwamem.h @@ -65,6 +65,7 @@ typedef struct { int secondary; // index of the parent hit shadowing the current hit; <0 if primary int seedlen0; // length of the starting seed int n_comp; // number of sub-alignments chained together + int frac_rep; uint64_t hash; } mem_alnreg_t; diff --git a/main.c b/main.c index 9028a19..bbeda8e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8-r744-dirty" +#define PACKAGE_VERSION "0.7.8-r746-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);