From c0a308a8b6e28866926290d9339b65a5d750280d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Apr 2014 17:33:07 -0400 Subject: [PATCH] dev-466: simplified chain filtering --- bwamem.c | 119 +++++++++++++++++++------------------------------------ main.c | 2 +- 2 files changed, 42 insertions(+), 79 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2e5104a..602f666 100644 --- a/bwamem.c +++ b/bwamem.c @@ -145,7 +145,8 @@ typedef struct { } mem_seed_t; typedef struct { - int n, m; + int n, m, first; + uint32_t w:30, kept:2; int64_t pos; mem_seed_t *seeds; } mem_chain_t; @@ -164,10 +165,8 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; - if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) { - if (bwa_verbose >= 5) printf("** contained\n"); + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing - } if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; @@ -177,10 +176,8 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); } c->seeds[c->n++] = *p; - if (bwa_verbose >= 5) printf("** appended\n"); return 1; - } else if (bwa_verbose >= 5) - printf("** new chain: %ld, %ld, %ld, %ld\n", (long)y, (long)abs(x-y), (long)(x - last->len), (long)(y - last->len)); + } return 0; // request to add a new chain } @@ -247,15 +244,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; - if (bwa_verbose >= 5) { - bwtint_t pos; - int is_rev, ref_id; - pos = bns_depos(global_bns, s.rbeg, &is_rev); - if (is_rev) pos -= s.len - 1; - bns_cnt_ambi(global_bns, pos, s.len, &ref_id); - printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld; %s:%c%ld\n", s.len, s.qbeg, (long)s.rbeg, \ - global_bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - global_bns->anns[ref_id].offset) + 1); - } if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain @@ -285,84 +273,59 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int * Filtering chains * ********************/ -typedef struct { - int beg, end, w; - void *p, *p2; -} flt_aux_t; +#define chn_beg(ch) ((ch).seeds->qbeg) +#define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len) #define flt_lt(a, b) ((a).w > (b).w) -KSORT_INIT(mem_flt, flt_aux_t, flt_lt) +KSORT_INIT(mem_flt, mem_chain_t, flt_lt) -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) { - flt_aux_t *a; - int i, j, n; - if (n_chn <= 1) return n_chn; // no need to filter - for (i = j = 0; i < n_chn; ++i) { // filter out chains with small weight - mem_chain_t *c = &chains[i]; - int w; - w = mem_chain_weight(c); - if (w >= opt->min_chain_weight) - chains[j++] = *c; - } - n_chn = j; - if (n_chn == 0) return 0; - a = malloc(sizeof(flt_aux_t) * n_chn); - for (i = 0; i < n_chn; ++i) { - mem_chain_t *c = &chains[i]; - int w; - w = mem_chain_weight(c); - a[i].beg = c->seeds[0].qbeg; - a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; - a[i].w = w; a[i].p = c; a[i].p2 = 0; + int i, k; + kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains + if (n_chn == 0) return 0; // no need to filter + // compute the weight of each chain and drop chains with small weight + for (i = k = 0; i < n_chn; ++i) { + mem_chain_t *c = &a[i]; + c->first = -1; c->kept = 0; + c->w = mem_chain_weight(c); + if (c->w < opt->min_chain_weight) free(c->seeds); + else a[k++] = *c; } + n_chn = k; ks_introsort(mem_flt, n_chn, a); - { // reorder chains such that the best chain appears first - mem_chain_t *swap; - swap = malloc(sizeof(mem_chain_t) * n_chn); - for (i = 0; i < n_chn; ++i) { - swap[i] = *((mem_chain_t*)a[i].p); - a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed - } - memcpy(chains, swap, sizeof(mem_chain_t) * n_chn); - free(swap); - } - for (i = 1, n = 1; i < n_chn; ++i) { - for (j = 0; j < n; ++j) { - int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; - int e_min = a[j].end < a[i].end? a[j].end : a[i].end; + // pairwise chain comparisons + kv_push(int, chains, 0); + for (i = 1; i < n_chn; ++i) { + for (k = 0; k < chains.n; ++k) { + int j = chains.a[k]; + int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); + int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]); if (e_min > b_max) { // have overlap - int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; + int li = chn_end(a[i]) - chn_beg(a[i]); + int lj = chn_end(a[j]) - chn_beg(a[j]); + int min_l = li < lj? li : lj; if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap - if (a[j].p2 == 0) a[j].p2 = a[i].p; + if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) break; } } } - if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. + if (k == chains.n) kv_push(int, chains, i); } - for (i = 0; i < n; ++i) { // mark chains to be kept - mem_chain_t *c = (mem_chain_t*)a[i].p; - if (c->n > 0) c->n = -c->n; - c = (mem_chain_t*)a[i].p2; - if (c && c->n > 0) c->n = -c->n; + for (i = 0; i < chains.n; ++i) { + mem_chain_t *c = &a[chains.a[i]]; + c->kept = 2; + if (c->first >= 0) a[c->first].kept = 1; } - free(a); - for (i = 0; i < n_chn; ++i) { // free discarded chains - mem_chain_t *c = &chains[i]; - if (c->n >= 0) { - free(c->seeds); - c->n = c->m = 0; - } else c->n = -c->n; + free(chains.a); + for (i = k = 0; i < n_chn; ++i) { // free discarded chains + mem_chain_t *c = &a[i]; + if (c->kept == 0) free(c->seeds); + else a[k++] = a[i]; } - for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains - if (chains[i].n > 0) { - if (n != i) chains[n++] = chains[i]; - else ++n; - } - } - return n; + return k; } /****************************** diff --git a/main.c b/main.c index 0ef8307..7a46a28 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r465" +#define PACKAGE_VERSION "0.7.8+dev-r466" #endif int bwa_fa2pac(int argc, char *argv[]);