dev-466: simplified chain filtering
This commit is contained in:
parent
f12dfae772
commit
c0a308a8b6
119
bwamem.c
119
bwamem.c
|
|
@ -145,7 +145,8 @@ typedef struct {
|
||||||
} mem_seed_t;
|
} mem_seed_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int n, m;
|
int n, m, first;
|
||||||
|
uint32_t w:30, kept:2;
|
||||||
int64_t pos;
|
int64_t pos;
|
||||||
mem_seed_t *seeds;
|
mem_seed_t *seeds;
|
||||||
} mem_chain_t;
|
} mem_chain_t;
|
||||||
|
|
@ -164,10 +165,8 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c
|
||||||
const mem_seed_t *last = &c->seeds[c->n-1];
|
const mem_seed_t *last = &c->seeds[c->n-1];
|
||||||
qend = last->qbeg + last->len;
|
qend = last->qbeg + last->len;
|
||||||
rend = last->rbeg + last->len;
|
rend = last->rbeg + last->len;
|
||||||
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) {
|
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
|
||||||
if (bwa_verbose >= 5) printf("** contained\n");
|
|
||||||
return 1; // contained seed; do nothing
|
return 1; // contained seed; do nothing
|
||||||
}
|
|
||||||
if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand
|
if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand
|
||||||
x = p->qbeg - last->qbeg; // always non-negtive
|
x = p->qbeg - last->qbeg; // always non-negtive
|
||||||
y = p->rbeg - last->rbeg;
|
y = p->rbeg - last->rbeg;
|
||||||
|
|
@ -177,10 +176,8 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c
|
||||||
c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t));
|
c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t));
|
||||||
}
|
}
|
||||||
c->seeds[c->n++] = *p;
|
c->seeds[c->n++] = *p;
|
||||||
if (bwa_verbose >= 5) printf("** appended\n");
|
|
||||||
return 1;
|
return 1;
|
||||||
} else if (bwa_verbose >= 5)
|
}
|
||||||
printf("** new chain: %ld, %ld, %ld, %ld\n", (long)y, (long)abs(x-y), (long)(x - last->len), (long)(y - last->len));
|
|
||||||
return 0; // request to add a new chain
|
return 0; // request to add a new chain
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -247,15 +244,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int
|
||||||
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||||
s.qbeg = p->info>>32;
|
s.qbeg = p->info>>32;
|
||||||
s.len = slen;
|
s.len = slen;
|
||||||
if (bwa_verbose >= 5) {
|
|
||||||
bwtint_t pos;
|
|
||||||
int is_rev, ref_id;
|
|
||||||
pos = bns_depos(global_bns, s.rbeg, &is_rev);
|
|
||||||
if (is_rev) pos -= s.len - 1;
|
|
||||||
bns_cnt_ambi(global_bns, pos, s.len, &ref_id);
|
|
||||||
printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld; %s:%c%ld\n", s.len, s.qbeg, (long)s.rbeg, \
|
|
||||||
global_bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - global_bns->anns[ref_id].offset) + 1);
|
|
||||||
}
|
|
||||||
if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip
|
if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip
|
||||||
if (kb_size(tree)) {
|
if (kb_size(tree)) {
|
||||||
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
||||||
|
|
@ -285,84 +273,59 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int
|
||||||
* Filtering chains *
|
* Filtering chains *
|
||||||
********************/
|
********************/
|
||||||
|
|
||||||
typedef struct {
|
#define chn_beg(ch) ((ch).seeds->qbeg)
|
||||||
int beg, end, w;
|
#define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len)
|
||||||
void *p, *p2;
|
|
||||||
} flt_aux_t;
|
|
||||||
|
|
||||||
#define flt_lt(a, b) ((a).w > (b).w)
|
#define flt_lt(a, b) ((a).w > (b).w)
|
||||||
KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
|
KSORT_INIT(mem_flt, mem_chain_t, flt_lt)
|
||||||
|
|
||||||
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
|
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
|
||||||
{
|
{
|
||||||
flt_aux_t *a;
|
int i, k;
|
||||||
int i, j, n;
|
kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains
|
||||||
if (n_chn <= 1) return n_chn; // no need to filter
|
if (n_chn == 0) return 0; // no need to filter
|
||||||
for (i = j = 0; i < n_chn; ++i) { // filter out chains with small weight
|
// compute the weight of each chain and drop chains with small weight
|
||||||
mem_chain_t *c = &chains[i];
|
for (i = k = 0; i < n_chn; ++i) {
|
||||||
int w;
|
mem_chain_t *c = &a[i];
|
||||||
w = mem_chain_weight(c);
|
c->first = -1; c->kept = 0;
|
||||||
if (w >= opt->min_chain_weight)
|
c->w = mem_chain_weight(c);
|
||||||
chains[j++] = *c;
|
if (c->w < opt->min_chain_weight) free(c->seeds);
|
||||||
}
|
else a[k++] = *c;
|
||||||
n_chn = j;
|
|
||||||
if (n_chn == 0) return 0;
|
|
||||||
a = malloc(sizeof(flt_aux_t) * n_chn);
|
|
||||||
for (i = 0; i < n_chn; ++i) {
|
|
||||||
mem_chain_t *c = &chains[i];
|
|
||||||
int w;
|
|
||||||
w = mem_chain_weight(c);
|
|
||||||
a[i].beg = c->seeds[0].qbeg;
|
|
||||||
a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len;
|
|
||||||
a[i].w = w; a[i].p = c; a[i].p2 = 0;
|
|
||||||
}
|
}
|
||||||
|
n_chn = k;
|
||||||
ks_introsort(mem_flt, n_chn, a);
|
ks_introsort(mem_flt, n_chn, a);
|
||||||
{ // reorder chains such that the best chain appears first
|
// pairwise chain comparisons
|
||||||
mem_chain_t *swap;
|
kv_push(int, chains, 0);
|
||||||
swap = malloc(sizeof(mem_chain_t) * n_chn);
|
for (i = 1; i < n_chn; ++i) {
|
||||||
for (i = 0; i < n_chn; ++i) {
|
for (k = 0; k < chains.n; ++k) {
|
||||||
swap[i] = *((mem_chain_t*)a[i].p);
|
int j = chains.a[k];
|
||||||
a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
|
int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]);
|
||||||
}
|
int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]);
|
||||||
memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
|
|
||||||
free(swap);
|
|
||||||
}
|
|
||||||
for (i = 1, n = 1; i < n_chn; ++i) {
|
|
||||||
for (j = 0; j < n; ++j) {
|
|
||||||
int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg;
|
|
||||||
int e_min = a[j].end < a[i].end? a[j].end : a[i].end;
|
|
||||||
if (e_min > b_max) { // have overlap
|
if (e_min > b_max) { // have overlap
|
||||||
int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg;
|
int li = chn_end(a[i]) - chn_beg(a[i]);
|
||||||
|
int lj = chn_end(a[j]) - chn_beg(a[j]);
|
||||||
|
int min_l = li < lj? li : lj;
|
||||||
if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap
|
if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap
|
||||||
if (a[j].p2 == 0) a[j].p2 = a[i].p;
|
if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate
|
||||||
if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
|
if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
|
if (k == chains.n) kv_push(int, chains, i);
|
||||||
}
|
}
|
||||||
for (i = 0; i < n; ++i) { // mark chains to be kept
|
for (i = 0; i < chains.n; ++i) {
|
||||||
mem_chain_t *c = (mem_chain_t*)a[i].p;
|
mem_chain_t *c = &a[chains.a[i]];
|
||||||
if (c->n > 0) c->n = -c->n;
|
c->kept = 2;
|
||||||
c = (mem_chain_t*)a[i].p2;
|
if (c->first >= 0) a[c->first].kept = 1;
|
||||||
if (c && c->n > 0) c->n = -c->n;
|
|
||||||
}
|
}
|
||||||
free(a);
|
free(chains.a);
|
||||||
for (i = 0; i < n_chn; ++i) { // free discarded chains
|
for (i = k = 0; i < n_chn; ++i) { // free discarded chains
|
||||||
mem_chain_t *c = &chains[i];
|
mem_chain_t *c = &a[i];
|
||||||
if (c->n >= 0) {
|
if (c->kept == 0) free(c->seeds);
|
||||||
free(c->seeds);
|
else a[k++] = a[i];
|
||||||
c->n = c->m = 0;
|
|
||||||
} else c->n = -c->n;
|
|
||||||
}
|
}
|
||||||
for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains
|
return k;
|
||||||
if (chains[i].n > 0) {
|
|
||||||
if (n != i) chains[n++] = chains[i];
|
|
||||||
else ++n;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/******************************
|
/******************************
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue