diff --git a/bwamem.c b/bwamem.c index ae70af7..33c911d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -27,6 +27,8 @@ mem_opt_t *mem_opt_init() o->min_seed_len = 17; o->max_occ = 10; o->max_chain_gap = 10000; + o->mask_level = 0.50; + o->chain_drop_ratio = 0.33; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -218,18 +220,18 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) for (i = 0; i < chn->n; ++i) { mem_chain1_t *c = &chn->chains[i]; int w = 0; - for (j = 0; j < c->n; ++j) w += c->len; + for (j = 0; j < c->n; ++j) w += c->seeds[j].len; a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; a[i].w = w; a[i].p = c; - a[i].w2 = 0; a[i].p2 = 0; + a[i].p2 = 0; } ks_introsort(mem_flt, chn->n, a); for (i = 1, n = 1; i < chn->n; ++i) { for (j = 0; j < n; ++j) { int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; - int e_min = e[j].end < a[i].end? a[j].end : a[i].end; + int e_min = a[j].end < a[i].end? a[j].end : a[i].end; if (e_min > b_max) { // have overlap int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap @@ -241,7 +243,27 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) } if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. } + for (i = 0; i < n; ++i) { // mark chains to be kept + mem_chain1_t *c = (mem_chain1_t*)a[i].p; + if (c->n > 0) c->n = -c->n; + c = (mem_chain1_t*)a[i].p2; + if (c && c->n > 0) c->n = -c->n; + } free(a); + for (i = 0; i < chn->n; ++i) { // free discarded chains + mem_chain1_t *c = &chn->chains[i]; + if (c->n >= 0) { + free(c->seeds); + c->n = c->m = 0; + } else c->n = -c->n; + } + for (i = n = 0; i < chn->n; ++i) { // squeeze out discarded chains + if (chn->chains[i].n > 0) { + if (n != i) chn->chains[n++] = chn->chains[i]; + else ++n; + } + } + chn->n = n; } /**************************************** diff --git a/bwamem.h b/bwamem.h index 0484edf..adf57dd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,6 +15,7 @@ typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + float mask_level, chain_drop_ratio; } mem_opt_t; typedef struct { @@ -47,6 +48,7 @@ mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn); void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a); #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 797a22f..f02224a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -51,6 +51,7 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); + mem_chain_flt(opt, &chain); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; mem_aln_t a;