2013-02-01 02:59:48 +08:00
|
|
|
#include <stdlib.h>
|
2013-02-01 04:55:22 +08:00
|
|
|
#include <string.h>
|
|
|
|
|
#include <stdio.h>
|
2013-02-01 02:59:48 +08:00
|
|
|
#include "bwamem.h"
|
2013-02-02 03:20:38 +08:00
|
|
|
#include "kvec.h"
|
2013-02-01 02:59:48 +08:00
|
|
|
|
|
|
|
|
memopt_t *mem_opt_init()
|
|
|
|
|
{
|
|
|
|
|
memopt_t *o;
|
|
|
|
|
o = calloc(1, sizeof(memopt_t));
|
|
|
|
|
o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100;
|
|
|
|
|
o->min_seed_len = 17;
|
|
|
|
|
o->max_occ = 10;
|
2013-02-01 04:55:22 +08:00
|
|
|
o->max_chain_gap = 10000;
|
2013-02-01 02:59:48 +08:00
|
|
|
return o;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/***************************
|
|
|
|
|
* SMEM iterator interface *
|
|
|
|
|
***************************/
|
|
|
|
|
|
2013-02-02 03:38:44 +08:00
|
|
|
struct __smem_i {
|
|
|
|
|
const bwt_t *bwt;
|
|
|
|
|
const uint8_t *query;
|
|
|
|
|
int start, len;
|
|
|
|
|
bwtintv_v *matches; // matches; to be returned by smem_next()
|
|
|
|
|
bwtintv_v *sub; // sub-matches inside the longest match; temporary
|
|
|
|
|
bwtintv_v *tmpvec[2]; // temporary arrays
|
|
|
|
|
};
|
|
|
|
|
|
2013-02-01 02:59:48 +08:00
|
|
|
smem_i *smem_itr_init(const bwt_t *bwt)
|
|
|
|
|
{
|
|
|
|
|
smem_i *itr;
|
|
|
|
|
itr = calloc(1, sizeof(smem_i));
|
|
|
|
|
itr->bwt = bwt;
|
|
|
|
|
itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
|
|
|
|
|
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
|
|
|
|
|
itr->matches = calloc(1, sizeof(bwtintv_v));
|
2013-02-02 03:20:38 +08:00
|
|
|
itr->sub = calloc(1, sizeof(bwtintv_v));
|
2013-02-01 02:59:48 +08:00
|
|
|
return itr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void smem_itr_destroy(smem_i *itr)
|
|
|
|
|
{
|
2013-02-02 04:26:34 +08:00
|
|
|
free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
|
|
|
|
|
free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
|
|
|
|
|
free(itr->matches->a); free(itr->matches);
|
|
|
|
|
free(itr->sub->a); free(itr->sub);
|
2013-02-01 02:59:48 +08:00
|
|
|
free(itr);
|
|
|
|
|
}
|
|
|
|
|
|
2013-02-02 03:20:38 +08:00
|
|
|
void smem_set_query(smem_i *itr, int len, const uint8_t *query)
|
2013-02-01 02:59:48 +08:00
|
|
|
{
|
|
|
|
|
itr->query = query;
|
|
|
|
|
itr->start = 0;
|
|
|
|
|
itr->len = len;
|
|
|
|
|
}
|
|
|
|
|
|
2013-02-02 03:38:44 +08:00
|
|
|
const bwtintv_v *smem_next(smem_i *itr, int split_len)
|
2013-02-01 02:59:48 +08:00
|
|
|
{
|
2013-02-02 03:20:38 +08:00
|
|
|
int i, max, max_i;
|
2013-02-02 03:38:44 +08:00
|
|
|
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
|
|
|
|
|
if (itr->start >= itr->len || itr->start < 0) return 0;
|
2013-02-01 02:59:48 +08:00
|
|
|
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
2013-02-02 03:38:44 +08:00
|
|
|
if (itr->start == itr->len) return 0;
|
|
|
|
|
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); // search for SMEM
|
|
|
|
|
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
|
|
|
|
|
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
|
2013-02-02 03:20:38 +08:00
|
|
|
bwtintv_t *p = &itr->matches->a[i];
|
|
|
|
|
int len = (uint32_t)p->info - (p->info>>32);
|
|
|
|
|
if (max < len) max = len, max_i = i;
|
|
|
|
|
}
|
2013-02-02 03:38:44 +08:00
|
|
|
if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { // if the longest SMEM is unique and long
|
2013-02-02 03:20:38 +08:00
|
|
|
int j;
|
2013-02-02 03:38:44 +08:00
|
|
|
bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging
|
2013-02-02 03:20:38 +08:00
|
|
|
bwtintv_t *p = &itr->matches->a[max_i];
|
2013-02-02 03:38:44 +08:00
|
|
|
bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM
|
2013-02-02 03:20:38 +08:00
|
|
|
i = j = 0; a->n = 0;
|
|
|
|
|
while (i < itr->matches->n && j < itr->sub->n) { // ordered merge
|
|
|
|
|
if (itr->matches->a[i].info < itr->sub->a[j].info) {
|
|
|
|
|
kv_push(bwtintv_t, *a, itr->matches->a[i]);
|
|
|
|
|
++i;
|
|
|
|
|
} else {
|
|
|
|
|
kv_push(bwtintv_t, *a, itr->sub->a[j]);
|
|
|
|
|
++j;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]);
|
|
|
|
|
for (; j < itr->sub->n; ++j) kv_push(bwtintv_t, *a, itr->sub->a[j]);
|
|
|
|
|
kv_copy(bwtintv_t, *itr->matches, *a);
|
|
|
|
|
}
|
2013-02-02 03:38:44 +08:00
|
|
|
return itr->matches;
|
2013-02-01 02:59:48 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-01 04:55:22 +08:00
|
|
|
#include "kbtree.h"
|
|
|
|
|
|
2013-02-01 05:39:24 +08:00
|
|
|
#define chain_cmp(a, b) ((a).pos - (b).pos)
|
|
|
|
|
KBTREE_INIT(chn, memchain1_t, chain_cmp)
|
2013-02-01 04:55:22 +08:00
|
|
|
|
|
|
|
|
static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p)
|
|
|
|
|
{
|
|
|
|
|
int64_t qend, rend, x, y;
|
|
|
|
|
const memseed_t *last = &c->seeds[c->n-1];
|
|
|
|
|
qend = last->qbeg + last->len;
|
|
|
|
|
rend = last->rbeg + last->len;
|
2013-02-01 05:39:24 +08:00
|
|
|
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
|
2013-02-01 04:55:22 +08:00
|
|
|
return 1; // contained seed; do nothing
|
|
|
|
|
x = p->qbeg - last->qbeg; // always positive
|
|
|
|
|
y = p->rbeg - last->rbeg;
|
2013-02-02 03:38:44 +08:00
|
|
|
if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
|
2013-02-01 04:55:22 +08:00
|
|
|
if (c->n == c->m) {
|
|
|
|
|
c->m <<= 1;
|
|
|
|
|
c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t));
|
|
|
|
|
}
|
|
|
|
|
c->seeds[c->n++] = *p;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
2013-02-02 03:38:44 +08:00
|
|
|
return 0; // request to add a new chain
|
2013-02-01 04:55:22 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-01 05:26:05 +08:00
|
|
|
static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
|
2013-02-01 04:55:22 +08:00
|
|
|
{
|
2013-02-02 03:38:44 +08:00
|
|
|
const bwtintv_v *a;
|
|
|
|
|
while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM
|
2013-02-01 04:55:22 +08:00
|
|
|
int i;
|
2013-02-02 03:38:44 +08:00
|
|
|
for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
|
|
|
|
|
bwtintv_t *p = &a->a[i];
|
2013-02-01 04:55:22 +08:00
|
|
|
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
|
|
|
|
int64_t k;
|
2013-02-02 03:38:44 +08:00
|
|
|
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
|
2013-02-01 04:55:22 +08:00
|
|
|
for (k = 0; k < p->x[2]; ++k) {
|
|
|
|
|
memchain1_t tmp, *lower, *upper;
|
2013-02-01 05:26:05 +08:00
|
|
|
memseed_t s;
|
2013-02-01 04:55:22 +08:00
|
|
|
int to_add = 0;
|
2013-02-02 03:38:44 +08:00
|
|
|
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
2013-02-01 05:26:05 +08:00
|
|
|
s.qbeg = p->info>>32;
|
|
|
|
|
s.len = slen;
|
2013-02-01 04:55:22 +08:00
|
|
|
if (kb_size(tree)) {
|
2013-02-02 03:38:44 +08:00
|
|
|
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
2013-02-01 05:26:05 +08:00
|
|
|
if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1;
|
|
|
|
|
} else to_add = 1;
|
2013-02-02 03:38:44 +08:00
|
|
|
if (to_add) { // add the seed as a new chain
|
2013-02-01 04:55:22 +08:00
|
|
|
tmp.n = 1; tmp.m = 4;
|
|
|
|
|
tmp.seeds = calloc(tmp.m, sizeof(memseed_t));
|
2013-02-01 05:26:05 +08:00
|
|
|
tmp.seeds[0] = s;
|
2013-02-01 04:55:22 +08:00
|
|
|
kb_putp(chn, tree, &tmp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-02-01 05:26:05 +08:00
|
|
|
memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
|
2013-02-01 04:55:22 +08:00
|
|
|
{
|
|
|
|
|
memchain_t chain;
|
|
|
|
|
smem_i *itr;
|
|
|
|
|
kbtree_t(chn) *tree;
|
|
|
|
|
|
|
|
|
|
memset(&chain, 0, sizeof(memchain_t));
|
|
|
|
|
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
|
|
|
|
|
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
|
|
|
|
itr = smem_itr_init(bwt);
|
2013-02-02 03:20:38 +08:00
|
|
|
smem_set_query(itr, len, seq);
|
2013-02-01 05:26:05 +08:00
|
|
|
mem_insert_seed(opt, tree, itr);
|
|
|
|
|
|
|
|
|
|
chain.m = kb_size(tree); chain.n = 0;
|
|
|
|
|
chain.chains = malloc(chain.m * sizeof(memchain1_t));
|
|
|
|
|
|
|
|
|
|
#define traverse_func(p_) (chain.chains[chain.n++] = *(p_))
|
|
|
|
|
__kb_traverse(memchain1_t, tree, traverse_func);
|
|
|
|
|
#undef traverse_func
|
2013-02-01 04:55:22 +08:00
|
|
|
|
|
|
|
|
smem_itr_destroy(itr);
|
|
|
|
|
kb_destroy(chn, tree);
|
|
|
|
|
return chain;
|
|
|
|
|
}
|