basic chaining working

Definitely suboptimal in a lot of corner cases...
This commit is contained in:
Heng Li 2013-01-31 16:26:05 -05:00
parent 6c19c9640c
commit 8977737460
3 changed files with 57 additions and 11 deletions

View File

@ -81,7 +81,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *
return 0;
}
void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
{
while (smem_next(itr) > 0) {
int i;
@ -89,21 +89,22 @@ void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
bwtintv_t *p = &itr->matches->a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k;
if (slen >= opt->min_seed_len || p->x[2] > opt->max_occ) continue;
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue;
for (k = 0; k < p->x[2]; ++k) {
memchain1_t tmp, *lower, *upper;
memseed_t c1;
memseed_t s;
int to_add = 0;
c1.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k);
c1.qbeg = p->info>>32;
c1.len = slen;
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k);
s.qbeg = p->info>>32;
s.len = slen;
if (kb_size(tree)) {
kb_intervalp(chn, tree, &tmp, &lower, &upper);
if (!test_and_merge(opt, lower, &c1)) to_add = 1;
} to_add = 1;
if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1;
} else to_add = 1;
if (to_add) {
tmp.n = 1; tmp.m = 4;
tmp.seeds = calloc(tmp.m, sizeof(memseed_t));
tmp.seeds[0] = s;
kb_putp(chn, tree, &tmp);
}
}
@ -111,7 +112,7 @@ void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
}
}
memchain_t mem_collect_seed(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
{
memchain_t chain;
smem_i *itr;
@ -122,6 +123,14 @@ memchain_t mem_collect_seed(const memopt_t *opt, const bwt_t *bwt, int len, cons
tree = kb_init(chn, KB_DEFAULT_SIZE);
itr = smem_itr_init(bwt);
smem_set_query(itr, 1, len, seq);
mem_insert_seed(opt, tree, itr);
chain.m = kb_size(tree); chain.n = 0;
chain.chains = malloc(chain.m * sizeof(memchain1_t));
#define traverse_func(p_) (chain.chains[chain.n++] = *(p_))
__kb_traverse(memchain1_t, tree, traverse_func);
#undef traverse_func
smem_itr_destroy(itr);
kb_destroy(chn, tree);

View File

@ -11,7 +11,8 @@ typedef struct {
} smem_i;
typedef struct {
int64_t qbeg, rbeg, len;
int64_t rbeg;
int32_t qbeg, len;
} memseed_t;
typedef struct {
@ -41,6 +42,8 @@ int smem_next(smem_i *itr);
memopt_t *mem_opt_init(void);
memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
#ifdef __cplusplus
}
#endif

View File

@ -16,7 +16,9 @@ int main_mem(int argc, char *argv[])
memopt_t *opt;
bwt_t *bwt;
bntseq_t *bns;
int c;
int i, j, c;
gzFile *fp;
kseq_t *seq;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "")) >= 0) {
@ -28,6 +30,38 @@ int main_mem(int argc, char *argv[])
free(opt);
return 1;
}
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
free(tmp);
bns = bns_restore(argv[optind]);
}
while (kseq_read(seq) >= 0) {
memchain_t chain;
printf(">%s\n", seq->name.s);
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s);
for (i = 0; i < chain.n; ++i) {
memchain1_t *p = &chain.chains[i];
printf("%d\t%d", i, p->n);
for (j = 0; j < p->n; ++j) {
bwtint_t pos;
int is_rev, ref_id;
pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
if (is_rev) pos -= p->seeds[j].len - 1;
bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
}
putchar('\n');
}
puts("//");
}
free(opt);
return 0;