better ref extraction

This commit is contained in:
Heng Li 2013-02-04 16:08:00 -05:00
parent 788e9d1e3d
commit 29c8546679
1 changed files with 14 additions and 12 deletions

View File

@ -197,23 +197,26 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
} }
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c)
{ { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds
mem_aln_t a; mem_aln_t a;
int i, j, qbeg, qend; int i, j, qbeg;
int64_t rlen, rbeg, rend, rmax[2], tmp; int64_t rlen, rbeg, rmax[2], tmp;
mem_seed_t *s; const mem_seed_t *s;
uint8_t *rseq = 0; uint8_t *rseq = 0;
memset(&a, 0, sizeof(mem_aln_t)); memset(&a, 0, sizeof(mem_aln_t));
// get the start and end of the seeded region // get the start and end of the seeded region
rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg;
s = &c->seeds[c->n-1];
rend = s->rbeg + s->len; qend = s->qbeg + s->len;
// get the max possible span // get the max possible span
rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg)); rmax[0] = l_pac<<1; rmax[1] = 0;
rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend)); for (i = 0; i < c->n; ++i) {
if (rmax[0] < 0) rmax[0] = 0; int64_t b, e;
if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1; const mem_seed_t *t = &c->seeds[i];
b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg));
e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len));
rmax[0] = rmax[0] < b? rmax[0] : b;
rmax[1] = rmax[1] > e? rmax[1] : e;
}
// retrieve the reference sequence // retrieve the reference sequence
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
@ -239,14 +242,13 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n');
if (c->n > 1) { // generate $qw if (c->n > 1) { // generate $qw
int l = rmax[1] - (s->rbeg + s->len); int l = rmax[1] - (s->rbeg + s->len);
assert(l >= 0 && l < 1000);
qw = malloc(l * 2); qw = malloc(l * 2);
for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default
for (i = 1; i < c->n; ++i) { for (i = 1; i < c->n; ++i) {
const mem_seed_t *t = &c->seeds[i]; const mem_seed_t *t = &c->seeds[i];
for (j = 0; j < t->len; ++j) { for (j = 0; j < t->len; ++j) {
int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len);
assert(x < l); if (x < 0) continue; // overlap with the first seed
if (qw[x] == -1) qw[x] = x > y? x - y : y - x; if (qw[x] == -1) qw[x] = x > y? x - y : y - x;
else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint
} }