bwasw works on a couple of sequences

This commit is contained in:
Heng Li 2011-10-24 13:42:32 -04:00
parent 4c43c5914d
commit 8512b55ce3
2 changed files with 31 additions and 45 deletions

View File

@ -75,9 +75,7 @@ void bsw2_destroy(bwtsw2_t *b)
(par).row = 5; (par).band_width = opt->bw; \
} while (0)
#define __rpac(pac, l, i) (pac[(l-i-1)>>2] >> (~(l-i-1)&3)*2 & 0x3)
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, int is_rev, uint8_t *_mem)
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
{
int i, matrix[25];
bwtint_t k;
@ -103,19 +101,14 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
for (j = score = 0; j < i; ++j) {
bsw2hit_t *q = b->hits + j;
if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) {
if (q->n_seeds < (1<<14) - 2) ++q->n_seeds;
if (q->n_seeds < (1<<13) - 2) ++q->n_seeds;
++score;
}
}
if (score) continue;
if (lt > p->k) lt = p->k;
if (is_rev) {
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
target[j++] = __rpac(pac, l_pac, k);
} else {
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
}
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem);
if (score > p->G) { // extensible
@ -128,7 +121,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
free(query); free(target);
}
void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, int is_rev, uint8_t *_mem)
void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
{
int i, matrix[25];
bwtint_t k;
@ -144,13 +137,8 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq,
int j, score;
path_t path;
if (p->l) continue;
if (is_rev) {
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
target[j++] = __rpac(pac, l_pac, k);
} else {
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
}
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem);
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
@ -222,8 +210,8 @@ void bsw2_debug_hits(const bwtsw2_t *b)
printf("# raw hits: %d\n", b->n);
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
if (p->l == 0)
printf("G=%d, [%d,%d), k=%lu, l=%lu\n", p->G, p->beg, p->end, (long)p->k, (long)p->l);
if (p->G > 0)
printf("G=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev);
}
}
@ -250,7 +238,7 @@ static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse)
}
/* seq[0] is the forward sequence and seq[1] is the reverse complement. */
static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target,
int l, uint8_t *seq[2], int is_rev, bsw2global_t *pool)
int l, uint8_t *seq[2], bsw2global_t *pool)
{
extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]);
bwtsw2_t *b[2], **bb[2], **_b, *p;
@ -278,20 +266,16 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8
int x = q->beg;
q->beg = l - q->end;
q->end = l - x;
q->k -= q->len;
}
}
}
//bsw2_debug_hits(bb[0][1]);
bsw2_debug_hits(bb[1][1]);
b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits"
bsw2_chain_filter(opt, l, b);
for (k = 0; k < 2; ++k) {
bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem);
if (k == 1) bsw2_debug_hits(bb[k][1]);
bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem);
merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here
bsw2_resolve_duphits(0, 0, bb[k][0], 0);
bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem);
bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem);
b[k] = bb[k][0];
free(bb[k]);
}
@ -528,18 +512,13 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const
free(seq[0]); continue;
}
// alignment
b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, 0, pool);
b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool);
for (k = 0; k < b[0]->n; ++k)
if (b[0]->hits[k].n_seeds < opt.t_seeds) break;
if (0 && k < b[0]->n) {
b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, 1, pool);
for (i = 0; i < b[1]->n; ++i) {
bsw2hit_t *p = b[1]->hits + i;
int x = p->beg;
p->beg = l - p->end;
p->end = l - x;
if (p->l == 0) p->k = bns->l_pac - (p->k + p->len);
}
if (k < b[0]->n) {
b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool);
for (i = 0; i < b[1]->n; ++i) // flip the strand flag
b[1]->hits[i].flag ^= 0x10, b[1]->hits[i].is_rev ^= 1;
flag_fr(b);
merge_hits(b, l, 0);
bsw2_resolve_duphits(0, 0, b[0], 0);

View File

@ -35,7 +35,7 @@ typedef struct {
#include "ksort.h"
KSORT_INIT_GENERIC(int)
#define __hitG_lt(a, b) ((a).G > (b).G)
#define __hitG_lt(a, b) ((a).n_seeds > (b).n_seeds || ((a).n_seeds == (b).n_seeds && (a).G > (b).G))
KSORT_INIT(hitG, bsw2hit_t, __hitG_lt)
static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} };
@ -270,10 +270,10 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int
{
int i, j, n, ref_id, is_rev;
if (b->n == 0) return 0;
if (bwt && bns) { // convert to chromosomal coordinates if suitable
if (bwt && bns) { // convert to chromosomal coordinates if requested
int old_n = b->n;
bsw2hit_t *old_hits = b->hits;
for (i = n = 0; i < b->n; ++i) { // compute memory needed to be allocated
for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated
bsw2hit_t *p = old_hits + i;
if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1;
else if (p->G > 0) ++n;
@ -290,6 +290,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int
b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, k), 1, &ref_id, &is_rev);
b->hits[j].l = 0;
b->hits[j].is_rev = is_rev;
if (is_rev) b->hits[j].k -= p->len;
++j;
}
} else if (p->G > 0) {
@ -298,25 +299,28 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int
b->hits[j].l = 0;
b->hits[j].flag |= 1;
b->hits[j].is_rev = is_rev;
if (is_rev) b->hits[j].k -= p->len;
++j;
}
}
free(old_hits);
}
for (i = j = 0; i < b->n; ++i) // squeeze out empty elements
if (b->hits[i].G) b->hits[j++] = b->hits[i];
b->n = j;
ks_introsort(hitG, b->n, b->hits);
for (i = 1; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
if (p->G == 0) break;
for (j = 0; j < i; ++j) {
bsw2hit_t *q = b->hits + j;
int compatible = 1;
if (q->G == 0) continue;
if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates
if (p->l == 0 && q->l == 0) {
int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg);
int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap
if (qol < 0) qol = 0;
if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) {
int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
- (int64_t)(p->k > q->k? p->k : q->k);
- (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap
if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL)
compatible = 0;
}
@ -594,6 +598,9 @@ bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *ta
mp_free(stack->pool, v);
} // while(top)
getrusage(0, &curr);
for (i = 0; i < 2; ++i)
for (j = 0; j < b_ret[i]->n; ++j)
b_ret[i]->hits[j].n_seeds = 0;
bsw2_resolve_duphits(bns, query, b, opt->is);
bsw2_resolve_duphits(bns, query, b1, opt->is);
//fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot);