From 8512b55ce32122ebfd78328ca0b65dc2dab66127 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 13:42:32 -0400 Subject: [PATCH] bwasw works on a couple of sequences --- bwtsw2_aux.c | 55 ++++++++++++++++----------------------------------- bwtsw2_core.c | 21 +++++++++++++------- 2 files changed, 31 insertions(+), 45 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 9dd8684..8f25734 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -75,9 +75,7 @@ void bsw2_destroy(bwtsw2_t *b) (par).row = 5; (par).band_width = opt->bw; \ } while (0) -#define __rpac(pac, l, i) (pac[(l-i-1)>>2] >> (~(l-i-1)&3)*2 & 0x3) - -void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, int is_rev, uint8_t *_mem) +void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { int i, matrix[25]; bwtint_t k; @@ -103,19 +101,14 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (j = score = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { - if (q->n_seeds < (1<<14) - 2) ++q->n_seeds; + if (q->n_seeds < (1<<13) - 2) ++q->n_seeds; ++score; } } if (score) continue; if (lt > p->k) lt = p->k; - if (is_rev) { - for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! - target[j++] = __rpac(pac, l_pac, k); - } else { - for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! - target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; - } + for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); if (score > p->G) { // extensible @@ -128,7 +121,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq free(query); free(target); } -void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, int is_rev, uint8_t *_mem) +void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { int i, matrix[25]; bwtint_t k; @@ -144,13 +137,8 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, int j, score; path_t path; if (p->l) continue; - if (is_rev) { - for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) - target[j++] = __rpac(pac, l_pac, k); - } else { - for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) - target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; - } + for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem); // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); @@ -222,8 +210,8 @@ void bsw2_debug_hits(const bwtsw2_t *b) printf("# raw hits: %d\n", b->n); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; - if (p->l == 0) - printf("G=%d, [%d,%d), k=%lu, l=%lu\n", p->G, p->beg, p->end, (long)p->k, (long)p->l); + if (p->G > 0) + printf("G=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); } } @@ -250,7 +238,7 @@ static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) } /* seq[0] is the forward sequence and seq[1] is the reverse complement. */ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, - int l, uint8_t *seq[2], int is_rev, bsw2global_t *pool) + int l, uint8_t *seq[2], bsw2global_t *pool) { extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); bwtsw2_t *b[2], **bb[2], **_b, *p; @@ -278,20 +266,16 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 int x = q->beg; q->beg = l - q->end; q->end = l - x; - q->k -= q->len; } } } - //bsw2_debug_hits(bb[0][1]); - bsw2_debug_hits(bb[1][1]); b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" bsw2_chain_filter(opt, l, b); for (k = 0; k < 2; ++k) { - bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); - if (k == 1) bsw2_debug_hits(bb[k][1]); + bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here bsw2_resolve_duphits(0, 0, bb[k][0], 0); - bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); + bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem); b[k] = bb[k][0]; free(bb[k]); } @@ -528,18 +512,13 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const free(seq[0]); continue; } // alignment - b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, 0, pool); + b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool); for (k = 0; k < b[0]->n; ++k) if (b[0]->hits[k].n_seeds < opt.t_seeds) break; - if (0 && k < b[0]->n) { - b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, 1, pool); - for (i = 0; i < b[1]->n; ++i) { - bsw2hit_t *p = b[1]->hits + i; - int x = p->beg; - p->beg = l - p->end; - p->end = l - x; - if (p->l == 0) p->k = bns->l_pac - (p->k + p->len); - } + if (k < b[0]->n) { + b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool); + for (i = 0; i < b[1]->n; ++i) // flip the strand flag + b[1]->hits[i].flag ^= 0x10, b[1]->hits[i].is_rev ^= 1; flag_fr(b); merge_hits(b, l, 0); bsw2_resolve_duphits(0, 0, b[0], 0); diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 244e121..f7b72de 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -35,7 +35,7 @@ typedef struct { #include "ksort.h" KSORT_INIT_GENERIC(int) -#define __hitG_lt(a, b) ((a).G > (b).G) +#define __hitG_lt(a, b) ((a).n_seeds > (b).n_seeds || ((a).n_seeds == (b).n_seeds && (a).G > (b).G)) KSORT_INIT(hitG, bsw2hit_t, __hitG_lt) static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} }; @@ -270,10 +270,10 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int { int i, j, n, ref_id, is_rev; if (b->n == 0) return 0; - if (bwt && bns) { // convert to chromosomal coordinates if suitable + if (bwt && bns) { // convert to chromosomal coordinates if requested int old_n = b->n; bsw2hit_t *old_hits = b->hits; - for (i = n = 0; i < b->n; ++i) { // compute memory needed to be allocated + for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; else if (p->G > 0) ++n; @@ -290,6 +290,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, k), 1, &ref_id, &is_rev); b->hits[j].l = 0; b->hits[j].is_rev = is_rev; + if (is_rev) b->hits[j].k -= p->len; ++j; } } else if (p->G > 0) { @@ -298,25 +299,28 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int b->hits[j].l = 0; b->hits[j].flag |= 1; b->hits[j].is_rev = is_rev; + if (is_rev) b->hits[j].k -= p->len; ++j; } } free(old_hits); } + for (i = j = 0; i < b->n; ++i) // squeeze out empty elements + if (b->hits[i].G) b->hits[j++] = b->hits[i]; + b->n = j; ks_introsort(hitG, b->n, b->hits); for (i = 1; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; - if (p->G == 0) break; for (j = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; int compatible = 1; - if (q->G == 0) continue; + if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates if (p->l == 0 && q->l == 0) { - int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); + int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap if (qol < 0) qol = 0; if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) { int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) - - (int64_t)(p->k > q->k? p->k : q->k); + - (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL) compatible = 0; } @@ -594,6 +598,9 @@ bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *ta mp_free(stack->pool, v); } // while(top) getrusage(0, &curr); + for (i = 0; i < 2; ++i) + for (j = 0; j < b_ret[i]->n; ++j) + b_ret[i]->hits[j].n_seeds = 0; bsw2_resolve_duphits(bns, query, b, opt->is); bsw2_resolve_duphits(bns, query, b1, opt->is); //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot);