diff --git a/bwtsw2.h b/bwtsw2.h index 0a0571f..3c93509 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -15,7 +15,7 @@ typedef struct { typedef struct { bwtint_t k, l; - uint32_t flag:18, n_seeds:14; + uint32_t flag:18, n_seeds:13, is_rev:1; int len, G, G2; int beg, end; } bsw2hit_t; @@ -38,8 +38,8 @@ extern "C" { #endif bsw2opt_t *bsw2_init_opt(); - bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn); + bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); + void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn); void bsw2_destroy(bwtsw2_t *b); bsw2global_t *bsw2_global_init(); diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index ced8b54..9836435 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -42,7 +42,7 @@ unsigned char nt_comp_table[256] = { 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' }; -extern int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS); +extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); bsw2opt_t *bsw2_init_opt() @@ -253,25 +253,41 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 int l, uint8_t *seq[2], int is_rev, bsw2global_t *pool) { extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); - bwtsw2_t *b[2], **bb[2]; - int k; + bwtsw2_t *b[2], **bb[2], **_b, *p; + int k, j; + bwtl_t *query; + query = bwtl_seq2bwtl(l, seq[0]); + _b = bsw2_core(bns, opt, query, target, pool); + bwtl_destroy(query); for (k = 0; k < 2; ++k) { - bwtl_t *query = bwtl_seq2bwtl(l, seq[k]); - bb[k] = bsw2_core(opt, query, target, pool); - bwtl_destroy(query); + bb[k] = calloc(2, sizeof(void*)); + bb[k][0] = calloc(1, sizeof(bwtsw2_t)); + bb[k][1] = calloc(1, sizeof(bwtsw2_t)); + } + fprintf(stderr, "here!\n"); + for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand + for (j = 0; j < _b[k]->n; ++j) { + p = bb[k][_b[k]->hits[j].is_rev]; + if (p->n == p->max) { + p->max = p->max? p->max<<1 : 8; + p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); + } + p->hits[p->n++] = _b[k]->hits[j]; + } } b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" bsw2_chain_filter(opt, l, b); for (k = 0; k < 2; ++k) { bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here - bsw2_resolve_duphits(0, bb[k][0], 0); + bsw2_resolve_duphits(0, 0, bb[k][0], 0); bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); b[k] = bb[k][0]; free(bb[k]); } merge_hits(b, l, 1); // again, b[1] is merged to b[0] bsw2_resolve_query_overlaps(b[0], opt->mask_level); + bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b); return b[0]; } @@ -453,7 +469,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks /* Core routine to align reads in _seq. It is separated from * process_seqs() to realize multi-threading */ -static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2]) +static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target) { int x; bsw2opt_t opt = *_opt; @@ -502,11 +518,11 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const free(seq[0]); continue; } // alignment - b[0] = bsw2_aln1_core(&opt, bns, pac, target[0], l, seq, 0, pool); + b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, 0, pool); for (k = 0; k < b[0]->n; ++k) if (b[0]->hits[k].n_seeds < opt.t_seeds) break; - if (k < b[0]->n) { - b[1] = bsw2_aln1_core(&opt, bns, pac, target[1], l, rseq, 1, pool); + if (0 && k < b[0]->n) { + b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, 1, pool); for (i = 0; i < b[1]->n; ++i) { bsw2hit_t *p = b[1]->hits + i; int x = p->beg; @@ -516,7 +532,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const } flag_fr(b); merge_hits(b, l, 0); - bsw2_resolve_duphits(0, b[0], 0); + bsw2_resolve_duphits(0, 0, b[0], 0); bsw2_resolve_query_overlaps(b[0], opt.mask_level); } else b[1] = 0; // generate CIGAR and print SAM @@ -536,7 +552,7 @@ typedef struct { const bsw2opt_t *_opt; const bntseq_t *bns; uint8_t *pac; - bwt_t *target[2]; + bwt_t *target; } thread_aux_t; /* another interface to bsw2_aln_core() to facilitate pthread_create() */ @@ -550,7 +566,7 @@ static void *worker(void *data) /* process sequences stored in _seq, generate SAM lines for these * sequences and reset _seq afterwards. */ -static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2]) +static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target) { int i; @@ -569,7 +585,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns; - p->pac = pac; p->target[0] = target[0]; p->target[1] = target[1]; + p->pac = pac; p->target = target; pthread_create(&tid[j], &attr, worker, p); } for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); @@ -591,7 +607,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * _seq->n = 0; } -void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn) +void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn) { gzFile fp; kseq_t *ks; diff --git a/bwtsw2_core.c b/bwtsw2_core.c index c3431cf..0440e93 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -266,14 +266,14 @@ static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, i } /* after this, "narrow SA hits" will be expanded and the coordinates * will be obtained and stored in b->hits[*].k. */ -int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS) +int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS) { - int i, j, n; + int i, j, n, ref_id, is_rev; if (b->n == 0) return 0; - if (bwt) { // convert to chromosomal coordinates if suitable + if (bwt && bns) { // convert to chromosomal coordinates if suitable int old_n = b->n; bsw2hit_t *old_hits = b->hits; - for (i = n = 0; i < b->n; ++i) { + for (i = n = 0; i < b->n; ++i) { // compute memory needed to be allocated bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; else if (p->G > 0) ++n; @@ -282,19 +282,21 @@ int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS) b->hits = calloc(b->max, sizeof(bsw2hit_t)); for (i = j = 0; i < old_n; ++i) { bsw2hit_t *p = old_hits + i; - if (p->l - p->k + 1 <= IS) { + if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive bwtint_t k; for (k = p->k; k <= p->l; ++k) { b->hits[j] = *p; - b->hits[j].k = bwt_sa(bwt, k); + b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, k), 1, &ref_id, &is_rev); b->hits[j].l = 0; + b->hits[j].is_rev = is_rev; ++j; } } else if (p->G > 0) { b->hits[j] = *p; - b->hits[j].k = bwt_sa(bwt, p->k); + b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, p->k), 1, &ref_id, &is_rev); b->hits[j].l = 0; b->hits[j].flag |= 1; + b->hits[j].is_rev = is_rev; ++j; } } @@ -434,7 +436,7 @@ static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s stack_push0(s, u); } /* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */ -bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) +bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) { bsw2stack_t *stack = (bsw2stack_t*)pool->stack; bwtsw2_t *b, *b1, **b_ret; @@ -591,8 +593,8 @@ bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *qu mp_free(stack->pool, v); } // while(top) getrusage(0, &curr); - bsw2_resolve_duphits(query, b, opt->is); - bsw2_resolve_duphits(query, b1, opt->is); + bsw2_resolve_duphits(bns, query, b, opt->is); + bsw2_resolve_duphits(bns, query, b1, opt->is); //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); // free free(heap); diff --git a/bwtsw2_main.c b/bwtsw2_main.c index afbad2e..281efb1 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -10,7 +10,7 @@ int bwa_bwtsw2(int argc, char *argv[]) { bsw2opt_t *opt; - bwt_t *target[2]; + bwt_t *target; char buf[1024]; bntseq_t *bns; int c; @@ -82,16 +82,14 @@ int bwa_bwtsw2(int argc, char *argv[]) opt->t *= opt->a; opt->coef *= opt->a; - strcpy(buf, argv[optind]); target[0] = bwt_restore_bwt(strcat(buf, ".bwt")); - strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target[0]); - strcpy(buf, argv[optind]); target[1] = bwt_restore_bwt(strcat(buf, ".rbwt")); - strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".rsa"), target[1]); + strcpy(buf, argv[optind]); target = bwt_restore_bwt(strcat(buf, ".bwt")); + strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target); bns = bns_restore(argv[optind]); bsw2_aln(opt, bns, target, argv[optind+1]); bns_destroy(bns); - bwt_destroy(target[0]); bwt_destroy(target[1]); + bwt_destroy(target); free(opt); return 0;