From aabb80773456d29c3551c097f190428110080b81 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 11:22:08 -0400 Subject: [PATCH] concatenate for-rev sequences in the end --- bntseq.c | 70 +++++++++++++++++++++------------------------------ bntseq.h | 7 ++++-- bwase.c | 7 +++--- bwtsw2_core.c | 6 ++--- 4 files changed, 39 insertions(+), 51 deletions(-) diff --git a/bntseq.c b/bntseq.c index 3a90b99..98a5a49 100644 --- a/bntseq.c +++ b/bntseq.c @@ -163,6 +163,9 @@ void bns_destroy(bntseq_t *bns) } } +#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1)) +#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3) + static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) { bntann1_t *p; @@ -178,7 +181,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; for (i = lasts = 0; i < seq->seq.l; ++i) { - int c = seq->seq.s[i]; + int c = nst_nt4_table[(int)seq->seq.s[i]]; if (c >= 4) { // N if (lasts == seq->seq.s[i]) { // contiguous N ++(*q)->len; @@ -190,20 +193,20 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ *q = bns->ambs + bns->n_holes; (*q)->len = 1; (*q)->offset = p->offset + i; - (*q)->amb = 'N'; + (*q)->amb = seq->seq.s[i]; ++p->n_ambs; ++bns->n_holes; } } lasts = seq->seq.s[i]; { // fill buffer - if (c >= 4) c = c>>4; + if (c >= 4) c = lrand48()&3; if (bns->l_pac == *m_pac) { // double the pac size *m_pac <<= 1; pac = realloc(pac, *m_pac/4); memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); } - pac[bns->l_pac>>2] |= c << ((3 - (bns->l_pac&3)) << 1); + _set_pac(pac, bns->l_pac, c); ++bns->l_pac; } } @@ -218,8 +221,8 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) char name[1024]; bntseq_t *bns; uint8_t *pac = 0; - int32_t i, m_seqs, m_holes; - int64_t ret = -1, m_pac; + int32_t m_seqs, m_holes; + int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; @@ -236,19 +239,13 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences - while (kseq_read(seq) >= 0) { - for (i = 0; i < seq->seq.l; ++i) { // convert to 2-bit encoding - seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - if (seq->seq.s[i] > 3) - seq->seq.s[i] |= (lrand48()&3) << 4; - } - pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); - if (!for_only) { - seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 0); // reversed but not complemented - for (i = 0; i < seq->seq.l; ++i) // complement - seq->seq.s[i] = seq->seq.s[i] < 4? 3 - seq->seq.s[i] : ((3 - (seq->seq.s[i]>>4)) << 4 | 4); - pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); - } + while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); + if (!for_only) { // add the reverse complemented sequence + m_pac = (bns->l_pac * 2 + 3) / 4 * 4; + pac = realloc(pac, m_pac/4); + memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); + for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) + _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file @@ -290,32 +287,21 @@ int bwa_fa2pac(int argc, char *argv[]) return 0; } -int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev) -{ - int left, mid, right; - is_fr = is_fr? 1 : 0; - left = 0; mid = 0; right = bns->n_seqs; - while (left < right) { - mid = (left + right) >> 1; - if (pos >= bns->anns[mid].offset<n_seqs - 1) break; - if (pos < bns->anns[mid+1].offset<anns[mid]; - if (pos - (p->offset<<1) < p->len) *is_rev = 0, pos -= p->offset; - else *is_rev = 1, pos = p->len - (pos - (p->offset<<1) - p->len) + p->offset; - } - return pos; -} - int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; - if (ref_id) bns_pos2refId(bns, pos_f, 0, ref_id, 0); + if (ref_id) { + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { + mid = (left + right) >> 1; + if (pos_f >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pos_f < bns->anns[mid+1].offset) break; // bracketed + left = mid + 1; + } else right = mid; + } + *ref_id = mid; + } left = 0; right = bns->n_holes; nn = 0; while (left < right) { mid = (left + right) >> 1; diff --git a/bntseq.h b/bntseq.h index 276ef64..0becc01 100644 --- a/bntseq.h +++ b/bntseq.h @@ -71,12 +71,15 @@ extern "C" { bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); - int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); - #ifdef __cplusplus } #endif +static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) +{ + return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - pos : pos; +} + #endif diff --git a/bwase.c b/bwase.c index 0b6581e..124d6b4 100644 --- a/bwase.c +++ b/bwase.c @@ -111,10 +111,9 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm) bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) { - bwtint_t pos_fr, pos_f; - int is_rev, ref_id; - pos_fr = bwt_sa(bwt, sapos); - pos_f = bns_pos2refId(bns, pos_fr, 1, &ref_id, &is_rev); // pos_f + bwtint_t pos_f; + int is_rev; + pos_f = bns_depos(bns, bwt_sa(bwt, sapos), &is_rev); // pos_f *strand = !is_rev; /* NB: For gapped alignment, pacpos may not be correct, which will be fixed * in refine_gapped_core(). This line also determines the way "x" is diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 846c184..fe87a5f 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -268,7 +268,7 @@ static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, i * will be obtained and stored in b->hits[*].k. */ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS) { - int i, j, n, ref_id, is_rev; + int i, j, n, is_rev; if (b->n == 0) return 0; if (bwt && bns) { // convert to chromosomal coordinates if requested int old_n = b->n; @@ -287,7 +287,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue; for (k = p->k; k <= p->l; ++k) { b->hits[j] = *p; - b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, k), 1, &ref_id, &is_rev); + b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev); b->hits[j].l = 0; b->hits[j].is_rev = is_rev; if (is_rev) b->hits[j].k -= p->len; @@ -295,7 +295,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int } } else if (p->G > 0) { b->hits[j] = *p; - b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, p->k), 1, &ref_id, &is_rev); + b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev); b->hits[j].l = 0; b->hits[j].flag |= 1; b->hits[j].is_rev = is_rev;