From 1cb409aaf2d4e47069703b8961a5da41d43bfd23 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 21 Oct 2011 12:03:14 -0400 Subject: [PATCH] use forward-only pac to reduce memory --- bntseq.c | 44 ++++++++++++++++++++++++++------------------ bntseq.h | 6 ++++-- bwase.c | 21 ++++++++++----------- bwtindex.c | 12 ++++++++++-- bwtsw2_aux.c | 4 ++-- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/bntseq.c b/bntseq.c index a12b95c..15b369f 100644 --- a/bntseq.c +++ b/bntseq.c @@ -286,36 +286,44 @@ int bwa_fa2pac(int argc, char *argv[]) return 0; } -int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq) +int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev) { - int left, mid, right, nn; - if (pac_coor >= bns->l_pac) - err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac); - // binary search for the sequence ID. Note that this is a bit different from the following one... + int left, mid, right; + is_fr = is_fr? 1 : 0; left = 0; mid = 0; right = bns->n_seqs; while (left < right) { mid = (left + right) >> 1; - if (pac_coor >= bns->anns[mid].offset) { + if (pos >= bns->anns[mid].offset<n_seqs - 1) break; - if (pac_coor < bns->anns[mid+1].offset) break; + if (pos < bns->anns[mid+1].offset<anns[mid]; + if (pos - (p->offset<<1) < p->len) *is_rev = 0, pos -= p->offset; + else *is_rev = 1, pos = p->len - (pos - (p->offset<<1) - p->len) + p->offset; + } + return pos; +} + +int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) +{ + int left, mid, right, nn; + if (ref_id) bns_pos2refId(bns, pos_f, 0, ref_id, 0); left = 0; right = bns->n_holes; nn = 0; while (left < right) { - int64_t mid = (left + right) >> 1; - if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; - else if (pac_coor + len <= bns->ambs[mid].offset) right = mid; + mid = (left + right) >> 1; + if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; + else if (pos_f + len <= bns->ambs[mid].offset) right = mid; else { // overlap - if (pac_coor >= bns->ambs[mid].offset) { - nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? - bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len; + if (pos_f >= bns->ambs[mid].offset) { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? + bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len; } else { - nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? - bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor); + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? + bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f); } break; } diff --git a/bntseq.h b/bntseq.h index 189e017..276ef64 100644 --- a/bntseq.h +++ b/bntseq.h @@ -70,8 +70,10 @@ extern "C" { bntseq_t *bns_restore(const char *prefix); bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); - int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix); - int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq); + int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); + int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev); + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + #ifdef __cplusplus } diff --git a/bwase.c b/bwase.c index 8f8d9a8..fda4752 100644 --- a/bwase.c +++ b/bwase.c @@ -111,17 +111,16 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm) bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) { - bwtint_t pacpos; - int32_t ref_id; - pacpos = bwt_sa(bwt, sapos); - bns_coor_pac2real(bns, pacpos, 0, &ref_id); - *strand = !(ref_id&1); + bwtint_t pos_fr, pos_f; + int is_rev, ref_id; + pos_fr = bwt_sa(bwt, sapos); + pos_f = bns_pos2refId(bns, pos_fr, 1, &ref_id, &is_rev); // pos_f + *strand = !is_rev; /* NB: For gapped alignment, pacpos may not be correct, which will be fixed * in refine_gapped_core(). This line also determines the way "x" is * calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */ - if (ref_id&1) // mapped to the forward strand - pacpos = bns->anns[ref_id].len - (pacpos + len - bns->anns[ref_id].offset) + bns->anns[ref_id-1].offset; - return pacpos; + if (is_rev) pos_f = pos_f < len? 0 : pos_f - len; // mapped to the forward strand + return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset } /** @@ -423,7 +422,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment // get seqid - nn = bns_coor_pac2real(bns, p->pos, j, &seqid); + nn = bns_cnt_ambi(bns, p->pos, j, &seqid); if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len) flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences @@ -450,7 +449,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in long long isize; am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality // redundant calculation here, but should not matter too much - m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid); + m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; if (p->type == BWA_TYPE_NO_MATCH) isize = 0; @@ -493,7 +492,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in bwt_multi1_t *q = p->multi + i; int k; j = pos_end_multi(q, p->len) - q->pos; - nn = bns_coor_pac2real(bns, q->pos, j, &seqid); + nn = bns_cnt_ambi(bns, q->pos, j, &seqid); err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', (int)(q->pos - bns->anns[seqid].offset + 1)); if (q->cigar) { diff --git a/bwtindex.c b/bwtindex.c index 8672687..8d40245 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -80,7 +80,7 @@ int bwa_index(int argc, char *argv[]) gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); - l_pac = bns_fasta2bntseq(fp, prefix); + l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); } else { // color indexing @@ -88,7 +88,7 @@ int bwa_index(int argc, char *argv[]) strcat(strcpy(str, prefix), ".nt"); t = clock(); fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - l_pac = bns_fasta2bntseq(fp, str); + l_pac = bns_fasta2bntseq(fp, str, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); { @@ -126,6 +126,14 @@ int bwa_index(int argc, char *argv[]) bwt_destroy(bwt); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } + { + gzFile fp = xzopen(argv[optind], "r"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); + l_pac = bns_fasta2bntseq(fp, prefix, 1); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + gzclose(fp); + } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 8ba6455..fe8b96b 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -314,7 +314,7 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n // FIXME: this routine does not work if the query bridge three reference sequences int32_t coor, refl, lq; int x, y, i, seqid; - bns_coor_pac2real(bns, p->k, p->len, &seqid); + bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; refl = bns->anns[seqid].len; x = coor; y = 0; @@ -404,7 +404,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks int beg, end; if (p->l == 0) { b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]); - nn = bns_coor_pac2real(bns, p->k, p->len, &seqid); + nn = bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; } ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10);