From 67543f19a1415c8e9a55f981085a9971771c3cfc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 15:55:55 -0500 Subject: [PATCH] code refactoring --- bwa.c | 37 +++++++++++++++++++++++++++++++++++++ bwa.h | 16 ++++++++++++++++ bwamem.c | 1 - fastmap.c | 54 +++++++++++++++--------------------------------------- 4 files changed, 68 insertions(+), 40 deletions(-) diff --git a/bwa.c b/bwa.c index eca721e..e8735b5 100644 --- a/bwa.c +++ b/bwa.c @@ -94,3 +94,40 @@ ret_gen_cigar: return cigar; } +/********************* + * Full index reader * + *********************/ + +bwaidx_t *bwa_idx_load(const char *prefix, int which) +{ + bwaidx_t *idx; + idx = calloc(1, sizeof(bwaidx_t)); + if (which & BWA_IDX_BWT) { + char *tmp; + tmp = calloc(strlen(prefix) + 5, 1); + strcat(strcpy(tmp, prefix), ".bwt"); // FM-index + idx->bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) + bwt_restore_sa(tmp, idx->bwt); + free(tmp); + } + if (which & BWA_IDX_BNS) { + idx->bns = bns_restore(prefix); + if (which & BWA_IDX_PAC) { + idx->pac = calloc(idx->bns->l_pac/4+1, 1); + fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence + } + fclose(idx->bns->fp_pac); + idx->bns->fp_pac = 0; + } + return idx; +} + +void bwa_idx_destroy(bwaidx_t *idx) +{ + if (idx == 0) return; + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + free(idx); +} diff --git a/bwa.h b/bwa.h index 022b784..ad528c9 100644 --- a/bwa.h +++ b/bwa.h @@ -2,6 +2,19 @@ #define BWA_H_ #include +#include "bntseq.h" +#include "bwt.h" + +#define BWA_IDX_BWT 0x1 +#define BWA_IDX_BNS 0x2 +#define BWA_IDX_PAC 0x4 +#define BWA_IDX_ALL 0x7 + +typedef struct { + bwt_t *bwt; // FM-index + bntseq_t *bns; // information on the reference sequences + uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base +} bwaidx_t; typedef struct { int l_seq; @@ -16,6 +29,9 @@ extern "C" { uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + bwaidx_t *bwa_idx_load(const char *prefix, int which); + void bwa_idx_destroy(bwaidx_t *idx); + #ifdef __cplusplus } #endif diff --git a/bwamem.c b/bwamem.c index 43f9f2f..6b219cf 100644 --- a/bwamem.c +++ b/bwamem.c @@ -112,7 +112,6 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } - const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) { int i, max, max_i, ori_start; diff --git a/fastmap.c b/fastmap.c index c97d566..2800821 100644 --- a/fastmap.c +++ b/fastmap.c @@ -2,8 +2,7 @@ #include #include #include -#include "bntseq.h" -#include "bwt.h" +#include "bwa.h" #include "bwamem.h" #include "kvec.h" #include "utils.h" @@ -15,13 +14,11 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - bwt_t *bwt; - bntseq_t *bns; int c, n, l; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; - uint8_t *pac = 0; bseq1_t *seqs; + bwaidx_t *idx; opt = mem_opt_init(); while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { @@ -48,19 +45,9 @@ int main_mem(int argc, char *argv[]) return 1; } mem_fill_scmat(opt->a, opt->b, opt->mat); - { // load the packed sequences, BWT and SA - char *tmp = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(tmp, argv[optind]), ".bwt"); - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, argv[optind]), ".sa"); - bwt_restore_sa(tmp, bwt); - free(tmp); - bns = bns_restore(argv[optind]); - pac = calloc(bns->l_pac/4+1, 1); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); - } - for (l = 0; l < bns->n_seqs; ++l) - printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); + idx = bwa_idx_load(argv[optind], BWA_IDX_ALL); + for (l = 0; l < idx->bns->n_seqs; ++l) + printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp); @@ -70,13 +57,12 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { - mem_process_seqs(opt, bwt, bns, pac, n, seqs); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs); free(seqs); } - free(opt); free(pac); - bns_destroy(bns); - bwt_destroy(bwt); + free(opt); + bwa_idx_destroy(idx); kseq_destroy(ks); gzclose(fp); if (ks2) { @@ -92,10 +78,9 @@ int main_fastmap(int argc, char *argv[]) kseq_t *seq; bwtint_t k; gzFile fp; - bwt_t *bwt; - bntseq_t *bns; smem_i *itr; const bwtintv_v *a; + bwaidx_t *idx; while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { switch (c) { @@ -112,16 +97,8 @@ int main_fastmap(int argc, char *argv[]) fp = gzopen(argv[optind + 1], "r"); seq = kseq_init(fp); - { // load the packed sequences, BWT and SA - char *tmp = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(tmp, argv[optind]), ".bwt"); - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, argv[optind]), ".sa"); - bwt_restore_sa(tmp, bwt); - free(tmp); - bns = bns_restore(argv[optind]); - } - itr = smem_itr_init(bwt); + idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS); + itr = smem_itr_init(idx->bwt); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { @@ -141,10 +118,10 @@ int main_fastmap(int argc, char *argv[]) bwtint_t pos; int len, is_rev, ref_id; len = (uint32_t)p->info - (p->info>>32); - pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); + pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); if (is_rev) pos -= len - 1; - bns_cnt_ambi(bns, pos, len, &ref_id); - printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + bns_cnt_ambi(idx->bns, pos, len, &ref_id); + printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); } } else fputs("\t*", stdout); putchar('\n'); @@ -154,8 +131,7 @@ int main_fastmap(int argc, char *argv[]) } smem_itr_destroy(itr); - bns_destroy(bns); - bwt_destroy(bwt); + bwa_idx_destroy(idx); kseq_destroy(seq); gzclose(fp); return 0;