code refactoring

This commit is contained in:
Heng Li 2013-02-23 15:55:55 -05:00
parent e613195e17
commit 67543f19a1
4 changed files with 68 additions and 40 deletions

37
bwa.c
View File

@ -94,3 +94,40 @@ ret_gen_cigar:
return cigar; return cigar;
} }
/*********************
* Full index reader *
*********************/
bwaidx_t *bwa_idx_load(const char *prefix, int which)
{
bwaidx_t *idx;
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) {
char *tmp;
tmp = calloc(strlen(prefix) + 5, 1);
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
idx->bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, idx->bwt);
free(tmp);
}
if (which & BWA_IDX_BNS) {
idx->bns = bns_restore(prefix);
if (which & BWA_IDX_PAC) {
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
}
fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
}
return idx;
}
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
free(idx);
}

16
bwa.h
View File

@ -2,6 +2,19 @@
#define BWA_H_ #define BWA_H_
#include <stdint.h> #include <stdint.h>
#include "bntseq.h"
#include "bwt.h"
#define BWA_IDX_BWT 0x1
#define BWA_IDX_BNS 0x2
#define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7
typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
} bwaidx_t;
typedef struct { typedef struct {
int l_seq; int l_seq;
@ -16,6 +29,9 @@ extern "C" {
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
bwaidx_t *bwa_idx_load(const char *prefix, int which);
void bwa_idx_destroy(bwaidx_t *idx);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -112,7 +112,6 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
itr->len = len; itr->len = len;
} }
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
{ {
int i, max, max_i, ori_start; int i, max, max_i, ori_start;

View File

@ -2,8 +2,7 @@
#include <stdio.h> #include <stdio.h>
#include <unistd.h> #include <unistd.h>
#include <stdlib.h> #include <stdlib.h>
#include "bntseq.h" #include "bwa.h"
#include "bwt.h"
#include "bwamem.h" #include "bwamem.h"
#include "kvec.h" #include "kvec.h"
#include "utils.h" #include "utils.h"
@ -15,13 +14,11 @@ extern unsigned char nst_nt4_table[256];
int main_mem(int argc, char *argv[]) int main_mem(int argc, char *argv[])
{ {
mem_opt_t *opt; mem_opt_t *opt;
bwt_t *bwt;
bntseq_t *bns;
int c, n, l; int c, n, l;
gzFile fp, fp2 = 0; gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0; kseq_t *ks, *ks2 = 0;
uint8_t *pac = 0;
bseq1_t *seqs; bseq1_t *seqs;
bwaidx_t *idx;
opt = mem_opt_init(); opt = mem_opt_init();
while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) {
@ -48,19 +45,9 @@ int main_mem(int argc, char *argv[])
return 1; return 1;
} }
mem_fill_scmat(opt->a, opt->b, opt->mat); mem_fill_scmat(opt->a, opt->b, opt->mat);
{ // load the packed sequences, BWT and SA idx = bwa_idx_load(argv[optind], BWA_IDX_ALL);
char *tmp = calloc(strlen(argv[optind]) + 5, 1); for (l = 0; l < idx->bns->n_seqs; ++l)
strcat(strcpy(tmp, argv[optind]), ".bwt"); printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len);
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
free(tmp);
bns = bns_restore(argv[optind]);
pac = calloc(bns->l_pac/4+1, 1);
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
}
for (l = 0; l < bns->n_seqs; ++l)
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r");
ks = kseq_init(fp); ks = kseq_init(fp);
@ -70,13 +57,12 @@ int main_mem(int argc, char *argv[])
opt->flag |= MEM_F_PE; opt->flag |= MEM_F_PE;
} }
while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) {
mem_process_seqs(opt, bwt, bns, pac, n, seqs); mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs);
free(seqs); free(seqs);
} }
free(opt); free(pac); free(opt);
bns_destroy(bns); bwa_idx_destroy(idx);
bwt_destroy(bwt);
kseq_destroy(ks); kseq_destroy(ks);
gzclose(fp); gzclose(fp);
if (ks2) { if (ks2) {
@ -92,10 +78,9 @@ int main_fastmap(int argc, char *argv[])
kseq_t *seq; kseq_t *seq;
bwtint_t k; bwtint_t k;
gzFile fp; gzFile fp;
bwt_t *bwt;
bntseq_t *bns;
smem_i *itr; smem_i *itr;
const bwtintv_v *a; const bwtintv_v *a;
bwaidx_t *idx;
while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) {
switch (c) { switch (c) {
@ -112,16 +97,8 @@ int main_fastmap(int argc, char *argv[])
fp = gzopen(argv[optind + 1], "r"); fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp); seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS);
char *tmp = calloc(strlen(argv[optind]) + 5, 1); itr = smem_itr_init(idx->bwt);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
free(tmp);
bns = bns_restore(argv[optind]);
}
itr = smem_itr_init(bwt);
while (kseq_read(seq) >= 0) { while (kseq_read(seq) >= 0) {
printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) { if (print_seq) {
@ -141,10 +118,10 @@ int main_fastmap(int argc, char *argv[])
bwtint_t pos; bwtint_t pos;
int len, is_rev, ref_id; int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32); len = (uint32_t)p->info - (p->info>>32);
pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
if (is_rev) pos -= len - 1; if (is_rev) pos -= len - 1;
bns_cnt_ambi(bns, pos, len, &ref_id); bns_cnt_ambi(idx->bns, pos, len, &ref_id);
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
} }
} else fputs("\t*", stdout); } else fputs("\t*", stdout);
putchar('\n'); putchar('\n');
@ -154,8 +131,7 @@ int main_fastmap(int argc, char *argv[])
} }
smem_itr_destroy(itr); smem_itr_destroy(itr);
bns_destroy(bns); bwa_idx_destroy(idx);
bwt_destroy(bwt);
kseq_destroy(seq); kseq_destroy(seq);
gzclose(fp); gzclose(fp);
return 0; return 0;