code refactoring

This commit is contained in:
Heng Li 2013-02-23 15:55:55 -05:00
parent e613195e17
commit 67543f19a1
4 changed files with 68 additions and 40 deletions

37
bwa.c
View File

@ -94,3 +94,40 @@ ret_gen_cigar:
return cigar;
}
/*********************
* Full index reader *
*********************/
bwaidx_t *bwa_idx_load(const char *prefix, int which)
{
bwaidx_t *idx;
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) {
char *tmp;
tmp = calloc(strlen(prefix) + 5, 1);
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
idx->bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, idx->bwt);
free(tmp);
}
if (which & BWA_IDX_BNS) {
idx->bns = bns_restore(prefix);
if (which & BWA_IDX_PAC) {
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
}
fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
}
return idx;
}
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
free(idx);
}

16
bwa.h
View File

@ -2,6 +2,19 @@
#define BWA_H_
#include <stdint.h>
#include "bntseq.h"
#include "bwt.h"
#define BWA_IDX_BWT 0x1
#define BWA_IDX_BNS 0x2
#define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7
typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
} bwaidx_t;
typedef struct {
int l_seq;
@ -16,6 +29,9 @@ extern "C" {
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
bwaidx_t *bwa_idx_load(const char *prefix, int which);
void bwa_idx_destroy(bwaidx_t *idx);
#ifdef __cplusplus
}
#endif

View File

@ -112,7 +112,6 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
itr->len = len;
}
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
{
int i, max, max_i, ori_start;

View File

@ -2,8 +2,7 @@
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include "bntseq.h"
#include "bwt.h"
#include "bwa.h"
#include "bwamem.h"
#include "kvec.h"
#include "utils.h"
@ -15,13 +14,11 @@ extern unsigned char nst_nt4_table[256];
int main_mem(int argc, char *argv[])
{
mem_opt_t *opt;
bwt_t *bwt;
bntseq_t *bns;
int c, n, l;
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
uint8_t *pac = 0;
bseq1_t *seqs;
bwaidx_t *idx;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) {
@ -48,19 +45,9 @@ int main_mem(int argc, char *argv[])
return 1;
}
mem_fill_scmat(opt->a, opt->b, opt->mat);
{ // load the packed sequences, BWT and SA
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
free(tmp);
bns = bns_restore(argv[optind]);
pac = calloc(bns->l_pac/4+1, 1);
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
}
for (l = 0; l < bns->n_seqs; ++l)
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
idx = bwa_idx_load(argv[optind], BWA_IDX_ALL);
for (l = 0; l < idx->bns->n_seqs; ++l)
printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len);
fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r");
ks = kseq_init(fp);
@ -70,13 +57,12 @@ int main_mem(int argc, char *argv[])
opt->flag |= MEM_F_PE;
}
while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) {
mem_process_seqs(opt, bwt, bns, pac, n, seqs);
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs);
free(seqs);
}
free(opt); free(pac);
bns_destroy(bns);
bwt_destroy(bwt);
free(opt);
bwa_idx_destroy(idx);
kseq_destroy(ks);
gzclose(fp);
if (ks2) {
@ -92,10 +78,9 @@ int main_fastmap(int argc, char *argv[])
kseq_t *seq;
bwtint_t k;
gzFile fp;
bwt_t *bwt;
bntseq_t *bns;
smem_i *itr;
const bwtintv_v *a;
bwaidx_t *idx;
while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) {
switch (c) {
@ -112,16 +97,8 @@ int main_fastmap(int argc, char *argv[])
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
free(tmp);
bns = bns_restore(argv[optind]);
}
itr = smem_itr_init(bwt);
idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS);
itr = smem_itr_init(idx->bwt);
while (kseq_read(seq) >= 0) {
printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
@ -141,10 +118,10 @@ int main_fastmap(int argc, char *argv[])
bwtint_t pos;
int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32);
pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
if (is_rev) pos -= len - 1;
bns_cnt_ambi(bns, pos, len, &ref_id);
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
bns_cnt_ambi(idx->bns, pos, len, &ref_id);
printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
}
} else fputs("\t*", stdout);
putchar('\n');
@ -154,8 +131,7 @@ int main_fastmap(int argc, char *argv[])
}
smem_itr_destroy(itr);
bns_destroy(bns);
bwt_destroy(bwt);
bwa_idx_destroy(idx);
kseq_destroy(seq);
gzclose(fp);
return 0;