From a1abfe9977f5d193064518d5c8ddcd3d1e81dcb0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 7 Apr 2012 00:23:01 -0400 Subject: [PATCH] API: aln seems working --- Makefile | 25 ++++++----- bwa.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.h | 44 +++++++++++++++++++ bwtaln.c | 2 +- bwtgap.c | 17 ++++---- bwtgap.h | 1 + 6 files changed, 195 insertions(+), 20 deletions(-) create mode 100644 bwa.c create mode 100644 bwa.h diff --git a/Makefile b/Makefile index 1fe8462..b39bd27 100644 --- a/Makefile +++ b/Makefile @@ -2,16 +2,19 @@ CC= gcc CXX= g++ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) +AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ - is.o bntseq.o bwtmisc.o bwtindex.o ksw.o stdaln.o simple_dp.o \ - bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ +LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \ + bwaseqio.o +AOBJS= QSufSort.o bwt_gen.o \ + is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ + bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o + bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread -SUBDIRS= . bwt_gen +SUBDIRS= . .SUFFIXES:.c .o .cc @@ -22,19 +25,21 @@ SUBDIRS= . bwt_gen all:$(PROG) -bwa:$(OBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(OBJS) main.o -o $@ $(LIBS) +bwa:libbwa.a $(AOBJS) main.o + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + +libbwa.a:$(LOBJS) + $(AR) -csru $@ $(LOBJS) + +bwa.o:bwa.h QSufSort.o:QSufSort.h bwt.o:bwt.h bwtio.o:bwt.h bwtaln.o:bwt.h bwtaln.h kseq.h -bwt1away.o:bwt.h bwtaln.h -bwt2fmv.o:bwt.h bntseq.o:bntseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h -fastmap:bwt.h bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h diff --git a/bwa.c b/bwa.c new file mode 100644 index 0000000..ab0afea --- /dev/null +++ b/bwa.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include "bwa.h" +#include "bwt.h" +#include "bwtgap.h" +#include "bntseq.h" + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +extern unsigned char nst_nt4_table[256]; + +bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 }; + +struct bwa_idx_t { + bwt_t *bwt; + bntseq_t *bns; + uint8_t *pac; +}; + +struct bwa_aux_t { + int max_buf; + gap_stack_t *stack; + gap_opt_t *opt; + int *diff_tab; + uint8_t *buf; +}; + +bwa_idx_t *bwa_idx_load(const char *prefix) +{ + bwa_idx_t *p; + int l; + char *str; + l = strlen(prefix); + p = calloc(1, sizeof(bwa_idx_t)); + str = malloc(l + 10); + strcpy(str, prefix); + p->bns = bns_restore(str); + strcpy(str + l, ".bwt"); + p->bwt = bwt_restore_bwt(str); + str[l] = 0; + strcpy(str + l, ".sa"); + bwt_restore_sa(str, p->bwt); + free(str); + p->pac = calloc(p->bns->l_pac/4+1, 1); + fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac); + fclose(p->bns->fp_pac); + p->bns->fp_pac = 0; + return p; +} + +void bwa_idx_destroy(bwa_idx_t *p) +{ + bns_destroy(p->bns); + bwt_destroy(p->bwt); + free(p->pac); + free(p); +} + +bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score) +{ + extern gap_opt_t *gap_init_opt(void); + extern int bwa_cal_maxdiff(int l, double err, double thres); + int i; + bwa_aux_t *p; + p = malloc(sizeof(bwa_aux_t)); + p->stack = gap_init_stack2(max_score); + p->opt = gap_init_opt(); + p->opt->s_gapo = opt->s_gapo; + p->opt->s_gape = opt->s_gape; + p->opt->max_diff = opt->max_diff; + p->opt->max_gapo = opt->max_gapo; + p->opt->max_gape = opt->max_gape; + p->opt->seed_len = opt->seed_len; + p->opt->max_seed_diff = opt->max_seed_diff; + p->opt->fnr = opt->fnr; + p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int)); + for (i = 1; i < BWA_MAX_QUERY_LEN; ++i) + p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + return p; +} + +void bwa_aux_destroy(bwa_aux_t *p) +{ + gap_destroy_stack(p->stack); + free(p->diff_tab); free(p->opt); + free(p); +} + +bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln) +{ + extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); + extern void seq_reverse(int len, uint8_t *seq, int is_comp); + int i, seq_len, buf_len; + bwt_width_t *w, *seed_w; + uint8_t *s; + gap_opt_t opt2 = *aux->opt; + + seq_len = strlen(seq); + // estimate the buffer length + buf_len = (aux->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; + if (buf_len > aux->max_buf) { + aux->max_buf = buf_len; + kroundup32(aux->max_buf); + aux->buf = realloc(aux->buf, aux->max_buf); + } + memset(aux->buf, 0, buf_len); + seed_w = (bwt_width_t*)aux->buf; + w = seed_w + aux->opt->seed_len; + s = (uint8_t*)(w + seq_len + 1); + if (opt2.fnr > 0.) opt2.max_diff = aux->diff_tab[seq_len]; + // copy the sequence + for (i = 0; i < seq_len; ++i) + s[i] = nst_nt4_table[(int)seq[i]]; + seq_reverse(seq_len, s, 0); + // mapping + bwt_cal_width(idx->bwt, seq_len, s, w); + if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff; + if (seq_len > aux->opt->seed_len) + bwt_cal_width(idx->bwt, aux->opt->seed_len, s + (seq_len - aux->opt->seed_len), seed_w); + for (i = 0; i < seq_len; ++i) // complement; I forgot why... + s[i] = s[i] > 3? 4 : 3 - s[i]; + return (bwa_alnpre_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= aux->opt->seed_len? 0 : seed_w, &opt2, n_aln, aux->stack); +} diff --git a/bwa.h b/bwa.h new file mode 100644 index 0000000..a69f9d2 --- /dev/null +++ b/bwa.h @@ -0,0 +1,44 @@ +#ifndef BWA_H_ +#define BWA_H_ + +#include + +#define BWA_DEF_MAX_SCORE 2048 +#define BWA_MAX_QUERY_LEN 1024 + +struct bwa_idx_t; +typedef struct bwa_idx_t bwa_idx_t; + +struct bwa_aux_t; +typedef struct bwa_aux_t bwa_aux_t; + +typedef struct { + int s_gapo, s_gape; // the mismatch penalty is fixed at 3 + int max_diff, max_gapo, max_gape; + int seed_len, max_seed_diff; + float fnr; +} bwa_opt_t; + +typedef struct { + uint32_t n_mm:16, n_gapo:8, n_gape:8; + int score; + uint64_t k, l; +} bwa_alnpre_t; + +extern bwa_opt_t bwa_def_opt; + +#ifdef __cplusplus +extern "C" { +#endif + + bwa_idx_t *bwa_idx_load(const char *prefix); + void bwa_idx_destroy(bwa_idx_t *p); + bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score); + void bwa_aux_destroy(bwa_aux_t *p); + bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwtaln.c b/bwtaln.c index 014734c..efc7f66 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -49,7 +49,7 @@ int bwa_cal_maxdiff(int l, double err, double thres) } // width must be filled as zero -static int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) +int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) { bwtint_t k, l, ok, ol; int i, bid; diff --git a/bwtgap.c b/bwtgap.c index c996f9f..364717c 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -10,21 +10,20 @@ #define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) -gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +gap_stack_t *gap_init_stack2(int max_score) { - int i; gap_stack_t *stack; stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); - stack->n_stacks = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); + stack->n_stacks = max_score; stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); - for (i = 0; i != stack->n_stacks; ++i) { - gap_stack1_t *p = stack->stacks + i; - p->m_entries = 4; - p->stack = (gap_entry_t*)calloc(p->m_entries, sizeof(gap_entry_t)); - } return stack; } +gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +{ + return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); +} + void gap_destroy_stack(gap_stack_t *stack) { int i; @@ -51,7 +50,7 @@ static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, i score = aln_score(n_mm, n_gapo, n_gape, opt); q = stack->stacks + score; if (q->n_entries == q->m_entries) { - q->m_entries <<= 1; + q->m_entries = q->m_entries? q->m_entries<<1 : 4; q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; diff --git a/bwtgap.h b/bwtgap.h index 01ee359..8398762 100644 --- a/bwtgap.h +++ b/bwtgap.h @@ -25,6 +25,7 @@ typedef struct { extern "C" { #endif + gap_stack_t *gap_init_stack2(int max_score); gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); void gap_destroy_stack(gap_stack_t *stack); bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w,