From 080726cb4746db94def2e9cf823a9e317a0edb23 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 7 Apr 2012 22:50:07 -0400 Subject: [PATCH] preliminary doc --- bwa.c | 68 ++++++++++++++++++++++++++------------------------ bwa.h | 76 +++++++++++++++++++++++++++++++++++++++++--------------- bwtaln.h | 2 +- 3 files changed, 92 insertions(+), 54 deletions(-) diff --git a/bwa.c b/bwa.c index 6f3dbd4..57b9a9a 100644 --- a/bwa.c +++ b/bwa.c @@ -21,7 +21,7 @@ struct bwa_idx_t { uint8_t *pac; }; -struct bwa_aux_t { +struct bwa_buf_t { int max_buf; gap_stack_t *stack; gap_opt_t *opt; @@ -60,13 +60,13 @@ void bwa_idx_destroy(bwa_idx_t *p) free(p); } -bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score) +bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) { extern gap_opt_t *gap_init_opt(void); extern int bwa_cal_maxdiff(int l, double err, double thres); int i; - bwa_aux_t *p; - p = malloc(sizeof(bwa_aux_t)); + bwa_buf_t *p; + p = malloc(sizeof(bwa_buf_t)); p->stack = gap_init_stack2(max_score); p->opt = gap_init_opt(); p->opt->s_gapo = opt->s_gapo; @@ -83,34 +83,35 @@ bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score) return p; } -void bwa_aux_destroy(bwa_aux_t *p) +void bwa_buf_destroy(bwa_buf_t *p) { gap_destroy_stack(p->stack); free(p->diff_tab); free(p->opt); free(p); } -bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln) +bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) { extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); int i, seq_len, buf_len; bwt_width_t *w, *seed_w; uint8_t *s; - gap_opt_t opt2 = *aux->opt; + gap_opt_t opt2 = *buf->opt; + bwa_sai_t sai; seq_len = strlen(seq); // estimate the buffer length - buf_len = (aux->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; - if (buf_len > aux->max_buf) { - aux->max_buf = buf_len; - kroundup32(aux->max_buf); - aux->buf = realloc(aux->buf, aux->max_buf); + buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; + if (buf_len > buf->max_buf) { + buf->max_buf = buf_len; + kroundup32(buf->max_buf); + buf->buf = realloc(buf->buf, buf->max_buf); } - memset(aux->buf, 0, buf_len); - seed_w = (bwt_width_t*)aux->buf; - w = seed_w + aux->opt->seed_len; + memset(buf->buf, 0, buf_len); + seed_w = (bwt_width_t*)buf->buf; + w = seed_w + buf->opt->seed_len; s = (uint8_t*)(w + seq_len + 1); - if (opt2.fnr > 0.) opt2.max_diff = aux->diff_tab[seq_len]; + if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len]; // copy the sequence for (i = 0; i < seq_len; ++i) s[i] = nst_nt4_table[(int)seq[i]]; @@ -118,11 +119,12 @@ bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, // mapping bwt_cal_width(idx->bwt, seq_len, s, w); if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff; - if (seq_len > aux->opt->seed_len) - bwt_cal_width(idx->bwt, aux->opt->seed_len, s + (seq_len - aux->opt->seed_len), seed_w); + if (seq_len > buf->opt->seed_len) + bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w); for (i = 0; i < seq_len; ++i) // complement; I forgot why... s[i] = s[i] > 3? 4 : 3 - s[i]; - return (bwa_alnpre_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= aux->opt->seed_len? 0 : seed_w, &opt2, n_aln, aux->stack); + sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack); + return sai; } static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps) @@ -145,36 +147,36 @@ static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t } } -bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps) +bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps) { extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); int strand, seq_len, i, n_gap, n_mm; - uint64_t pos3; + uint64_t pos3, pac_pos; uint8_t *s[2]; bwa_aln_t aln; memset(&aln, 0, sizeof(bwa_aln_t)); seq_len = strlen(seq); - if (seq_len<<1 > aux->max_buf) { - aux->max_buf = seq_len<<1; - kroundup32(aux->max_buf); - aux->buf = realloc(aux->buf, aux->max_buf); + if (seq_len<<1 > buf->max_buf) { + buf->max_buf = seq_len<<1; + kroundup32(buf->max_buf); + buf->buf = realloc(buf->buf, buf->max_buf); } - s[0] = aux->buf; + s[0] = buf->buf; s[1] = s[0] + seq_len; for (i = 0; i < seq_len; ++i) s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]]; seq_reverse(seq_len, s[1], 1); - aln.pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); + pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); if (strand) aln.flag |= 16; if (n_gaps) { // only for gapped alignment int n_cigar; bwa_cigar_t *cigar16; - cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &aln.pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); + cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); aln.n_cigar = n_cigar; aln.cigar = malloc(n_cigar * 4); - for (i = 0, pos3 = aln.pac_pos; i < n_cigar; ++i) { + for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) { int op = cigar16[i]>>14; int len = cigar16[i]&0x3fff; if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR @@ -186,13 +188,13 @@ bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint aln.n_cigar = 1; aln.cigar = malloc(4); aln.cigar[0] = seq_len<<4 | 0; - pos3 = aln.pac_pos + seq_len; + pos3 = pac_pos + seq_len; } - aln.n_n = bns_cnt_ambi(idx->bns, aln.pac_pos, pos3 - aln.pac_pos, &aln.ref_id); - aln.offset = aln.pac_pos - idx->bns->anns[aln.ref_id].offset; + aln.n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln.ref_id); + aln.offset = pac_pos - idx->bns->anns[aln.ref_id].offset; if (pos3 - idx->bns->anns[aln.ref_id].offset > idx->bns->anns[aln.ref_id].len) // read mapped beyond the end of a sequence aln.flag |= 4; // read unmapped - compute_NM(idx->pac, idx->bns->l_pac, s[strand], aln.pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap); + compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap); aln.n_mm = n_mm; aln.n_gap = n_gap; return aln; diff --git a/bwa.h b/bwa.h index 303c983..964b832 100644 --- a/bwa.h +++ b/bwa.h @@ -6,46 +6,82 @@ #define BWA_DEF_MAX_SCORE 2048 #define BWA_MAX_QUERY_LEN 1024 +// BWA index struct bwa_idx_t; typedef struct bwa_idx_t bwa_idx_t; -struct bwa_aux_t; -typedef struct bwa_aux_t bwa_aux_t; +// Buffer for BWA alignment +struct bwa_buf_t; +typedef struct bwa_buf_t bwa_buf_t; +// BWA alignment options typedef struct { - int s_gapo, s_gape; // the mismatch penalty is fixed at 3 - int max_diff, max_gapo, max_gape; - int seed_len, max_seed_diff; - float fnr; + int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3 + int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions + int seed_len, max_seed_diff; // seed length and max differences allowed in the seed + float fnr; // parameter for automatic length-adjusted max differences } bwa_opt_t; +// default BWA alignment options +extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 } + +// an interval hit in the SA coordinate; basic unit in .sai files typedef struct { uint32_t n_mm:16, n_gapo:8, n_gape:8; int score; - uint64_t k, l; -} bwa_alnpre_t; + uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits +} bwa_sai1_t; +// all interval hits in the SA coordinate typedef struct { - uint32_t n_n:8, n_gap:12, n_mm:12; - int32_t ref_id; - uint32_t offset; - uint32_t n_cigar:16, flag:16; - uint64_t pac_pos; - uint32_t *cigar; -} bwa_aln_t; + int n; // number of interval hits + bwa_sai1_t *sai; +} bwa_sai_t; -extern bwa_opt_t bwa_def_opt; +// an alignment +typedef struct { + uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment + int32_t ref_id; // referece sequence index (the first seq is indexed by 0) + uint32_t offset; // coordinate on the reference; zero-based + uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag + uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations +} bwa_aln_t; #ifdef __cplusplus extern "C" { #endif + // load a BWA index bwa_idx_t *bwa_idx_load(const char *prefix); void bwa_idx_destroy(bwa_idx_t *p); - bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score); - void bwa_aux_destroy(bwa_aux_t *p); - bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln); - bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps); + + // allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE + bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score); + void bwa_buf_destroy(bwa_buf_t *p); + + /** + * Find all the SA intervals + * + * @param idx BWA index; multiple threads can share the same index + * @param buf BWA alignment buffer; each thread should have its own buffer + * @param seq NULL terminated C string, consisting of A/C/G/T/N only + * + * @return SA intervals seq is matched to + */ + bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq); + + /** + * Construct an alignment in the base-pair coordinate + * + * @param idx BWA index + * @param buf BWA alignment buffer + * @param seq NULL terinated C string + * @param sa Suffix array value + * @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape + * + * @return An alignment + */ + bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps); #ifdef __cplusplus } diff --git a/bwtaln.h b/bwtaln.h index a3eace2..39eaf4b 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -35,8 +35,8 @@ typedef struct { typedef struct { uint32_t n_mm:16, n_gapo:8, n_gape:8; - bwtint_t k, l; int score; + bwtint_t k, l; } bwt_aln1_t; typedef uint16_t bwa_cigar_t;