preliminary doc

This commit is contained in:
Heng Li 2012-04-07 22:50:07 -04:00
parent 1cef219667
commit 080726cb47
3 changed files with 92 additions and 54 deletions

68
bwa.c
View File

@ -21,7 +21,7 @@ struct bwa_idx_t {
uint8_t *pac;
};
struct bwa_aux_t {
struct bwa_buf_t {
int max_buf;
gap_stack_t *stack;
gap_opt_t *opt;
@ -60,13 +60,13 @@ void bwa_idx_destroy(bwa_idx_t *p)
free(p);
}
bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score)
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score)
{
extern gap_opt_t *gap_init_opt(void);
extern int bwa_cal_maxdiff(int l, double err, double thres);
int i;
bwa_aux_t *p;
p = malloc(sizeof(bwa_aux_t));
bwa_buf_t *p;
p = malloc(sizeof(bwa_buf_t));
p->stack = gap_init_stack2(max_score);
p->opt = gap_init_opt();
p->opt->s_gapo = opt->s_gapo;
@ -83,34 +83,35 @@ bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score)
return p;
}
void bwa_aux_destroy(bwa_aux_t *p)
void bwa_buf_destroy(bwa_buf_t *p)
{
gap_destroy_stack(p->stack);
free(p->diff_tab); free(p->opt);
free(p);
}
bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln)
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
{
extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width);
int i, seq_len, buf_len;
bwt_width_t *w, *seed_w;
uint8_t *s;
gap_opt_t opt2 = *aux->opt;
gap_opt_t opt2 = *buf->opt;
bwa_sai_t sai;
seq_len = strlen(seq);
// estimate the buffer length
buf_len = (aux->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
if (buf_len > aux->max_buf) {
aux->max_buf = buf_len;
kroundup32(aux->max_buf);
aux->buf = realloc(aux->buf, aux->max_buf);
buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
if (buf_len > buf->max_buf) {
buf->max_buf = buf_len;
kroundup32(buf->max_buf);
buf->buf = realloc(buf->buf, buf->max_buf);
}
memset(aux->buf, 0, buf_len);
seed_w = (bwt_width_t*)aux->buf;
w = seed_w + aux->opt->seed_len;
memset(buf->buf, 0, buf_len);
seed_w = (bwt_width_t*)buf->buf;
w = seed_w + buf->opt->seed_len;
s = (uint8_t*)(w + seq_len + 1);
if (opt2.fnr > 0.) opt2.max_diff = aux->diff_tab[seq_len];
if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len];
// copy the sequence
for (i = 0; i < seq_len; ++i)
s[i] = nst_nt4_table[(int)seq[i]];
@ -118,11 +119,12 @@ bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq,
// mapping
bwt_cal_width(idx->bwt, seq_len, s, w);
if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff;
if (seq_len > aux->opt->seed_len)
bwt_cal_width(idx->bwt, aux->opt->seed_len, s + (seq_len - aux->opt->seed_len), seed_w);
if (seq_len > buf->opt->seed_len)
bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w);
for (i = 0; i < seq_len; ++i) // complement; I forgot why...
s[i] = s[i] > 3? 4 : 3 - s[i];
return (bwa_alnpre_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= aux->opt->seed_len? 0 : seed_w, &opt2, n_aln, aux->stack);
sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack);
return sai;
}
static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps)
@ -145,36 +147,36 @@ static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t
}
}
bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps)
bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps)
{
extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct);
int strand, seq_len, i, n_gap, n_mm;
uint64_t pos3;
uint64_t pos3, pac_pos;
uint8_t *s[2];
bwa_aln_t aln;
memset(&aln, 0, sizeof(bwa_aln_t));
seq_len = strlen(seq);
if (seq_len<<1 > aux->max_buf) {
aux->max_buf = seq_len<<1;
kroundup32(aux->max_buf);
aux->buf = realloc(aux->buf, aux->max_buf);
if (seq_len<<1 > buf->max_buf) {
buf->max_buf = seq_len<<1;
kroundup32(buf->max_buf);
buf->buf = realloc(buf->buf, buf->max_buf);
}
s[0] = aux->buf;
s[0] = buf->buf;
s[1] = s[0] + seq_len;
for (i = 0; i < seq_len; ++i)
s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]];
seq_reverse(seq_len, s[1], 1);
aln.pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
if (strand) aln.flag |= 16;
if (n_gaps) { // only for gapped alignment
int n_cigar;
bwa_cigar_t *cigar16;
cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &aln.pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
aln.n_cigar = n_cigar;
aln.cigar = malloc(n_cigar * 4);
for (i = 0, pos3 = aln.pac_pos; i < n_cigar; ++i) {
for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) {
int op = cigar16[i]>>14;
int len = cigar16[i]&0x3fff;
if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR
@ -186,13 +188,13 @@ bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint
aln.n_cigar = 1;
aln.cigar = malloc(4);
aln.cigar[0] = seq_len<<4 | 0;
pos3 = aln.pac_pos + seq_len;
pos3 = pac_pos + seq_len;
}
aln.n_n = bns_cnt_ambi(idx->bns, aln.pac_pos, pos3 - aln.pac_pos, &aln.ref_id);
aln.offset = aln.pac_pos - idx->bns->anns[aln.ref_id].offset;
aln.n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln.ref_id);
aln.offset = pac_pos - idx->bns->anns[aln.ref_id].offset;
if (pos3 - idx->bns->anns[aln.ref_id].offset > idx->bns->anns[aln.ref_id].len) // read mapped beyond the end of a sequence
aln.flag |= 4; // read unmapped
compute_NM(idx->pac, idx->bns->l_pac, s[strand], aln.pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap);
compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap);
aln.n_mm = n_mm;
aln.n_gap = n_gap;
return aln;

76
bwa.h
View File

@ -6,46 +6,82 @@
#define BWA_DEF_MAX_SCORE 2048
#define BWA_MAX_QUERY_LEN 1024
// BWA index
struct bwa_idx_t;
typedef struct bwa_idx_t bwa_idx_t;
struct bwa_aux_t;
typedef struct bwa_aux_t bwa_aux_t;
// Buffer for BWA alignment
struct bwa_buf_t;
typedef struct bwa_buf_t bwa_buf_t;
// BWA alignment options
typedef struct {
int s_gapo, s_gape; // the mismatch penalty is fixed at 3
int max_diff, max_gapo, max_gape;
int seed_len, max_seed_diff;
float fnr;
int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3
int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions
int seed_len, max_seed_diff; // seed length and max differences allowed in the seed
float fnr; // parameter for automatic length-adjusted max differences
} bwa_opt_t;
// default BWA alignment options
extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 }
// an interval hit in the SA coordinate; basic unit in .sai files
typedef struct {
uint32_t n_mm:16, n_gapo:8, n_gape:8;
int score;
uint64_t k, l;
} bwa_alnpre_t;
uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits
} bwa_sai1_t;
// all interval hits in the SA coordinate
typedef struct {
uint32_t n_n:8, n_gap:12, n_mm:12;
int32_t ref_id;
uint32_t offset;
uint32_t n_cigar:16, flag:16;
uint64_t pac_pos;
uint32_t *cigar;
} bwa_aln_t;
int n; // number of interval hits
bwa_sai1_t *sai;
} bwa_sai_t;
extern bwa_opt_t bwa_def_opt;
// an alignment
typedef struct {
uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment
int32_t ref_id; // referece sequence index (the first seq is indexed by 0)
uint32_t offset; // coordinate on the reference; zero-based
uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag
uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations
} bwa_aln_t;
#ifdef __cplusplus
extern "C" {
#endif
// load a BWA index
bwa_idx_t *bwa_idx_load(const char *prefix);
void bwa_idx_destroy(bwa_idx_t *p);
bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score);
void bwa_aux_destroy(bwa_aux_t *p);
bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln);
bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps);
// allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score);
void bwa_buf_destroy(bwa_buf_t *p);
/**
* Find all the SA intervals
*
* @param idx BWA index; multiple threads can share the same index
* @param buf BWA alignment buffer; each thread should have its own buffer
* @param seq NULL terminated C string, consisting of A/C/G/T/N only
*
* @return SA intervals seq is matched to
*/
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq);
/**
* Construct an alignment in the base-pair coordinate
*
* @param idx BWA index
* @param buf BWA alignment buffer
* @param seq NULL terinated C string
* @param sa Suffix array value
* @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape
*
* @return An alignment
*/
bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps);
#ifdef __cplusplus
}

View File

@ -35,8 +35,8 @@ typedef struct {
typedef struct {
uint32_t n_mm:16, n_gapo:8, n_gape:8;
bwtint_t k, l;
int score;
bwtint_t k, l;
} bwt_aln1_t;
typedef uint16_t bwa_cigar_t;