fast-bwa/bwa.c

263 lines
7.7 KiB
C
Raw Normal View History

2012-04-07 12:23:01 +08:00
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
2012-04-08 12:02:06 +08:00
#include <math.h>
2012-04-07 12:23:01 +08:00
#include "bwa.h"
#include "bwt.h"
#include "bwtgap.h"
#include "bntseq.h"
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
extern unsigned char nst_nt4_table[256];
2012-04-08 10:00:03 +08:00
extern void seq_reverse(int len, uint8_t *seq, int is_comp);
2012-04-07 12:23:01 +08:00
bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 };
struct bwa_idx_t {
bwt_t *bwt;
bntseq_t *bns;
uint8_t *pac;
};
2012-04-08 10:50:07 +08:00
struct bwa_buf_t {
2012-04-07 12:23:01 +08:00
int max_buf;
gap_stack_t *stack;
gap_opt_t *opt;
int *diff_tab;
uint8_t *buf;
2012-04-08 12:02:06 +08:00
int *logn;
2012-04-07 12:23:01 +08:00
};
bwa_idx_t *bwa_idx_load(const char *prefix)
{
bwa_idx_t *p;
int l;
char *str;
l = strlen(prefix);
p = calloc(1, sizeof(bwa_idx_t));
str = malloc(l + 10);
strcpy(str, prefix);
p->bns = bns_restore(str);
strcpy(str + l, ".bwt");
p->bwt = bwt_restore_bwt(str);
str[l] = 0;
strcpy(str + l, ".sa");
bwt_restore_sa(str, p->bwt);
free(str);
p->pac = calloc(p->bns->l_pac/4+1, 1);
fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac);
fclose(p->bns->fp_pac);
p->bns->fp_pac = 0;
return p;
}
void bwa_idx_destroy(bwa_idx_t *p)
{
bns_destroy(p->bns);
bwt_destroy(p->bwt);
free(p->pac);
free(p);
}
2012-04-08 10:50:07 +08:00
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score)
2012-04-07 12:23:01 +08:00
{
extern gap_opt_t *gap_init_opt(void);
extern int bwa_cal_maxdiff(int l, double err, double thres);
int i;
2012-04-08 10:50:07 +08:00
bwa_buf_t *p;
p = malloc(sizeof(bwa_buf_t));
2012-04-07 12:23:01 +08:00
p->stack = gap_init_stack2(max_score);
p->opt = gap_init_opt();
p->opt->s_gapo = opt->s_gapo;
p->opt->s_gape = opt->s_gape;
p->opt->max_diff = opt->max_diff;
p->opt->max_gapo = opt->max_gapo;
p->opt->max_gape = opt->max_gape;
p->opt->seed_len = opt->seed_len;
p->opt->max_seed_diff = opt->max_seed_diff;
p->opt->fnr = opt->fnr;
p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int));
for (i = 1; i < BWA_MAX_QUERY_LEN; ++i)
p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
2012-04-08 12:02:06 +08:00
p->logn = calloc(256, sizeof(int));
for (i = 1; i != 256; ++i)
p->logn[i] = (int)(4.343 * log(i) + 0.499);
2012-04-07 12:23:01 +08:00
return p;
}
2012-04-08 10:50:07 +08:00
void bwa_buf_destroy(bwa_buf_t *p)
2012-04-07 12:23:01 +08:00
{
gap_destroy_stack(p->stack);
2012-04-08 12:02:06 +08:00
free(p->diff_tab); free(p->logn); free(p->opt);
2012-04-07 12:23:01 +08:00
free(p);
}
2012-04-08 10:50:07 +08:00
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
2012-04-07 12:23:01 +08:00
{
extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width);
int i, seq_len, buf_len;
bwt_width_t *w, *seed_w;
uint8_t *s;
2012-04-08 10:50:07 +08:00
gap_opt_t opt2 = *buf->opt;
bwa_sai_t sai;
2012-04-07 12:23:01 +08:00
seq_len = strlen(seq);
// estimate the buffer length
2012-04-08 10:50:07 +08:00
buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
if (buf_len > buf->max_buf) {
buf->max_buf = buf_len;
kroundup32(buf->max_buf);
buf->buf = realloc(buf->buf, buf->max_buf);
2012-04-07 12:23:01 +08:00
}
2012-04-08 10:50:07 +08:00
memset(buf->buf, 0, buf_len);
seed_w = (bwt_width_t*)buf->buf;
w = seed_w + buf->opt->seed_len;
2012-04-07 12:23:01 +08:00
s = (uint8_t*)(w + seq_len + 1);
2012-04-08 10:50:07 +08:00
if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len];
2012-04-07 12:23:01 +08:00
// copy the sequence
for (i = 0; i < seq_len; ++i)
s[i] = nst_nt4_table[(int)seq[i]];
seq_reverse(seq_len, s, 0);
// mapping
bwt_cal_width(idx->bwt, seq_len, s, w);
if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff;
2012-04-08 10:50:07 +08:00
if (seq_len > buf->opt->seed_len)
bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w);
2012-04-07 12:23:01 +08:00
for (i = 0; i < seq_len; ++i) // complement; I forgot why...
s[i] = s[i] > 3? 4 : 3 - s[i];
2012-04-08 10:50:07 +08:00
sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack);
return sai;
2012-04-07 12:23:01 +08:00
}
2012-04-08 10:00:03 +08:00
static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps)
{
uint64_t x = pos, z;
int k, y = 0;
*n_mm = *n_gaps = 0;
for (k = 0; k < n_cigar; ++k) {
int l = cigar[k]>>4;
int op = cigar[k]&0xf;
if (op == 0) { // match/mismatch
for (z = 0; z < l && x + z < l_pac; ++z) {
int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm);
}
}
if (op == 1 || op == 2) (*n_gaps) += l;
if (op == 0 || op == 2) x += l;
if (op == 0 || op == 1 || op == 4) y += l;
}
}
2012-04-08 10:50:07 +08:00
bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps)
2012-04-07 13:25:39 +08:00
{
extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct);
2012-04-08 10:00:03 +08:00
int strand, seq_len, i, n_gap, n_mm;
2012-04-08 10:50:07 +08:00
uint64_t pos3, pac_pos;
2012-04-07 13:25:39 +08:00
uint8_t *s[2];
bwa_aln_t aln;
2012-04-08 10:00:03 +08:00
memset(&aln, 0, sizeof(bwa_aln_t));
2012-04-07 13:25:39 +08:00
seq_len = strlen(seq);
2012-04-08 10:50:07 +08:00
if (seq_len<<1 > buf->max_buf) {
buf->max_buf = seq_len<<1;
kroundup32(buf->max_buf);
buf->buf = realloc(buf->buf, buf->max_buf);
2012-04-07 13:25:39 +08:00
}
2012-04-08 10:50:07 +08:00
s[0] = buf->buf;
2012-04-08 10:00:03 +08:00
s[1] = s[0] + seq_len;
for (i = 0; i < seq_len; ++i)
s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]];
seq_reverse(seq_len, s[1], 1);
2012-04-08 10:50:07 +08:00
pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
2012-04-08 10:00:03 +08:00
if (strand) aln.flag |= 16;
if (n_gaps) { // only for gapped alignment
int n_cigar;
bwa_cigar_t *cigar16;
2012-04-08 10:50:07 +08:00
cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
2012-04-08 10:00:03 +08:00
aln.n_cigar = n_cigar;
aln.cigar = malloc(n_cigar * 4);
2012-04-08 10:50:07 +08:00
for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) {
2012-04-08 10:00:03 +08:00
int op = cigar16[i]>>14;
int len = cigar16[i]&0x3fff;
if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR
aln.cigar[i] = len<<4 | op;
if (op == 0 || op == 2) pos3 += len;
}
free(cigar16);
} else { // ungapped
aln.n_cigar = 1;
aln.cigar = malloc(4);
aln.cigar[0] = seq_len<<4 | 0;
2012-04-08 10:50:07 +08:00
pos3 = pac_pos + seq_len;
2012-04-08 10:00:03 +08:00
}
2012-04-08 10:50:07 +08:00
aln.n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln.ref_id);
aln.offset = pac_pos - idx->bns->anns[aln.ref_id].offset;
2012-04-08 10:00:03 +08:00
if (pos3 - idx->bns->anns[aln.ref_id].offset > idx->bns->anns[aln.ref_id].len) // read mapped beyond the end of a sequence
aln.flag |= 4; // read unmapped
2012-04-08 10:50:07 +08:00
compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap);
2012-04-08 10:00:03 +08:00
aln.n_mm = n_mm;
aln.n_gap = n_gap;
2012-04-07 13:25:39 +08:00
return aln;
}
2012-04-08 12:02:06 +08:00
2012-04-08 12:12:34 +08:00
bwa_one_t *bwa_se2(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar)
2012-04-08 12:02:06 +08:00
{
bwa_one_t *one;
int best, cnt, i, seq_len;
seq_len = strlen(seq);
one = calloc(1, sizeof(bwa_one_t));
one->sai = bwa_sai(idx, buf, seq);
if (one->sai.n == 0) return one;
// count number of hits; randomly select one alignment
best = one->sai.sai[0].score;
for (i = cnt = 0; i < one->sai.n; ++i) {
bwa_sai1_t *p = &one->sai.sai[i];
if (p->score > best) break;
if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
one->which = p;
one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
}
cnt += p->l - p->k + 1;
}
one->c1 = cnt;
for (; i < one->sai.n; ++i)
cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1;
one->c2 = cnt - one->c1;
// estimate single-end mapping quality
one->mapQs = -1;
if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible?
else if (one->c1 > 1) one->mapQs = 0;
else {
int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape;
if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25;
else if (one->c2 == 0) one->mapQs = 37;
}
if (one->mapQs < 0) {
cnt = (one->c2 >= 255)? 255 : one->c2;
one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt];
}
one->mapQ = one->mapQs;
// compute CIGAR on request
one->one.ref_id = -1;
if (gen_cigar) one->one = bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape);
return one;
}
2012-04-08 12:12:34 +08:00
bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
{
return bwa_se2(idx, buf, seq, 1);
}
2012-04-08 12:02:06 +08:00
void bwa_one_destroy(bwa_one_t *one)
{
free(one->sai.sai);
free(one->one.cigar);
free(one);
}