aln seems working

This commit is contained in:
Heng Li 2011-10-20 16:13:35 -04:00
parent 46123639cf
commit 2d2db5d50f
5 changed files with 56 additions and 68 deletions

View File

@ -178,7 +178,7 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int *
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0;
for (i = lasts = 0; i < seq->seq.l; ++i) {
int c = nst_nt4_table[(int)seq->seq.s[i]];
int c = seq->seq.s[i];
if (c >= 4) { // N
if (lasts == seq->seq.s[i]) { // contiguous N
++(*q)->len;
@ -218,7 +218,7 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
char name[1024];
bntseq_t *bns;
unsigned char buf[0x10000];
int32_t l_buf, m_seqs, m_holes;
int32_t i, l_buf, m_seqs, m_holes;
int64_t ret = -1;
bntamb1_t *q;
FILE *fp;
@ -238,6 +238,8 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
memset(buf, 0, 0x10000);
// read sequences
while (kseq_read(seq) >= 0) {
for (i = 0; i < seq->seq.l; ++i) // convert to 2-bit encoding
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q);
seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1);
add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q);

View File

@ -49,22 +49,22 @@ int bwa_cal_maxdiff(int l, double err, double thres)
}
// width must be filled as zero
static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
static int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width)
{
bwtint_t k, l, ok, ol;
int i, bid;
bid = 0;
k = 0; l = rbwt->seq_len;
k = 0; l = bwt->seq_len;
for (i = 0; i < len; ++i) {
ubyte_t c = str[i];
if (c < 4) {
bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
k = rbwt->L2[c] + ok + 1;
l = rbwt->L2[c] + ol;
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
k = bwt->L2[c] + ok + 1;
l = bwt->L2[c] + ol;
}
if (k > l || c > 3) { // then restart
k = 0;
l = rbwt->seq_len;
l = bwt->seq_len;
++bid;
}
width[i].w = l - k + 1;
@ -75,12 +75,11 @@ static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_wid
return bid;
}
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
{
int i, max_l = 0, max_len;
int i, j, max_l = 0, max_len;
gap_stack_t *stack;
bwt_width_t *w[2], *seed_w[2];
const ubyte_t *seq[2];
bwt_width_t *w, *seed_w;
gap_opt_t local_opt = *opt;
// initiate priority stack
@ -90,46 +89,40 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seq
if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
w[0] = w[1] = 0;
seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
w = 0;
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
#ifdef HAVE_PTHREAD
if (i % opt->n_threads != tid) continue;
#endif
p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
seq[0] = p->seq; seq[1] = p->rseq;
if (max_l < p->len) {
max_l = p->len;
w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t));
w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t));
memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t));
memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t));
w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t));
memset(w, 0, (max_l + 1) * sizeof(bwt_width_t));
}
bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
bwt_cal_width(bwt, p->len, p->seq, w);
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
if (p->len > opt->seed_len) {
bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
}
if (p->len > opt->seed_len)
bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w);
// core function
p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
// store the alignment
for (j = 0; j < p->len; ++j) // we need to complement
p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j];
p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
// clean up the record
free(p->name); free(p->seq); free(p->rseq); free(p->qual);
p->name = 0; p->seq = p->rseq = p->qual = 0;
}
free(seed_w[0]); free(seed_w[1]);
free(w[0]); free(w[1]);
free(seed_w); free(w);
gap_destroy_stack(stack);
}
#ifdef HAVE_PTHREAD
typedef struct {
int tid;
bwt_t *bwt[2];
bwt_t *bwt;
int n_seqs;
bwa_seq_t *seqs;
const gap_opt_t *opt;
@ -163,15 +156,15 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
bwa_seq_t *seqs;
bwa_seqio_t *ks;
clock_t t;
bwt_t *bwt[2];
bwt_t *bwt;
// initialization
ks = bwa_open_reads(opt->mode, fn_fa);
{ // load BWT
extern uint8_t nst_nt4_table[];
char *str = (char*)calloc(strlen(prefix) + 10, 1);
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
free(str);
}
@ -223,7 +216,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
}
// destroy
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
bwt_destroy(bwt);
bwa_seq_close(ks);
}

View File

@ -135,7 +135,7 @@ extern "C" {
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
int bwa_cal_maxdiff(int l, double err, double thres);
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);

View File

@ -42,7 +42,7 @@ static void gap_reset_stack(gap_stack_t *stack)
stack->n_entries = 0;
}
static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape,
static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape,
int state, int is_diff, const gap_opt_t *opt)
{
int score;
@ -55,7 +55,7 @@ static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint
q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
}
p = q->stack + q->n_entries;
p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l;
p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l;
p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state;
p->last_diff_pos = is_diff? i : 0;
++(q->n_entries);
@ -101,8 +101,8 @@ static inline int int_log2(uint32_t v)
return c;
}
bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2],
bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width,
bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
{
int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt);
int best_diff = opt->max_diff + 1, max_diff = opt->max_diff;
@ -115,7 +115,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
// check whether there are too many N
for (j = _j = 0; j < len; ++j)
if (seq[0][j] > 3) ++_j;
if (seq[j] > 3) ++_j;
if (_j > max_diff) {
*_n_aln = n_aln;
return aln;
@ -123,31 +123,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
//for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w);
gap_reset_stack(stack); // reset stack
gap_push(stack, 0, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt);
gap_push(stack, 1, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt);
gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt);
while (stack->n_entries) {
gap_entry_t e;
int a, i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
bwtint_t k, l, cnt_k[4], cnt_l[4], occ;
const bwt_t *bwt;
const ubyte_t *str;
const bwt_width_t *seed_width = 0;
bwt_width_t *width;
if (max_entries < stack->n_entries) max_entries = stack->n_entries;
if (stack->n_entries > opt->max_entries) break;
gap_pop(stack, &e); // get the best entry
k = e.k; l = e.l; // SA interval
a = e.info>>20&1; i = e.info&0xffff; // strand, length
i = e.info&0xffff; // length
if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed
m = max_diff - (e.n_mm + e.n_gapo);
if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape;
if (m < 0) continue;
bwt = bwts[1-a]; str = seq[a]; width = w[a];
if (seed_w) { // apply seeding
seed_width = seed_w[a];
if (seed_width) { // apply seeding
m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo);
if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape;
}
@ -158,7 +151,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
hit_found = 0;
if (i == 0) hit_found = 1;
else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed
if (bwt_match_exact_alt(bwt, i, str, &k, &l)) hit_found = 1;
if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1;
else continue; // no hit, skip
}
@ -189,7 +182,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t));
}
p = aln + n_aln;
p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a;
p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape;
p->k = k; p->l = l;
p->score = score;
++n_aln;
@ -206,7 +199,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
int ii = i - (len - opt->seed_len);
if (width[i-1].bid > m-1) allow_diff = 0;
else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0;
if (seed_w && ii > 0) {
if (seed_width && ii > 0) {
if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0;
else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1
&& seed_width[ii-1].w == seed_width[ii].w) allow_M = 0;
@ -218,24 +211,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
if (e.state == STATE_M) { // gap open
if (e.n_gapo < opt->max_gapo) { // gap open is allowed
// insertion
gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt);
gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt);
// deletion
for (j = 0; j != 4; ++j) {
k = bwt->L2[j] + cnt_k[j] + 1;
l = bwt->L2[j] + cnt_l[j];
if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt);
if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt);
}
}
} else if (e.state == STATE_I) { // extention of an insertion
if (e.n_gape < opt->max_gape) // gap extention is allowed
gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt);
gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt);
} else if (e.state == STATE_D) { // extention of a deletion
if (e.n_gape < opt->max_gape) { // gap extention is allowed
if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) {
for (j = 0; j != 4; ++j) {
k = bwt->L2[j] + cnt_k[j] + 1;
l = bwt->L2[j] + cnt_l[j];
if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt);
if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt);
}
}
}
@ -244,17 +237,17 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
// mismatches
if (allow_diff && allow_M) { // mismatch is allowed
for (j = 1; j <= 4; ++j) {
int c = (str[i] + j) & 3;
int is_mm = (j != 4 || str[i] > 3);
int c = (seq[i] + j) & 3;
int is_mm = (j != 4 || seq[i] > 3);
k = bwt->L2[c] + cnt_k[c] + 1;
l = bwt->L2[c] + cnt_l[c];
if (k <= l) gap_push(stack, a, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt);
if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt);
}
} else if (str[i] < 4) { // try exact match only
int c = str[i] & 3;
} else if (seq[i] < 4) { // try exact match only
int c = seq[i] & 3;
k = bwt->L2[c] + cnt_k[c] + 1;
l = bwt->L2[c] + cnt_l[c];
if (k <= l) gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt);
if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt);
}
}

View File

@ -5,7 +5,7 @@
#include "bwtaln.h"
typedef struct { // recursion stack
u_int32_t info; // score<<21 | a<<20 | i
u_int32_t info; // score<<21 | i
u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
bwtint_t k, l; // (k,l) is the SA region of [i,n-1]
int last_diff_pos;
@ -27,8 +27,8 @@ extern "C" {
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt);
void gap_destroy_stack(gap_stack_t *stack);
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2],
bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w,
bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
#ifdef __cplusplus