From 2d2db5d50f61e7eeb10103137c6f1f3178951f84 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 16:13:35 -0400 Subject: [PATCH] aln seems working --- bntseq.c | 6 ++++-- bwtaln.c | 59 +++++++++++++++++++++++++------------------------------- bwtaln.h | 2 +- bwtgap.c | 51 +++++++++++++++++++++--------------------------- bwtgap.h | 6 +++--- 5 files changed, 56 insertions(+), 68 deletions(-) diff --git a/bntseq.c b/bntseq.c index 41fb68c..4d532c4 100644 --- a/bntseq.c +++ b/bntseq.c @@ -178,7 +178,7 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int * p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; for (i = lasts = 0; i < seq->seq.l; ++i) { - int c = nst_nt4_table[(int)seq->seq.s[i]]; + int c = seq->seq.s[i]; if (c >= 4) { // N if (lasts == seq->seq.s[i]) { // contiguous N ++(*q)->len; @@ -218,7 +218,7 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) char name[1024]; bntseq_t *bns; unsigned char buf[0x10000]; - int32_t l_buf, m_seqs, m_holes; + int32_t i, l_buf, m_seqs, m_holes; int64_t ret = -1; bntamb1_t *q; FILE *fp; @@ -238,6 +238,8 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) memset(buf, 0, 0x10000); // read sequences while (kseq_read(seq) >= 0) { + for (i = 0; i < seq->seq.l; ++i) // convert to 2-bit encoding + seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1); add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); diff --git a/bwtaln.c b/bwtaln.c index de6b001..c5002c6 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -49,22 +49,22 @@ int bwa_cal_maxdiff(int l, double err, double thres) } // width must be filled as zero -static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width) +static int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) { bwtint_t k, l, ok, ol; int i, bid; bid = 0; - k = 0; l = rbwt->seq_len; + k = 0; l = bwt->seq_len; for (i = 0; i < len; ++i) { ubyte_t c = str[i]; if (c < 4) { - bwt_2occ(rbwt, k - 1, l, c, &ok, &ol); - k = rbwt->L2[c] + ok + 1; - l = rbwt->L2[c] + ol; + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; } if (k > l || c > 3) { // then restart k = 0; - l = rbwt->seq_len; + l = bwt->seq_len; ++bid; } width[i].w = l - k + 1; @@ -75,12 +75,11 @@ static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_wid return bid; } -void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) +void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) { - int i, max_l = 0, max_len; + int i, j, max_l = 0, max_len; gap_stack_t *stack; - bwt_width_t *w[2], *seed_w[2]; - const ubyte_t *seq[2]; + bwt_width_t *w, *seed_w; gap_opt_t local_opt = *opt; // initiate priority stack @@ -90,46 +89,40 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seq if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); - seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); - seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); - w[0] = w[1] = 0; + seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + w = 0; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; #ifdef HAVE_PTHREAD if (i % opt->n_threads != tid) continue; #endif p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; - seq[0] = p->seq; seq[1] = p->rseq; if (max_l < p->len) { max_l = p->len; - w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t)); - w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t)); - memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t)); - memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t)); + w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); + memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); } - bwt_cal_width(bwt[0], p->len, seq[0], w[0]); - bwt_cal_width(bwt[1], p->len, seq[1], w[1]); + bwt_cal_width(bwt, p->len, p->seq, w); if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; - if (p->len > opt->seed_len) { - bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]); - bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]); - } + if (p->len > opt->seed_len) + bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w); // core function - p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); - // store the alignment + for (j = 0; j < p->len; ++j) // we need to complement + p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; + p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + // clean up the record free(p->name); free(p->seq); free(p->rseq); free(p->qual); p->name = 0; p->seq = p->rseq = p->qual = 0; } - free(seed_w[0]); free(seed_w[1]); - free(w[0]); free(w[1]); + free(seed_w); free(w); gap_destroy_stack(stack); } #ifdef HAVE_PTHREAD typedef struct { int tid; - bwt_t *bwt[2]; + bwt_t *bwt; int n_seqs; bwa_seq_t *seqs; const gap_opt_t *opt; @@ -163,15 +156,15 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; - bwt_t *bwt[2]; + bwt_t *bwt; // initialization ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT + extern uint8_t nst_nt4_table[]; char *str = (char*)calloc(strlen(prefix) + 10, 1); - strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); free(str); } @@ -223,7 +216,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) } // destroy - bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); + bwt_destroy(bwt); bwa_seq_close(ks); } diff --git a/bwtaln.h b/bwtaln.h index 0dc7363..20191ad 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -135,7 +135,7 @@ extern "C" { void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs); int bwa_cal_maxdiff(int l, double err, double thres); - void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt); + void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt); void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); diff --git a/bwtgap.c b/bwtgap.c index d32c3bb..c996f9f 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -42,7 +42,7 @@ static void gap_reset_stack(gap_stack_t *stack) stack->n_entries = 0; } -static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, +static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int state, int is_diff, const gap_opt_t *opt) { int score; @@ -55,7 +55,7 @@ static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; - p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l; + p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; p->last_diff_pos = is_diff? i : 0; ++(q->n_entries); @@ -101,8 +101,8 @@ static inline int int_log2(uint32_t v) return c; } -bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2], - bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) +bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, + bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) { int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; @@ -115,7 +115,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], // check whether there are too many N for (j = _j = 0; j < len; ++j) - if (seq[0][j] > 3) ++_j; + if (seq[j] > 3) ++_j; if (_j > max_diff) { *_n_aln = n_aln; return aln; @@ -123,31 +123,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); gap_reset_stack(stack); // reset stack - gap_push(stack, 0, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt); - gap_push(stack, 1, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt); + gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt); while (stack->n_entries) { gap_entry_t e; - int a, i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; + int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; bwtint_t k, l, cnt_k[4], cnt_l[4], occ; - const bwt_t *bwt; - const ubyte_t *str; - const bwt_width_t *seed_width = 0; - bwt_width_t *width; if (max_entries < stack->n_entries) max_entries = stack->n_entries; if (stack->n_entries > opt->max_entries) break; gap_pop(stack, &e); // get the best entry k = e.k; l = e.l; // SA interval - a = e.info>>20&1; i = e.info&0xffff; // strand, length + i = e.info&0xffff; // length if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed m = max_diff - (e.n_mm + e.n_gapo); if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; if (m < 0) continue; - bwt = bwts[1-a]; str = seq[a]; width = w[a]; - if (seed_w) { // apply seeding - seed_width = seed_w[a]; + if (seed_width) { // apply seeding m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; } @@ -158,7 +151,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], hit_found = 0; if (i == 0) hit_found = 1; else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed - if (bwt_match_exact_alt(bwt, i, str, &k, &l)) hit_found = 1; + if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1; else continue; // no hit, skip } @@ -189,7 +182,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); } p = aln + n_aln; - p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a; + p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->k = k; p->l = l; p->score = score; ++n_aln; @@ -206,7 +199,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], int ii = i - (len - opt->seed_len); if (width[i-1].bid > m-1) allow_diff = 0; else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0; - if (seed_w && ii > 0) { + if (seed_width && ii > 0) { if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0; else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1 && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0; @@ -218,24 +211,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], if (e.state == STATE_M) { // gap open if (e.n_gapo < opt->max_gapo) { // gap open is allowed // insertion - gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); // deletion for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); } } } else if (e.state == STATE_I) { // extention of an insertion if (e.n_gape < opt->max_gape) // gap extention is allowed - gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); } else if (e.state == STATE_D) { // extention of a deletion if (e.n_gape < opt->max_gape) { // gap extention is allowed if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); } } } @@ -244,17 +237,17 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], // mismatches if (allow_diff && allow_M) { // mismatch is allowed for (j = 1; j <= 4; ++j) { - int c = (str[i] + j) & 3; - int is_mm = (j != 4 || str[i] > 3); + int c = (seq[i] + j) & 3; + int is_mm = (j != 4 || seq[i] > 3); k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, a, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); } - } else if (str[i] < 4) { // try exact match only - int c = str[i] & 3; + } else if (seq[i] < 4) { // try exact match only + int c = seq[i] & 3; k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); } } diff --git a/bwtgap.h b/bwtgap.h index fc910bc..01ee359 100644 --- a/bwtgap.h +++ b/bwtgap.h @@ -5,7 +5,7 @@ #include "bwtaln.h" typedef struct { // recursion stack - u_int32_t info; // score<<21 | a<<20 | i + u_int32_t info; // score<<21 | i u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; bwtint_t k, l; // (k,l) is the SA region of [i,n-1] int last_diff_pos; @@ -27,8 +27,8 @@ extern "C" { gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); void gap_destroy_stack(gap_stack_t *stack); - bwt_aln1_t *bwt_match_gap(bwt_t *const bwt[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2], - bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); + bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w, + bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); #ifdef __cplusplus