aln seems working
This commit is contained in:
parent
46123639cf
commit
2d2db5d50f
6
bntseq.c
6
bntseq.c
|
|
@ -178,7 +178,7 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int *
|
|||
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
|
||||
p->n_ambs = 0;
|
||||
for (i = lasts = 0; i < seq->seq.l; ++i) {
|
||||
int c = nst_nt4_table[(int)seq->seq.s[i]];
|
||||
int c = seq->seq.s[i];
|
||||
if (c >= 4) { // N
|
||||
if (lasts == seq->seq.s[i]) { // contiguous N
|
||||
++(*q)->len;
|
||||
|
|
@ -218,7 +218,7 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
|
|||
char name[1024];
|
||||
bntseq_t *bns;
|
||||
unsigned char buf[0x10000];
|
||||
int32_t l_buf, m_seqs, m_holes;
|
||||
int32_t i, l_buf, m_seqs, m_holes;
|
||||
int64_t ret = -1;
|
||||
bntamb1_t *q;
|
||||
FILE *fp;
|
||||
|
|
@ -238,6 +238,8 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
|
|||
memset(buf, 0, 0x10000);
|
||||
// read sequences
|
||||
while (kseq_read(seq) >= 0) {
|
||||
for (i = 0; i < seq->seq.l; ++i) // convert to 2-bit encoding
|
||||
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
||||
add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q);
|
||||
seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1);
|
||||
add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q);
|
||||
|
|
|
|||
59
bwtaln.c
59
bwtaln.c
|
|
@ -49,22 +49,22 @@ int bwa_cal_maxdiff(int l, double err, double thres)
|
|||
}
|
||||
|
||||
// width must be filled as zero
|
||||
static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
|
||||
static int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width)
|
||||
{
|
||||
bwtint_t k, l, ok, ol;
|
||||
int i, bid;
|
||||
bid = 0;
|
||||
k = 0; l = rbwt->seq_len;
|
||||
k = 0; l = bwt->seq_len;
|
||||
for (i = 0; i < len; ++i) {
|
||||
ubyte_t c = str[i];
|
||||
if (c < 4) {
|
||||
bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
|
||||
k = rbwt->L2[c] + ok + 1;
|
||||
l = rbwt->L2[c] + ol;
|
||||
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
||||
k = bwt->L2[c] + ok + 1;
|
||||
l = bwt->L2[c] + ol;
|
||||
}
|
||||
if (k > l || c > 3) { // then restart
|
||||
k = 0;
|
||||
l = rbwt->seq_len;
|
||||
l = bwt->seq_len;
|
||||
++bid;
|
||||
}
|
||||
width[i].w = l - k + 1;
|
||||
|
|
@ -75,12 +75,11 @@ static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_wid
|
|||
return bid;
|
||||
}
|
||||
|
||||
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
|
||||
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
|
||||
{
|
||||
int i, max_l = 0, max_len;
|
||||
int i, j, max_l = 0, max_len;
|
||||
gap_stack_t *stack;
|
||||
bwt_width_t *w[2], *seed_w[2];
|
||||
const ubyte_t *seq[2];
|
||||
bwt_width_t *w, *seed_w;
|
||||
gap_opt_t local_opt = *opt;
|
||||
|
||||
// initiate priority stack
|
||||
|
|
@ -90,46 +89,40 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seq
|
|||
if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
|
||||
stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
|
||||
|
||||
seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
||||
seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
||||
w[0] = w[1] = 0;
|
||||
seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
||||
w = 0;
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
#ifdef HAVE_PTHREAD
|
||||
if (i % opt->n_threads != tid) continue;
|
||||
#endif
|
||||
p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
|
||||
seq[0] = p->seq; seq[1] = p->rseq;
|
||||
if (max_l < p->len) {
|
||||
max_l = p->len;
|
||||
w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t));
|
||||
w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t));
|
||||
memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t));
|
||||
memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t));
|
||||
w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t));
|
||||
memset(w, 0, (max_l + 1) * sizeof(bwt_width_t));
|
||||
}
|
||||
bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
|
||||
bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
|
||||
bwt_cal_width(bwt, p->len, p->seq, w);
|
||||
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
|
||||
local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
|
||||
if (p->len > opt->seed_len) {
|
||||
bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
|
||||
bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
|
||||
}
|
||||
if (p->len > opt->seed_len)
|
||||
bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w);
|
||||
// core function
|
||||
p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
|
||||
// store the alignment
|
||||
for (j = 0; j < p->len; ++j) // we need to complement
|
||||
p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j];
|
||||
p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
|
||||
// clean up the record
|
||||
free(p->name); free(p->seq); free(p->rseq); free(p->qual);
|
||||
p->name = 0; p->seq = p->rseq = p->qual = 0;
|
||||
}
|
||||
free(seed_w[0]); free(seed_w[1]);
|
||||
free(w[0]); free(w[1]);
|
||||
free(seed_w); free(w);
|
||||
gap_destroy_stack(stack);
|
||||
}
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
typedef struct {
|
||||
int tid;
|
||||
bwt_t *bwt[2];
|
||||
bwt_t *bwt;
|
||||
int n_seqs;
|
||||
bwa_seq_t *seqs;
|
||||
const gap_opt_t *opt;
|
||||
|
|
@ -163,15 +156,15 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
|||
bwa_seq_t *seqs;
|
||||
bwa_seqio_t *ks;
|
||||
clock_t t;
|
||||
bwt_t *bwt[2];
|
||||
bwt_t *bwt;
|
||||
|
||||
// initialization
|
||||
ks = bwa_open_reads(opt->mode, fn_fa);
|
||||
|
||||
{ // load BWT
|
||||
extern uint8_t nst_nt4_table[];
|
||||
char *str = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
free(str);
|
||||
}
|
||||
|
||||
|
|
@ -223,7 +216,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
|||
}
|
||||
|
||||
// destroy
|
||||
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
||||
bwt_destroy(bwt);
|
||||
bwa_seq_close(ks);
|
||||
}
|
||||
|
||||
|
|
|
|||
2
bwtaln.h
2
bwtaln.h
|
|
@ -135,7 +135,7 @@ extern "C" {
|
|||
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
|
||||
|
||||
int bwa_cal_maxdiff(int l, double err, double thres);
|
||||
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
|
||||
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
|
||||
|
||||
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
|
||||
|
||||
|
|
|
|||
51
bwtgap.c
51
bwtgap.c
|
|
@ -42,7 +42,7 @@ static void gap_reset_stack(gap_stack_t *stack)
|
|||
stack->n_entries = 0;
|
||||
}
|
||||
|
||||
static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape,
|
||||
static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape,
|
||||
int state, int is_diff, const gap_opt_t *opt)
|
||||
{
|
||||
int score;
|
||||
|
|
@ -55,7 +55,7 @@ static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint
|
|||
q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
|
||||
}
|
||||
p = q->stack + q->n_entries;
|
||||
p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l;
|
||||
p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l;
|
||||
p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state;
|
||||
p->last_diff_pos = is_diff? i : 0;
|
||||
++(q->n_entries);
|
||||
|
|
@ -101,8 +101,8 @@ static inline int int_log2(uint32_t v)
|
|||
return c;
|
||||
}
|
||||
|
||||
bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2],
|
||||
bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
|
||||
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width,
|
||||
bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
|
||||
{
|
||||
int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt);
|
||||
int best_diff = opt->max_diff + 1, max_diff = opt->max_diff;
|
||||
|
|
@ -115,7 +115,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
|
|||
|
||||
// check whether there are too many N
|
||||
for (j = _j = 0; j < len; ++j)
|
||||
if (seq[0][j] > 3) ++_j;
|
||||
if (seq[j] > 3) ++_j;
|
||||
if (_j > max_diff) {
|
||||
*_n_aln = n_aln;
|
||||
return aln;
|
||||
|
|
@ -123,31 +123,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
|
|||
|
||||
//for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w);
|
||||
gap_reset_stack(stack); // reset stack
|
||||
gap_push(stack, 0, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt);
|
||||
gap_push(stack, 1, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt);
|
||||
gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt);
|
||||
|
||||
while (stack->n_entries) {
|
||||
gap_entry_t e;
|
||||
int a, i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
|
||||
int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
|
||||
bwtint_t k, l, cnt_k[4], cnt_l[4], occ;
|
||||
const bwt_t *bwt;
|
||||
const ubyte_t *str;
|
||||
const bwt_width_t *seed_width = 0;
|
||||
bwt_width_t *width;
|
||||
|
||||
if (max_entries < stack->n_entries) max_entries = stack->n_entries;
|
||||
if (stack->n_entries > opt->max_entries) break;
|
||||
gap_pop(stack, &e); // get the best entry
|
||||
k = e.k; l = e.l; // SA interval
|
||||
a = e.info>>20&1; i = e.info&0xffff; // strand, length
|
||||
i = e.info&0xffff; // length
|
||||
if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed
|
||||
|
||||
m = max_diff - (e.n_mm + e.n_gapo);
|
||||
if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape;
|
||||
if (m < 0) continue;
|
||||
bwt = bwts[1-a]; str = seq[a]; width = w[a];
|
||||
if (seed_w) { // apply seeding
|
||||
seed_width = seed_w[a];
|
||||
if (seed_width) { // apply seeding
|
||||
m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo);
|
||||
if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape;
|
||||
}
|
||||
|
|
@ -158,7 +151,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
|
|||
hit_found = 0;
|
||||
if (i == 0) hit_found = 1;
|
||||
else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed
|
||||
if (bwt_match_exact_alt(bwt, i, str, &k, &l)) hit_found = 1;
|
||||
if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1;
|
||||
else continue; // no hit, skip
|
||||
}
|
||||
|
||||
|
|
@ -189,7 +182,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
|
|||
memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t));
|
||||
}
|
||||
p = aln + n_aln;
|
||||
p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a;
|
||||
p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape;
|
||||
p->k = k; p->l = l;
|
||||
p->score = score;
|
||||
++n_aln;
|
||||
|
|
@ -206,7 +199,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
|
|||
int ii = i - (len - opt->seed_len);
|
||||
if (width[i-1].bid > m-1) allow_diff = 0;
|
||||
else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0;
|
||||
if (seed_w && ii > 0) {
|
||||
if (seed_width && ii > 0) {
|
||||
if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0;
|
||||
else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1
|
||||
&& seed_width[ii-1].w == seed_width[ii].w) allow_M = 0;
|
||||
|
|
@ -218,24 +211,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
|
|||
if (e.state == STATE_M) { // gap open
|
||||
if (e.n_gapo < opt->max_gapo) { // gap open is allowed
|
||||
// insertion
|
||||
gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt);
|
||||
gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt);
|
||||
// deletion
|
||||
for (j = 0; j != 4; ++j) {
|
||||
k = bwt->L2[j] + cnt_k[j] + 1;
|
||||
l = bwt->L2[j] + cnt_l[j];
|
||||
if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt);
|
||||
if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt);
|
||||
}
|
||||
}
|
||||
} else if (e.state == STATE_I) { // extention of an insertion
|
||||
if (e.n_gape < opt->max_gape) // gap extention is allowed
|
||||
gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt);
|
||||
gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt);
|
||||
} else if (e.state == STATE_D) { // extention of a deletion
|
||||
if (e.n_gape < opt->max_gape) { // gap extention is allowed
|
||||
if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) {
|
||||
for (j = 0; j != 4; ++j) {
|
||||
k = bwt->L2[j] + cnt_k[j] + 1;
|
||||
l = bwt->L2[j] + cnt_l[j];
|
||||
if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt);
|
||||
if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -244,17 +237,17 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2],
|
|||
// mismatches
|
||||
if (allow_diff && allow_M) { // mismatch is allowed
|
||||
for (j = 1; j <= 4; ++j) {
|
||||
int c = (str[i] + j) & 3;
|
||||
int is_mm = (j != 4 || str[i] > 3);
|
||||
int c = (seq[i] + j) & 3;
|
||||
int is_mm = (j != 4 || seq[i] > 3);
|
||||
k = bwt->L2[c] + cnt_k[c] + 1;
|
||||
l = bwt->L2[c] + cnt_l[c];
|
||||
if (k <= l) gap_push(stack, a, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt);
|
||||
if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt);
|
||||
}
|
||||
} else if (str[i] < 4) { // try exact match only
|
||||
int c = str[i] & 3;
|
||||
} else if (seq[i] < 4) { // try exact match only
|
||||
int c = seq[i] & 3;
|
||||
k = bwt->L2[c] + cnt_k[c] + 1;
|
||||
l = bwt->L2[c] + cnt_l[c];
|
||||
if (k <= l) gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt);
|
||||
if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
6
bwtgap.h
6
bwtgap.h
|
|
@ -5,7 +5,7 @@
|
|||
#include "bwtaln.h"
|
||||
|
||||
typedef struct { // recursion stack
|
||||
u_int32_t info; // score<<21 | a<<20 | i
|
||||
u_int32_t info; // score<<21 | i
|
||||
u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
|
||||
bwtint_t k, l; // (k,l) is the SA region of [i,n-1]
|
||||
int last_diff_pos;
|
||||
|
|
@ -27,8 +27,8 @@ extern "C" {
|
|||
|
||||
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt);
|
||||
void gap_destroy_stack(gap_stack_t *stack);
|
||||
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2],
|
||||
bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
|
||||
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w,
|
||||
bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
Loading…
Reference in New Issue