diff --git a/bntseq.c b/bntseq.c index 15b369f..56df921 100644 --- a/bntseq.c +++ b/bntseq.c @@ -190,14 +190,14 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int * *q = bns->ambs + bns->n_holes; (*q)->len = 1; (*q)->offset = p->offset + i; - (*q)->amb = seq->seq.s[i]; + (*q)->amb = 'N'; ++p->n_ambs; ++bns->n_holes; } } lasts = seq->seq.s[i]; { // fill buffer - if (c >= 4) c = lrand48()&0x3; + if (c >= 4) c = c>>4; if (*l_buf == 0x40000) { fwrite(buf, 1, 0x10000, fp); memset(buf, 0, 0x10000); @@ -238,11 +238,16 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) memset(buf, 0, 0x10000); // read sequences while (kseq_read(seq) >= 0) { - for (i = 0; i < seq->seq.l; ++i) // convert to 2-bit encoding + for (i = 0; i < seq->seq.l; ++i) { // convert to 2-bit encoding seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + if (seq->seq.s[i] > 3) + seq->seq.s[i] |= (lrand48()&3) << 4; + } add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); if (!for_only) { - seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1); + seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 0); // reversed but not complemented + for (i = 0; i < seq->seq.l; ++i) // complement + seq->seq.s[i] = seq->seq.s[i] < 4? 3 - seq->seq.s[i] : ((3 - (seq->seq.s[i]>>4)) << 4 | 4); add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); } } diff --git a/bwt.c b/bwt.c index e7da788..6f5eb67 100644 --- a/bwt.c +++ b/bwt.c @@ -282,9 +282,9 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem prev = tmpvec[0]? tmpvec[0] : &a[0]; curr = tmpvec[1]? tmpvec[1] : &a[1]; bwt_set_intv(bwt, q[x], ik); - ik.info = x + 1; - for (i = x + 1; i < len; ++i) { // forward search + + for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search if (q[i] > 3) break; c = 3 - q[i]; bwt_extend(bwt, &ik, ok, 0); @@ -298,6 +298,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem ret = curr->a[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; + if (x == 40) printf("[%lld,%lld,%lld]\n", prev->a[0].x[0], prev->a[0].x[1], prev->a[0].x[2]); for (i = x - 1; i >= -1; --i) { // backward search for MEMs if (q[i] > 3) break; c = i < 0? 0 : q[i]; diff --git a/fastmap.c b/fastmap.c index ee8e9b4..cf952b7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -39,6 +39,27 @@ int main_fastmap(int argc, char *argv[]) while (kseq_read(seq) >= 0) { for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + { + int beg = 98; + bwtintv_t ik, ok[4]; + bwt_set_intv(bwt, seq->seq.s[seq->seq.l - 1], ik); + for (i = seq->seq.l - 2; i >= beg; --i) { + //printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i+1); + bwt_extend(bwt, &ik, ok, 1); + ik = ok[seq->seq.s[i]]; + if (ik.x[2] == 0) break; + } + printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i+1); + printf("======================== %lld, [%lld,%lld,%lld,%lld]\n", bwt->primary, bwt->L2[1], bwt->L2[2]-bwt->L2[1], bwt->L2[3]-bwt->L2[2], bwt->L2[4]-bwt->L2[3]); + bwt_set_intv(bwt, seq->seq.s[beg], ik); + for (i = beg + 1; i < seq->seq.l; ++i) { + //printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i-1); + bwt_extend(bwt, &ik, ok, 0); + ik = ok[3-seq->seq.s[i]]; + if (ik.x[2] == 0) break; + } + printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i-1); + } bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); printf(">%s\t%ld\n", seq->name.s, mem.n); for (i = 0; i < mem.n; ++i) {