From 0a98a0cfab2f3dc7764dc2b0cd610db604b12ddf Mon Sep 17 00:00:00 2001 From: Gitea Date: Thu, 25 Jan 2024 15:40:57 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AF=B9=E4=BB=A3=E7=A0=81=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E4=BA=86=E9=87=8D=E6=9E=84=EF=BC=8C=E5=9F=BA=E6=9C=AC=E5=AE=8C?= =?UTF-8?q?=E6=88=90=E4=BA=86fmt-index=E6=95=B0=E6=8D=AE=E7=BB=93=E6=9E=84?= =?UTF-8?q?=E7=9A=84=E5=BB=BA=E7=AB=8B=E5=92=8C=E5=BA=8F=E5=88=97=E6=9F=A5?= =?UTF-8?q?=E6=89=BE=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 4 + Makefile | 5 +- bwt.cpp | 310 +++++++++++++++++++++++++++++++++++++ bwt.h | 68 +++++++++ common.h | 34 ----- fmt_index.cpp | 412 ++------------------------------------------------ fmt_index.h | 26 ++++ main.cpp | 6 +- sa.cpp | 12 +- sa.h | 14 ++ util.cpp | 56 +++++++ util.h | 31 ++++ 12 files changed, 528 insertions(+), 450 deletions(-) create mode 100644 bwt.cpp create mode 100644 bwt.h delete mode 100644 common.h create mode 100644 fmt_index.h create mode 100644 sa.h create mode 100644 util.cpp create mode 100644 util.h diff --git a/.gitignore b/.gitignore index cd531cf..fcec038 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# specific for bwa_perf +*.txt +fmtidx + # ---> C # Prerequisites *.d diff --git a/Makefile b/Makefile index 4a4e795..ce06fba 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,11 @@ CC= g++ -CFLAGS= -g -Wall -Wno-unused-function -O2 +NOWARN= -Wno-unused-result -Wno-unused-function +CFLAGS= -g -Wall $(NOWARN) -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) -AOBJS= sa.o fmt_index.o +AOBJS= util.o sa.o fmt_index.o bwt.o PROG= fmtidx INCLUDES= LIBS= -lm -lz -lpthread diff --git a/bwt.cpp b/bwt.cpp new file mode 100644 index 0000000..776ba11 --- /dev/null +++ b/bwt.cpp @@ -0,0 +1,310 @@ +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "bwt.h" + +using namespace std; + +// 计算一个字节构成的T,G,C,A序列,对应的每个碱基的个数(按T,G,C,A顺序存储在32位整数中,每个占8位) +void bwt_gen_cnt_table(bwt_t *bwt) +{ + int i, j; + for (i = 0; i != 256; ++i) + { + uint32_t x = 0; + for (j = 0; j != 4; ++j) + x |= (((i & 3) == j) + ((i >> 2 & 3) == j) + ((i >> 4 & 3) == j) + (i >> 6 == j)) << (j << 3); + bwt->cnt_table[i] = x; + } +} + +// 解析两bit的bwt碱基序列 +bwt_t *restore_bwt_str(const char *fn) +{ + bwt_t *bwt; + bwt = (bwt_t *)calloc(1, sizeof(bwt_t)); + FILE *fp = fopen(fn, "rb"); + + fseek(fp, 0, SEEK_END); + bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; // 以32位word为单位计算的size + bwt->bwt = (uint32_t *)calloc(bwt->bwt_size, 4); + fseek(fp, 0, SEEK_SET); + fread(&bwt->primary, sizeof(bwtint_t), 1, fp); + fread(bwt->L2 + 1, sizeof(bwtint_t), 4, fp); + fread_fix(fp, bwt->bwt_size << 2, bwt->bwt); + bwt->seq_len = bwt->L2[4]; + + // char *buf = (char *)calloc(bwt->seq_len + 1, 1); + // for (bwtint_t i = 0; i < bwt->seq_len; ++i) + // { + // buf[i] = BASE[bwt->bwt[i >> 4] >> ((15 - (i & 15)) << 1) & 3]; + // cout << buf[i]; + // } + // cout << endl; + + fclose(fp); + bwt_gen_cnt_table(bwt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量 + return bwt; +} + +// 根据原始的字符串bwt创建interval-bwt +void create_interval_occ_bwt(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size + buf = (uint32_t *)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + // 计算occ,生成naive bwt + for (i = k = 0; i < bwt->seq_len; ++i) + { + // cout << i << '\t'; + // cout << c[0] << ' ' << c[1] << ' ' << c[2] << ' ' << c[3] << endl; + if (i % OCC_INTERVAL == 0) + { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) 每个c包含多少个32位 + // cout << "i: " << i << "\tc: " << c[0] << '\t' << c[1] << '\t' << c[2] << '\t' << c[3] << endl; + } + if (i % 16 == 0) + buf[k++] = bwt->bwt[i / 16]; // 16 == sizeof(uint32_t)/2, 2个bit表示一个碱基 + ++c[bwt_B00(bwt, i)]; + } + // the last element + // cout << c[0] << '\t' << c[1] << '\t' << c[2] << '\t' << c[3] << endl; + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); + bwt->bwt = buf; +} + +// 对64位整型数据y,计算碱基c的累积个数 +static inline int __occ_aux(uint64_t y, int c) +{ + // reduce nucleotide counting to bits counting + y = ((c & 2) ? y : ~y) >> 1 & ((c & 1) ? y : ~y) & 0x5555555555555555ull; + // count the number of 1s in y + y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); + return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; +} + +// k行(包含)之前,碱基c的累计总数, interval大于等于32才能正确计算 +bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c) +{ + bwtint_t n; + uint32_t *p, *end; + + if (k == bwt->seq_len) + return bwt->L2[c + 1] - bwt->L2[c]; + if (k == (bwtint_t)(-1)) + return 0; + k -= (k >= bwt->primary); // because $ is not in bwt + + // retrieve Occ at k/OCC_INTERVAL + n = ((bwtint_t *)(p = bwt_occ_intv(bwt, k)))[c]; + // cout << "bwt_occ - 1: " << (int)c << '\t' << k << '\t' << n << endl; + p += sizeof(bwtint_t); // jump to the start of the first BWT cell + + // calculate Occ up to the last k/32 + end = p + (((k >> 5) - ((k & ~OCC_INTV_MASK) >> 5)) << 1); + for (; p < end; p += 2) + n += __occ_aux((uint64_t)p[0] << 32 | p[1], c); + + // cout << "bwt_occ - 2: " << (int)c << '\t' << k << '\t' << n << endl; + // calculate Occ + n += __occ_aux(((uint64_t)p[0] << 32 | p[1]) & ~((1ull << ((~k & 31) << 1)) - 1), c); + if (c == 0) + n -= ~k & 31; // corrected for the masked bits + // cout << "bwt_occ - 3: " << (int)c << '\t' << k << '\t' << n << endl; + return n; +} + +// 统计k行(bwt mtx行)之前4种碱基累积数量,这里的k是bwt矩阵里的行,比bwt字符串多1 +void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) +{ + bwtint_t x; + uint32_t *p, tmp, *end; + if (k == (bwtint_t)(-1)) + { + memset(cnt, 0, 4 * sizeof(bwtint_t)); + return; + } + k -= (k >= bwt->primary); // because $ is not in bwt + p = bwt_occ_intv(bwt, k); + memcpy(cnt, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) + end = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); // this is the end point of the following loop + for (x = 0; p < end; ++p) + x += __occ_aux4(bwt, *p); + tmp = *p & ~((1U << ((~k & 15) << 1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k & 15); // 这里多算了A,要减去 + cnt[0] += x & 0xff; + cnt[1] += x >> 8 & 0xff; + cnt[2] += x >> 16 & 0xff; + cnt[3] += x >> 24; +} + +// an analogy to bwt_occ4() but more efficient, requiring k <= l +void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) +{ + bwtint_t _k, _l; + _k = k - (k >= bwt->primary); + _l = l - (l >= bwt->primary); + if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) + { + bwt_occ4(bwt, k, cntk); + bwt_occ4(bwt, l, cntl); + } + else + { + bwtint_t x, y; + uint32_t *p, tmp, *endk, *endl; + k -= (k >= bwt->primary); // because $ is not in bwt + l -= (l >= bwt->primary); + p = bwt_occ_intv(bwt, k); + memcpy(cntk, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) + // prepare cntk[] + endk = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); + endl = p + ((l >> 4) - ((l & ~OCC_INTV_MASK) >> 4)); + for (x = 0; p < endk; ++p) + x += __occ_aux4(bwt, *p); + y = x; + tmp = *p & ~((1U << ((~k & 15) << 1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k & 15); + // calculate cntl[] and finalize cntk[] + for (; p < endl; ++p) + y += __occ_aux4(bwt, *p); + tmp = *p & ~((1U << ((~l & 15) << 1)) - 1); + y += __occ_aux4(bwt, tmp) - (~l & 15); + memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); + cntk[0] += x & 0xff; + cntk[1] += x >> 8 & 0xff; + cntk[2] += x >> 16 & 0xff; + cntk[3] += x >> 24; + cntl[0] += y & 0xff; + cntl[1] += y >> 8 & 0xff; + cntl[2] += y >> 16 & 0xff; + cntl[3] += y >> 24; + } +} + +void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) +{ + bwtint_t tk[4], tl[4]; + int i; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 + // 这里是反向扩展 + for (i = 0; i != 4; ++i) + { + ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 + ok[i].x[2] = tl[i] - tk[i]; // 间隔 + } + // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 + ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; +} + +// 利用bwt搜索seed,完整搜索,只需要单向搜索 +void bwt_search(bwt_t *bwt, const string &q) +{ + bwtintv_t ik, ok[4]; + int i, j, c, x = 0; + + bwt_set_intv(bwt, bval(q[x]), ik); + ik.info = x + 1; + cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + + for (i = x + 1; i < q.size(); ++i) + { + if (bval(q[i]) < 4) + { + c = cbval(q[i]); + bwt_extend(bwt, &ik, ok, 0); + ik = ok[c]; + ik.info = i + 1; + cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + } + } +} + +// 扩展两次 +void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1) +{ + bwtint_t tk[4], tl[4]; + int i; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 + // 这里是反向扩展 + for (i = 0; i != 4; ++i) + { + ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 + ok[i].x[2] = tl[i] - tk[i]; // 间隔 + } + // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 + ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; + + *ik = ok[c1]; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); + for (i = 0; i != 4; ++i) + { + ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 + ok[i].x[2] = tl[i] - tk[i]; // 间隔 + } + // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 + ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; +} + +// 每次扩展两步 +void bwt_search2(bwt_t *bwt, const string &q) +{ + bwtintv_t ik, ok[4]; + int i, j, c1, c2, x = 0; + + bwt_set_intv(bwt, bval(q[x]), ik); + ik.info = x + 1; + cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + + for (i = x + 1; i + 1 < q.size(); i += 2) + { + if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) + { + + c1 = cbval(q[i]); + c2 = cbval(q[i + 1]); + + bwt_extend2(bwt, &ik, ok, 0, c1); + + ik = ok[c2]; + ik.info = i + 1; + cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + } + else + { + break; + } + } + if (i < q.size() && bval(q[i]) < 4) + { // 最后一次扩展 + c1 = cbval(q[i]); + bwt_extend(bwt, &ik, ok, 0); + ik = ok[c1]; + ik.info = i + 1; + cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + } +} \ No newline at end of file diff --git a/bwt.h b/bwt.h new file mode 100644 index 0000000..70ed8b5 --- /dev/null +++ b/bwt.h @@ -0,0 +1,68 @@ +#ifndef BWT_H_ +#define BWT_H_ +#include +#include +#include "util.h" + +using std::string; + +// occ间隔对应的右移位数,5表示间隔32行保存一次 +#define OCC_INTV_SHIFT 5 +#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT) +#define OCC_INTV_MASK (OCC_INTERVAL - 1) + +// 从最原始的bwt碱基串里获取k行对应的碱基,这里的bwt不包括occ check point数据 +#define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k) & 0xf) << 1) & 3) + +/* retrieve a character from the $-removed BWT string. Note that + * bwt_t::bwt is not exactly the BWT string and therefore this macro is + * called bwt_B0 instead of bwt_B */ +// 从构建完成的bwt(包含occ check point)获取k行(不含$,这里的k不输出bwt mtx的行,是bwt字符串的行)的碱基 +#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3) + +// 获取碱基c(待查找序列的首)和对应的互补碱基对应的行,以及间隔 +#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0) + +// The following two lines are ONLY correct when OCC_INTERVAL==0x80 +// #define bwt_bwt(b, k) ((b)->bwt[((k) >> 7 << 4) + sizeof(bwtint_t) + (((k) & 0x7f) >> 4)]) +// #define bwt_occ_intv(b, k) ((b)->bwt + ((k) >> 7 << 4)) +///* For general OCC_INTERVAL, the following is correct: +// k行对应的bwt str碱基,对应的check point bwt str数据起始地址 +#define bwt_bwt(b, k) ((b)->bwt[(k) / OCC_INTERVAL * (OCC_INTERVAL / (sizeof(uint32_t) * 8 / 2) + sizeof(bwtint_t) / 4 * 4) + sizeof(bwtint_t) / 4 * 4 + (k) % OCC_INTERVAL / 16]) +// k行(bwt str行(不包含$))对应的check point occ数据起始地址(小于k且是OCC_INTERVAL的整数倍) +#define bwt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / (sizeof(uint32_t) * 8 / 2) + sizeof(bwtint_t) / 4 * 4)) +// 字节b中包含的T G C A(按顺序保存在32位整数里(每个占8bit))的数量 +#define __occ_aux4(bwt, b) \ + ((bwt)->cnt_table[(b) & 0xff] + (bwt)->cnt_table[(b) >> 8 & 0xff] + (bwt)->cnt_table[(b) >> 16 & 0xff] + (bwt)->cnt_table[(b) >> 24]) + +// 原始fm-index (bwt)结构 +struct bwt_t +{ + bwtint_t primary; // S^{-1}(0), or the primary index of BWT + bwtint_t L2[5]; // C(), cumulative count + bwtint_t seq_len; // sequence length + bwtint_t bwt_size; // size of bwt, about seq_len/4 + uint32_t *bwt; // BWT + // occurance array, separated to two parts + uint32_t cnt_table[256]; + // suffix array + int sa_intv; + bwtint_t n_sa; + uint8_t *sa; +}; + +// k行(包含,bwt mtx行)之前,碱基c的累计总数, interval大于等于32才能正确计算 +bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c); +// 统计k行(bwt mtx行,包含k行本身)之前4种碱基累积数量,这里的k是bwt矩阵里的行,比bwt字符串多1 +void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); +// 解析两bit的bwt碱基序列 +bwt_t *restore_bwt_str(const char *fn); +// 根据原始的字符串bwt创建interval-bwt +void create_interval_occ_bwt(bwt_t *bwt); + +// 利用bwt搜索seed,完整搜索,只需要单向搜索 +void bwt_search(bwt_t *bwt, const string &q); +// 每次扩展两步 +void bwt_search2(bwt_t *bwt, const string &q); + +#endif \ No newline at end of file diff --git a/common.h b/common.h deleted file mode 100644 index feef26c..0000000 --- a/common.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef COMMON_H_ -#define COMMON_H_ - -#define OCC_INTV_SHIFT 5 -#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT) -#define OCC_INTV_MASK (OCC_INTERVAL - 1) -#define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k) & 0xf) << 1) & 3) - -/* retrieve a character from the $-removed BWT string. Note that - * bwt_t::bwt is not exactly the BWT string and therefore this macro is - * called bwt_B0 instead of bwt_B */ -#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3) - -#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0) - - -#define SA_BYTES_33(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8) -#define SA_BYTES_40(n_sa) ((((40 * (n_sa) + 7) / 8) & (~7L)) + 8) - -#define xassert(cond, msg) \ - if ((cond) == 0) \ - _err_fatal_simple_core(__func__, msg) - -typedef uint64_t bwtint_t; - -double realtime(void); - -void bwt_set_sa_33(uint8_t *sa_arr, bwtint_t k, bwtint_t val); -bwtint_t bwt_get_sa_33(uint8_t *sa_arr, bwtint_t k); - -int main_sa(int argc, char **argv); -int main_fmtidx(int argc, char **argv); - -#endif \ No newline at end of file diff --git a/fmt_index.cpp b/fmt_index.cpp index b6ef064..4f6df06 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -8,112 +8,12 @@ #include #include -#include "common.h" +#include "util.h" +#include "fmt_index.h" using namespace std; -///* For general OCC_INTERVAL, the following is correct: -#define bwt_bwt(b, k) ((b)->bwt[(k) / OCC_INTERVAL * (OCC_INTERVAL / (sizeof(uint32_t) * 8 / 2) + sizeof(bwtint_t) / 4 * 4) + sizeof(bwtint_t) / 4 * 4 + (k) % OCC_INTERVAL / 16]) -#define bwt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / (sizeof(uint32_t) * 8 / 2) + sizeof(bwtint_t) / 4 * 4)) -//*/ - -// The following two lines are ONLY correct when OCC_INTERVAL==0x80 -// #define bwt_bwt(b, k) ((b)->bwt[((k) >> 7 << 4) + sizeof(bwtint_t) + (((k) & 0x7f) >> 4)]) -// #define bwt_occ_intv(b, k) ((b)->bwt + ((k) >> 7 << 4)) - -/* retrieve a character from the $-removed BWT string. Note that - * bwt_t::bwt is not exactly the BWT string and therefore this macro is - * called bwt_B0 instead of bwt_B */ -#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3) - -#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0) - -#define __occ_aux4(bwt, b) \ - ((bwt)->cnt_table[(b) & 0xff] + (bwt)->cnt_table[(b) >> 8 & 0xff] + (bwt)->cnt_table[(b) >> 16 & 0xff] + (bwt)->cnt_table[(b) >> 24]) - const char BASE[4] = {'A', 'C', 'G', 'T'}; -// base转成2bit值 -inline int bval(char b) -{ - if (b == 'A') - return 0; - if (b == 'C') - return 1; - if (b == 'G') - return 2; - if (b == 'T') - return 3; - return 4; -} - -// 互补碱基值 -inline int cbval(char b) -{ - return 3 - bval(b); -} - -struct bwtintv_t -{ - bwtint_t x[3], info; // x[0]表示正链位置,x[1]表示互补链位置,x[2]表示间隔长度,info 表示read的起始结束位置 -}; - -// 原始fm-index结构 -struct bwt_t -{ - bwtint_t primary; // S^{-1}(0), or the primary index of BWT - bwtint_t L2[5]; // C(), cumulative count - bwtint_t seq_len; // sequence length - bwtint_t bwt_size; // size of bwt, about seq_len/4 - uint32_t *bwt; // BWT - // occurance array, separated to two parts - uint32_t cnt_table[256]; - // suffix array - int sa_intv; - bwtint_t n_sa; - uint8_t *sa; -}; - -// fm-index, twice calc in one memory access -struct FMTIndex { - bwtint_t primary; // S^{-1}(0), or the primary index of BWT - bwtint_t sec_primary; // second primary line - bwtint_t L2[5]; // C(), cumulative count - bwtint_t seq_len; // sequence length - bwtint_t bwt_size; // size of bwt, about seq_len/4 - uint32_t *bwt; // BWT - // occurance array, separated to two parts - uint32_t cnt_table[5][256]; // 4对应原来的cnt_table,0,1,2,3,分别对应该碱基的扩展 - int sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 - int first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3 - int last_base; // dollar转换成的base - // suffix array - int sa_intv; - bwtint_t n_sa; - uint8_t *sa; -}; - -void _err_fatal_simple_core(const char *func, const char *msg) -{ - fprintf(stderr, "[%s] %s Abort!\n", func, msg); - abort(); -} - -// 读取最原始的bwt, -static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) -{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks - const int bufsize = 0x1000000; // 16M block - bwtint_t offset = 0; - while (size) - { - int x = bufsize < size ? bufsize : size; - if ((x = fread((uint8_t *)a + offset, 1, x, fp)) == 0) - break; - size -= x; - offset += x; - } - return offset; -} - // 求反向互补序列 string calc_reverse_seq(string &seq) { @@ -133,69 +33,15 @@ string calc_reverse_seq(string &seq) return rseq; } -static inline int __occ_aux(uint64_t y, int c) +// 打印32位整型数据中包含的pre-bwt:bwt +void print_base_uint32(uint32_t p) { - // reduce nucleotide counting to bits counting - y = ((c & 2) ? y : ~y) >> 1 & ((c & 1) ? y : ~y) & 0x5555555555555555ull; - // count the number of 1s in y - y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); - return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; -} - -// k行(包含)之前,碱基c的累计总数, interval大于等于32才能正确计算 -bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c) -{ - bwtint_t n; - uint32_t *p, *end; - - if (k == bwt->seq_len) - return bwt->L2[c + 1] - bwt->L2[c]; - if (k == (bwtint_t)(-1)) - return 0; - k -= (k >= bwt->primary); // because $ is not in bwt - - // retrieve Occ at k/OCC_INTERVAL - n = ((bwtint_t *)(p = bwt_occ_intv(bwt, k)))[c]; - // cout << "bwt_occ - 1: " << (int)c << '\t' << k << '\t' << n << endl; - p += sizeof(bwtint_t); // jump to the start of the first BWT cell - - // calculate Occ up to the last k/32 - end = p + (((k >> 5) - ((k & ~OCC_INTV_MASK) >> 5)) << 1); - for (; p < end; p += 2) - n += __occ_aux((uint64_t)p[0] << 32 | p[1], c); - - //cout << "bwt_occ - 2: " << (int)c << '\t' << k << '\t' << n << endl; - // calculate Occ - n += __occ_aux(((uint64_t)p[0] << 32 | p[1]) & ~((1ull << ((~k & 31) << 1)) - 1), c); - if (c == 0) - n -= ~k & 31; // corrected for the masked bits - //cout << "bwt_occ - 3: " << (int)c << '\t' << k << '\t' << n << endl; - return n; -} - -// 这里的k是bwt矩阵里的行,比bwt字符串多1 -void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) -{ - bwtint_t x; - uint32_t *p, tmp, *end; - if (k == (bwtint_t)(-1)) + for (int i = 30; i > 0; i -= 4) { - memset(cnt, 0, 4 * sizeof(bwtint_t)); - return; + int b1 = p >> i & 3; + int b2 = p >> (i - 2) & 3; + cout << BASE[b1] << BASE[b2] << endl; } - k -= (k >= bwt->primary); // because $ is not in bwt - p = bwt_occ_intv(bwt, k); - memcpy(cnt, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) - end = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); // this is the end point of the following loop - for (x = 0; p < end; ++p) - x += __occ_aux4(bwt, *p); - tmp = *p & ~((1U << ((~k & 15) << 1)) - 1); - x += __occ_aux4(bwt, tmp) - (~k & 15); // 这里多算了A,要减去 - cnt[0] += x & 0xff; - cnt[1] += x >> 8 & 0xff; - cnt[2] += x >> 16 & 0xff; - cnt[3] += x >> 24; } // 创建bwt矩阵 @@ -208,7 +54,6 @@ void create_bwt_mtx(string &seq) { sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i); } - std::sort(sarr, sarr + seq.size() + 1); // bwt matrix @@ -224,7 +69,6 @@ void create_bwt_mtx(string &seq) // cout << sarr[i].back(); // } // cout << endl; -// // cout << "pre bwt string" << endl; // for (int i = 0; i < sarr[0].size(); ++i) // { @@ -233,20 +77,7 @@ void create_bwt_mtx(string &seq) // cout << endl; } -// 计算一个字节构成的A,T,C,G序列,对应的每个碱基的个数,因为最多有4个相同的碱基,所以每次左移3位就行 -void bwt_gen_cnt_table(bwt_t *bwt) -{ - int i, j; - for (i = 0; i != 256; ++i) - { - uint32_t x = 0; - for (j = 0; j != 4; ++j) - x |= (((i & 3) == j) + ((i >> 2 & 3) == j) + ((i >> 4 & 3) == j) + (i >> 6 == j)) << (j << 3); - bwt->cnt_table[i] = x; - } -} - -// fmt-index的count table +// fmt-index的count table,4对应着bwt碱基的累积量,0,1,2,3分别对应着bwt是A,C,G,T,pre-bwt的累积量 void fmt_gen_cnt_table(FMTIndex *fmt) { int i, j, k; @@ -267,79 +98,7 @@ void fmt_gen_cnt_table(FMTIndex *fmt) } } -void print_base_uint32(uint32_t p) -{ - for (int i = 30; i > 0; i -= 4) - { - int b1 = p >> i & 3; - int b2 = p >> (i - 2) & 3; - cout << BASE[b1] << BASE[b2] << endl; - } -} -// 解析两bit碱基序列 -bwt_t *restore_bwt_str(const char *fn) -{ - bwt_t *bwt; - bwt = (bwt_t *)calloc(1, sizeof(bwt_t)); - FILE *fp = fopen(fn, "rb"); - char *buf; - - fseek(fp, 0, SEEK_END); - bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; // 以32位word为单位计算的size - bwt->bwt = (uint32_t *)calloc(bwt->bwt_size, 4); - fseek(fp, 0, SEEK_SET); - fread(&bwt->primary, sizeof(bwtint_t), 1, fp); - fread(bwt->L2 + 1, sizeof(bwtint_t), 4, fp); - fread_fix(fp, bwt->bwt_size << 2, bwt->bwt); - bwt->seq_len = bwt->L2[4]; - - // buf = (char *)calloc(bwt->seq_len + 1, 1); - // for (bwtint_t i = 0; i < bwt->seq_len; ++i) - // { - // buf[i] = BASE[bwt->bwt[i >> 4] >> ((15 - (i & 15)) << 1) & 3]; - // cout << buf[i]; - // } - // cout << endl; - - fclose(fp); - bwt_gen_cnt_table(bwt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量 - return bwt; -} - -// 根据原始的字符串bwt创建interval-bwt -void create_interval_occ_bwt(bwt_t *bwt) -{ - bwtint_t i, k, c[4], n_occ; - uint32_t *buf; - - n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; - bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size - buf = (uint32_t *)calloc(bwt->bwt_size, 4); // will be the new bwt - c[0] = c[1] = c[2] = c[3] = 0; - // 计算occ,生成naive bwt - for (i = k = 0; i < bwt->seq_len; ++i) - { - // cout << i << '\t'; - // cout << c[0] << ' ' << c[1] << ' ' << c[2] << ' ' << c[3] << endl; - if (i % OCC_INTERVAL == 0) - { - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) 每个c包含多少个32位 - // cout << "i: " << i << "\tc: " << c[0] << '\t' << c[1] << '\t' << c[2] << '\t' << c[3] << endl; - } - if (i % 16 == 0) - buf[k++] = bwt->bwt[i / 16]; // 16 == sizeof(uint32_t)/2, 2个bit表示一个碱基 - ++c[bwt_B00(bwt, i)]; - } - // the last element - // cout << c[0] << '\t' << c[1] << '\t' << c[2] << '\t' << c[3] << endl; - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); - // update bwt - free(bwt->bwt); - bwt->bwt = buf; -} // 根据interval-bwt创建fmt-index FMTIndex *create_fmt_from_bwt(bwt_t *bwt) @@ -523,160 +282,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt) return fmt; } -// an analogy to bwt_occ4() but more efficient, requiring k <= l -void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) -{ - bwtint_t _k, _l; - _k = k - (k >= bwt->primary); - _l = l - (l >= bwt->primary); - if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) - { - bwt_occ4(bwt, k, cntk); - bwt_occ4(bwt, l, cntl); - } - else - { - bwtint_t x, y; - uint32_t *p, tmp, *endk, *endl; - k -= (k >= bwt->primary); // because $ is not in bwt - l -= (l >= bwt->primary); - p = bwt_occ_intv(bwt, k); - memcpy(cntk, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) - // prepare cntk[] - endk = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); - endl = p + ((l >> 4) - ((l & ~OCC_INTV_MASK) >> 4)); - for (x = 0; p < endk; ++p) - x += __occ_aux4(bwt, *p); - y = x; - tmp = *p & ~((1U << ((~k & 15) << 1)) - 1); - x += __occ_aux4(bwt, tmp) - (~k & 15); - // calculate cntl[] and finalize cntk[] - for (; p < endl; ++p) - y += __occ_aux4(bwt, *p); - tmp = *p & ~((1U << ((~l & 15) << 1)) - 1); - y += __occ_aux4(bwt, tmp) - (~l & 15); - memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); - cntk[0] += x & 0xff; - cntk[1] += x >> 8 & 0xff; - cntk[2] += x >> 16 & 0xff; - cntk[3] += x >> 24; - cntl[0] += y & 0xff; - cntl[1] += y >> 8 & 0xff; - cntl[2] += y >> 16 & 0xff; - cntl[3] += y >> 24; - } -} -void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) -{ - bwtint_t tk[4], tl[4]; - int i; - bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 - // 这里是反向扩展 - for (i = 0; i != 4; ++i) - { - ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 - ok[i].x[2] = tl[i] - tk[i]; // 间隔 - } - // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 - ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); - ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; - ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; - ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; -} - -// 利用bwt搜索seed,完整搜索,只需要单向搜索 -void bwt_search(bwt_t *bwt, const string &q) -{ - bwtintv_t ik, ok[4]; - int i, j, c, x = 0; - - bwt_set_intv(bwt, bval(q[x]), ik); - ik.info = x + 1; - cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; - - for (i = x + 1; i < q.size(); ++i) - { - if (bval(q[i]) < 4) - { - c = cbval(q[i]); - bwt_extend(bwt, &ik, ok, 0); - ik = ok[c]; - ik.info = i + 1; - cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; - } - } -} - -// 扩展两次 -void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1) -{ - bwtint_t tk[4], tl[4]; - int i; - bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 - // 这里是反向扩展 - for (i = 0; i != 4; ++i) - { - ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 - ok[i].x[2] = tl[i] - tk[i]; // 间隔 - } - // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 - ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); - ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; - ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; - ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; - - *ik = ok[c1]; - bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); - for (i = 0; i != 4; ++i) - { - ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 - ok[i].x[2] = tl[i] - tk[i]; // 间隔 - } - // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 - ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); - ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; - ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; - ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; -} - -void bwt_search2(bwt_t *bwt, const string &q) -{ - bwtintv_t ik, ok[4]; - int i, j, c1, c2, x = 0; - - bwt_set_intv(bwt, bval(q[x]), ik); - ik.info = x + 1; - cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; - - for (i = x + 1; i + 1 < q.size(); i += 2) - { - if (bval(q[i]) < 4 && bval(q[i+1]) < 4) - { - - c1 = cbval(q[i]); - c2 = cbval(q[i + 1]); - - bwt_extend2(bwt, &ik, ok, 0, c1); - - ik = ok[c2]; - ik.info = i + 1; - cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; - } - else - { - break; - } - } - if (i < q.size() && bval(q[i]) < 4) { // 最后一次扩展 - c1 = cbval(q[i]); - bwt_extend(bwt, &ik, ok, 0); - ik = ok[c1]; - ik.info = i + 1; - cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; - } -} #define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0) #define fmt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 8 + 20)) diff --git a/fmt_index.h b/fmt_index.h new file mode 100644 index 0000000..6eef697 --- /dev/null +++ b/fmt_index.h @@ -0,0 +1,26 @@ +#ifndef FMT_INDEX_H_ +#define FMT_INDEX_H_ + +#include "bwt.h" + +// fm-index, extend twice in one search step (one memory access) +struct FMTIndex +{ + bwtint_t primary; // S^{-1}(0), or the primary index of BWT + bwtint_t sec_primary; // second primary line + bwtint_t L2[5]; // C(), cumulative count + bwtint_t seq_len; // sequence length + bwtint_t bwt_size; // size of bwt, about seq_len/4 + uint32_t *bwt; // BWT + // occurance array, separated to two parts + uint32_t cnt_table[5][256]; // 4对应原来的cnt_table,0,1,2,3,分别对应该碱基的扩展 + int sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 + int first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3 + int last_base; // dollar转换成的base + // suffix array + int sa_intv; + bwtint_t n_sa; + uint8_t *sa; +}; + +#endif \ No newline at end of file diff --git a/main.cpp b/main.cpp index 7fdb743..6d763da 100644 --- a/main.cpp +++ b/main.cpp @@ -4,9 +4,13 @@ #include #include -#include "common.h" using namespace std; +// 各个分部的入口函数 +int main_sa(int argc, char **argv); +int main_fmtidx(int argc, char **argv); + +// 整个程序的入口 int main(int argc, char **argv) { // main_sa(argc, argv); diff --git a/sa.cpp b/sa.cpp index 45aadf7..044086a 100644 --- a/sa.cpp +++ b/sa.cpp @@ -2,20 +2,12 @@ #include #include #include -#include -#include "common.h" +#include "util.h" +#include "sa.h" using namespace std; -double realtime(void) -{ - struct timeval tp; - struct timezone tzp; - gettimeofday(&tp, &tzp); - return tp.tv_sec + tp.tv_usec * 1e-6; -} - inline void bwt_set_sa_33(uint8_t *sa_arr, bwtint_t k, bwtint_t val) { const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组,共享33个字节 diff --git a/sa.h b/sa.h new file mode 100644 index 0000000..3c972a8 --- /dev/null +++ b/sa.h @@ -0,0 +1,14 @@ +#ifndef SA_H_ +#define SA_H_ + +#include "util.h" + +void bwt_set_sa_33(uint8_t *sa_arr, bwtint_t k, bwtint_t val); +bwtint_t bwt_get_sa_33(uint8_t *sa_arr, bwtint_t k); + +// 用33个bit来表示bwt行信息,所需的总字节数 +#define SA_BYTES_33(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8) +// 用40个bit来表示bwt行信息,所需的总字节数 +#define SA_BYTES_40(n_sa) ((((40 * (n_sa) + 7) / 8) & (~7L)) + 8) + +#endif \ No newline at end of file diff --git a/util.cpp b/util.cpp new file mode 100644 index 0000000..c492066 --- /dev/null +++ b/util.cpp @@ -0,0 +1,56 @@ +#include +#include +#include + +#include "util.h" + +// base转成2bit值 +int bval(char b) +{ + if (b == 'A') + return 0; + if (b == 'C') + return 1; + if (b == 'G') + return 2; + if (b == 'T') + return 3; + return 4; +} + +// 互补碱基值 +int cbval(char b) +{ + return 3 - bval(b); +} + +double realtime(void) +{ + struct timeval tp; + struct timezone tzp; + gettimeofday(&tp, &tzp); + return tp.tv_sec + tp.tv_usec * 1e-6; +} + +// 打印故障信息,并终止程序 +void _err_fatal_simple_core(const char *func, const char *msg) +{ + fprintf(stderr, "[%s] %s Abort!\n", func, msg); + abort(); +} + +// 读取数据 +bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) +{ /* Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks */ + const int bufsize = 0x1000000; // 16M block + bwtint_t offset = 0; + while (size) + { + int x = bufsize < size ? bufsize : size; + if ((x = fread((uint8_t *)a + offset, 1, x, fp)) == 0) + break; + size -= x; + offset += x; + } + return offset; +} \ No newline at end of file diff --git a/util.h b/util.h new file mode 100644 index 0000000..12e98fe --- /dev/null +++ b/util.h @@ -0,0 +1,31 @@ +#ifndef COMMON_H_ +#define COMMON_H_ + +#include +#include + +typedef uint64_t bwtint_t; + +#define xassert(cond, msg) \ + if ((cond) == 0) \ + _err_fatal_simple_core(__func__, msg) + + double realtime(void); + +// 在fm-indexv(或者bwt)查找过程中,记录结果 +struct bwtintv_t +{ + bwtint_t x[3], info; // x[0]表示正链位置,x[1]表示互补链位置,x[2]表示间隔长度,info 表示read的起始结束位置 +}; + +// 读取二进制数据 +bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a); +// 给出问题信息并终止程序 +void _err_fatal_simple_core(const char *func, const char *msg); + +// base转成2bit值 +int bval(char b); +// 互补碱基值 +int cbval(char b); + +#endif \ No newline at end of file