Compare commits

...

2 Commits

12 changed files with 963 additions and 708 deletions

5
.gitignore vendored
View File

@ -1,3 +1,8 @@
# specific for bwa_perf
*.txt
fmtidx
*.fmt
# ---> C
# Prerequisites
*.d

View File

@ -1,10 +1,11 @@
CC= g++
CFLAGS= -g -Wall -Wno-unused-function -O2
NOWARN= -Wno-unused-result -Wno-unused-function
CFLAGS= -g -Wall $(NOWARN) -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF)
AOBJS= sa.o fmt_index.o
AOBJS= util.o sa.o fmt_index.o bwt.o
PROG= fmtidx
INCLUDES=
LIBS= -lm -lz -lpthread

323
bwt.cpp 100644
View File

@ -0,0 +1,323 @@
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <string>
#include <iostream>
#include "util.h"
#include "bwt.h"
using namespace std;
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
// 计算一个字节构成的T,G,C,A序列对应的每个碱基的个数(按T,G,C,A顺序存储在32位整数中每个占8位)
void bwt_gen_cnt_table(bwt_t *bwt)
{
int i, j;
for (i = 0; i != 256; ++i)
{
uint32_t x = 0;
for (j = 0; j != 4; ++j)
x |= (((i & 3) == j) + ((i >> 2 & 3) == j) + ((i >> 4 & 3) == j) + (i >> 6 == j)) << (j << 3);
bwt->cnt_table[i] = x;
}
}
// 解析两bit的bwt碱基序列这个只有bwt str可以包含也可不包含occ check point
bwt_t *restore_bwt(const char *fn)
{
bwt_t *bwt;
bwt = (bwt_t *)calloc(1, sizeof(bwt_t));
FILE *fp = fopen(fn, "rb");
fseek(fp, 0, SEEK_END);
bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; // 以32位word为单位计算的size
bwt->bwt = (uint32_t *)calloc(bwt->bwt_size, 4);
fseek(fp, 0, SEEK_SET);
fread(&bwt->primary, sizeof(bwtint_t), 1, fp);
fread(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
fread_fix(fp, bwt->bwt_size << 2, bwt->bwt);
bwt->seq_len = bwt->L2[4];
// char *buf = (char *)calloc(bwt->seq_len + 1, 1);
// for (bwtint_t i = 0; i < bwt->seq_len; ++i)
// {
// buf[i] = BASE[bwt->bwt[i >> 4] >> ((15 - (i & 15)) << 1) & 3];
// cout << buf[i];
// }
// cout << endl;
fclose(fp);
bwt_gen_cnt_table(bwt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量
return bwt;
}
// 根据原始的字符串bwt创建interval-bwt
void create_interval_occ_bwt(bwt_t *bwt)
{
bwtint_t i, k, c[4], n_occ;
uint32_t *buf;
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
buf = (uint32_t *)calloc(bwt->bwt_size, 4); // will be the new bwt
c[0] = c[1] = c[2] = c[3] = 0;
// 计算occ生成naive bwt
for (i = k = 0; i < bwt->seq_len; ++i)
{
// cout << i << '\t';
// cout << c[0] << ' ' << c[1] << ' ' << c[2] << ' ' << c[3] << endl;
if (i % OCC_INTERVAL == 0)
{
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) 每个c包含多少个32位
// cout << "i: " << i << "\tc: " << c[0] << '\t' << c[1] << '\t' << c[2] << '\t' << c[3] << endl;
}
if (i % 16 == 0)
buf[k++] = bwt->bwt[i / 16]; // 16 == sizeof(uint32_t)/2, 2个bit表示一个碱基
++c[bwt_B00(bwt, i)];
}
// the last element
// cout << c[0] << '\t' << c[1] << '\t' << c[2] << '\t' << c[3] << endl;
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
// update bwt
free(bwt->bwt);
bwt->bwt = buf;
}
// 对64位整型数据y计算碱基c的累积个数
static inline int __occ_aux(uint64_t y, int c)
{
// reduce nucleotide counting to bits counting
y = ((c & 2) ? y : ~y) >> 1 & ((c & 1) ? y : ~y) & 0x5555555555555555ull;
// count the number of 1s in y
y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
}
// k行(包含)之前碱基c的累计总数, interval大于等于32才能正确计算
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c)
{
bwtint_t n;
uint32_t *p, *end;
if (k == bwt->seq_len)
return bwt->L2[c + 1] - bwt->L2[c];
if (k == (bwtint_t)(-1))
return 0;
k -= (k >= bwt->primary); // because $ is not in bwt
// retrieve Occ at k/OCC_INTERVAL
n = ((bwtint_t *)(p = bwt_occ_intv(bwt, k)))[c];
// cout << "bwt_occ - 1: " << (int)c << '\t' << k << '\t' << n << endl;
p += sizeof(bwtint_t); // jump to the start of the first BWT cell
// calculate Occ up to the last k/32
end = p + (((k >> 5) - ((k & ~OCC_INTV_MASK) >> 5)) << 1);
for (; p < end; p += 2)
n += __occ_aux((uint64_t)p[0] << 32 | p[1], c);
// cout << "bwt_occ - 2: " << (int)c << '\t' << k << '\t' << n << endl;
// calculate Occ
n += __occ_aux(((uint64_t)p[0] << 32 | p[1]) & ~((1ull << ((~k & 31) << 1)) - 1), c);
if (c == 0)
n -= ~k & 31; // corrected for the masked bits
// cout << "bwt_occ - 3: " << (int)c << '\t' << k << '\t' << n << endl;
return n;
}
// 统计k行bwt mtx行之前4种碱基累积数量这里的k是bwt矩阵里的行比bwt字符串多1
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
{
bwtint_t x;
uint32_t *p, tmp, *end;
if (k == (bwtint_t)(-1))
{
memset(cnt, 0, 4 * sizeof(bwtint_t));
return;
}
k -= (k >= bwt->primary); // because $ is not in bwt
p = bwt_occ_intv(bwt, k);
memcpy(cnt, p, 4 * sizeof(bwtint_t));
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
end = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); // this is the end point of the following loop
for (x = 0; p < end; ++p)
x += __occ_aux4(bwt, *p);
tmp = *p & ~((1U << ((~k & 15) << 1)) - 1);
x += __occ_aux4(bwt, tmp) - (~k & 15); // 这里多算了A要减去
cnt[0] += x & 0xff;
cnt[1] += x >> 8 & 0xff;
cnt[2] += x >> 16 & 0xff;
cnt[3] += x >> 24;
}
// an analogy to bwt_occ4() but more efficient, requiring k <= l
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
{
bwtint_t _k, _l;
_k = k - (k >= bwt->primary);
_l = l - (l >= bwt->primary);
if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1))
{
bwt_occ4(bwt, k, cntk);
bwt_occ4(bwt, l, cntl);
}
else
{
bwtint_t x, y;
uint32_t *p, tmp, *endk, *endl;
k -= (k >= bwt->primary); // because $ is not in bwt
l -= (l >= bwt->primary);
p = bwt_occ_intv(bwt, k);
memcpy(cntk, p, 4 * sizeof(bwtint_t));
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
// prepare cntk[]
endk = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4));
endl = p + ((l >> 4) - ((l & ~OCC_INTV_MASK) >> 4));
for (x = 0; p < endk; ++p)
x += __occ_aux4(bwt, *p);
y = x;
tmp = *p & ~((1U << ((~k & 15) << 1)) - 1);
x += __occ_aux4(bwt, tmp) - (~k & 15);
// calculate cntl[] and finalize cntk[]
for (; p < endl; ++p)
y += __occ_aux4(bwt, *p);
tmp = *p & ~((1U << ((~l & 15) << 1)) - 1);
y += __occ_aux4(bwt, tmp) - (~l & 15);
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
cntk[0] += x & 0xff;
cntk[1] += x >> 8 & 0xff;
cntk[2] += x >> 16 & 0xff;
cntk[3] += x >> 24;
cntl[0] += y & 0xff;
cntl[1] += y >> 8 & 0xff;
cntl[2] += y >> 16 & 0xff;
cntl[3] += y >> 24;
}
}
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
{
bwtint_t tk[4], tl[4];
int i;
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
// cout << "bwt-1-d: " << ik->x[!is_back] - 1 << '\t' << tk[0] << '\t' << tk[1] << '\t' << tk[2] << '\t' << tk[3] << endl;
// cout << "bwt-1-d: " << ik->x[!is_back] - 1 + ik->x[2] << '\t' << tl[0] << '\t' << tl[1] << '\t' << tl[2] << '\t' << tl[3] << endl;
// 这里是反向扩展
for (i = 0; i != 4; ++i)
{
ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链
ok[i].x[2] = tl[i] - tk[i]; // 间隔
}
// 因为计算的是互补碱基所以3对应着0,2对应1下边是正向扩展
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary);
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
}
// 利用bwt搜索seed完整搜索只需要单向搜索
void bwt_search(bwt_t *bwt, const string &q)
{
bwtintv_t ik, ok[4];
int i, j, c, x = 0;
bwt_set_intv(bwt, bval(q[x]), ik);
ik.info = x + 1;
cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
for (i = x + 1; i < q.size(); ++i)
{
if (bval(q[i]) < 4)
{
c = cbval(q[i]);
bwt_extend(bwt, &ik, ok, 0);
ik = ok[c];
ik.info = i + 1;
cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
}
}
}
// 扩展两次
void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1)
{
bwtint_t tk[4], tl[4];
int i;
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
// 这里是反向扩展
for (i = 0; i != 4; ++i)
{
ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链
ok[i].x[2] = tl[i] - tk[i]; // 间隔
}
// 因为计算的是互补碱基所以3对应着0,2对应1下边是正向扩展
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary);
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
*ik = ok[c1];
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
for (i = 0; i != 4; ++i)
{
ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链
ok[i].x[2] = tl[i] - tk[i]; // 间隔
}
// 因为计算的是互补碱基所以3对应着0,2对应1下边是正向扩展
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary);
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
}
// 每次扩展两步
void bwt_search2(bwt_t *bwt, const string &q)
{
bwtintv_t ik, ok[4];
int i, j, c1, c2, x = 0;
bwt_set_intv(bwt, bval(q[x]), ik);
ik.info = x + 1;
cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
for (i = x + 1; i + 1 < q.size(); i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
bwt_extend2(bwt, &ik, ok, 0, c1);
ik = ok[c2];
ik.info = i + 1;
cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
}
else
{
break;
}
}
if (i < q.size() && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
bwt_extend(bwt, &ik, ok, 0);
ik = ok[c1];
ik.info = i + 1;
cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
}
}

68
bwt.h 100644
View File

@ -0,0 +1,68 @@
#ifndef BWT_H_
#define BWT_H_
#include <stdint.h>
#include <string>
#include "util.h"
using std::string;
// occ间隔对应的右移位数5表示间隔32行保存一次
#define OCC_INTV_SHIFT 5
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
// 从最原始的bwt碱基串里获取k行对应的碱基这里的bwt不包括occ check point数据
#define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k) & 0xf) << 1) & 3)
/* retrieve a character from the $-removed BWT string. Note that
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
* called bwt_B0 instead of bwt_B */
// 从构建完成的bwt包含occ check point获取k行不含$这里的k不输出bwt mtx的行是bwt字符串的行的碱基
#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3)
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0)
// The following two lines are ONLY correct when OCC_INTERVAL==0x80
// #define bwt_bwt(b, k) ((b)->bwt[((k) >> 7 << 4) + sizeof(bwtint_t) + (((k) & 0x7f) >> 4)])
// #define bwt_occ_intv(b, k) ((b)->bwt + ((k) >> 7 << 4))
///* For general OCC_INTERVAL, the following is correct:
// k行对应的bwt str碱基对应的check point bwt str数据起始地址
#define bwt_bwt(b, k) ((b)->bwt[(k) / OCC_INTERVAL * (OCC_INTERVAL / (sizeof(uint32_t) * 8 / 2) + sizeof(bwtint_t) / 4 * 4) + sizeof(bwtint_t) / 4 * 4 + (k) % OCC_INTERVAL / 16])
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#define bwt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / (sizeof(uint32_t) * 8 / 2) + sizeof(bwtint_t) / 4 * 4))
// 字节b中包含的T G C A按顺序保存在32位整数里每个占8bit的数量
#define __occ_aux4(bwt, b) \
((bwt)->cnt_table[(b) & 0xff] + (bwt)->cnt_table[(b) >> 8 & 0xff] + (bwt)->cnt_table[(b) >> 16 & 0xff] + (bwt)->cnt_table[(b) >> 24])
// 原始fm-index (bwt)结构
struct bwt_t
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_table[256];
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
};
// k行(包含bwt mtx行)之前碱基c的累计总数, interval大于等于32才能正确计算
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c);
// 统计k行bwt mtx行包含k行本身之前4种碱基累积数量这里的k是bwt矩阵里的行比bwt字符串多1
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
// 解析两bit的bwt碱基序列
bwt_t *restore_bwt(const char *fn);
// 根据原始的字符串bwt创建interval-bwt
void create_interval_occ_bwt(bwt_t *bwt);
// 利用bwt搜索seed完整搜索只需要单向搜索
void bwt_search(bwt_t *bwt, const string &q);
// 每次扩展两步
void bwt_search2(bwt_t *bwt, const string &q);
#endif

View File

@ -1,34 +0,0 @@
#ifndef COMMON_H_
#define COMMON_H_
#define OCC_INTV_SHIFT 5
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
#define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k) & 0xf) << 1) & 3)
/* retrieve a character from the $-removed BWT string. Note that
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
* called bwt_B0 instead of bwt_B */
#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3)
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0)
#define SA_BYTES_33(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)
#define SA_BYTES_40(n_sa) ((((40 * (n_sa) + 7) / 8) & (~7L)) + 8)
#define xassert(cond, msg) \
if ((cond) == 0) \
_err_fatal_simple_core(__func__, msg)
typedef uint64_t bwtint_t;
double realtime(void);
void bwt_set_sa_33(uint8_t *sa_arr, bwtint_t k, bwtint_t val);
bwtint_t bwt_get_sa_33(uint8_t *sa_arr, bwtint_t k);
int main_sa(int argc, char **argv);
int main_fmtidx(int argc, char **argv);
#endif

File diff suppressed because it is too large Load Diff

34
fmt_index.h 100644
View File

@ -0,0 +1,34 @@
#ifndef FMT_INDEX_H_
#define FMT_INDEX_H_
#include "bwt.h"
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 8 + 20))
// 字节val中包含bwt base为b的pre-bwt中T G C A按顺序保存在32位整数里每个占8bit的数量
#define __fmt_occ_e2_aux4(fmt, b, val) \
((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24])
// fm-index, extend twice in one search step (one memory access)
struct FMTIndex
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t sec_primary; // second primary line
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_table[5][256]; // 4对应原来的cnt_table0,1,2,3,分别对应该碱基的扩展
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
uint8_t last_base; // dollar转换成的base
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
};
#endif

View File

@ -4,9 +4,13 @@
#include <vector>
#include <sys/time.h>
#include "common.h"
using namespace std;
// 各个分部的入口函数
int main_sa(int argc, char **argv);
int main_fmtidx(int argc, char **argv);
// 整个程序的入口
int main(int argc, char **argv)
{
// main_sa(argc, argv);

12
sa.cpp
View File

@ -2,20 +2,12 @@
#include <stdint.h>
#include <stdlib.h>
#include <vector>
#include <sys/time.h>
#include "common.h"
#include "util.h"
#include "sa.h"
using namespace std;
double realtime(void)
{
struct timeval tp;
struct timezone tzp;
gettimeofday(&tp, &tzp);
return tp.tv_sec + tp.tv_usec * 1e-6;
}
inline void bwt_set_sa_33(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
{
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组共享33个字节

14
sa.h 100644
View File

@ -0,0 +1,14 @@
#ifndef SA_H_
#define SA_H_
#include "util.h"
void bwt_set_sa_33(uint8_t *sa_arr, bwtint_t k, bwtint_t val);
bwtint_t bwt_get_sa_33(uint8_t *sa_arr, bwtint_t k);
// 用33个bit来表示bwt行信息所需的总字节数
#define SA_BYTES_33(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)
// 用40个bit来表示bwt行信息所需的总字节数
#define SA_BYTES_40(n_sa) ((((40 * (n_sa) + 7) / 8) & (~7L)) + 8)
#endif

154
util.cpp 100644
View File

@ -0,0 +1,154 @@
#define FSYNC_ON_FLUSH
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <errno.h>
#ifdef FSYNC_ON_FLUSH
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
#include <stdarg.h>
#include "util.h"
// base转成2bit值
int bval(char b)
{
if (b == 'A')
return 0;
if (b == 'C')
return 1;
if (b == 'G')
return 2;
if (b == 'T')
return 3;
return 4;
}
// 互补碱基值
int cbval(char b)
{
return 3 - bval(b);
}
double realtime(void)
{
struct timeval tp;
struct timezone tzp;
gettimeofday(&tp, &tzp);
return tp.tv_sec + tp.tv_usec * 1e-6;
}
// 打印故障信息,并终止程序
void _err_fatal_simple_core(const char *func, const char *msg)
{
fprintf(stderr, "[%s] %s Abort!\n", func, msg);
abort();
}
// 打印信息并停止运行
void _err_fatal_simple(const char *func, const char *msg)
{
fprintf(stderr, "[%s] %s\n", func, msg);
exit(EXIT_FAILURE);
}
void err_fatal(const char *header, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
fprintf(stderr, "[%s] ", header);
vfprintf(stderr, fmt, args);
fprintf(stderr, "\n");
va_end(args);
exit(EXIT_FAILURE);
}
void err_fatal_core(const char *header, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
fprintf(stderr, "[%s] ", header);
vfprintf(stderr, fmt, args);
fprintf(stderr, " Abort!\n");
va_end(args);
abort();
}
// 打开文件流
FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
{
FILE *fp = 0;
if (strcmp(fn, "-") == 0)
return (strstr(mode, "r")) ? stdin : stdout;
if ((fp = fopen(fn, mode)) == 0)
{
err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
}
return fp;
}
// 读取数据
bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
{ /* Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks */
const int bufsize = 0x1000000; // 16M block
bwtint_t offset = 0;
while (size)
{
int x = bufsize < size ? bufsize : size;
if ((x = fread((uint8_t *)a + offset, 1, x, fp)) == 0)
break;
size -= x;
offset += x;
}
return offset;
}
// 写二进制文件
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
{
size_t ret = fwrite(ptr, size, nmemb, stream);
if (ret != nmemb)
_err_fatal_simple("fwrite", strerror(errno));
return ret;
}
// 刷新文件流
int err_fflush(FILE *stream)
{
int ret = fflush(stream);
if (ret != 0)
_err_fatal_simple("fflush", strerror(errno));
#ifdef FSYNC_ON_FLUSH
/* Calling fflush() ensures that all the data has made it to the
kernel buffers, but this may not be sufficient for remote filesystems
(e.g. NFS, lustre) as an error may still occur while the kernel
is copying the buffered data to the file server. To be sure of
catching these errors, we need to call fsync() on the file
descriptor, but only if it is a regular file. */
{
struct stat sbuf;
if (0 != fstat(fileno(stream), &sbuf))
_err_fatal_simple("fstat", strerror(errno));
if (S_ISREG(sbuf.st_mode))
{
if (0 != fsync(fileno(stream)))
_err_fatal_simple("fsync", strerror(errno));
}
}
#endif
return ret;
}
// 关闭文件流
int err_fclose(FILE *stream)
{
int ret = fclose(stream);
if (ret != 0)
_err_fatal_simple("fclose", strerror(errno));
return ret;
}

41
util.h 100644
View File

@ -0,0 +1,41 @@
#ifndef COMMON_H_
#define COMMON_H_
#include <stdlib.h>
#include <stdint.h>
typedef uint64_t bwtint_t;
#define xassert(cond, msg) \
if ((cond) == 0) \
_err_fatal_simple_core(__func__, msg)
#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
double realtime(void);
// 在fm-indexv(或者bwt)查找过程中,记录结果
struct bwtintv_t
{
bwtint_t x[3], info; // x[0]表示正链位置x[1]表示互补链位置x[2]表示间隔长度info 表示read的起始结束位置
};
// 读取二进制数据
bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a);
// 给出问题信息并终止程序
void _err_fatal_simple_core(const char *func, const char *msg);
// base转成2bit值
int bval(char b);
// 互补碱基值
int cbval(char b);
// 打开文件流
FILE *err_xopen_core(const char *func, const char *fn, const char *mode);
// 写二进制文件
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
// 刷新文件流
int err_fflush(FILE *stream);
// 关闭文件流
int err_fclose(FILE *stream);
#endif