解决了一些bug,目前看来fmt可以正常工作

This commit is contained in:
Gitea 2024-01-27 00:42:47 +08:00
parent 8e1576a2cd
commit c69cb901bb
7 changed files with 453 additions and 276 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
# specific for bwa_perf # specific for bwa_perf
*.txt *.txt
fmtidx fmtidx
*.fmt
# ---> C # ---> C
# Prerequisites # Prerequisites

17
bwt.cpp
View File

@ -10,6 +10,17 @@
using namespace std; using namespace std;
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
// 计算一个字节构成的T,G,C,A序列对应的每个碱基的个数(按T,G,C,A顺序存储在32位整数中每个占8位) // 计算一个字节构成的T,G,C,A序列对应的每个碱基的个数(按T,G,C,A顺序存储在32位整数中每个占8位)
void bwt_gen_cnt_table(bwt_t *bwt) void bwt_gen_cnt_table(bwt_t *bwt)
{ {
@ -23,8 +34,8 @@ void bwt_gen_cnt_table(bwt_t *bwt)
} }
} }
// 解析两bit的bwt碱基序列 // 解析两bit的bwt碱基序列这个只有bwt str可以包含也可不包含occ check point
bwt_t *restore_bwt_str(const char *fn) bwt_t *restore_bwt(const char *fn)
{ {
bwt_t *bwt; bwt_t *bwt;
bwt = (bwt_t *)calloc(1, sizeof(bwt_t)); bwt = (bwt_t *)calloc(1, sizeof(bwt_t));
@ -202,6 +213,8 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
bwtint_t tk[4], tl[4]; bwtint_t tk[4], tl[4];
int i; int i;
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积 bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
// cout << "bwt-1-d: " << ik->x[!is_back] - 1 << '\t' << tk[0] << '\t' << tk[1] << '\t' << tk[2] << '\t' << tk[3] << endl;
// cout << "bwt-1-d: " << ik->x[!is_back] - 1 + ik->x[2] << '\t' << tl[0] << '\t' << tl[1] << '\t' << tl[2] << '\t' << tl[3] << endl;
// 这里是反向扩展 // 这里是反向扩展
for (i = 0; i != 4; ++i) for (i = 0; i != 4; ++i)
{ {

4
bwt.h
View File

@ -20,7 +20,7 @@ using std::string;
// 从构建完成的bwt包含occ check point获取k行不含$这里的k不输出bwt mtx的行是bwt字符串的行的碱基 // 从构建完成的bwt包含occ check point获取k行不含$这里的k不输出bwt mtx的行是bwt字符串的行的碱基
#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3) #define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3)
// 获取碱基c待查找序列的首)和对应的互补碱基对应的行,以及间隔 // 获取碱基c待查找序列的首个碱基)和对应的互补碱基对应的行,以及间隔
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0) #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0)
// The following two lines are ONLY correct when OCC_INTERVAL==0x80 // The following two lines are ONLY correct when OCC_INTERVAL==0x80
@ -56,7 +56,7 @@ bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c);
// 统计k行bwt mtx行包含k行本身之前4种碱基累积数量这里的k是bwt矩阵里的行比bwt字符串多1 // 统计k行bwt mtx行包含k行本身之前4种碱基累积数量这里的k是bwt矩阵里的行比bwt字符串多1
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
// 解析两bit的bwt碱基序列 // 解析两bit的bwt碱基序列
bwt_t *restore_bwt_str(const char *fn); bwt_t *restore_bwt(const char *fn);
// 根据原始的字符串bwt创建interval-bwt // 根据原始的字符串bwt创建interval-bwt
void create_interval_occ_bwt(bwt_t *bwt); void create_interval_occ_bwt(bwt_t *bwt);

View File

@ -44,35 +44,44 @@ void print_base_uint32(uint32_t p)
} }
} }
// 随机生成长度为len的序列
string generate_rand_seq(int len)
{
string seq(len, 'A');
for (int i = 0; i < len; ++i)
{
seq[i] = BASE[rand() % 4];
}
return seq;
}
// 创建bwt矩阵 // 创建bwt矩阵
void create_bwt_mtx(string &seq) void create_bwt_mtx(string &seq)
{ {
cout << "seq size: " << seq.size() + 1 << endl; bwtint_t seq_len = seq.size() + 1;
string sarr[seq.size() + 1]; string sarr[seq_len];
sarr[0] = seq + '$'; sarr[0] = seq + '$';
for (int i = 1; i < sarr[0].size(); ++i) for (int i = 1; i < seq_len; ++i)
{ {
sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i); sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i);
} }
std::sort(sarr, sarr + seq.size() + 1); std::sort(sarr, sarr + seq_len);
// bwt matrix
for (int i = 0; i < sarr[0].size(); ++i)
{
// cout << i << ' ' << sarr[i] << endl;
cout << sarr[i] << endl;
}
// print bwt matrix
// for (int i = 0; i < seq_len; ++i)
//{
// // cout << i << ' ' << sarr[i] << endl;
// cout << sarr[i] << endl;
//}
// cout << "bwt string" << endl; // cout << "bwt string" << endl;
// for (int i = 0; i < sarr[0].size(); ++i) // for (int i = 0; i < seq_len; ++i)
// { // {
// cout << sarr[i].back(); // cout << sarr[i].back();
// } // }
// cout << endl; // cout << endl;
// cout << "pre bwt string" << endl; // cout << "pre bwt string" << endl;
// for (int i = 0; i < sarr[0].size(); ++i) // for (int i = 0; i < seq_len; ++i)
// { // {
// cout << sarr[i][sarr[0].size() - 2]; // cout << sarr[i][seq_len - 2];
// } // }
// cout << endl; // cout << endl;
} }
@ -98,7 +107,43 @@ void fmt_gen_cnt_table(FMTIndex *fmt)
} }
} }
void dump_fmt(const char *fn, const FMTIndex *fmt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&fmt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
err_fwrite(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
err_fwrite(&fmt->first_base, sizeof(uint8_t), 1, fp);
err_fwrite(&fmt->last_base, sizeof(uint8_t), 1, fp);
err_fwrite(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fwrite(fmt->bwt, 4, fmt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
FMTIndex *restore_fmt(const char *fn)
{
FMTIndex *fmt;
fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
FILE *fp = fopen(fn, "rb");
fseek(fp, 0, SEEK_END);
fmt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 6 - 3) >> 2; // 以32位word为单位计算的size
fmt->bwt = (uint32_t *)calloc(fmt->bwt_size, 4);
fseek(fp, 0, SEEK_SET);
fread(&fmt->primary, sizeof(bwtint_t), 1, fp);
fread(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
fread(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
fread(&fmt->first_base, sizeof(uint8_t), 1, fp);
fread(&fmt->last_base, sizeof(uint8_t), 1, fp);
fread(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
fread_fix(fp, fmt->bwt_size << 2, fmt->bwt);
fmt->seq_len = fmt->L2[4];
fclose(fp);
fmt_gen_cnt_table(fmt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量
return fmt;
}
// 根据interval-bwt创建fmt-index // 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt) FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
@ -108,13 +153,13 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
fmt_gen_cnt_table(fmt); fmt_gen_cnt_table(fmt);
bwtint_t i, j, k, m, n, n_occ, cnt[4], cnt2[4]; bwtint_t i, j, k, m, n, n_occ, cnt[4], cnt2[4];
uint32_t c[4], c2[16] /*保存AA..TT*/; uint32_t c[4], c2[16]; /*c用来保存原来的bwt碱基串的累积值c2用来保存pre-bwt和bwt碱基对的累计值AA..TT*/
uint32_t *buf; uint32_t *buf; /* 计算之后变成fmt结构中bwt部分 */
fmt->seq_len = bwt->seq_len; fmt->seq_len = bwt->seq_len; // bwt碱基序列的长度不包含$字符也就是该长度比bwt matrix长度少1
for (i = 0; i < 5; ++i) for (i = 0; i < 5; ++i)
fmt->L2[i] = bwt->L2[i]; fmt->L2[i] = bwt->L2[i]; // 每个碱基的总累积值
fmt->primary = bwt->primary; fmt->primary = bwt->primary; // $在末尾的行在bwt matrix行中的排序位置
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // check point 个数 n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // check point 个数
fmt->bwt_size = (fmt->seq_len * 2 + 15) >> 4; // 要保存最后两列碱基 fmt->bwt_size = (fmt->seq_len * 2 + 15) >> 4; // 要保存最后两列碱基
@ -129,146 +174,95 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
bwt_occ4(bwt, before_first_line, cnt); bwt_occ4(bwt, before_first_line, cnt);
for (j = i * 4, k = 0; k < 4; ++j, ++k) for (j = i * 4, k = 0; k < 4; ++j, ++k)
c2[j] = cnt[k]; c2[j] = cnt[k];
// cout << "start: " << BASE[i] << " line: " << before_first_line << " occ: " << cnt[0] << '\t' << cnt[1] << '\t' << cnt[2] << '\t' << cnt[3] << endl;
} }
// cout << "c2: ";
// for (m = 0; m < 16; ++m)
// cout << c2[m] << ' ';
// cout << endl;
// k表示buf存储的偏移量 // k表示buf存储的偏移量
for (i = k = 0; i < bwt->seq_len; ++i) for (i = k = 0; i < bwt->seq_len; ++i)
{ {
// 记录occ // 记录occ
if (i % OCC_INTERVAL == 0) if (i % OCC_INTERVAL == 0)
{ {
memcpy(buf + k, c, sizeof(uint32_t) * 4); // 保存occ memcpy(buf + k, c, sizeof(uint32_t) * 4); // bwt str中各个碱基的occ
k += 4; k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16); // 二次计算的occ memcpy(buf + k, c2, sizeof(uint32_t) * 16); // pre-bwt:bwt碱基对的occ
k += 16; k += 16;
} }
// 每个32位整数保存8个倒数第二列碱基和8个倒数第一列(bwt)碱基 // 每个32位整数保存8个倒数第二列碱基pre-bwt和8个倒数第一列(bwt)碱基
if (i % 16 == 0) // 每个32位整数可以包含16个碱基每次需要处理16个碱基也就是间隔最小可以设置为16 if (i % 16 == 0) // 每个32位整数可以包含16个碱基每次需要处理16个碱基也就是间隔最小可以设置为16
{ {
uint32_t bwt_16_seq = bwt->bwt[i / 16]; uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串
uint32_t pre_bwt_16_seq = 0; uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 8; // bwt字符串i对应的基准行因为原始的bwt-cpcheck point包含由4个uint64_t(8个uint32_t)组成的occ信息
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 8; // bwt字符串i对应的基准行 int offset = (i % OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址每16个碱基共用一个uint32整型因此需要偏移量来获取当前碱基串的首地址
int offset = (i % OCC_INTERVAL) / 16; uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址
bwt_16_seq = *(bwt_addr + offset); for (j = 0; j < 16; ++j) // 对于bwt碱基串一个一个碱基分别处理
for (j = 0; j < 16; ++j)
{ {
bwtint_t cur_line = i + j; bwtint_t cur_str_line = i + j; // 当前碱基在bwt str中的行排序
if (cur_line < bwt->seq_len) // 因为bwt序列里除去了$符号所以bwt序列个数比原版bwt少1 if (cur_str_line < bwt->seq_len) // 当前碱基行不应超出bwt str总碱基长度bwt str长度比bwt matrix长度少1因为bwt str不包含$
{ {
uint8_t bwt_base = bwt_B0(bwt, cur_line); // 对应行的bwt的碱基 uint8_t bwt_base = bwt_B0(bwt, cur_str_line); // 对应行的bwt的碱基
// 先求出该碱基对应在第一列的行 // 先求出该碱基对应在第一列的行对应的bwt matrix行
if (cur_line >= bwt->primary) // 因为bwt序列里除去了$符号,所以,超过$所在行之后对应的seq位置应该加一才是真正对应的行 bwtint_t cur_mtx_line = cur_str_line;
cur_line += 1; if (cur_str_line >= bwt->primary) // 因为bwt序列里除去了$符号,所以,超过$所在行之后对应的seq位置应该加一才是真正对应bwt matrix的行
bwtint_t origin_base_line = bwt->L2[bwt_base] + 1 + bwt_occ(bwt, cur_line - 1, bwt_base); // bwt矩阵行 cur_mtx_line += 1;
bwtint_t base_line = origin_base_line; bwt_occ4(bwt, cur_mtx_line, cnt); // 获取原来bwt-checkpoint中的occ值
if (base_line >= bwt->primary) // base_line表示在bwt字符中的位置所以超出$为最尾所在行之后要减掉1 for (m=0; m<4; ++m)
base_line -= 1; // bwt碱基序列行不包含$ c[m] = (uint32_t)cnt[m]; // 碱基m在cur_bwt_mtx_line(包含)之前的累积值直接拷贝原bwt中的occ即可
uint32_t pre_bwt_base = bwt_B0(bwt, base_line); // bwt列碱基对应的前一个碱基
if (origin_base_line == bwt->primary) cnt[bwt_base] -= 1; // 得到cur_bwt_mtx_line(不包含)之前的累积量即bwt_occ4(bwt, cur_bwt_mtx_line-1, cnt)
bwtint_t bwt_base_mtx_line = bwt->L2[bwt_base] + 1 + cnt[bwt_base]; // bwt_base对应的bwt matrix行LF变换
bwt_occ4(bwt, bwt_base_mtx_line, cnt2); // 计算bwt_base_mtx_line之前的occ
for (n = 0; n < 4; ++n)
{
int c2_idx = bwt_base << 2 | n; // bwt base放在前边
c2[c2_idx] = (uint32_t)cnt2[n]; // pre-bwt:bwt 碱基对的累计值
}
bwtint_t bwt_base_str_line = bwt_base_mtx_line; // bwt-str中对应的行排序
if (bwt_base_str_line >= bwt->primary) // base_line表示在bwt str中的位置所以超出$为最尾所在行之后要减掉1
bwt_base_str_line -= 1; // bwt碱基序列行不包含$
uint32_t pre_bwt_base = bwt_B0(bwt, bwt_base_str_line); // bwt列碱基对应的前一个碱基pre-bwt
// 此时bwt_base对应的bwt matrix首行是$排在最尾的行说明bwt_base就是序列的第一个碱基
// 此时计算出来的pre_bwt_base就是primary前一行的bwt base以此来代替$字符,在后续的计算过程中需要考虑
if (bwt_base_mtx_line == bwt->primary)
{ {
// 计算sec_bcp // 计算sec_bcp
fmt->sec_bcp = pre_bwt_base << 2 | bwt_base; // 因为把$当成A处理了 fmt->sec_bcp = pre_bwt_base << 2 | bwt_base; // 因为把$当成A处理了
fmt->sec_primary = cur_line; fmt->sec_primary = cur_mtx_line; // pre-bwt base为$的行排序bwt-matrix行
fmt->first_base = bwt_base; fmt->first_base = bwt_base; // 原始序列第一个碱基
fmt->last_base = pre_bwt_base; fmt->last_base = pre_bwt_base; // 计算后替代$字符的碱基应该是primary行上边一行对应的bwt base
} }
// 暂存 // 暂存 pre-bwt碱基序列
pre_bwt_16_seq = pre_bwt_16_seq | (pre_bwt_base << (15-j)*2); pre_bwt_16_seq = pre_bwt_16_seq | (pre_bwt_base << (15-j)*2); // 序列靠前的碱基排在uint32_t数据中的高位
if (base_line >= bwt->primary)
base_line += 1; // bwt矩阵行
bwtint_t pre_base_line = bwt->L2[pre_bwt_base] + 1 + bwt_occ(bwt, base_line - 1, pre_bwt_base);
// 获取c
bwt_occ4(bwt, cur_line, cnt);
for (m = 0; m < 4; ++m)
{
c[m] = (uint32_t)cnt[m]; // 碱基m在cur_line(包含)之前的累积值
}
// 求出c2
cnt[bwt_base] -= 1; // 得到cur_line(不包含)之前的累积量
// bwtint_t m_first_line = bwt->L2[bwt_base] + cnt[bwt_base]; // 该bwt_base对应的在bwt矩阵中行的前一行
// bwt_occ4(bwt, m_first_line, cnt2);
// for (n = 0; n < 4; ++n) // 只计算bwt_base对应的二级occ其他用之前的值
// {
// int c2_idx = bwt_base << 2 | n;
// c2[c2_idx] = (uint32_t)cnt2[n];
// }
for (m = 0; m < 4; ++m) // 输出调试信息
{ // cout << "mtx line: " << cur_mtx_line << ' ' << c[0] << ' ' << c[1] << ' ' << c[2] << ' ' << c[3] << ' ';
bwtint_t m_first_line = -1;
// if (m == bwt_base || cnt[m] > 0)
if (m == bwt_base)
{
m_first_line = bwt->L2[m] + 1 + cnt[m]; // m是否与bwt_base相同这里需要想清楚情况不一样的
if (m_first_line >= bwt->seq_len)
m_first_line = bwt->seq_len;
// cout << cur_line << '\t' << BASE[m] << '\t' << m_first_line << endl;
bwt_occ4(bwt, m_first_line, cnt2);
for (n = 0; n < 4; ++n)
{
int c2_idx = m << 2 | n;
c2[c2_idx] = (uint32_t)cnt2[n];
}
}
}
cnt[bwt_base] += 1; // cur_line(包含)之前
// cout << cur_line << '\t'
// << base_line << '\t'
// << pre_base_line << '\t'
// << BASE[pre_bwt_base] << '\t'
// << BASE[bwt_base] << '\t'
// << cnt[0] << ' ' << cnt[1] << ' ' << cnt[2] << ' ' << cnt[3] << "\t\t";
// for (m = 0; m < 16; ++m) // for (m = 0; m < 16; ++m)
// cout << c2[m] << ' '; // cout << c2[m] << ' ';
// cout << endl; // cout << endl;
// for (m = 0; m < 16; ++m)
// fprintf(fmt_out, "%-4d", c2[m]);
// fprintf(fmt_out, "\n");
} }
else else
break; break;
} }
//print_base_uint32(pre_bwt_16_seq);
//cout << endl;
//print_base_uint32(bwt_16_seq);
// 保存bwt和pre_bwt // 保存bwt和pre_bwt
uint32_t tmp_seq = 0; uint32_t pre_and_bwt_seq = 0;
tmp_seq = (((pre_bwt_16_seq & (3 << 30)) >> 0) | ((bwt_16_seq & (3 << 30)) >> 2)) for (m = 0; m < 8; ++m)
| (((pre_bwt_16_seq & (3 << 28)) >> 2) | ((bwt_16_seq & (3 << 28)) >> 4)) {
| (((pre_bwt_16_seq & (3 << 26)) >> 4) | ((bwt_16_seq & (3 << 26)) >> 6)) int lshift_bit = 30 - 2 * m;
| (((pre_bwt_16_seq & (3 << 24)) >> 6) | ((bwt_16_seq & (3 << 24)) >> 8)) pre_and_bwt_seq |= (((pre_bwt_16_seq & (3 << lshift_bit)) >> (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) >> ((m * 2) + 2)));
| (((pre_bwt_16_seq & (3 << 22)) >> 8) | ((bwt_16_seq & (3 << 22)) >> 10)) }
| (((pre_bwt_16_seq & (3 << 20)) >> 10) | ((bwt_16_seq & (3 << 20)) >> 12)) buf[k++] = pre_and_bwt_seq;
| (((pre_bwt_16_seq & (3 << 18)) >> 12) | ((bwt_16_seq & (3 << 18)) >> 14))
| (((pre_bwt_16_seq & (3 << 16)) >> 14) | ((bwt_16_seq & (3 << 16)) >> 16));
buf[k++] = tmp_seq;
//cout << i << endl;
//print_base_uint32(tmp_seq);
if (j > 8) if (j > 8)
{ {
// cout << "j: " << j << endl; pre_and_bwt_seq = 0;
tmp_seq = (((pre_bwt_16_seq & (3 << 14)) << 16) | ((bwt_16_seq & (3 << 14)) << 14)) for (m = 8; m > 0; --m)
| (((pre_bwt_16_seq & (3 << 12)) << 14) | ((bwt_16_seq & (3 << 12)) << 12)) {
| (((pre_bwt_16_seq & (3 << 10)) << 12) | ((bwt_16_seq & (3 << 10)) << 10)) int lshift_bit = 2 * m - 2;
| (((pre_bwt_16_seq & (3 << 8)) << 10) | ((bwt_16_seq & (3 << 8)) << 8)) pre_and_bwt_seq |= (((pre_bwt_16_seq & (3 << lshift_bit)) << (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) << (m * 2 - 2)));
| (((pre_bwt_16_seq & (3 << 6)) << 8) | ((bwt_16_seq & (3 << 6)) << 6)) }
| (((pre_bwt_16_seq & (3 << 4)) << 6) | ((bwt_16_seq & (3 << 4)) << 4)) buf[k++] = pre_and_bwt_seq;
| (((pre_bwt_16_seq & (3 << 2)) << 4) | ((bwt_16_seq & (3 << 2)) << 2))
| (((pre_bwt_16_seq & (3 << 0)) << 2) | ((bwt_16_seq & (3 << 0)) << 0));
buf[k++] = tmp_seq;
//print_base_uint32(tmp_seq);
} }
} }
} }
// the last element // the last element
// cout << c[0] << '\t' << c[1] << '\t' << c[2] << '\t' << c[3] << endl;
memcpy(buf + k, c, sizeof(uint32_t) * 4); memcpy(buf + k, c, sizeof(uint32_t) * 4);
k += 4; k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16); memcpy(buf + k, c2, sizeof(uint32_t) * 16);
@ -278,103 +272,149 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
xassert(k == fmt->bwt_size, "inconsistent bwt_size"); xassert(k == fmt->bwt_size, "inconsistent bwt_size");
// update fmt // update fmt
fmt->bwt = buf; fmt->bwt = buf;
return fmt; return fmt;
} }
// 扩展两个个碱基计算bwt base为b的pre-bwt str中各个碱基的occ
void fmt_e2_occ4(const FMTIndex *fmt, bwtint_t k, int b, uint32_t cnt1[4], uint32_t cnt2[4])
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 8 + 20))
void fmt_occ4(const FMTIndex *fmt, bwtint_t k, int b, uint32_t cnt1[4], uint32_t cnt2[4])
{ {
bwtint_t x; uint32_t x1, x2;
uint32_t *p, tmp, *end; uint32_t *p, tmp, *end;
bwtint_t bwt_k_line = k, bwt_k_base_line = k >> OCC_INTV_SHIFT << OCC_INTV_SHIFT;
if (k == (bwtint_t)(-1)) if (k == (bwtint_t)(-1))
{ {
p = fmt->bwt + 4 + b * 4;
memset(cnt1, 0, 4 * sizeof(uint32_t)); memset(cnt1, 0, 4 * sizeof(uint32_t));
memset(cnt2, 0, 4 * sizeof(uint32_t)); memcpy(cnt2, p, 4 * sizeof(uint32_t));
return; return;
} }
k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1 k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
p = fmt_occ_intv(fmt, k); p = fmt_occ_intv(fmt, k);
cout << "base: " << BASE[b] << endl;
cout << "k: " << k << "; p: " << (uint64_t)p << endl;
// cout << "0: " << (uint64_t)fmt_occ_intv(fmt, 0)
// << " ;31: " << (uint64_t)fmt_occ_intv(fmt, 31)
// << " ;32: " << (uint64_t)fmt_occ_intv(fmt, 32)
// << " ;64: " << (uint64_t)fmt_occ_intv(fmt, 64)
// << " ;96: " << (uint64_t)fmt_occ_intv(fmt, 96) << endl;
memcpy(cnt1, p, 4 * sizeof(uint32_t)); memcpy(cnt1, p, 4 * sizeof(uint32_t));
memcpy(cnt2, p + 4 + b * 4, 4 * sizeof(uint32_t)); memcpy(cnt2, p + 4 + b * 4, 4 * sizeof(uint32_t));
cout << "cnt1: " << cnt1[0] << '\t' << cnt1[1] << '\t' << cnt1[2] << '\t' << cnt1[3] << endl;
cout << "cnt2: " << cnt2[0] << '\t' << cnt2[1] << '\t' << cnt2[2] << '\t' << cnt2[3] << endl;
p += 20; // 该地址是bwt和pre_bwt字符串数据的首地址 p += 20; // 该地址是bwt和pre_bwt字符串数据的首地址
end = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); // this is the end point of the following loop end = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3)); // this is the end point of the following loop
// for (x = 0; p < end; ++p)
// x += __occ_aux4(bwt, *p); for (x1 = 0, x2 = 0; p < end; ++p)
// tmp = *p & ~((1U << ((~k & 15) << 1)) - 1); {
// x += __occ_aux4(bwt, tmp) - (~k & 15); x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
// cnt[0] += x & 0xff; x2 += __fmt_occ_e2_aux4(fmt, b, *p);
// cnt[1] += x >> 8 & 0xff; }
// cnt[2] += x >> 16 & 0xff; tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
// cnt[3] += x >> 24; x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
x2 += __fmt_occ_e2_aux4(fmt, b, tmp);
if (b == 0)
x2 -= ~k & 7;
// 如果跨过了second_primary,那么可能需要减掉一次累积值
if (b == fmt->first_base && bwt_k_base_line < fmt->sec_primary && bwt_k_line >= fmt->sec_primary)
{
x2 -= 1 << (fmt->last_base << 3);
}
cnt1[0] += x1 & 0xff;
cnt1[1] += x1 >> 8 & 0xff;
cnt1[2] += x1 >> 16 & 0xff;
cnt1[3] += x1 >> 24;
cnt2[0] += x2 & 0xff;
cnt2[1] += x2 >> 8 & 0xff;
cnt2[2] += x2 >> 16 & 0xff;
cnt2[3] += x2 >> 24;
// cout << "fmt-occ: " << k << '\t' << cnt1[0] << '\t' << cnt1[1] << '\t' << cnt1[2] << '\t' << cnt1[3] << endl;
// cout << "fmt-occ-2: " << k << '\t' << cnt2[0] << '\t' << cnt2[1] << '\t' << cnt2[2] << '\t' << cnt2[3] << endl;
// cout << "bwt_k_base_line: " << bwt_k_base_line << endl;
// cout << "bwt_k_line: " << bwt_k_line << endl;
// cout << "sec_primary: " << fmt->sec_primary << endl;
} }
void fmt_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b, // 对k行和l行同时计算occ如果k和l落在同一个interval区间可以减少一些计算量和访存
uint32_t cntk1[4], uint32_t cntl1[4], uint32_t cntk2[4], uint32_t cntl2[4]) void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b,
uint32_t cntk1[4], uint32_t cntk2[4], uint32_t cntl1[4], uint32_t cntl2[4])
{ {
// fmt_e2_occ4(fmt, k, b, cntk1, cntk2);
// fmt_e2_occ4(fmt, l, b, cntl1, cntl2);
// return;
bwtint_t _k, _l; bwtint_t _k, _l;
_k = k - (k >= fmt->primary); // 换算成了seq的行 _k = k - (k >= fmt->primary); // 换算成了seq的行
_l = l - (l >= fmt->primary); _l = l - (l >= fmt->primary);
// if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1))
// { {
fmt_occ4(fmt, k, b, cntk1, cntk2); fmt_e2_occ4(fmt, k, b, cntk1, cntk2);
fmt_occ4(fmt, l, b, cntl1, cntk1); fmt_e2_occ4(fmt, l, b, cntl1, cntl2);
// } }
// else else
// { {
// bwtint_t x, y; uint32_t x1, x2, y1, y2;
// uint32_t *p, tmp, *endk, *endl; uint32_t *p, tmp, *ek, *el;
// k -= (k >= bwt->primary); // because $ is not in bwt bwtint_t bwt_k_line = k, bwt_l_line = l, bwt_base_line = k >> OCC_INTV_SHIFT << OCC_INTV_SHIFT;
// l -= (l >= bwt->primary);
// p = bwt_occ_intv(bwt, k); k -= (k >= fmt->primary); // because $ is not in bwt
// memcpy(cntk, p, 4 * sizeof(bwtint_t)); l -= (l >= fmt->primary);
// p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) p = fmt_occ_intv(fmt, k);
// // prepare cntk[] memcpy(cntk1, p, 4 * sizeof(uint32_t));
// endk = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); memcpy(cntk2, p + 4 + b * 4, 4 * sizeof(uint32_t));
// endl = p + ((l >> 4) - ((l & ~OCC_INTV_MASK) >> 4)); memcpy(cntl1, cntk1, 4 * sizeof(uint32_t));
// for (x = 0; p < endk; ++p) memcpy(cntl2, cntk2, 4 * sizeof(uint32_t));
// x += __occ_aux4(bwt, *p); p += 20;
// y = x; // prepare cntk[]
// tmp = *p & ~((1U << ((~k & 15) << 1)) - 1); ek = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3));
// x += __occ_aux4(bwt, tmp) - (~k & 15); el = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3));
// // calculate cntl[] and finalize cntk[] for (x1 = 0, x2 = 0; p < ek; ++p)
// for (; p < endl; ++p) {
// y += __occ_aux4(bwt, *p); x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
// tmp = *p & ~((1U << ((~l & 15) << 1)) - 1); x2 += __fmt_occ_e2_aux4(fmt, b, *p);
// y += __occ_aux4(bwt, tmp) - (~l & 15); }
// memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); y1 = x1;
// cntk[0] += x & 0xff; y2 = x2;
// cntk[1] += x >> 8 & 0xff; tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
// cntk[2] += x >> 16 & 0xff; x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
// cntk[3] += x >> 24; x2 += __fmt_occ_e2_aux4(fmt, b, tmp);
// cntl[0] += y & 0xff; if (b == 0)
// cntl[1] += y >> 8 & 0xff; x2 -= ~k & 7;
// cntl[2] += y >> 16 & 0xff; for (; p < el; ++p)
// cntl[3] += y >> 24; {
// } y1 += __fmt_occ_e2_aux4(fmt, 4, *p);
y2 += __fmt_occ_e2_aux4(fmt, b, *p);
}
tmp = *p & ~((1U << ((~l & 7) << 2)) - 1);
y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~l & 7);
y2 += __fmt_occ_e2_aux4(fmt, b, tmp);
if (b == 0)
y2 -= ~l & 7;
// 如果跨过了second_primary,那么可能需要减掉一次累积值
if (b == fmt->first_base && bwt_base_line < fmt->sec_primary)
{
if (bwt_k_line >= fmt->sec_primary)
x2 -= 1 << (fmt->last_base << 3);
if (bwt_l_line >= fmt->sec_primary)
y2 -= 1 << (fmt->last_base << 3);
}
cntk1[0] += x1 & 0xff;
cntk1[1] += x1 >> 8 & 0xff;
cntk1[2] += x1 >> 16 & 0xff;
cntk1[3] += x1 >> 24;
cntk2[0] += x2 & 0xff;
cntk2[1] += x2 >> 8 & 0xff;
cntk2[2] += x2 >> 16 & 0xff;
cntk2[3] += x2 >> 24;
cntl1[0] += y1 & 0xff;
cntl1[1] += y1 >> 8 & 0xff;
cntl1[2] += y1 >> 16 & 0xff;
cntl1[3] += y1 >> 24;
cntl2[0] += y2 & 0xff;
cntl2[1] += y2 >> 8 & 0xff;
cntl2[2] += y2 >> 16 & 0xff;
cntl2[3] += y2 >> 24;
// cout << "fmt-occ: " << k << '\t' << cntk1[0] << '\t' << cntk1[1] << '\t' << cntk1[2] << '\t' << cntk1[3] << endl;
// cout << "fmt-occ-2: " << k << '\t' << cntk2[0] << '\t' << cntk2[1] << '\t' << cntk2[2] << '\t' << cntk2[3] << endl;
// cout << "fmt-occ: " << l << '\t' << cntl1[0] << '\t' << cntl1[1] << '\t' << cntl1[2] << '\t' << cntl1[3] << endl;
// cout << "fmt-occ-2: " << l << '\t' << cntl2[0] << '\t' << cntl2[1] << '\t' << cntl2[2] << '\t' << cntl2[3] << endl;
}
} }
// 扩展一个碱基计算bwt str中各个碱基的occ
#define __fmt_occ_e2_aux4(fmt, b, val) \
((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24])
void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4]) void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4])
{ {
uint32_t x; uint32_t x;
@ -402,69 +442,58 @@ void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4])
cnt[3] += x >> 24; cnt[3] += x >> 24;
} }
void fmt_e2_occ4(const FMTIndex *fmt, bwtint_t k, int b, uint32_t cnt1[4], uint32_t cnt2[4]) // 对k行和l行同时计算bwt str的occ如果k和l落在同一个interval区间可以减少一些计算量和访存
void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], uint32_t cntl[4])
{ {
uint32_t x1, x2; bwtint_t _k, _l;
uint32_t *p, tmp, *end; _k = k - (k >= fmt->primary); // 换算成了seq的行
bwtint_t bwt_k_line = k, bwt_k_base_line = k >> OCC_INTV_SHIFT << OCC_INTV_SHIFT; _l = l - (l >= fmt->primary);
if (k == (bwtint_t)(-1)) if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1))
{ {
p = fmt->bwt + 4 + b * 4; fmt_e1_occ4(fmt, k, cntk);
memset(cnt1, 0, 4 * sizeof(uint32_t)); fmt_e1_occ4(fmt, l, cntl);
memcpy(cnt2, p, 4 * sizeof(uint32_t));
return;
} }
k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1 else
{
uint32_t x1, y1;
uint32_t *p, tmp, *endk, *endl;
k -= (k >= fmt->primary); // because $ is not in bwt
l -= (l >= fmt->primary);
p = fmt_occ_intv(fmt, k); p = fmt_occ_intv(fmt, k);
// cout << "base: " << BASE[b] << endl; memcpy(cntk, p, 4 * sizeof(uint32_t));
// cout << "k: " << k << "; c 0 cnt: " << p[0] << '\t' << p[1] << '\t' << p[2] << '\t' << p[3] << endl; memcpy(cntl, p, 4 * sizeof(uint32_t));
memcpy(cnt1, p, 4 * sizeof(uint32_t)); p += 20;
memcpy(cnt2, p + 4 + b * 4, 4 * sizeof(uint32_t)); // prepare cntk[]
// cout << "[start: ] k: " << k << "; k line cnt: " << cnt[0] << '\t' << cnt[1] << '\t' << cnt[2] << '\t' << cnt[3] << endl; endk = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3));
endl = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3));
p += 20; // 该地址是bwt和pre_bwt字符串数据的首地址 for (x1 = 0; p < endk; ++p)
end = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3)); // this is the end point of the following loop
for (x1 = 0, x2 = 0; p < end; ++p)
{ {
x1 += __fmt_occ_e2_aux4(fmt, 4, *p); x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
x2 += __fmt_occ_e2_aux4(fmt, b, *p);
} }
//{ y1 = x1;
// x += fmt->cnt_table[b][*p & 0xff]
// + fmt->cnt_table[b][*p >> 8 & 0xff]
// + fmt->cnt_table[b][*p >> 16 & 0xff]
// + fmt->cnt_table[b][*p >> 24 & 0xff];
// // cout << "p: " << *p << endl;
// // print_base_uint32(*p);
// // cout << (fmt->cnt_table[b][*p & 0xff] >> 24) << ' '
// // << fmt->cnt_table[b][*p >> 24 & 0xff]
// // << endl;
//}
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7); x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
x2 += __fmt_occ_e2_aux4(fmt, b, tmp); for (; p < endl; ++p)
if (b == 0)
x2 -= ~k & 7;
// 如果跨过了second_primary,那么可能需要减掉一次累积值
if (b == fmt->first_base && bwt_k_base_line < fmt->sec_primary && bwt_k_line >= fmt->sec_primary)
{ {
x2 -= 1 << (fmt->last_base << 3); y1 += __fmt_occ_e2_aux4(fmt, 4, *p);
}
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
cntk[0] += x1 & 0xff;
cntk[1] += x1 >> 8 & 0xff;
cntk[2] += x1 >> 16 & 0xff;
cntk[3] += x1 >> 24;
cntl[0] += y1 & 0xff;
cntl[1] += y1 >> 8 & 0xff;
cntl[2] += y1 >> 16 & 0xff;
cntl[3] += y1 >> 24;
} }
// x += __occ_aux4(bwt, tmp) - (~k & 15);
// cout << "x: " << x << " b:" << b << endl;
cnt1[0] += x1 & 0xff;
cnt1[1] += x1 >> 8 & 0xff;
cnt1[2] += x1 >> 16 & 0xff;
cnt1[3] += x1 >> 24;
cnt2[0] += x2 & 0xff;
cnt2[1] += x2 >> 8 & 0xff;
cnt2[2] += x2 >> 16 & 0xff;
cnt2[3] += x2 >> 24;
// cout << "[end : ]k: " << k << "; k line cnt: " << cnt[0] << '\t' << cnt[1] << '\t' << cnt[2] << '\t' << cnt[3] << endl;
} }
// 扩展一个碱基
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1) void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1)
{ {
uint32_t tk[4], tl[4]; uint32_t tk[4], tl[4];
@ -472,6 +501,7 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk); fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk);
fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl); fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl);
fmt_e1_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
for (i = 0; i != 4; ++i) for (i = 0; i != 4; ++i)
{ {
ok[i].x[!is_back] = fmt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 ok[i].x[!is_back] = fmt->L2[i] + 1 + tk[i]; // 起始行位置,互补链
@ -483,18 +513,21 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
*ik = ok[b1]; *ik = ok[b1];
} }
// 扩展两个碱基
void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1, int b2) void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1, int b2)
{ {
uint32_t tk1[4], tl1[4], tk2[4], tl2[4]; uint32_t tk1[4], tl1[4], tk2[4], tl2[4];
int i; int i;
// fmt_e2_occ4(fmt, ik->x[!is_back] - 1, b1, tk1, tk2);
// fmt_e2_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, tl1, tl2);
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tk2, tl1, tl2);
// fmt_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tl1, tk2, tl2); // tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积 // cout << "k: " << tk1[0] << '\t' << tk1[1] << '\t' << tk1[2] << '\t' << tk1[3] << endl;
// cout << "l: " << tl1[0] << '\t' << tl1[1] << '\t' << tl1[2] << '\t' << tl1[3] << endl;
fmt_e2_occ4(fmt, ik->x[!is_back] - 1, b1, tk1, tk2); // cout << "k: " << tk2[0] << '\t' << tk2[1] << '\t' << tk2[2] << '\t' << tk2[3] << endl;
fmt_e2_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, tl1, tl2); // cout << "l: " << tl2[0] << '\t' << tl2[1] << '\t' << tl2[2] << '\t' << tl2[3] << endl;
// fmt_e2_occ(fmt, -1, 0, tk);
// 这里是反向扩展 // 这里是反向扩展
for (i = 0; i != 4; ++i) for (i = 0; i != 4; ++i)
@ -505,11 +538,17 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
// 因为计算的是互补碱基所以3对应着0,2对应1下边是正向扩展 // 因为计算的是互补碱基所以3对应着0,2对应1下边是正向扩展
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary); ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary);
for (i = 2; i >= b1; --i) ok[2].x[is_back] = ok[3].x[is_back] + tl1[3] - tk1[3];
ok[i].x[is_back] = ok[i + 1].x[is_back] + tl1[i + 1] - tk1[i + 1]; ok[1].x[is_back] = ok[2].x[is_back] + tl1[2] - tk1[2];
ok[0].x[is_back] = ok[1].x[is_back] + tl1[1] - tk1[1];
cout << "fmt-d: " << BASE[b1] << '\t' << ok[b1].x[is_back] << '\t' << ok[b1].x[2] << endl;
ok[3].x[is_back] = ok[b1].x[is_back] + (ok[b1].x[!is_back] <= fmt->primary && ok[b1].x[!is_back] + ok[b1].x[2] - 1 >= fmt->primary); ok[3].x[is_back] = ok[b1].x[is_back] + (ok[b1].x[!is_back] <= fmt->primary && ok[b1].x[!is_back] + ok[b1].x[2] - 1 >= fmt->primary);
for (i = 2; i >= b2; --i) ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
ok[i].x[is_back] = ok[i + 1].x[is_back] + ok[i + 1].x[2]; ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
*ik = ok[b2]; *ik = ok[b2];
} }
// 利用fmt搜索seed完整搜索只需要单向搜索 // 利用fmt搜索seed完整搜索只需要单向搜索
@ -557,13 +596,18 @@ int main_fmtidx(int argc, char **argv)
//create_bwt_mtx(seq); //create_bwt_mtx(seq);
//cout << seq << endl; //cout << seq << endl;
bwt_t *bwt = restore_bwt_str(argv[1]); // 读取bwt原始字符串带ACGT总的累积量 bwt_t *bwt = restore_bwt(argv[1]); // 读取bwt原始字符串带ACGT总的累积量
create_interval_occ_bwt(bwt); // 根据bwt字符串创建包含interval occ的bwt128碱基+ACGT累积量 // create_interval_occ_bwt(bwt); // 根据bwt字符串创建包含interval occ的bwt128碱基+ACGT累积量
cout << "L2: " << bwt->L2[0] << '\t' << bwt->L2[1] << '\t' << bwt->L2[2] << '\t' cout << "L2: " << bwt->L2[0] << '\t' << bwt->L2[1] << '\t' << bwt->L2[2] << '\t'
<< bwt->L2[3] << '\t' << bwt->L2[4] << endl; << bwt->L2[3] << '\t' << bwt->L2[4] << endl;
string s = "AACCCTAA"; string s = "AACCCTAA";
srand(time(NULL));
s = generate_rand_seq(10);
cout << "seq: " << s << endl;
// s = "TTC";
bwt_search(bwt, s); bwt_search(bwt, s);
bwt_search2(bwt, s); bwt_search2(bwt, s);
@ -573,6 +617,9 @@ int main_fmtidx(int argc, char **argv)
// } // }
// TGGGAT // TGGGAT
FMTIndex *fmt = create_fmt_from_bwt(bwt); FMTIndex *fmt = create_fmt_from_bwt(bwt);
dump_fmt("ref.fmt", fmt);
// FMTIndex *fmt = restore_fmt("tiny.fmt");
fmt_search(fmt, s); fmt_search(fmt, s);
// cout << bwt->bwt_size << endl; // cout << bwt->bwt_size << endl;
// cout << bwt->seq_len << endl; // cout << bwt->seq_len << endl;

View File

@ -3,6 +3,14 @@
#include "bwt.h" #include "bwt.h"
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 8 + 20))
// 字节val中包含bwt base为b的pre-bwt中T G C A按顺序保存在32位整数里每个占8bit的数量
#define __fmt_occ_e2_aux4(fmt, b, val) \
((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24])
// fm-index, extend twice in one search step (one memory access) // fm-index, extend twice in one search step (one memory access)
struct FMTIndex struct FMTIndex
{ {
@ -14,9 +22,9 @@ struct FMTIndex
uint32_t *bwt; // BWT uint32_t *bwt; // BWT
// occurance array, separated to two parts // occurance array, separated to two parts
uint32_t cnt_table[5][256]; // 4对应原来的cnt_table0,1,2,3,分别对应该碱基的扩展 uint32_t cnt_table[5][256]; // 4对应原来的cnt_table0,1,2,3,分别对应该碱基的扩展
int sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
int first_base; // 序列的第一个碱基2bit的int类型0,1,2,3 uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
int last_base; // dollar转换成的base uint8_t last_base; // dollar转换成的base
// suffix array // suffix array
int sa_intv; int sa_intv;
bwtint_t n_sa; bwtint_t n_sa;

100
util.cpp
View File

@ -1,7 +1,16 @@
#define FSYNC_ON_FLUSH
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/time.h> #include <sys/time.h>
#include <string.h>
#include <errno.h>
#ifdef FSYNC_ON_FLUSH
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
#include <stdarg.h>
#include "util.h" #include "util.h"
// base转成2bit值 // base转成2bit值
@ -39,6 +48,48 @@ void _err_fatal_simple_core(const char *func, const char *msg)
abort(); abort();
} }
// 打印信息并停止运行
void _err_fatal_simple(const char *func, const char *msg)
{
fprintf(stderr, "[%s] %s\n", func, msg);
exit(EXIT_FAILURE);
}
void err_fatal(const char *header, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
fprintf(stderr, "[%s] ", header);
vfprintf(stderr, fmt, args);
fprintf(stderr, "\n");
va_end(args);
exit(EXIT_FAILURE);
}
void err_fatal_core(const char *header, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
fprintf(stderr, "[%s] ", header);
vfprintf(stderr, fmt, args);
fprintf(stderr, " Abort!\n");
va_end(args);
abort();
}
// 打开文件流
FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
{
FILE *fp = 0;
if (strcmp(fn, "-") == 0)
return (strstr(mode, "r")) ? stdin : stdout;
if ((fp = fopen(fn, mode)) == 0)
{
err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
}
return fp;
}
// 读取数据 // 读取数据
bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
{ /* Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks */ { /* Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks */
@ -54,3 +105,50 @@ bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
} }
return offset; return offset;
} }
// 写二进制文件
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
{
size_t ret = fwrite(ptr, size, nmemb, stream);
if (ret != nmemb)
_err_fatal_simple("fwrite", strerror(errno));
return ret;
}
// 刷新文件流
int err_fflush(FILE *stream)
{
int ret = fflush(stream);
if (ret != 0)
_err_fatal_simple("fflush", strerror(errno));
#ifdef FSYNC_ON_FLUSH
/* Calling fflush() ensures that all the data has made it to the
kernel buffers, but this may not be sufficient for remote filesystems
(e.g. NFS, lustre) as an error may still occur while the kernel
is copying the buffered data to the file server. To be sure of
catching these errors, we need to call fsync() on the file
descriptor, but only if it is a regular file. */
{
struct stat sbuf;
if (0 != fstat(fileno(stream), &sbuf))
_err_fatal_simple("fstat", strerror(errno));
if (S_ISREG(sbuf.st_mode))
{
if (0 != fsync(fileno(stream)))
_err_fatal_simple("fsync", strerror(errno));
}
}
#endif
return ret;
}
// 关闭文件流
int err_fclose(FILE *stream)
{
int ret = fclose(stream);
if (ret != 0)
_err_fatal_simple("fclose", strerror(errno));
return ret;
}

12
util.h
View File

@ -10,7 +10,9 @@ typedef uint64_t bwtint_t;
if ((cond) == 0) \ if ((cond) == 0) \
_err_fatal_simple_core(__func__, msg) _err_fatal_simple_core(__func__, msg)
double realtime(void); #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
double realtime(void);
// 在fm-indexv(或者bwt)查找过程中,记录结果 // 在fm-indexv(或者bwt)查找过程中,记录结果
struct bwtintv_t struct bwtintv_t
@ -27,5 +29,13 @@ void _err_fatal_simple_core(const char *func, const char *msg);
int bval(char b); int bval(char b);
// 互补碱基值 // 互补碱基值
int cbval(char b); int cbval(char b);
// 打开文件流
FILE *err_xopen_core(const char *func, const char *fn, const char *mode);
// 写二进制文件
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
// 刷新文件流
int err_fflush(FILE *stream);
// 关闭文件流
int err_fclose(FILE *stream);
#endif #endif