bwa_perf/fmt_index.cpp

710 lines
24 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <iostream>
#include <stdint.h>
#include <stdlib.h>
#include <vector>
#include <sys/time.h>
#include <string>
#include <stdio.h>
#include <algorithm>
#include <string.h>
#include <immintrin.h>
#include "util.h"
#include "bwt.h"
#include "fmt_index.h"
using namespace std;
double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0,
t11 = 0, t12 = 0, t13 = 0, t14 = 0;
long f1 = 0, f2 = 0, f3 = 0, f4 = 0, f5 = 0;
const static char BASE[4] = {'A', 'C', 'G', 'T'};
// 求反向互补序列
string calc_reverse_seq(string &seq)
{
string rseq(seq.size(), '0');
for (size_t i = 0; i < seq.size(); ++i)
{
if (seq[i] == 'A')
rseq[i] = 'T';
else if (seq[i] == 'C')
rseq[i] = 'G';
else if (seq[i] == 'G')
rseq[i] = 'C';
else if (seq[i] == 'T')
rseq[i] = 'A';
}
std::reverse(rseq.begin(), rseq.end());
return rseq;
}
// 打印32位整型数据中包含的pre-bwtbwt
void print_base_uint32(uint32_t p)
{
for (int i = 30; i > 0; i -= 4)
{
int b1 = p >> i & 3;
int b2 = p >> (i - 2) & 3;
cout << BASE[b1] << BASE[b2] << endl;
}
}
// 随机生成长度为len的序列
string generate_rand_seq(int len)
{
string seq(len, 'A');
for (int i = 0; i < len; ++i)
{
seq[i] = BASE[rand() % 4];
}
return seq;
}
// 创建bwt矩阵
void create_bwt_mtx(string &seq)
{
bwtint_t seq_len = seq.size() + 1;
string sarr[seq_len];
sarr[0] = seq + '$';
for (bwtint_t i = 1; i < seq_len; ++i)
{
sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i);
}
std::sort(sarr, sarr + seq_len);
// print bwt matrix
// for (int i = 0; i < seq_len; ++i)
//{
// // cout << i << ' ' << sarr[i] << endl;
// cout << sarr[i] << endl;
//}
// cout << "bwt string" << endl;
// for (int i = 0; i < seq_len; ++i)
// {
// cout << sarr[i].back();
// }
// cout << endl;
// cout << "pre bwt string" << endl;
// for (int i = 0; i < seq_len; ++i)
// {
// cout << sarr[i][seq_len - 2];
// }
// cout << endl;
}
// 生成occ每个字节对应一个pattern
void fmt_gen_cnt_occ(FMTIndex *fmt)
{
// 0-8大于a的occ8-16大于b的occ16-24b的occ
int i, a, b, ti;
uint32_t oa, ooa, ob, oob;
for (i = 0; i != 256; ++i) // 遍历单个字节的各种情况
{
for (a = 0; a < 4; ++a) // ba格式
{
oa = 0;
ooa = 0;
oa += ((i >> 4 & 3) == a) + ((i & 3) == a);
ooa += ((i >> 4 & 3) > a) + ((i & 3) > a);
for (b = 0; b < 4; ++b)
{
oob = ob = 0;
oob += ((i >> 6 & 3) > b && (i >> 4 & 3) == a) + ((i >> 2 & 3) > b && (i & 3) == a);
ob += ((i >> 6 & 3) == b && (i >> 4 & 3) == a) + ((i >> 2 & 3) == b && (i & 3) == a);
ti = a << 2 | b;
fmt->cnt_occ[ti][i]= ob << 24 | oob << 16 | oa << 8 | ooa ;
}
}
}
}
// fmt-index的count table4对应着bwt碱基的累积量0,1,2,3分别对应着bwt是A,C,G,Tpre-bwt的累积量
void fmt_gen_cnt_table(uint32_t cnt_table[4][256])
{
int i, j, k;
uint32_t x = 0;
for (i = 0; i != 256; ++i) // 遍历单个字节的各种情况
{
for (k = 0; k < 4; ++k) // bwt碱基
{
x = 0; // for [A,C,G,T][A,C,G,T]
for (j = 0; j != 4; ++j) // pre-bwt碱基
x |= (((i >> 6 & 3) == j && (i >> 4 & 3) == k) + ((i >> 2 & 3) == j && (i & 3) == k)) << (j << 3);
cnt_table[k][i] = x;
}
}
}
void dump_fmt(const char *fn, const FMTIndex *fmt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&fmt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
err_fwrite(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
err_fwrite(&fmt->first_base, sizeof(uint8_t), 1, fp);
err_fwrite(&fmt->last_base, sizeof(uint8_t), 1, fp);
err_fwrite(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fwrite(fmt->bwt, 4, fmt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
FMTIndex *restore_fmt(const char *fn)
{
FMTIndex *fmt;
fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
FILE *fp = fopen(fn, "rb");
fseek(fp, 0, SEEK_END);
fmt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 6 - 3) >> 2; // 以32位word为单位计算的size
fmt->bwt = (uint32_t *)calloc(fmt->bwt_size, 4);
fseek(fp, 0, SEEK_SET);
fread(&fmt->primary, sizeof(bwtint_t), 1, fp);
fread(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
fread(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
fread(&fmt->first_base, sizeof(uint8_t), 1, fp);
fread(&fmt->last_base, sizeof(uint8_t), 1, fp);
fread(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
fread_fix(fp, fmt->bwt_size << 2, fmt->bwt);
fmt->seq_len = fmt->L2[4];
fclose(fp);
fmt_gen_cnt_occ(fmt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量
return fmt;
}
// 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
{
// FILE *fmt_out = fopen("fmt.txt", "w");
FMTIndex *fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
fmt_gen_cnt_occ(fmt);
bwtint_t i, j, k, m, n, n_occ, cnt[4], cnt2[4];
uint32_t c[4], c2[16]; /*c用来保存原来的bwt碱基串的累积值c2用来保存pre-bwt和bwt碱基对的累计值如AA..TT*/
uint32_t *buf; /* 计算之后变成fmt结构中bwt部分 */
#ifdef FMT_MID_INTERVAL
// 加入中间的check point
uint32_t mc[4] = {0};
uint32_t cnt_table[4][256]; // 4对应原来的cnt_table0,1,2,3,分别对应该碱基的扩展
fmt_gen_cnt_table(cnt_table);
#endif
fmt->seq_len = bwt->seq_len; // bwt碱基序列的长度不包含$字符也就是该长度比bwt matrix长度少1
for (i = 0; i < 5; ++i)
fmt->L2[i] = bwt->L2[i]; // 每个碱基的总累积值
fmt->primary = bwt->primary; // $在末尾的行在bwt matrix行中的排序位置
n_occ = (bwt->seq_len + FMT_OCC_INTERVAL - 1) / FMT_OCC_INTERVAL + 1; // check point 个数
fmt->bwt_size = (fmt->seq_len * 2 + 15) >> 4; // 要保存最后两列碱基
fmt->bwt_size += n_occ * 20; // A,C,G,T和AA,AC.....TG,TT共20个
#ifdef FMT_MID_INTERVAL
bwtint_t mn_occ = (bwt->seq_len >> FMT_OCC_INTV_SHIFT) * (FMT_OCC_INTERVAL / FMT_MID_INTERVAL - 1);
bwtint_t last_seq_len = bwt->seq_len % FMT_OCC_INTERVAL;
mn_occ += (last_seq_len + FMT_MID_INTERVAL - 1) / FMT_MID_INTERVAL - 1;
fmt->bwt_size += mn_occ * 4;
//cout << "mn_occ: " << mn_occ << endl;
//exit(0);
#endif
buf = (uint32_t *)calloc(fmt->bwt_size, 4); // 开辟计算fmt用到的缓存
c[0] = c[1] = c[2] = c[3] = 0;
// 首行的c2应该是对应的ACGT对应的行减去1的occ
for (i = 0; i < 4; ++i)
{
bwtint_t before_first_line = fmt->L2[i];
bwt_occ4(bwt, before_first_line, cnt);
for (j = i * 4, k = 0; k < 4; ++j, ++k)
c2[j] = cnt[k];
}
// k表示buf存储的偏移量
for (i = k = 0; i < bwt->seq_len; ++i)
{
// 记录occ
if (i % FMT_OCC_INTERVAL == 0)
{
memcpy(buf + k, c, sizeof(uint32_t) * 4); // bwt str中各个碱基的occ
k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16); // pre-bwt:bwt碱基对的occ
k += 16;
#ifdef FMT_MID_INTERVAL
mc[0] = mc[1] = mc[2] = mc[3] = 0;
#endif
}
// 每个32位整数保存8个倒数第二列碱基pre-bwt和8个倒数第一列(bwt)碱基
if (i % 16 == 0) // 每个32位整数可以包含16个碱基每次需要处理16个碱基也就是间隔最小可以设置为16
{
uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 4; // 这里加4还是加8要看保存occ的是是uint32还是uint64bwt字符串i对应的基准行因为原始的bwt-cpcheck point包含由4个uint32_t(8个uint32_t)组成的occ信息
int offset = (i % OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址每16个碱基共用一个uint32整型因此需要偏移量来获取当前碱基串的首地址
uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址
for (j = 0; j < 16; ++j) // 对于bwt碱基串一个一个碱基分别处理
{
bwtint_t cur_str_line = i + j; // 当前碱基在bwt str中的行排序
if (cur_str_line < bwt->seq_len) // 当前碱基行不应超出bwt str总碱基长度bwt str长度比bwt matrix长度少1因为bwt str不包含$
{
uint8_t bwt_base = bwt_B0(bwt, cur_str_line); // 对应行的bwt的碱基
// 先求出该碱基对应在第一列的行对应的bwt matrix行
bwtint_t cur_mtx_line = cur_str_line;
if (cur_str_line >= bwt->primary) // 因为bwt序列里除去了$符号,所以,超过$所在行之后对应的seq位置应该加一才是真正对应bwt matrix的行
cur_mtx_line += 1;
bwt_occ4(bwt, cur_mtx_line, cnt); // 获取原来bwt-checkpoint中的occ值
for (m=0; m<4; ++m)
c[m] = (uint32_t)cnt[m]; // 碱基m在cur_bwt_mtx_line(包含)之前的累积值直接拷贝原bwt中的occ即可
cnt[bwt_base] -= 1; // 得到cur_bwt_mtx_line(不包含)之前的累积量即bwt_occ4(bwt, cur_bwt_mtx_line-1, cnt)
bwtint_t bwt_base_mtx_line = bwt->L2[bwt_base] + 1 + cnt[bwt_base]; // bwt_base对应的bwt matrix行LF变换
bwt_occ4(bwt, bwt_base_mtx_line, cnt2); // 计算bwt_base_mtx_line之前的occ
for (n = 0; n < 4; ++n)
{
int c2_idx = bwt_base << 2 | n; // bwt base放在前边
c2[c2_idx] = (uint32_t)cnt2[n]; // pre-bwt:bwt 碱基对的累计值
}
bwtint_t bwt_base_str_line = bwt_base_mtx_line; // bwt-str中对应的行排序
if (bwt_base_str_line >= bwt->primary) // base_line表示在bwt str中的位置所以超出$为最尾所在行之后要减掉1
bwt_base_str_line -= 1; // bwt碱基序列行不包含$
uint32_t pre_bwt_base = bwt_B0(bwt, bwt_base_str_line); // bwt列碱基对应的前一个碱基pre-bwt
// 此时bwt_base对应的bwt matrix首行是$排在最尾的行说明bwt_base就是序列的第一个碱基
// 此时计算出来的pre_bwt_base就是primary前一行的bwt base以此来代替$字符,在后续的计算过程中需要考虑
if (bwt_base_mtx_line == bwt->primary)
{
// 计算sec_bcp
fmt->sec_bcp = pre_bwt_base << 2 | bwt_base; // 因为把$当成A处理了
fmt->sec_primary = cur_mtx_line; // pre-bwt base为$的行排序bwt-matrix行
fmt->first_base = bwt_base; // 原始序列第一个碱基
fmt->last_base = pre_bwt_base; // 计算后替代$字符的碱基应该是primary行上边一行对应的bwt base
}
// 暂存 pre-bwt碱基序列
pre_bwt_16_seq = pre_bwt_16_seq | (pre_bwt_base << (15-j)*2); // 序列靠前的碱基排在uint32_t数据中的高位
// 输出调试信息
// cout << "mtx line: " << cur_mtx_line << ' ' << c[0] << ' ' << c[1] << ' ' << c[2] << ' ' << c[3] << ' ';
// for (m = 0; m < 16; ++m)
// cout << c2[m] << ' ';
// cout << endl;
}
else
break;
}
// 保存bwt和pre_bwt
uint32_t pre_and_bwt_seq = 0;
uint32_t pre_and_bwt_seq_2 = 0;
for (m = 0; m < 8; ++m)
{
int lshift_bit = 30 - 2 * m;
pre_and_bwt_seq |= (((pre_bwt_16_seq & (3 << lshift_bit)) >> (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) >> ((m * 2) + 2)));
}
buf[k++] = pre_and_bwt_seq;
if (j > 8)
{
#if FMT_MID_INTERVAL == 8
for (m = 0; m < 4; ++m)
{
uint32_t s1 = pre_and_bwt_seq;
mc[m] += cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
buf[k++] = mc[m];
}
#endif
for (m = 8; m > 0; --m)
{
int lshift_bit = 2 * m - 2;
pre_and_bwt_seq_2 |= (((pre_bwt_16_seq & (3 << lshift_bit)) << (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) << (m * 2 - 2)));
}
buf[k++] = pre_and_bwt_seq_2;
#if FMT_MID_INTERVAL == 8
if ((i + 16) % FMT_OCC_INTERVAL != 0 && j == 16)
for (m = 0; m < 4; ++m)
{
uint32_t s1 = pre_and_bwt_seq_2;
mc[m] += cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
buf[k++] = mc[m];
}
#endif
}
#if FMT_MID_INTERVAL == 16
// 加入中间的check point
for (m = 0; m < 4; ++m)
{
uint32_t s1 = pre_and_bwt_seq;
mc[m] += cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
s1 = pre_and_bwt_seq_2;
mc[m] += cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
}
if ((i + 16) % FMT_OCC_INTERVAL != 0 && j > 8) // 序列最后的部分不用保存
{
for (m = 0; m < 4; ++m)
buf[k++] = mc[m];
}
#endif
}
}
// the last element
memcpy(buf + k, c, sizeof(uint32_t) * 4);
k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16);
k += 16;
// cout << "n occ: " << n_occ << endl;
cout << "size: " << k << '\t' << fmt->bwt_size << endl;
xassert(k == fmt->bwt_size, "inconsistent bwt_size");
// update fmt
fmt->bwt = buf;
return fmt;
}
// 扩展两个个碱基计算bwt base为b的pre-bwt str中各个碱基的occ
void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4])
{
uint32_t x = 0;
uint32_t *p, *q, tmp;
bwtint_t bwt_k_line = k, bwt_k_base_line = k >> FMT_OCC_INTV_SHIFT << FMT_OCC_INTV_SHIFT;
int i, ti;
cnt[0] = 0;
cnt[1] = 0;
cnt[2] = 0;
if (k == (bwtint_t)(-1))
{
p = fmt->bwt + 4 + b1 * 4;
for (i = b2 + 1; i < 4; ++i) cnt[2] += p[i];
cnt[3] = p[b2];
return;
}
ti = b1 << 2 | b2;
k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
p = fmt_occ_intv(fmt, k);
//cout << "k-base: " << k << "; occ: " << p[0] << ' ' << p[1] << ' ' << p[2] << ' ' << p[3] << endl;
for (i = b1 + 1; i < 4; ++i) cnt[0] += p[i]; // 大于b1的碱基的occ之和
cnt[1] = p[b1]; // b1的occ
q = p + 4 + b1 * 4;
for (i = b2 + 1; i < 4 ; ++i) cnt[2] += q[i]; // 大于b2的occ之和
cnt[3] = q[b2]; // b2的occ
p += 20;
//if (k == 3965453116 || k == 3965453672 || k == 2668688087 || k == 2668688550) {
// cout << "sec base-occ: " << q[0] << ' ' << q[1] << ' ' << q[2] << ' ' << q[3] << endl;
//}
#ifdef FMT_MID_INTERVAL
// 使用mid interval信息
int mk = k % FMT_OCC_INTERVAL;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
//if (k == 3137454504)
//{
// for (i = 0; i < n_mintv; ++i)
// {
// q = p + i * 6;
// print_base_uint32(*q);
// print_base_uint32(*(q + 1));
// x = *(q + 2);
// cout << ((x) >> 24 & 0xff) << ' ' << ((x) >> 16 & 0xff) << ' ' << ((x) >> 8 & 0xff) << ' ' << ((x) & 0xff) << ' ' << __fmt_mid_sum(x) << endl;
// x = *(q + 3);
// cout << ((x) >> 24 & 0xff) << ' ' << ((x) >> 16 & 0xff) << ' ' << ((x) >> 8 & 0xff) << ' ' << ((x) & 0xff) << ' ' << __fmt_mid_sum(x) << endl;
// x = *(q + 4);
// cout << ((x) >> 24 & 0xff) << ' ' << ((x) >> 16 & 0xff) << ' ' << ((x) >> 8 & 0xff) << ' ' << ((x) & 0xff) << ' ' << __fmt_mid_sum(x) << endl;
// x = *(q + 5);
// cout << ((x) >> 24 & 0xff) << ' ' << ((x) >> 16 & 0xff) << ' ' << ((x) >> 8 & 0xff) << ' ' << ((x) & 0xff) << ' ' << __fmt_mid_sum(x) << endl;
// }
// x = 0;
//}
if (n_mintv > 0) // 至少超过了第一个mid interval
{
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)) - 4; // 对应的mid interval check point的首地址即A C G T的局部累积量
q = p + b1;
for (i = b1 + 1; i < 4; ++i)
x += p[i]; // 大于b1的碱基的occ之和
cnt[0] += __fmt_mid_sum(x);
x = *q;
// if (k == 3137454504)
// {
// cout << ((x) >> 24 & 0xff) << '\t' << ((x) >> 16 & 0xff)
// << '\t' << ((x) >> 8 & 0xff) << '\t' << ((x) & 0xff) << endl;
// cout << __fmt_mid_sum(x) << endl;
// }
cnt[1] += __fmt_mid_sum(x); // b1的occ
for (i = 3; i > b2; --i)
cnt[2] += x >> (i << 3) & 0xff; // 大于b2的occ之和
cnt[3] += x >> (b2 << 3) & 0xff; // b2的occ
x = 0;
p += 4;
}
// cout << "mid-occ: " << cnt[0] << ' ' << cnt[1] << ' ' << cnt[2] << ' ' << cnt[3] << endl;
// cout << "k: " << k << ' ' << cnt[1] << endl;
#if FMT_MID_INTERVAL == 16
if ((mk & FMT_MID_INTV_MASK) >> 3)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
// print_base_uint32(*p);
++p;
}
#endif
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
x += __fmt_occ_e2_aux2(fmt, ti, tmp);
//if (k == 3965453116 || k == 3965453672 || k == 2668688087 || k == 2668688550)
//{
// print_base_uint32(tmp);
// cout << "多的: " << (~k & 7) << endl;
// cout << (x >> 24 & 0xff) << ' ' << (x >> 16 & 0xff) << ' ' << (x >> 8 & 0xff) << ' ' << (x & 0xff) << endl;
//}
#else // 该地址是bwt和pre_bwt字符串数据的首地址
uint32_t *end = p + ((k >> 3) - ((k & ~FMT_OCC_INTV_MASK) >> 3)); // this is the end point of the following loop
// p = end - (end - p) / 8;
// cout << "k - kbase: " << k - bwt_k_base_line << endl;
for (; p < end; ++p)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
// print_base_uint32(*p);
}
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
// print_base_uint32(tmp);
x += __fmt_occ_e2_aux2(fmt, ti, tmp);
#endif
if (b1 == 0)
{
x -= (~k & 7) << 8;
if (b2 == 0)
x -= (~k & 7) << 24;
// cout << "here-1" << endl;
}
// 如果跨过了second_primary,那么可能需要减掉一次累积值
if (b1 == fmt->first_base && bwt_k_base_line < fmt->sec_primary && bwt_k_line >= fmt->sec_primary)
{
if (b2 < fmt->last_base)
cnt[2] -= 1 ;
else if (b2 == fmt->last_base)
cnt[3] -= 1;
// cout << "here-2" << endl;
}
// if (k == 3965453116 || k == 3965453672 || k == 2668688087 || k == 2668688550)
// {
// cout << BASE[b1] << BASE[b2] << endl;
// cout << "final-x: " << (x >> 24 & 0xff) << ' ' << (x >> 16 & 0xff) << ' ' << (x >> 8 & 0xff) << ' ' << (x & 0xff) << endl;
// }
cnt[0] += x & 0xff;
cnt[1] += x >> 8 & 0xff;
cnt[2] += x >> 16 & 0xff;
cnt[3] += x >> 24 & 0xff;
// if (k == 3965453116 || k == 3965453672 || k == 2668688087 || k == 2668688550)
// cout << "final-occ: " << cnt[0] << ' ' << cnt[1] << ' ' << cnt[2] << ' ' << cnt[3] << endl;
}
// 扩展两个碱基
void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2)
{
bwtint_t tk[4], tl[4], first_pos;
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk);
fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
//fmt_e2_occ_2way(fmt, ik->x[!is_back] - 1, b1, b2, tk);
//fmt_e2_occ_2way(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
// 这里是反向扩展
//cout << BASE[b1] << BASE[b2] << endl;
//cout << "fmt: interval-1: " << tl[0] - tk[0] << ' ' << tl[0] << ' ' << tk[0] << endl;
//cout << "fmt: interval-2: " << tl[2] - tk[2] << ' ' << tl[2] << ' ' << tk[2] << endl;
ok->x[!is_back] = fmt->L2[b2] + 1 + tk[3];
ok->x[2] = tl[3] - tk[3];
// 第一次正向扩展
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
// 第二次正向扩展
first_pos = fmt->L2[b1] + 1 + tk[1];
ok->x[is_back] = ok->x[is_back] + (first_pos <= fmt->primary && first_pos + tl[1] - tk[1] - 1 >= fmt->primary) + tl[2] - tk[2];
}
// 扩展一个碱基
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1)
{
bwtint_t tk[4], tl[4];
int b2 = 3;
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk);
fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
//fmt_e2_occ_2way(fmt, ik->x[!is_back] - 1, b1, b2, tk);
//fmt_e2_occ_2way(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
// 这里是反向扩展
ok->x[!is_back] = fmt->L2[b1] + 1 + tk[1];
ok->x[2] = tl[1] - tk[1];
// 第一次正向扩展
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
}
// 利用fmt搜索seed完整搜索只需要单向搜索
bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
{
bwtintv_t ik;
bwtintv_t ok;
int i, c1, c2, x = 0;
int qlen = (int)q.size();
fmt_set_intv(fmt, bval(q[x]), ik);
ik.info = x + 1;
//cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
for (i = x + 1; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
//cout << BASE[c2] << BASE[c1] << endl;
//double tm_t = realtime();
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
ik = ok;
// t8 += realtime() - tm_t;
ik.info = i + 1;
//cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
// double tm_t = realtime();
fmt_extend1(fmt, &ik, &ok, 0, c1);
ik = ok;
// t9 += realtime() - tm_t;
ik.info = i + 1;
//cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
}
// cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
return ik;
}
uint32_t str2bit(string &str)
{
uint32_t pac = 0;
for (int i = 0; i < 16; ++i)
pac = (pac << 2) | bval(str[i]);
return pac;
}
int main_fmtidx(int argc, char **argv)
{
// string seq("ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA");
string seq("ACCCT");
string rseq = calc_reverse_seq(seq);
seq = seq + rseq;
//create_bwt_mtx(seq);
//cout << seq << endl;
// uint32_t cnt_table[4][256];
// fmt_gen_cnt_table(cnt_table);
// string ss1("ATATATCAATTCTCTT");
// string ss2("ATTGCCTCTGAATTAC");
// uint32_t bs1 = str2bit(ss1), bs2 = str2bit(ss2);
// uint32_t s1 = bs1;
// uint32_t x = 0;
// for (int m = 0; m < 4; ++m)
// {
// x = cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
// cout << ((x) >> 24 & 0xff) << ' ' << ((x) >> 16 & 0xff) << ' ' << ((x) >> 8 & 0xff) << ' ' << ((x) & 0xff) << ' ' << __fmt_mid_sum(x) << endl;
// }
// x = 0;
// s1 = bs2;
// for (int m = 0; m < 4; ++m)
// {
// x = cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
// cout << ((x) >> 24 & 0xff) << ' ' << ((x) >> 16 & 0xff) << ' ' << ((x) >> 8 & 0xff) << ' ' << ((x) & 0xff) << ' ' << __fmt_mid_sum(x) << endl;
// }
// return 0;
// string bwt_str = string(argv[1]) + ".bwt.str";
string bwt_idx = string(argv[1]) + ".128.bwt";
// string bwt_idx = string(argv[1]) + ".64.bwt";
// string bwt_idx = string(argv[1]) + ".16.bwt";
// string bwt_idx = string(argv[1]) + ".bwt";
//bwt_t *bwt = restore_bwt(bwt_str.c_str());
//create_interval_occ_bwt(bwt);
//dump_bwt(bwt_idx.c_str(), bwt);
//string fmt_idx = string(argv[1]) + ".fmt";
//string fmt_idx = string(argv[1]) + ".256.8.fmt";
string fmt_idx = string(argv[1]) + ".256.fmt";
//string fmt_idx = string(argv[1]) + ".128.fmt";
// string fmt_idx = string(argv[1]) + ".64.fmt";
// string fmt_idx = string(argv[1]) + ".32.fmt";
t11 = realtime();
bwt_t *bwt = restore_bwt(bwt_idx.c_str());
t11 = realtime() - t11;
cout << "[time read bwt:] " << t11 << "s" << endl;
t10 = realtime();
FMTIndex *fmt = restore_fmt(fmt_idx.c_str());
//FMTIndex *fmt = create_fmt_from_bwt(bwt);
//dump_fmt(fmt_idx.c_str(), fmt);
t10 = realtime() - t10;
cout << "[time gen fmt:] " << t10 << "s" << endl;
vector<string> seed_arr(10000000);
seed_arr[0] = "GCGATACTAAGA";
seed_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
srand(time(NULL));
t1 = realtime();
for (int i = 0; i < (int)seed_arr.size(); ++i)
seed_arr[i] = generate_rand_seq(7);
t1 = realtime() - t1;
cout << "[time gen seed:] " << t1 << "s" << endl;
t2 = realtime();
for (int i = 0; i < (int)seed_arr.size(); ++i)
bwt_search(bwt, seed_arr[i]);
t2 = realtime() - t2;
cout << "[time bwt search:] " << t2 << "s" << endl;
t3 = realtime();
for (int i = 0; i < (int)seed_arr.size(); ++i)
fmt_search(fmt, seed_arr[i]);
t3 = realtime() - t3;
cout << "[time fmt search:] " << t3 << "s" << endl;
//return 0;
t4 = realtime();
for (int i = 0; i < (int)seed_arr.size(); ++i)
{
bwtintv_t p1 =
bwt_search2(bwt, seed_arr[i]);
bwtintv_t p2 = fmt_search(fmt, seed_arr[i]);
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
cout << seed_arr[i] << endl
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
}
t4 = realtime() - t4;
cout << "[time bwt search 2:] " << t4 << "s" << endl;
//cout << "bwt occ: " << t5 << "s; " << t7 << '\t' << t10 << endl;
//cout << "fmt occ: " << t6 << "s; " << t11 << '\t' << t8 << '\t' << t9 << endl;
// bwt_search(bwt, s);
// bwt_search2(bwt, s);
// for (int i = 0; i < 120; ++i)
// {
// cout << i << '\t' << bwt_B0(bwt, i) << endl;
// }
// TGGGAT
// FMTIndex *fmt = create_fmt_from_bwt(bwt);
// dump_fmt("ref.fmt", fmt);
// FMTIndex *fmt = restore_fmt("tiny.fmt");
//fmt_search(fmt, s);
// cout << bwt->bwt_size << endl;
// cout << bwt->seq_len << endl;
//cout << "sec_: " << fmt->sec_bcp << '\t' << fmt->sec_primary << endl;
//uint8_t b8 = 2 << 4 | 2;
//cout << "AGAG: " << fmt->cnt_table[2][b8] << endl;
//cout << (((b8 >> 6) == 0 && (b8 >> 4 & 3) == 2) + ((b8 >> 2 & 3) == 0 && (b8 & 3) == 2)) << endl;
return 0;
}