bwa_perf/fmt_index.cpp

1208 lines
37 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <iostream>
#include <stdint.h>
#include <stdlib.h>
#include <vector>
#include <sys/time.h>
#include <string>
#include <stdio.h>
#include <algorithm>
#include <string.h>
#include <immintrin.h>
#include <sstream>
#include <fstream>
#include "util.h"
#include "bwt.h"
#include "fmt_index.h"
using namespace std;
using std::cout;
using std::ifstream;
double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0,
t11 = 0, t12 = 0, t13 = 0, t14 = 0;
long f1 = 0, f2 = 0, f3 = 0, f4 = 0, f5 = 0;
const static char BASE[4] = {'A', 'C', 'G', 'T'};
// 求反向互补序列
string calc_reverse_seq(string &seq)
{
string rseq(seq.size(), '0');
for (size_t i = 0; i < seq.size(); ++i)
{
if (seq[i] == 'A')
rseq[i] = 'T';
else if (seq[i] == 'C')
rseq[i] = 'G';
else if (seq[i] == 'G')
rseq[i] = 'C';
else if (seq[i] == 'T')
rseq[i] = 'A';
}
std::reverse(rseq.begin(), rseq.end());
return rseq;
}
// 打印32位整型数据中包含的pre-bwtbwt
void print_base_uint32(uint32_t p)
{
for (int i = 30; i > 0; i -= 4)
{
int b1 = p >> i & 3;
int b2 = p >> (i - 2) & 3;
cout << BASE[b1] << BASE[b2] << endl;
}
}
// 随机生成长度为len的序列
string generate_rand_seq(int len)
{
string seq(len, 'A');
for (int i = 0; i < len; ++i)
{
seq[i] = BASE[rand() % 4];
}
return seq;
}
// 创建bwt矩阵
void create_bwt_mtx(string &seq)
{
bwtint_t seq_len = seq.size() + 1;
string sarr[seq_len];
sarr[0] = seq + '$';
for (bwtint_t i = 1; i < seq_len; ++i)
{
sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i);
}
std::sort(sarr, sarr + seq_len);
// print bwt matrix
// for (int i = 0; i < seq_len; ++i)
//{
// // cout << i << ' ' << sarr[i] << endl;
// cout << sarr[i] << endl;
//}
// cout << "bwt string" << endl;
// for (int i = 0; i < seq_len; ++i)
// {
// cout << sarr[i].back();
// }
// cout << endl;
// cout << "pre bwt string" << endl;
// for (int i = 0; i < seq_len; ++i)
// {
// cout << sarr[i][seq_len - 2];
// }
// cout << endl;
}
// 生成occ每个字节对应一个pattern
void fmt_gen_cnt_occ(FMTIndex *fmt)
{
// 0-8大于a的occ8-16大于b的occ16-24b的occ
int i, a, b, ti;
uint32_t oa, ooa, ob, oob;
for (i = 0; i != 256; ++i) // 遍历单个字节的各种情况
{
for (a = 0; a < 4; ++a) // ba格式
{
oa = 0;
ooa = 0;
oa += ((i >> 4 & 3) == a) + ((i & 3) == a);
ooa += ((i >> 4 & 3) > a) + ((i & 3) > a);
for (b = 0; b < 4; ++b)
{
oob = ob = 0;
oob += ((i >> 6 & 3) > b && (i >> 4 & 3) == a) + ((i >> 2 & 3) > b && (i & 3) == a);
ob += ((i >> 6 & 3) == b && (i >> 4 & 3) == a) + ((i >> 2 & 3) == b && (i & 3) == a);
ti = a << 2 | b;
fmt->cnt_occ[ti][i]= ob << 24 | oob << 16 | oa << 8 | ooa ;
}
}
}
}
// fmt-index的count table4对应着bwt碱基的累积量0,1,2,3分别对应着bwt是A,C,G,Tpre-bwt的累积量
void fmt_gen_cnt_table(uint32_t cnt_table[4][256])
{
int i, j, k;
uint32_t x = 0;
for (i = 0; i != 256; ++i) // 遍历单个字节的各种情况
{
for (k = 0; k < 4; ++k) // bwt碱基
{
x = 0; // for [A,C,G,T][A,C,G,T]
for (j = 0; j != 4; ++j) // pre-bwt碱基
x |= (((i >> 6 & 3) == j && (i >> 4 & 3) == k) + ((i >> 2 & 3) == j && (i & 3) == k)) << (j << 3);
cnt_table[k][i] = x;
}
}
}
// 将fmt结构数据写入到二进制文件
void dump_fmt(const char *fn, const FMTIndex *fmt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&fmt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
err_fwrite(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
err_fwrite(&fmt->first_base, sizeof(uint8_t), 1, fp);
err_fwrite(&fmt->last_base, sizeof(uint8_t), 1, fp);
err_fwrite(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fwrite(fmt->bwt, 4, fmt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
// 将kmer hash数据写入到文件
void dump_xmer_idx(const char *fn, const XmerHash *xh)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(xh->xe10, 1, (1 << (10 << 1)) * sizeof(XmerEntryArr), fp);
err_fwrite(xh->xe11, 1, (1 << (11 << 1)) * sizeof(XmerEntry), fp);
err_fwrite(xh->xe12, 1, (1 << (12 << 1)) * sizeof(XmerEntry), fp);
err_fwrite(xh->xe13, 1, (1 << (13 << 1)) * sizeof(XmerEntry), fp);
err_fwrite(xh->xe14, 1, (1 << (14 << 1)) * sizeof(XmerEntry), fp);
err_fflush(fp);
err_fclose(fp);
}
// 从文件中读取xmer hash信息
XmerHash restore_xmer_idx(const char *fn)
{
FILE *fp = xopen(fn, "rb");
XmerHash xhash;
XmerHash *xh = &xhash;
int len = 1 << (10 << 1);
xh->xe10 = (XmerEntryArr *)malloc(len * sizeof(XmerEntryArr));
fread_fix(fp, len * sizeof(XmerEntryArr), xh->xe10);
len = 1 << (11 << 1);
xh->xe11 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
fread_fix(fp, len * sizeof(XmerEntry), xh->xe11);
len = 1 << (12 << 1);
xh->xe12 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
fread_fix(fp, len * sizeof(XmerEntry), xh->xe12);
len = 1 << (13 << 1);
xh->xe13 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
fread_fix(fp, len * sizeof(XmerEntry), xh->xe13);
len = 1 << (14 << 1);
xh->xe14 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
fread_fix(fp, len * sizeof(XmerEntry), xh->xe14);
err_fclose(fp);
return xhash;
}
// 从文件中读取fmt结构数据
FMTIndex *restore_fmt(const char *fn)
{
FMTIndex *fmt;
fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
FILE *fp = xopen(fn, "rb");
fseek(fp, 0, SEEK_END);
fmt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 6 - 3) >> 2; // 以32位word为单位计算的size
fmt->bwt = (uint32_t *)calloc(fmt->bwt_size, 4);
fseek(fp, 0, SEEK_SET);
err_fread_noeof(&fmt->primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
err_fread_noeof(&fmt->first_base, sizeof(uint8_t), 1, fp);
err_fread_noeof(&fmt->last_base, sizeof(uint8_t), 1, fp);
err_fread_noeof(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
fread_fix(fp, fmt->bwt_size << 2, fmt->bwt);
fmt->seq_len = fmt->L2[4];
err_fclose(fp);
fmt_gen_cnt_occ(fmt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量
return fmt;
}
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
{
char skipped[256];
FILE *fp;
bwtint_t primary;
fp = xopen(fn, "rb");
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == fmt->primary, "SA-BWT inconsistency: primary is not the same.");
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
err_fread_noeof(&fmt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == fmt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
fmt->n_sa = (fmt->seq_len + fmt->sa_intv) / fmt->sa_intv;
fmt->sa = (uint8_t *)malloc(SA_BYTES(fmt->n_sa));
fread_fix(fp, SA_BYTES(fmt->n_sa), fmt->sa);
err_fclose(fp);
}
// 读取pac ref数据
void fmt_load_ref_pac(const char *fn_prefix, FMTIndex *fmt)
{
string ann_fn = string(fn_prefix) + ".ann";
string pac_fn = string(fn_prefix) + ".pac";
FILE *fp;
fp = xopen(ann_fn.c_str(), "r");
fscanf(fp, "%ld", &fmt->l_pac);
err_fclose(fp);
// load pac
fmt->pac = (uint8_t*)calloc(fmt->l_pac / 4 + 1, 1);
fp = xopen(pac_fn.c_str(), "r");
err_fread_noeof(fmt->pac, 1, fmt->l_pac / 4 + 1, fp); // concatenated 2-bit encoded sequence
err_fclose(fp);
}
// 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
{
// FILE *fmt_out = fopen("fmt.txt", "w");
FMTIndex *fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
fmt_gen_cnt_occ(fmt);
bwtint_t i, j, k, m, n, n_occ, cnt[4], cnt2[4];
uint32_t c[4], c2[16]; /*c用来保存原来的bwt碱基串的累积值c2用来保存pre-bwt和bwt碱基对的累计值如AA..TT*/
uint32_t *buf; /* 计算之后变成fmt结构中bwt部分 */
#ifdef FMT_MID_INTERVAL
// 加入中间的check point
uint32_t mc[4] = {0};
uint32_t cnt_table[4][256]; // 4对应原来的cnt_table0,1,2,3,分别对应该碱基的扩展
fmt_gen_cnt_table(cnt_table);
#endif
fmt->seq_len = bwt->seq_len; // bwt碱基序列的长度不包含$字符也就是该长度比bwt matrix长度少1
for (i = 0; i < 5; ++i)
fmt->L2[i] = bwt->L2[i]; // 每个碱基的总累积值
fmt->primary = bwt->primary; // $在末尾的行在bwt matrix行中的排序位置
n_occ = (bwt->seq_len + FMT_OCC_INTERVAL - 1) / FMT_OCC_INTERVAL + 1; // check point 个数
fmt->bwt_size = (fmt->seq_len * 2 + 15) >> 4; // 要保存最后两列碱基
fmt->bwt_size += n_occ * 20; // A,C,G,T和AA,AC.....TG,TT共20个
#ifdef FMT_MID_INTERVAL
uint32_t s1;
bwtint_t mn_occ = (bwt->seq_len >> FMT_OCC_INTV_SHIFT) * (FMT_OCC_INTERVAL / FMT_MID_INTERVAL - 1);
bwtint_t last_seq_len = bwt->seq_len % FMT_OCC_INTERVAL;
mn_occ += (last_seq_len + FMT_MID_INTERVAL - 1) / FMT_MID_INTERVAL - 1;
fmt->bwt_size += mn_occ * 4;
cout << (FMT_OCC_INTERVAL / FMT_MID_INTERVAL - 1) << ' ' << last_seq_len << ' ' << (last_seq_len + FMT_MID_INTERVAL - 1) / FMT_MID_INTERVAL - 1 << endl;
cout << "mn_occ: " << mn_occ << ' ' << mn_occ * 4 << endl;
i = 0;
//cout << ((i + 16 & FMT_MID_INTV_MASK) == 0) << endl;
// exit(0);
#endif
buf = (uint32_t *)calloc(fmt->bwt_size, 4); // 开辟计算fmt用到的缓存
c[0] = c[1] = c[2] = c[3] = 0;
// 首行的c2应该是对应的ACGT对应的行减去1的occ
for (i = 0; i < 4; ++i)
{
bwtint_t before_first_line = fmt->L2[i];
bwt_occ4(bwt, before_first_line, cnt);
for (j = i * 4, k = 0; k < 4; ++j, ++k)
c2[j] = cnt[k];
}
// k表示buf存储的偏移量
for (i = k = 0; i < bwt->seq_len; ++i)
{
// 记录occ
if (i % FMT_OCC_INTERVAL == 0)
{
memcpy(buf + k, c, sizeof(uint32_t) * 4); // bwt str中各个碱基的occ
k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16); // pre-bwt:bwt碱基对的occ
k += 16;
#ifdef FMT_MID_INTERVAL
mc[0] = mc[1] = mc[2] = mc[3] = 0;
#endif
}
// 每个32位整数保存8个倒数第二列碱基pre-bwt和8个倒数第一列(bwt)碱基
if (i % 16 == 0) // 每个32位整数可以包含16个碱基每次需要处理16个碱基也就是间隔最小可以设置为16
{
uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 4; // 这里加4还是加8要看保存occ的是是uint32还是uint64bwt字符串i对应的基准行因为原始的bwt-cpcheck point包含由4个uint32_t(8个uint32_t)组成的occ信息
int offset = (i % OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址每16个碱基共用一个uint32整型因此需要偏移量来获取当前碱基串的首地址
uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址
for (j = 0; j < 16; ++j) // 对于bwt碱基串一个一个碱基分别处理
{
bwtint_t cur_str_line = i + j; // 当前碱基在bwt str中的行排序
if (cur_str_line < bwt->seq_len) // 当前碱基行不应超出bwt str总碱基长度bwt str长度比bwt matrix长度少1因为bwt str不包含$
{
uint8_t bwt_base = bwt_B0(bwt, cur_str_line); // 对应行的bwt的碱基
// 先求出该碱基对应在第一列的行对应的bwt matrix行
bwtint_t cur_mtx_line = cur_str_line;
if (cur_str_line >= bwt->primary) // 因为bwt序列里除去了$符号,所以,超过$所在行之后对应的seq位置应该加一才是真正对应bwt matrix的行
cur_mtx_line += 1;
bwt_occ4(bwt, cur_mtx_line, cnt); // 获取原来bwt-checkpoint中的occ值
for (m=0; m<4; ++m)
c[m] = (uint32_t)cnt[m]; // 碱基m在cur_bwt_mtx_line(包含)之前的累积值直接拷贝原bwt中的occ即可
cnt[bwt_base] -= 1; // 得到cur_bwt_mtx_line(不包含)之前的累积量即bwt_occ4(bwt, cur_bwt_mtx_line-1, cnt)
bwtint_t bwt_base_mtx_line = bwt->L2[bwt_base] + 1 + cnt[bwt_base]; // bwt_base对应的bwt matrix行LF变换
bwt_occ4(bwt, bwt_base_mtx_line, cnt2); // 计算bwt_base_mtx_line之前的occ
for (n = 0; n < 4; ++n)
{
int c2_idx = bwt_base << 2 | n; // bwt base放在前边
c2[c2_idx] = (uint32_t)cnt2[n]; // pre-bwt:bwt 碱基对的累计值
}
bwtint_t bwt_base_str_line = bwt_base_mtx_line; // bwt-str中对应的行排序
if (bwt_base_str_line >= bwt->primary) // base_line表示在bwt str中的位置所以超出$为最尾所在行之后要减掉1
bwt_base_str_line -= 1; // bwt碱基序列行不包含$
uint32_t pre_bwt_base = bwt_B0(bwt, bwt_base_str_line); // bwt列碱基对应的前一个碱基pre-bwt
// 此时bwt_base对应的bwt matrix首行是$排在最尾的行说明bwt_base就是序列的第一个碱基
// 此时计算出来的pre_bwt_base就是primary前一行的bwt base以此来代替$字符,在后续的计算过程中需要考虑
if (bwt_base_mtx_line == bwt->primary)
{
// 计算sec_bcp
fmt->sec_bcp = pre_bwt_base << 2 | bwt_base; // 因为把$当成A处理了
fmt->sec_primary = cur_mtx_line; // pre-bwt base为$的行排序bwt-matrix行
fmt->first_base = bwt_base; // 原始序列第一个碱基
fmt->last_base = pre_bwt_base; // 计算后替代$字符的碱基应该是primary行上边一行对应的bwt base
}
// 暂存 pre-bwt碱基序列
pre_bwt_16_seq = pre_bwt_16_seq | (pre_bwt_base << (15-j)*2); // 序列靠前的碱基排在uint32_t数据中的高位
// 输出调试信息
// cout << "mtx line: " << cur_mtx_line << ' ' << c[0] << ' ' << c[1] << ' ' << c[2] << ' ' << c[3] << ' ';
// for (m = 0; m < 16; ++m)
// cout << c2[m] << ' ';
// cout << endl;
}
else
break;
}
// 保存bwt和pre_bwt
uint32_t pre_and_bwt_seq = 0;
uint32_t pre_and_bwt_seq_2 = 0;
for (m = 0; m < 8; ++m)
{
int lshift_bit = 30 - 2 * m;
pre_and_bwt_seq |= (((pre_bwt_16_seq & (3 << lshift_bit)) >> (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) >> ((m * 2) + 2)));
}
buf[k++] = pre_and_bwt_seq;
if (j > 8)
{
for (m = 8; m > 0; --m)
{
int lshift_bit = 2 * m - 2;
pre_and_bwt_seq_2 |= (((pre_bwt_16_seq & (3 << lshift_bit)) << (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) << (m * 2 - 2)));
}
#ifdef FMT_MID_INTERVAL // 计算前边8+8个碱基的mid interval occ
s1 = pre_and_bwt_seq;
for (m = 0; m < 4; ++m)
mc[m] += cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
#endif
#if FMT_MID_INTERVAL == 8 // 如果mid interval是8的话这里要保存一次
for (m = 0; m < 4; ++m)
buf[k++] = mc[m];
#endif
buf[k++] = pre_and_bwt_seq_2;
#ifdef FMT_MID_INTERVAL
s1 = pre_and_bwt_seq_2;
for (m = 0; m < 4; ++m)
mc[m] += cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
if ((i + 16) % FMT_OCC_INTERVAL != 0 && j == 16 && ((i + 16) & FMT_MID_INTV_MASK) == 0)
for (m = 0; m < 4; ++m)
buf[k++] = mc[m];
#endif
}
}
}
// the last element
memcpy(buf + k, c, sizeof(uint32_t) * 4);
k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16);
k += 16;
// cout << "n occ: " << n_occ << endl;
cout << "size: " << k << '\t' << fmt->bwt_size << endl;
// exit(0);
xassert(k == fmt->bwt_size, "inconsistent bwt_size");
// update fmt
fmt->bwt = buf;
return fmt;
}
// 扩展两个个碱基计算bwt base为b的pre-bwt str中各个碱基的occ
inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4])
{
uint32_t x = 0;
uint32_t *p, *q, tmp;
bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK);
int i, ti = b1 << 2 | b2;
cnt[0] = 0;
cnt[1] = 0;
cnt[2] = 0;
if (k == (bwtint_t)(-1))
{
p = fmt->bwt + 4 + b1 * 4;
for (i = b2 + 1; i < 4; ++i) cnt[2] += p[i];
cnt[3] = p[b2];
return;
}
k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
p = fmt_occ_intv(fmt, k);
for (i = b1 + 1; i < 4; ++i) cnt[0] += p[i]; // 大于b1的碱基的occ之和
cnt[1] = p[b1]; // b1的occ
q = p + 4 + b1 * 4;
for (i = b2 + 1; i < 4 ; ++i) cnt[2] += q[i]; // 大于b2的occ之和
cnt[3] = q[b2]; // b2的occ
p += 20;
#ifdef FMT_MID_INTERVAL // 加入了middle checkpoint
// 使用mid interval信息
int mk = k % FMT_OCC_INTERVAL;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
if (n_mintv > 0) // 至少超过了第一个mid interval
{
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)) - 4; // 对应的mid interval check point的首地址即A C G T的局部累积量
q = p + b1;
for (i = b1 + 1; i < 4; ++i)
x += p[i]; // 大于b1的碱基的occ之和
cnt[0] += __fmt_mid_sum(x);
x = *q;
cnt[1] += __fmt_mid_sum(x); // b1的occ
for (i = 3; i > b2; --i)
cnt[2] += x >> (i << 3) & 0xff; // 大于b2的occ之和
cnt[3] += x >> (b2 << 3) & 0xff; // b2的occ
x = 0;
p += 4;
}
#if FMT_MID_INTERVAL == 16 // middle check point interval等于16时候只需要判断一下是不是要计算两个uint32表示的碱基序列
if ((mk & FMT_MID_INTV_MASK) >> 3)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
++p;
}
#elif FMT_MID_INTERVAL > 16 // 该地址是bwt和pre_bwt字符串数据的首地址
uint32_t *end = p + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3));
for (; p < end; ++p)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
}
#endif
#else // 没有加入middle check point interval
#if FMT_OCC_INTERVAL > 16
uint32_t *end = p + ((k >> 3) - ((k & ~FMT_OCC_INTV_MASK) >> 3));
// p = end - (end - p) / 4;
for (; p < end; ++p)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
}
#else // FMT_OCC_INTERVAL等于16的时候只需要判断一次就可以
if ((k & FMT_OCC_INTV_MASK) >> 3)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
++p;
}
#endif
#endif
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
x += __fmt_occ_e2_aux2(fmt, ti, tmp);
if (b1 == 0)
{
x -= (~k & 7) << 8;
if (b2 == 0)
x -= (~k & 7) << 24;
}
// 如果跨过了second_primary,那么可能需要减掉一次累积值
if (b1 == fmt->first_base && cp_line < fmt->sec_primary && str_line >= fmt->sec_primary)
{
if (b2 < fmt->last_base)
cnt[2] -= 1;
else if (b2 == fmt->last_base)
cnt[3] -= 1;
}
cnt[0] += x & 0xff;
cnt[1] += x >> 8 & 0xff;
cnt[2] += x >> 16 & 0xff;
cnt[3] += x >> 24 & 0xff;
}
// 这里的k是bwt str的行
inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_t *b1, uint8_t *b2)
{
uint32_t *p;
uint8_t base2;
// 第一步找到check point位置
p = fmt_occ_intv(fmt, k); // check point起始位置
p += 20; // bwt碱基起始位置
// 第二步找到mid check point位置
int mk = k & FMT_OCC_INTV_MASK;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置
// 第三步找到具体的uint32_t
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基和8个倒数第二bwt碱基
// 第四步,获取碱基
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
*b2 = base2 >> 2 & 3;
*b1 = base2 & 3;
}
// k, k1, k2都是bwt矩阵对应的行
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
{
uint8_t b1, b2;
bwtint_t tk[4], kk;
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
fmt_get_previous_base(fmt, kk, &b1, &b2);
fmt_e2_occ(fmt, k, b1, b2, tk);
*k1 = fmt->L2[b1] + tk[1];
*k2 = fmt->L2[b2] + tk[3];
}
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
{
bwtint_t sa = 0, mask = fmt->sa_intv - 1;
bwtint_t k1, k2;
while (k & mask)
{
++sa;
fmt_previous_line(fmt, k, &k1, &k2);
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
if (!(k1 & mask))
{
k = k1;
break;
}
++sa;
k = k2;
}
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
return sa;
}
// 扩展两个碱基
inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2)
{
bwtint_t tk[4], tl[4], first_pos;
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk);
fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
// 这里是反向扩展
ok->x[!is_back] = fmt->L2[b2] + 1 + tk[3];
ok->x[2] = tl[3] - tk[3];
// 第一次正向扩展
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
// 第二次正向扩展
first_pos = fmt->L2[b1] + 1 + tk[1];
ok->x[is_back] = ok->x[is_back] + (first_pos <= fmt->primary && first_pos + tl[1] - tk[1] - 1 >= fmt->primary) + tl[2] - tk[2];
}
// 扩展一个碱基
inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1)
{
bwtint_t tk[4], tl[4];
int b2 = 3; // 如果只扩展一次那么第二个碱基设置成T可以减小一些计算量如计算大于b2的累积数量
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk);
fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
// 这里是反向扩展
ok->x[!is_back] = fmt->L2[b1] + 1 + tk[1];
ok->x[2] = tl[1] - tk[1];
// 第一次正向扩展
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
}
// 序列和参考基因直接对比
static void direct_extend(const FMTIndex *fmt, int len, const char *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt)
{
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
#define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \
while (k != qcond && r != rcond) \
{ \
const int base = PAC_BASE(fmt->pac, r); \
if (q[k] != "ACGT"[base]) \
break; \
k += qstep; \
r += rstep; \
}
#define EXTEND_BASE_LOOP_COMP(qcond, rcond, qstep, rstep) \
while (k != qcond && r != rcond) \
{ \
const int base = 3 - PAC_BASE(fmt->pac, r); \
if (q[k] != "ACGT"[base]) \
break; \
k += qstep; \
r += rstep; \
}
int k;
int64_t r, rp;
mt->num_match = 1;
rp = fmt_sa(fmt, mtx_line);
r = rp >= (int64_t)fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
k = right_pos;
if (rp < (int64_t)fmt->l_pac) // 匹配到了正向链
{
// 向前继续扩展
r += right_pos - left_pos;
EXTEND_BASE_LOOP(len, (int64_t)fmt->l_pac, 1, 1);
mt->rm[0].qe = k;
mt->rm[0].reverse = 0;
// 向后扩展x位置之前的碱基
r -= k - left_pos + 1;
k = left_pos - 1;
EXTEND_BASE_LOOP(-1, (int64_t)-1, -1, -1);
mt->rm[0].qs = k + 1;
mt->rm[0].rs = r + 1;
}
else // 匹配到了互补链
{
r -= right_pos - left_pos;
EXTEND_BASE_LOOP_COMP(len, (int64_t)-1, 1, -1);
mt->rm[0].qe = k;
mt->rm[0].reverse = 1;
// 扩展x之前的碱基
r += k - left_pos + 1;
k = left_pos - 1;
EXTEND_BASE_LOOP_COMP(-1, (int64_t)fmt->l_pac, -1, 1);
mt->rm[0].qs = k + 1;
mt->rm[0].rs = r - 1;
}
mt->info = mt->rm[0].qs;
mt->info = mt->info << 32 | mt->rm[0].qe;
mt->x[2] = 1;
}
// 利用fmt搜索seed完整搜索只需要单向搜索
bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
{
bwtintv_t ik;
bwtintv_t ok;
int i, c1, c2, x = 0;
int qlen = (int)q.size();
fmt_set_intv(fmt, bval(q[x]), ik);
ik.info = x + 1;
for (i = x + 1; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
ik = ok;
ik.info = i + 1;
//cout << ik.x[0] << ' ' << ik.x[1] << ' ' << ik.x[2] << endl;
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
fmt_extend1(fmt, &ik, &ok, 0, c1);
ik = ok;
ik.info = i + 1;
//cout << ik.x[0] << ' ' << ik.x[1] << ' ' << ik.x[2] << endl;
}
return ik;
}
// 将用字符串表示的序列计算成用2bit表示的序列
uint32_t str2bit(string &str)
{
uint32_t pac = 0;
for (int i = 0; i < 16; ++i)
pac = (pac << 2) | bval(str[i]);
return pac;
}
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(vector<string> &vseq, int kmer_len)
{
uint64_t seq_up_val = (1 << (kmer_len << 1));
vseq.clear();
vseq.resize(seq_up_val);
for (uint64_t i = 0; i < seq_up_val; ++i)
{
for (int j = kmer_len - 1; j >= 0; --j)
{
vseq[i].push_back(BASE[(i >> (j << 1)) & 3]);
}
}
}
// 获取xmer的fmt匹配信息
inline void xmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos = 0)
{
bwtint_t x0, x1, x2;
int byte_idx = pos * 14;
uint8_t *arr = mem_addr + byte_idx;
x0 = *arr;
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x1 = *arr;
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x2 = *((uint32_t *)arr);
ok->x[0] = x0;
ok->x[1] = x1;
ok->x[2] = x2;
}
// 设置xmer第pos个碱基对应的fmt匹配信息
inline void xmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos = 0)
{
int byte_idx = pos * 14;
uint8_t *arr = mem_addr + byte_idx;
arr[0] = (uint8_t)(ik.x[0] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
arr += 5;
arr[0] = (uint8_t)(ik.x[1] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
arr += 5;
*((uint32_t *)arr) = (uint32_t)ik.x[2];
}
// 创建xmer hash索引
void create_xmer_index(FMTIndex *fmt)
{
vector<string> qarr;
gen_all_seq(qarr, 10);
bwtintv_t ik;
int i, j, c1, c2;
int qlen = 10;
bwtint_t tk[4], tl[4];
XmerHash *xh = &fmt->xmer_hash;
cout << "qlen: " << qlen << endl;
xh->xe10 = (XmerEntryArr *)malloc((1 << (10 << 1)) * sizeof(XmerEntryArr));
xh->xe11 = (XmerEntry *)malloc((1 << (11 << 1)) * sizeof(XmerEntry));
xh->xe12 = (XmerEntry *)malloc((1 << (12 << 1)) * sizeof(XmerEntry));
xh->xe13 = (XmerEntry *)malloc((1 << (13 << 1)) * sizeof(XmerEntry));
xh->xe14 = (XmerEntry *)malloc((1 << (14 << 1)) * sizeof(XmerEntry));
// 长度为10的kmer
for (j = 0; j < (int)qarr.size(); ++j)
{
string &q = qarr[j];
uint8_t *mem_addr = xh->xe10[j].intv_arr;
fmt_set_intv(fmt, bval(q[0]), ik);
xmer_setval_at(mem_addr, ik, 0);
// 每次扩展两个碱基
for (i = 1; i + 1 < qlen; i += 2)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
xmer_setval_at(mem_addr, ik, i);
// 第二次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
ik.x[2] = tl[3] - tk[3];
xmer_setval_at(mem_addr, ik, i + 1);
}
{ // 最后一次扩展
c1 = cbval(q[i]);
c2 = 3;
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
xmer_setval_at(mem_addr, ik, i);
}
}
// 长度11的kmer
gen_all_seq(qarr, 11);
for (j = 0; j < (int)qarr.size(); ++j)
{
bwtintv_t p = fmt_search(fmt, qarr[j]);
xmer_setval_at(fmt->xmer_hash.xe11[j].intv_arr, p);
}
// 长度12的kmer
gen_all_seq(qarr, 12);
for (j = 0; j < (int)qarr.size(); ++j)
{
bwtintv_t p = fmt_search(fmt, qarr[j]);
xmer_setval_at(fmt->xmer_hash.xe12[j].intv_arr, p);
}
//string q = "CACATATTGGCG";
//uint32_t qbit = 0;
//for (i = 0; i < 12; ++i)
//{
// qbit |= bval(q[i]) << ((11 - i) << 1);
//}
//cout << "qbit: " << qbit << endl;
//xmer_getval_at(fmt->xmer_hash.xe12[qbit].intv_arr, &ik);
// 长度13的kmer
gen_all_seq(qarr, 13);
for (j = 0; j < (int)qarr.size(); ++j)
{
bwtintv_t p = fmt_search(fmt, qarr[j]);
xmer_setval_at(fmt->xmer_hash.xe13[j].intv_arr, p);
}
// 长度14的kmer
gen_all_seq(qarr, 14);
for (j = 0; j < (int)qarr.size(); ++j)
{
bwtintv_t p = fmt_search(fmt, qarr[j]);
xmer_setval_at(fmt->xmer_hash.xe14[j].intv_arr, p);
}
}
// 利用fmt搜索seed利用xmer加速搜索过程
bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q, int cmp_de)
{
bwtintv_t ik;
int i, c1, c2, x = 0;
int qlen = (int)q.size();
int max_xmer_len = 14;
// 先利用kmer进行索引查询
uint32_t qbit = 0;
x = min(max_xmer_len, qlen);
for (i = 0; i < x; ++i)
{
qbit |= bval(q[i]) << ((max_xmer_len - 1 - i) << 1);
}
if (x == 14)
{
xmer_getval_at(fmt->xmer_hash.xe14[qbit].intv_arr, &ik);
}
else if (x == 13)
{
xmer_getval_at(fmt->xmer_hash.xe13[qbit >> 2].intv_arr, &ik);
}
else if (x == 12)
{
// cout << "qbit: " << qbit << endl;
xmer_getval_at(fmt->xmer_hash.xe12[qbit >> 4].intv_arr, &ik);
}
else if (x == 11)
{
xmer_getval_at(fmt->xmer_hash.xe11[qbit >> 6].intv_arr, &ik);
}
else
{
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
}
#if 1
bwtintv_t ok;
// 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
ik = ok;
ik.info = i + 1;
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
fmt_extend1(fmt, &ik, &ok, 0, c1);
ik = ok;
ik.info = i + 1;
}
#else // 测试一下bwt的性能
bwtintv_t ok[4];
for (i = x; i < qlen; ++i) {
if (bval(q[i]) < 4) {
c1 = cbval(q[i]);
bwt_extend(bwt, &ik, ok, 0);
ik = ok[c1];
ik.info = i + 1;
}
}
#endif
if (cmp_de) {
int64_t rp = fmt_sa(fmt, ik.x[0]);
rp = rp >= (int64_t)fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
ik.rm[0].rs = (uint32_t)rp;
}
return ik;
}
// 利用fmt搜索seed利用xmer, direct_extend加速搜索过程
bwtintv_t fmt_search_use_xmer_de(FMTIndex *fmt, const string &q)
{
bwtintv_t ik = {0};
int i, c1, c2, x = 0;
int qlen = (int)q.size();
bwtintv_t mt = {0};
int max_xmer_len = 14;
// 先利用kmer进行索引查询
uint32_t qbit = 0;
x = min(max_xmer_len, qlen);
for (i = 0; i < x; ++i)
{
qbit |= bval(q[i]) << ((max_xmer_len - 1 - i) << 1);
}
if (x == 14)
{
xmer_getval_at(fmt->xmer_hash.xe14[qbit].intv_arr, &ik);
}
else if (x == 13)
{
xmer_getval_at(fmt->xmer_hash.xe13[qbit >> 2].intv_arr, &ik);
}
else if (x == 12)
{
// cout << "qbit: " << qbit << endl;
xmer_getval_at(fmt->xmer_hash.xe12[qbit >> 4].intv_arr, &ik);
}
else if (x == 11)
{
xmer_getval_at(fmt->xmer_hash.xe11[qbit >> 6].intv_arr, &ik);
}
else
{
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
}
bwtintv_t ok = {0};
// 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
if (ok.x[2] == 1) {
direct_extend(fmt, qlen, q.c_str(), 0, i + 2, ok.x[0], &mt);
return mt;
}
ik = ok;
ik.info = i + 1;
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
fmt_extend1(fmt, &ik, &ok, 0, c1);
ik = ok;
ik.info = i + 1;
}
return ik;
}
#define GEN_BWT_IDX 0
#define GEN_FMT_IDX 1
#define GEN_XMER_IDX 0
#define CMP_FMT_BWT_TIME 1
#define CMP_FMT_BWT_RESULT 1
// argv[1] 应该是索引的前缀
int main_fmtidx(int argc, char **argv)
{
string prefix = argv[1];
string bwt_str = prefix + ".bwt.str";
string bwt_idx, fmt_idx, xmer_idx, sa_fn, seq_fn;
bwt_t *bwt;
FMTIndex *fmt;
ostringstream oss_bwt, oss_fmt, oss_xmer, oss_sa;
oss_bwt << '.' << OCC_INTERVAL;
oss_fmt << '.' << FMT_OCC_INTERVAL;
oss_xmer << '.' << XMER_LEN;
oss_fmt << '.' << FMT_MID_INTERVAL;
oss_sa << ".33." << SA_INTV;
bwt_idx = prefix + oss_bwt.str() + ".bwt";
fmt_idx = prefix + oss_fmt.str() + ".fmt";
xmer_idx = prefix + ".14.kmer";
sa_fn = prefix + oss_sa.str() + ".sa";
seq_fn = "./seqs.txt";
cout << bwt_idx << endl;
cout << fmt_idx << endl;
cout << xmer_idx << endl;
cout << sa_fn << endl;
// 生成或读取bwt索引文件
double time_read_bwt = realtime();
#if GEN_BWT_IDX
bwt = restore_bwt(bwt_str.c_str());
create_interval_occ_bwt(bwt);
dump_bwt(bwt_idx.c_str(), bwt);
#else
bwt = restore_bwt(bwt_idx.c_str());
#endif
time_read_bwt = realtime() - time_read_bwt;
cout << "[time gen/read bwt:] " << time_read_bwt << "s" << endl;
// 生成或读取fmt索引文件
double time_read_fmt = realtime();
#if GEN_FMT_IDX
fmt = create_fmt_from_bwt(bwt);
dump_fmt(fmt_idx.c_str(), fmt);
#else
fmt = restore_fmt(fmt_idx.c_str());
#endif
time_read_fmt = realtime() - time_read_fmt;
cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl;
// 生成或读取xmer索引文件
double time_gen_xmer_hash = realtime();
#if GEN_XMER_IDX
create_xmer_index(fmt);
dump_xmer_idx(xmer_idx.c_str(), &fmt->xmer_hash);
#else
fmt->xmer_hash = restore_xmer_idx(xmer_idx.c_str());
#endif
time_gen_xmer_hash = realtime() - time_gen_xmer_hash;
cout << "[time gen xkmer hash:] " << time_gen_xmer_hash << "s" << endl;
// 读取sa
double time_load_sa = realtime();
fmt_restore_sa(sa_fn.c_str(), fmt);
time_load_sa = realtime() - time_load_sa;
cout << "[time load sa:] " << time_load_sa << "s" << endl;
// 读取pac
double time_load_pac = realtime();
fmt_load_ref_pac(prefix.c_str(), fmt);
time_load_pac = realtime() - time_load_pac;
cout << "[time load pac:] " << time_load_pac << "s" << endl;
//cout << "l_pac: " << fmt->l_pac << endl;
//return 0;
// 生成随机序列进行测试
int seq_size = 10000000;
//int seq_size = 1;
int seq_len = 60;
vector<string> seq_arr(seq_size);
//seq_arr[0] = "CACATATTGGCG";
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
// seq_arr[0] = "ACCTTTCGAATTGA";
#if 0 // 随机生成seqs
srand(time(NULL));
double time_gen_seq = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i)
seq_arr[i] = generate_rand_seq(seq_len);
time_gen_seq = realtime() - time_gen_seq;
cout << "[time gen seq:] " << time_gen_seq << "s" << endl;
#else
ifstream seq_in(seq_fn.c_str(), ios::in);
string read_line;
seq_arr.resize(0);
while (!seq_in.eof())
{
seq_in >> read_line;
// cout << read_line.substr(0, seq_len) << endl;
seq_arr.push_back(read_line.substr(0, seq_len));
}
seq_size = seq_arr.size();
seq_in.close();
#endif
// 对比bwt和fmt搜索的时间
#if CMP_FMT_BWT_TIME
// bwt
double time_bwt_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i) {
// bwtintv_t p =
bwt_search(bwt, seq_arr[i]);
}
time_bwt_search = realtime() - time_bwt_search;
cout << "[time bwt search:] " << time_bwt_search << "s" << endl;
// fmt
double time_fmt_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search(fmt, seq_arr[i]);
time_fmt_search = realtime() - time_fmt_search;
cout << "[time fmt search:] " << time_fmt_search << "s" << endl;
// kmer
double time_xmer_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search_use_xmer(fmt, bwt, seq_arr[i], 0);
time_xmer_search = realtime() - time_xmer_search;
cout << "[time xmer search:] " << time_xmer_search << "s" << endl;
// direct-extend
double time_de_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search_use_xmer_de(fmt, seq_arr[i]);
time_de_search = realtime() - time_de_search;
cout << "[time direct-extend search:] " << time_de_search << "s" << endl;
#endif
// 对比bwt和fmt搜索的结果
#if CMP_FMT_BWT_RESULT
double time_cmp = realtime();
long diff_num = 0;
for (int i = 0; i < (int)seq_arr.size(); ++i)
{
#if 0
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
bwtintv_t p2 = fmt_search_use_xmer(fmt, bwt, seq_arr[i], 0);
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) {
cout << "different: " << seq_arr[i] << endl
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
diff_num++;
}
#else
bwtintv_t p1 = fmt_search_use_xmer(fmt, bwt, seq_arr[i], 1);
bwtintv_t p2 = fmt_search_use_xmer_de(fmt, seq_arr[i]);
if (p2.num_match == 1) {
if (p1.rm[0].rs != p2.rm[0].rs || p1.x[2] != 1) {
cout << "different: " << seq_arr[i] << '\t' << p1.rm[0].rs << '\t' << p2.rm[0].rs << endl;
diff_num++;
}
}
else if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) {
cout << "different: " << seq_arr[i] << endl
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
diff_num++;
}
#endif
}
time_cmp = realtime() - time_cmp;
cout << "[time compare bwt & fmt:] " << time_cmp << "s\tdifferent num: " << diff_num << endl;
#endif
return 0;
}