消融实验测试版本

This commit is contained in:
zzh 2024-04-01 21:32:48 +08:00
parent bad3112151
commit 98ae5bb779
5 changed files with 348 additions and 309 deletions

22
bwt.cpp
View File

@ -227,6 +227,28 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin
} }
} }
// 设置某一行的排序值sa的索引有效值从1开始0设置为-1, 小端模式)
void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
{
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组共享33个字节
const int val_idx_in_block = k & 7;
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
bwtint_t *sa_addr = (bwtint_t *)(sa_arr + start_byte_idx);
// *sa_addr &= (1 << val_idx_in_block) - 1; // 如果开辟内存的时候清零了,这一步可以省略,会清除后面的数据,只适合按递增顺序赋值
*sa_addr |= (val & ((1L << 33) - 1)) << val_idx_in_block;
}
// 获取某一行的排序值(小端模式)
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k)
{
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组共享33个字节
const int val_idx_in_block = k & 7;
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
bwtint_t val = *(bwtint_t *)(sa_arr + start_byte_idx);
val = (val >> val_idx_in_block) & 8589934591;
return val;
}
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
{ {
bwtint_t tk[4], tl[4]; bwtint_t tk[4], tl[4];

7
bwt.h
View File

@ -7,7 +7,7 @@
using std::string; using std::string;
// occ间隔对应的右移位数5表示间隔32行保存一次 // occ间隔对应的右移位数5表示间隔32行保存一次
#define OCC_INTV_SHIFT 6 #define OCC_INTV_SHIFT 7
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT) #define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1) #define OCC_INTV_MASK (OCC_INTERVAL - 1)
@ -47,6 +47,9 @@ struct bwt_t
uint8_t *sa; uint8_t *sa;
}; };
void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val);
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k);
// k行(包含bwt mtx行)之前碱基c的累计总数, interval大于等于32才能正确计算 // k行(包含bwt mtx行)之前碱基c的累计总数, interval大于等于32才能正确计算
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c); bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c);
// 统计k行bwt mtx行包含k行本身之前4种碱基累积数量这里的k是bwt矩阵里的行比bwt字符串多1 // 统计k行bwt mtx行包含k行本身之前4种碱基累积数量这里的k是bwt矩阵里的行比bwt字符串多1
@ -65,4 +68,4 @@ void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back,
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
#endif #endif

View File

@ -9,11 +9,14 @@
#include <string.h> #include <string.h>
#include <immintrin.h> #include <immintrin.h>
#include <sstream> #include <sstream>
#include <fstream>
#include "util.h" #include "util.h"
#include "bwt.h" #include "bwt.h"
#include "fmt_index.h" #include "fmt_index.h"
using namespace std; using namespace std;
using std::cout;
using std::ifstream;
double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0, double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0,
t11 = 0, t12 = 0, t13 = 0, t14 = 0; t11 = 0, t12 = 0, t13 = 0, t14 = 0;
@ -155,48 +158,6 @@ void dump_fmt(const char *fn, const FMTIndex *fmt)
} }
// 将kmer hash数据写入到文件 // 将kmer hash数据写入到文件
void dump_kmer_idx(const char *fn, const KmerEntry *kmer_entry)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(kmer_entry, 1, KMER_ARR_SIZE * sizeof(KmerEntry), fp);
err_fflush(fp);
err_fclose(fp);
}
// 从文件中读取kmer hash信息
KmerEntry *restore_kmer_idx(const char *fn)
{
FILE *fp = xopen(fn, "rb");
uint32_t kmer_bytes = KMER_ARR_SIZE * sizeof(KmerEntry);
KmerEntry *kmer_entry = (KmerEntry *)malloc(kmer_bytes);
fread_fix(fp, kmer_bytes, kmer_entry);
err_fclose(fp);
return kmer_entry;
}
// 将full kmer hash数据写入到文件
void dump_full_entry_idx(const char *fn, const FullEntry *full_entry)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(full_entry, 1, FULL_ENTRY_ARR_SIZE * sizeof(FullEntry), fp);
err_fflush(fp);
err_fclose(fp);
}
// 从文件中读取full kmer hash信息
FullEntry *restore_full_entry_idx(const char *fn)
{
FILE *fp = xopen(fn, "rb");
uint32_t full_entry_bytes = FULL_ENTRY_ARR_SIZE * sizeof(FullEntry);
FullEntry *full_entry = (FullEntry *)malloc(full_entry_bytes);
fread_fix(fp, full_entry_bytes, full_entry);
err_fclose(fp);
return full_entry;
}
// 将full kmer hash数据写入到文件
void dump_xmer_idx(const char *fn, const XmerHash *xh) void dump_xmer_idx(const char *fn, const XmerHash *xh)
{ {
FILE *fp; FILE *fp;
@ -259,6 +220,41 @@ FMTIndex *restore_fmt(const char *fn)
return fmt; return fmt;
} }
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
{
char skipped[256];
FILE *fp;
bwtint_t primary;
fp = xopen(fn, "rb");
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == fmt->primary, "SA-BWT inconsistency: primary is not the same.");
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
err_fread_noeof(&fmt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == fmt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
fmt->n_sa = (fmt->seq_len + fmt->sa_intv) / fmt->sa_intv;
fmt->sa = (uint8_t *)malloc(SA_BYTES(fmt->n_sa));
fread_fix(fp, SA_BYTES(fmt->n_sa), fmt->sa);
err_fclose(fp);
}
// 读取pac ref数据
void fmt_load_ref_pac(const char *fn_prefix, FMTIndex *fmt)
{
string ann_fn = string(fn_prefix) + ".ann";
string pac_fn = string(fn_prefix) + ".pac";
FILE *fp;
fp = xopen(ann_fn.c_str(), "r");
fscanf(fp, "%ld", &fmt->l_pac);
err_fclose(fp);
// load pac
fmt->pac = (uint8_t*)calloc(fmt->l_pac / 4 + 1, 1);
fp = xopen(pac_fn.c_str(), "r");
err_fread_noeof(fmt->pac, 1, fmt->l_pac / 4 + 1, fp); // concatenated 2-bit encoded sequence
err_fclose(fp);
}
// 根据interval-bwt创建fmt-index // 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt) FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
{ {
@ -534,6 +530,59 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
cnt[3] += x >> 24 & 0xff; cnt[3] += x >> 24 & 0xff;
} }
// 这里的k是bwt str的行
inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_t *b1, uint8_t *b2)
{
uint32_t *p;
uint8_t base2;
// 第一步找到check point位置
p = fmt_occ_intv(fmt, k); // check point起始位置
p += 20; // bwt碱基起始位置
// 第二步找到mid check point位置
int mk = k & FMT_OCC_INTV_MASK;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置
// 第三步找到具体的uint32_t
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基和8个倒数第二bwt碱基
// 第四步,获取碱基
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
*b2 = base2 >> 2 & 3;
*b1 = base2 & 3;
}
// k, k1, k2都是bwt矩阵对应的行
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
{
uint8_t b1, b2;
bwtint_t tk[4], kk;
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
fmt_get_previous_base(fmt, kk, &b1, &b2);
fmt_e2_occ(fmt, k, b1, b2, tk);
*k1 = fmt->L2[b1] + tk[1];
*k2 = fmt->L2[b2] + tk[3];
}
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
{
bwtint_t sa = 0, mask = fmt->sa_intv - 1;
bwtint_t k1, k2;
while (k & mask)
{
++sa;
fmt_previous_line(fmt, k, &k1, &k2);
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
if (!(k1 & mask))
{
k = k1;
break;
}
++sa;
k = k2;
}
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
return sa;
}
// 扩展两个碱基 // 扩展两个碱基
inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2) inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2)
{ {
@ -566,6 +615,67 @@ inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int i
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
} }
// 序列和参考基因直接对比
static void direct_extend(const FMTIndex *fmt, int len, const char *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt)
{
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
#define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \
while (k != qcond && r != rcond) \
{ \
const int base = PAC_BASE(fmt->pac, r); \
if (q[k] != "ACGT"[base]) \
break; \
k += qstep; \
r += rstep; \
}
#define EXTEND_BASE_LOOP_COMP(qcond, rcond, qstep, rstep) \
while (k != qcond && r != rcond) \
{ \
const int base = 3 - PAC_BASE(fmt->pac, r); \
if (q[k] != "ACGT"[base]) \
break; \
k += qstep; \
r += rstep; \
}
int k;
int64_t r, rp;
mt->num_match = 1;
rp = fmt_sa(fmt, mtx_line);
r = rp >= (int64_t)fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
k = right_pos;
if (rp < (int64_t)fmt->l_pac) // 匹配到了正向链
{
// 向前继续扩展
r += right_pos - left_pos;
EXTEND_BASE_LOOP(len, (int64_t)fmt->l_pac, 1, 1);
mt->rm[0].qe = k;
mt->rm[0].reverse = 0;
// 向后扩展x位置之前的碱基
r -= k - left_pos + 1;
k = left_pos - 1;
EXTEND_BASE_LOOP(-1, (int64_t)-1, -1, -1);
mt->rm[0].qs = k + 1;
mt->rm[0].rs = r + 1;
}
else // 匹配到了互补链
{
r -= right_pos - left_pos;
EXTEND_BASE_LOOP_COMP(len, (int64_t)-1, 1, -1);
mt->rm[0].qe = k;
mt->rm[0].reverse = 1;
// 扩展x之前的碱基
r += k - left_pos + 1;
k = left_pos - 1;
EXTEND_BASE_LOOP_COMP(-1, (int64_t)fmt->l_pac, -1, 1);
mt->rm[0].qs = k + 1;
mt->rm[0].rs = r - 1;
}
mt->info = mt->rm[0].qs;
mt->info = mt->info << 32 | mt->rm[0].qe;
mt->x[2] = 1;
}
// 利用fmt搜索seed完整搜索只需要单向搜索 // 利用fmt搜索seed完整搜索只需要单向搜索
bwtintv_t fmt_search(FMTIndex *fmt, const string &q) bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
{ {
@ -604,166 +714,6 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
return ik; return ik;
} }
// 获取kmer的fmt匹配信息
inline void kmer_getval_at(KmerEntry *ke, bwtintv_t *ok, int pos)
{
bwtint_t x0, x1, x2;
int byte_idx = pos * 14;
uint8_t *arr = ke->intv_arr + byte_idx;
x0 = *arr;
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x1 = *arr;
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x2 = *((uint32_t *)arr);
ok->x[0] = x0;
ok->x[1] = x1;
ok->x[2] = x2;
}
// 设置kmer第pos个碱基对应的fmt匹配信息
inline void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos)
{
int byte_idx = pos * 14;
uint8_t *arr = ke->intv_arr + byte_idx;
arr[0] = (uint8_t)(ik.x[0] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
arr += 5;
arr[0] = (uint8_t)(ik.x[1] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
arr += 5;
*((uint32_t *)arr) = (uint32_t)ik.x[2];
}
inline void full_entry_set(FullEntry *fe, bwtintv_t ik)
{
uint8_t *arr = fe->intv_arr;
arr[0] = (uint8_t)(ik.x[0] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
arr += 5;
arr[0] = (uint8_t)(ik.x[1] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
arr += 5;
*((uint32_t *)arr) = (uint32_t)ik.x[2];
}
inline void full_entry_get(FullEntry *fe, bwtintv_t *ok)
{
bwtint_t x0, x1, x2;
uint8_t *arr = fe->intv_arr;
x0 = *arr;
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x1 = *arr;
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x2 = *((uint32_t *)arr);
ok->x[0] = x0;
ok->x[1] = x1;
ok->x[2] = x2;
}
// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息
void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke)
{
bwtintv_t ik;
int i, c1, c2;
int qlen = (int)q.size();
bwtint_t tk[4], tl[4];
fmt_set_intv(fmt, bval(q[0]), ik);
kmer_setval_at(&ke, ik, 0);
// 每次扩展两个碱基
for (i = 1; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
kmer_setval_at(&ke, ik, i);
// 第二次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
ik.x[2] = tl[3] - tk[3];
kmer_setval_at(&ke, ik, i + 1);
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
c2 = 3;
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
kmer_setval_at(&ke, ik, i);
}
}
// 利用fmt搜索seed利用kmer加速搜索过程
bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
{
bwtintv_t ik;
bwtintv_t ok;
int i, c1, c2, x = 0;
int qlen = (int)q.size();
// 先利用kmer进行索引查询
uint32_t qbit = 0;
x = min(KMER_LEN, qlen);
//double tt = realtime();
for (i = 0; i < x; ++i)
{
qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1);
}
kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1);
//t1 += realtime() - tt;
// tt = realtime();
// 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
ik = ok;
ik.info = i + 1;
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
fmt_extend1(fmt, &ik, &ok, 0, c1);
//fmt_extend2(fmt, &ik, &ok, 0, c1, 3);
ik = ok;
ik.info = i + 1;
}
//t2 += realtime() - tt;
//cout << ik.x[2] << endl;
return ik;
}
// 将用字符串表示的序列计算成用2bit表示的序列 // 将用字符串表示的序列计算成用2bit表示的序列
uint32_t str2bit(string &str) uint32_t str2bit(string &str)
{ {
@ -922,7 +872,7 @@ void create_xmer_index(FMTIndex *fmt)
} }
// 利用fmt搜索seed利用xmer加速搜索过程 // 利用fmt搜索seed利用xmer加速搜索过程
bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q) bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q, int cmp_de)
{ {
bwtintv_t ik; bwtintv_t ik;
int i, c1, c2, x = 0; int i, c1, c2, x = 0;
@ -957,7 +907,7 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q)
{ {
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1); xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
} }
#if 0 #if 1
bwtintv_t ok; bwtintv_t ok;
// 每次扩展两个碱基 // 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2) for (i = x; i + 1 < qlen; i += 2)
@ -982,7 +932,7 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q)
ik = ok; ik = ok;
ik.info = i + 1; ik.info = i + 1;
} }
#else #else // 测试一下bwt的性能
bwtintv_t ok[4]; bwtintv_t ok[4];
for (i = x; i < qlen; ++i) { for (i = x; i < qlen; ++i) {
if (bval(q[i]) < 4) { if (bval(q[i]) < 4) {
@ -994,90 +944,117 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q)
} }
#endif #endif
if (cmp_de) {
int64_t rp = fmt_sa(fmt, ik.x[0]);
rp = rp >= (int64_t)fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
ik.rm[0].rs = (uint32_t)rp;
}
return ik; return ik;
} }
uint16_t *restore_kmer_bit(const char *fn, int kmer_len) // 利用fmt搜索seed利用xmer, direct_extend加速搜索过程
bwtintv_t fmt_search_use_xmer_de(FMTIndex *fmt, const string &q)
{ {
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t)); bwtintv_t ik = {0};
FILE *fp; int i, c1, c2, x = 0;
fp = xopen(fn, "rb"); int qlen = (int)q.size();
fread_fix(fp, (1 << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit); bwtintv_t mt = {0};
err_fclose(fp);
return kbit;
}
void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len) int max_xmer_len = 14;
{ // 先利用kmer进行索引查询
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t)); uint32_t qbit = 0;
ostringstream oss; x = min(max_xmer_len, qlen);
oss << '.' << kmer_len << ".kmer.bit"; for (i = 0; i < x; ++i)
string kmer_bit = prefix + oss.str();
cout << kmer_bit << endl;
uint64_t seq_up_val = (1L << (kmer_len << 1));
cout << (1L << ((kmer_len << 1) - 4)) << '\t' << seq_up_val << endl;
string seq;
seq.resize(kmer_len);
int percent = 1;
double tt = realtime();
for (uint64_t i = 0; i < seq_up_val; ++i)
{ {
if ((i+1) % (seq_up_val / 100) == 0) qbit |= bval(q[i]) << ((max_xmer_len - 1 - i) << 1);
{
tt = realtime() - tt;
cout << percent++ << "\% finished, " << tt << "s elapsed." << endl;
tt = realtime();
}
for (int j = kmer_len - 1; j >= 0; --j)
{
seq[kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
}
uint32_t pos = i >> 4;
uint32_t bit_pos = i & 0xf;
//bwtintv_t p = fmt_search_use_xmer(fmt, seq);
//if (p.x[2] > 0) {
// kbit[pos] |= 1 << bit_pos;
//}
} }
FILE *fp; if (x == 14)
fp = xopen(kmer_bit.c_str(), "wb"); {
err_fwrite(kbit, 2, 1L << ((kmer_len << 1) - 4), fp); xmer_getval_at(fmt->xmer_hash.xe14[qbit].intv_arr, &ik);
err_fflush(fp); }
err_fclose(fp); else if (x == 13)
{
xmer_getval_at(fmt->xmer_hash.xe13[qbit >> 2].intv_arr, &ik);
}
else if (x == 12)
{
// cout << "qbit: " << qbit << endl;
xmer_getval_at(fmt->xmer_hash.xe12[qbit >> 4].intv_arr, &ik);
}
else if (x == 11)
{
xmer_getval_at(fmt->xmer_hash.xe11[qbit >> 6].intv_arr, &ik);
}
else
{
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
}
bwtintv_t ok = {0};
// 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
if (ok.x[2] == 1) {
direct_extend(fmt, qlen, q.c_str(), 0, i + 2, ok.x[0], &mt);
return mt;
}
ik = ok;
ik.info = i + 1;
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
fmt_extend1(fmt, &ik, &ok, 0, c1);
ik = ok;
ik.info = i + 1;
}
return ik;
} }
#define GEN_BWT_IDX 0 #define GEN_BWT_IDX 0
#define GEN_FMT_IDX 0 #define GEN_FMT_IDX 1
#define GEN_XMER_IDX 0 #define GEN_XMER_IDX 0
#define CMP_FMT_BWT_TIME 1 #define CMP_FMT_BWT_TIME 1
#define CMP_FMT_BWT_RESULT 0 #define CMP_FMT_BWT_RESULT 1
// argv[1] 应该是索引的前缀 // argv[1] 应该是索引的前缀
int main_fmtidx(int argc, char **argv) int main_fmtidx(int argc, char **argv)
{ {
//cout << "size: " << sizeof(FullEntry) << '\t' << sizeof(XmerEntryArr) << endl;
string prefix = argv[1]; string prefix = argv[1];
string bwt_str = prefix + ".bwt.str"; string bwt_str = prefix + ".bwt.str";
string bwt_idx, fmt_idx, xmer_idx; string bwt_idx, fmt_idx, xmer_idx, sa_fn, seq_fn;
bwt_t *bwt; bwt_t *bwt;
FMTIndex *fmt; FMTIndex *fmt;
ostringstream oss_bwt, oss_fmt, oss_xmer; ostringstream oss_bwt, oss_fmt, oss_xmer, oss_sa;
oss_bwt << '.' << OCC_INTERVAL; oss_bwt << '.' << OCC_INTERVAL;
oss_fmt << '.' << FMT_OCC_INTERVAL; oss_fmt << '.' << FMT_OCC_INTERVAL;
oss_xmer << '.' << XMER_LEN; oss_xmer << '.' << XMER_LEN;
oss_fmt << '.' << FMT_MID_INTERVAL; oss_fmt << '.' << FMT_MID_INTERVAL;
oss_sa << ".33." << SA_INTV;
bwt_idx = prefix + oss_bwt.str() + ".bwt"; bwt_idx = prefix + oss_bwt.str() + ".bwt";
fmt_idx = prefix + oss_fmt.str() + ".fmt"; fmt_idx = prefix + oss_fmt.str() + ".fmt";
xmer_idx = prefix + ".14.kmer"; xmer_idx = prefix + ".14.kmer";
sa_fn = prefix + oss_sa.str() + ".sa";
seq_fn = "./seqs.txt";
cout << bwt_idx << endl; cout << bwt_idx << endl;
cout << fmt_idx << endl; cout << fmt_idx << endl;
cout << xmer_idx << endl; cout << xmer_idx << endl;
cout << sa_fn << endl;
// 生成或读取bwt索引文件
// 生成或读取bwt索引文件
double time_read_bwt = realtime(); double time_read_bwt = realtime();
#if GEN_BWT_IDX #if GEN_BWT_IDX
bwt = restore_bwt(bwt_str.c_str()); bwt = restore_bwt(bwt_str.c_str());
@ -1100,7 +1077,7 @@ int main_fmtidx(int argc, char **argv)
time_read_fmt = realtime() - time_read_fmt; time_read_fmt = realtime() - time_read_fmt;
cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl; cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl;
// 生成xmer索引文件 // 生成或读取xmer索引文件
double time_gen_xmer_hash = realtime(); double time_gen_xmer_hash = realtime();
#if GEN_XMER_IDX #if GEN_XMER_IDX
create_xmer_index(fmt); create_xmer_index(fmt);
@ -1111,80 +1088,119 @@ int main_fmtidx(int argc, char **argv)
time_gen_xmer_hash = realtime() - time_gen_xmer_hash; time_gen_xmer_hash = realtime() - time_gen_xmer_hash;
cout << "[time gen xkmer hash:] " << time_gen_xmer_hash << "s" << endl; cout << "[time gen xkmer hash:] " << time_gen_xmer_hash << "s" << endl;
int kmer_bit_len = 18; // 读取sa
//create_kmer_bit(fmt, prefix, kmer_bit_len); double time_load_sa = realtime();
// return 0; fmt_restore_sa(sa_fn.c_str(), fmt);
time_load_sa = realtime() - time_load_sa;
cout << "[time load sa:] " << time_load_sa << "s" << endl;
// 读取pac
double time_load_pac = realtime();
fmt_load_ref_pac(prefix.c_str(), fmt);
time_load_pac = realtime() - time_load_pac;
cout << "[time load pac:] " << time_load_pac << "s" << endl;
//cout << "l_pac: " << fmt->l_pac << endl;
//return 0;
// 生成随机序列进行测试 // 生成随机序列进行测试
int seq_size = 10000000; int seq_size = 10000000;
//int seq_size = 1; //int seq_size = 1;
int seq_len = kmer_bit_len; int seq_len = 60;
vector<string> seq_arr(seq_size); vector<string> seq_arr(seq_size);
seq_arr[0] = "CACATATTGGCG"; //seq_arr[0] = "CACATATTGGCG";
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT"; // seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
// seq_arr[0] = "ACCTTTCGAATTGA"; // seq_arr[0] = "ACCTTTCGAATTGA";
#if 1 #if 0 // 随机生成seqs
srand(time(NULL)); srand(time(NULL));
double time_gen_seq = realtime(); double time_gen_seq = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i) for (int i = 0; i < (int)seq_arr.size(); ++i)
seq_arr[i] = generate_rand_seq(seq_len); seq_arr[i] = generate_rand_seq(seq_len);
time_gen_seq = realtime() - time_gen_seq; time_gen_seq = realtime() - time_gen_seq;
cout << "[time gen seq:] " << time_gen_seq << "s" << endl; cout << "[time gen seq:] " << time_gen_seq << "s" << endl;
#else
ifstream seq_in(seq_fn.c_str(), ios::in);
string read_line;
seq_arr.resize(0);
while (!seq_in.eof())
{
seq_in >> read_line;
// cout << read_line.substr(0, seq_len) << endl;
seq_arr.push_back(read_line.substr(0, seq_len));
}
seq_size = seq_arr.size();
seq_in.close();
#endif #endif
// 对比bwt和fmt搜索的时间 // 对比bwt和fmt搜索的时间
#if CMP_FMT_BWT_TIME #if CMP_FMT_BWT_TIME
//int nintv_0 = 0; // bwt
//bwtint_t max_gap = 0;
double time_bwt_search = realtime(); double time_bwt_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i) { for (int i = 0; i < (int)seq_arr.size(); ++i) {
bwtintv_t p = bwt_search(bwt, seq_arr[i]); // bwtintv_t p =
//if (max_gap < p.x[2]) bwt_search(bwt, seq_arr[i]);
// max_gap = p.x[2];
//if (p.x[2] == 0)
// ++nintv_0;
} }
//cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
//cout << "max gap: " << max_gap << endl;
time_bwt_search = realtime() - time_bwt_search; time_bwt_search = realtime() - time_bwt_search;
cout << "[time bwt search:] " << time_bwt_search << "s" << endl; cout << "[time bwt search:] " << time_bwt_search << "s" << endl;
// fmt
double time_fmt_search = realtime(); double time_fmt_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i) for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search(fmt, seq_arr[i]); fmt_search(fmt, seq_arr[i]);
time_fmt_search = realtime() - time_fmt_search; time_fmt_search = realtime() - time_fmt_search;
cout << "[time fmt search:] " << time_fmt_search << "s" << endl; cout << "[time fmt search:] " << time_fmt_search << "s" << endl;
// kmer
double time_xmer_search = realtime(); double time_xmer_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i) for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search_use_xmer(fmt, bwt, seq_arr[i]); fmt_search_use_xmer(fmt, bwt, seq_arr[i], 0);
time_xmer_search = realtime() - time_xmer_search; time_xmer_search = realtime() - time_xmer_search;
cout << "[time xmer search:] " << time_xmer_search << "s" << endl; cout << "[time xmer search:] " << time_xmer_search << "s" << endl;
#endif
// cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
// cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl;
// cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl;
// ostringstream oss_kmer_bit; // direct-extend
// oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit"; double time_de_search = realtime();
// uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len); for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search_use_xmer_de(fmt, seq_arr[i]);
time_de_search = realtime() - time_de_search;
cout << "[time direct-extend search:] " << time_de_search << "s" << endl;
#endif
// 对比bwt和fmt搜索的结果 // 对比bwt和fmt搜索的结果
#if CMP_FMT_BWT_RESULT #if CMP_FMT_BWT_RESULT
double time_cmp = realtime(); double time_cmp = realtime();
long diff_num = 0;
for (int i = 0; i < (int)seq_arr.size(); ++i) for (int i = 0; i < (int)seq_arr.size(); ++i)
{ {
#if 0
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]); bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]); // bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
bwtintv_t p2 = fmt_search_use_xmer(fmt, seq_arr[i]); bwtintv_t p2 = fmt_search_use_xmer(fmt, bwt, seq_arr[i], 0);
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) {
cout << seq_arr[i] << endl cout << "different: " << seq_arr[i] << endl
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
diff_num++;
}
#else
bwtintv_t p1 = fmt_search_use_xmer(fmt, bwt, seq_arr[i], 1);
bwtintv_t p2 = fmt_search_use_xmer_de(fmt, seq_arr[i]);
if (p2.num_match == 1) {
if (p1.rm[0].rs != p2.rm[0].rs || p1.x[2] != 1) {
cout << "different: " << seq_arr[i] << '\t' << p1.rm[0].rs << '\t' << p2.rm[0].rs << endl;
diff_num++;
}
}
else if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) {
cout << "different: " << seq_arr[i] << endl
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
diff_num++;
}
#endif
} }
time_cmp = realtime() - time_cmp; time_cmp = realtime() - time_cmp;
cout << "[time compare bwt & fmt:] " << time_cmp << "s" << endl; cout << "[time compare bwt & fmt:] " << time_cmp << "s\tdifferent num: " << diff_num << endl;
#endif #endif
return 0; return 0;

View File

@ -22,7 +22,8 @@
#elif FMT_MID_INTERVAL == 32 #elif FMT_MID_INTERVAL == 32
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48)) #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48))
#elif FMT_MID_INTERVAL == 64 #elif FMT_MID_INTERVAL == 64
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32)) //#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8 << 6))
#elif FMT_MID_INTERVAL == 128 #elif FMT_MID_INTERVAL == 128
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24)) #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24))
#else #else
@ -38,26 +39,12 @@
#define __fmt_mid_sum(x) \ #define __fmt_mid_sum(x) \
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff) ((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
#define KMER_LEN 12
#define KMER_ARR_SIZE (1 << (KMER_LEN << 1))
#define XMER_LEN 14 #define XMER_LEN 14
// 用来保存kmer对应的fmt的位置信息 // sa存储的行间隔
struct KmerEntry #define SA_INTV 4
{ #define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)
// 40+40+32 14个byte这样好处理
uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
};
#define FULL_ENTRY_LEN 14
#define FULL_ENTRY_ARR_SIZE (1 << (FULL_ENTRY_LEN << 1))
// kmer全部匹配对应的最后fmt匹配结果
struct FullEntry
{
uint8_t intv_arr[14];
};
// x-mer entry, 1 - 14个碱基组成的hash key // x-mer entry, 1 - 14个碱基组成的hash key
struct XmerEntry struct XmerEntry
@ -93,9 +80,10 @@ struct FMTIndex
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3 uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
uint8_t last_base; // dollar转换成的base uint8_t last_base; // dollar转换成的base
// ref pac相关
bwtint_t l_pac; // 参考序列长度
uint8_t *pac; // 保存2bit编码的参考序列
// 保存kmer对应的fmt位置信息 // 保存kmer对应的fmt位置信息
KmerEntry *kmer_entry;
FullEntry *full_entry;
XmerHash xmer_hash; XmerHash xmer_hash;
// suffix array // suffix array
int sa_intv; int sa_intv;

10
util.h
View File

@ -17,10 +17,20 @@ typedef uint64_t bwtint_t;
double realtime(void); double realtime(void);
typedef struct
{
int qs; // query start
int qe; // query end [)
int reverse; // 反向链
uint32_t rs; // 参考基因的起始点(包含)
} ref_match_t;
// 在fm-indexv(或者bwt)查找过程中,记录结果 // 在fm-indexv(或者bwt)查找过程中,记录结果
struct bwtintv_t struct bwtintv_t
{ {
bwtint_t x[3], info; // x[0]表示正链位置x[1]表示互补链位置x[2]表示间隔长度info 表示read的起始结束位置 bwtint_t x[3], info; // x[0]表示正链位置x[1]表示互补链位置x[2]表示间隔长度info 表示read的起始结束位置
int num_match;
ref_match_t rm[2];
}; };
// 读取单个数据 // 读取单个数据