解决了创建fmt时候的bug,获取bwt str地址时候应该用bwt 的 occ-interval计算,而不是fmt的occ interval

This commit is contained in:
zzh 2024-01-31 15:50:33 +08:00
parent fe34be5d3a
commit a500d76eaa
3 changed files with 6 additions and 3 deletions

View File

@ -1,7 +1,7 @@
CC= g++ CC= g++
NOWARN= -Wno-unused-result -Wno-unused-function NOWARN= -Wno-unused-result -Wno-unused-function
CFLAGS= #-g -Wall $(NOWARN) #-O2 CFLAGS= #-g -Wall $(NOWARN) #-O2
CPPFLAGS= -g -Wall $(NOWARN) -O2 CPPFLAGS= -g -Wall $(NOWARN) -O3
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF SHOW_PERF= -DSHOW_PERF
AR= ar AR= ar

View File

@ -204,7 +204,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
{ {
uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串 uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 4; // 这里加4还是加8要看保存occ的是是uint32还是uint64bwt字符串i对应的基准行因为原始的bwt-cpcheck point包含由4个uint32_t(8个uint32_t)组成的occ信息 uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 4; // 这里加4还是加8要看保存occ的是是uint32还是uint64bwt字符串i对应的基准行因为原始的bwt-cpcheck point包含由4个uint32_t(8个uint32_t)组成的occ信息
int offset = (i % FMT_OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址每16个碱基共用一个uint32整型因此需要偏移量来获取当前碱基串的首地址 int offset = (i % OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址每16个碱基共用一个uint32整型因此需要偏移量来获取当前碱基串的首地址
uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址 uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址
for (j = 0; j < 16; ++j) // 对于bwt碱基串一个一个碱基分别处理 for (j = 0; j < 16; ++j) // 对于bwt碱基串一个一个碱基分别处理
{ {
@ -317,7 +317,7 @@ void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]
cnt[3] = q[b2]; cnt[3] = q[b2];
p += 20; // 该地址是bwt和pre_bwt字符串数据的首地址 p += 20; // 该地址是bwt和pre_bwt字符串数据的首地址
end = p + ((k >> 3) - ((k & ~FMT_OCC_INTV_MASK) >> 3)); // this is the end point of the following loop end = p + ((k >> 3) - ((k & ~FMT_OCC_INTV_MASK) >> 3)); // this is the end point of the following loop
// p = end - (end - p) / 4; //p = end - (end - p) / 4;
// cout << "k - kbase: " << k - bwt_k_base_line << endl; // cout << "k - kbase: " << k - bwt_k_base_line << endl;
for (x = 0; p < end; ++p) for (x = 0; p < end; ++p)
{ {
@ -460,6 +460,7 @@ int main_fmtidx(int argc, char **argv)
//string fmt_idx = string(argv[1]) + ".fmt"; //string fmt_idx = string(argv[1]) + ".fmt";
// string fmt_idx = string(argv[1]) + ".256.fmt"; // string fmt_idx = string(argv[1]) + ".256.fmt";
// string fmt_idx = string(argv[1]) + ".128.fmt";
string fmt_idx = string(argv[1]) + ".64.fmt"; string fmt_idx = string(argv[1]) + ".64.fmt";
// string fmt_idx = string(argv[1]) + ".32.fmt"; // string fmt_idx = string(argv[1]) + ".32.fmt";

View File

@ -5,6 +5,8 @@
#define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT) #define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1) #define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV 16
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔 // 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0) #define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍 // k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍