From c69cb901bb6cc4ce80d3c45606828d0213a50e3b Mon Sep 17 00:00:00 2001
From: Gitea <gitea@fake.local>
Date: Sat, 27 Jan 2024 00:42:47 +0800
Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E4=BA=86=E4=B8=80=E4=BA=9Bbu?=
 =?UTF-8?q?g=EF=BC=8C=E7=9B=AE=E5=89=8D=E7=9C=8B=E6=9D=A5fmt=E5=8F=AF?=
 =?UTF-8?q?=E4=BB=A5=E6=AD=A3=E5=B8=B8=E5=B7=A5=E4=BD=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore    |   1 +
 bwt.cpp       |  17 +-
 bwt.h         |   4 +-
 fmt_index.cpp | 581 +++++++++++++++++++++++++++-----------------------
 fmt_index.h   |  14 +-
 util.cpp      | 100 ++++++++-
 util.h        |  12 +-
 7 files changed, 453 insertions(+), 276 deletions(-)

diff --git a/.gitignore b/.gitignore
index fcec038..9999bc1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # specific for bwa_perf
 *.txt
 fmtidx
+*.fmt
 
 # ---> C
 # Prerequisites
diff --git a/bwt.cpp b/bwt.cpp
index 776ba11..7173131 100644
--- a/bwt.cpp
+++ b/bwt.cpp
@@ -10,6 +10,17 @@
 
 using namespace std;
 
+void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
+{
+    FILE *fp;
+    fp = xopen(fn, "wb");
+    err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
+    err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
+    err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
+    err_fflush(fp);
+    err_fclose(fp);
+}
+
 // 计算一个字节构成的T,G,C,A序列，对应的每个碱基的个数(按T,G,C,A顺序存储在32位整数中，每个占8位)
 void bwt_gen_cnt_table(bwt_t *bwt)
 {
@@ -23,8 +34,8 @@ void bwt_gen_cnt_table(bwt_t *bwt)
     }
 }
 
-// 解析两bit的bwt碱基序列
-bwt_t *restore_bwt_str(const char *fn)
+// 解析两bit的bwt碱基序列，这个只有bwt str，可以包含也可不包含occ check point
+bwt_t *restore_bwt(const char *fn)
 {
     bwt_t *bwt;
     bwt = (bwt_t *)calloc(1, sizeof(bwt_t));
@@ -202,6 +213,8 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
     bwtint_t tk[4], tl[4];
     int i;
     bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数，tl表示在l行之前的累积
+    // cout << "bwt-1-d: " << ik->x[!is_back] - 1 << '\t' << tk[0] << '\t' << tk[1] << '\t' << tk[2] << '\t' << tk[3] << endl;
+    // cout << "bwt-1-d: " << ik->x[!is_back] - 1 + ik->x[2] << '\t' << tl[0] << '\t' << tl[1] << '\t' << tl[2] << '\t' << tl[3] << endl;
     // 这里是反向扩展
     for (i = 0; i != 4; ++i)
     {
diff --git a/bwt.h b/bwt.h
index 70ed8b5..564b64d 100644
--- a/bwt.h
+++ b/bwt.h
@@ -20,7 +20,7 @@ using std::string;
 // 从构建完成的bwt（包含occ check point）获取k行（不含$，这里的k不输出bwt mtx的行，是bwt字符串的行）的碱基
 #define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3)
 
-// 获取碱基c（待查找序列的首）和对应的互补碱基对应的行，以及间隔
+// 获取碱基c（待查找序列的首个碱基）和对应的互补碱基对应的行，以及间隔
 #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0)
 
 // The following two lines are ONLY correct when OCC_INTERVAL==0x80
@@ -56,7 +56,7 @@ bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c);
 // 统计k行（bwt mtx行，包含k行本身）之前4种碱基累积数量，这里的k是bwt矩阵里的行，比bwt字符串多1
 void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
 // 解析两bit的bwt碱基序列
-bwt_t *restore_bwt_str(const char *fn);
+bwt_t *restore_bwt(const char *fn);
 // 根据原始的字符串bwt创建interval-bwt
 void create_interval_occ_bwt(bwt_t *bwt);
 
diff --git a/fmt_index.cpp b/fmt_index.cpp
index 4f6df06..4bc645e 100644
--- a/fmt_index.cpp
+++ b/fmt_index.cpp
@@ -44,35 +44,44 @@ void print_base_uint32(uint32_t p)
 	}
 }
 
+// 随机生成长度为len的序列
+string generate_rand_seq(int len)
+{
+	string seq(len, 'A');
+	for (int i = 0; i < len; ++i)
+	{
+		seq[i] = BASE[rand() % 4];
+	}
+	return seq;
+}
 // 创建bwt矩阵
 void create_bwt_mtx(string &seq)
 {
-	cout << "seq size: " << seq.size() + 1 << endl;
-	string sarr[seq.size() + 1];
+	bwtint_t seq_len = seq.size() + 1;
+	string sarr[seq_len];
 	sarr[0] = seq + '$';
-	for (int i = 1; i < sarr[0].size(); ++i)
+	for (int i = 1; i < seq_len; ++i)
 	{
 		sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i);
 	}
-	std::sort(sarr, sarr + seq.size() + 1);
-
-	// bwt matrix
-	for (int i = 0; i < sarr[0].size(); ++i)
-	{
-		// cout << i << ' ' << sarr[i] << endl;
-		cout << sarr[i] << endl;
-	}
+	std::sort(sarr, sarr + seq_len);
 
+	// print bwt matrix
+	// for (int i = 0; i < seq_len; ++i)
+	//{
+	//	// cout << i << ' ' << sarr[i] << endl;
+	//	cout << sarr[i] << endl;
+	//}
 	// cout << "bwt string" << endl;
-	// for (int i = 0; i < sarr[0].size(); ++i)
+	// for (int i = 0; i < seq_len; ++i)
 	// {
 	// 	cout << sarr[i].back();
 	// }
 	// cout << endl;
 	// cout << "pre bwt string" << endl;
-	// for (int i = 0; i < sarr[0].size(); ++i)
+	// for (int i = 0; i < seq_len; ++i)
 	// {
-	// 	cout << sarr[i][sarr[0].size() - 2];
+	// 	cout << sarr[i][seq_len - 2];
 	// }
 	// cout << endl;
 }
@@ -98,7 +107,43 @@ void fmt_gen_cnt_table(FMTIndex *fmt)
 	}
 }
 
+void dump_fmt(const char *fn, const FMTIndex *fmt)
+{
+	FILE *fp;
+	fp = xopen(fn, "wb");
+	err_fwrite(&fmt->primary, sizeof(bwtint_t), 1, fp);
+	err_fwrite(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
+	err_fwrite(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
+	err_fwrite(&fmt->first_base, sizeof(uint8_t), 1, fp);
+	err_fwrite(&fmt->last_base, sizeof(uint8_t), 1, fp);
+	err_fwrite(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
+	err_fwrite(fmt->bwt, 4, fmt->bwt_size, fp);
+	err_fflush(fp);
+	err_fclose(fp);
+}
 
+FMTIndex *restore_fmt(const char *fn)
+{
+	FMTIndex *fmt;
+	fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
+	FILE *fp = fopen(fn, "rb");
+
+	fseek(fp, 0, SEEK_END);
+	fmt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 6 - 3) >> 2; // 以32位word为单位计算的size
+	fmt->bwt = (uint32_t *)calloc(fmt->bwt_size, 4);
+	fseek(fp, 0, SEEK_SET);
+	fread(&fmt->primary, sizeof(bwtint_t), 1, fp);
+	fread(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
+	fread(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
+	fread(&fmt->first_base, sizeof(uint8_t), 1, fp);
+	fread(&fmt->last_base, sizeof(uint8_t), 1, fp);
+	fread(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
+	fread_fix(fp, fmt->bwt_size << 2, fmt->bwt);
+	fmt->seq_len = fmt->L2[4];
+	fclose(fp);
+	fmt_gen_cnt_table(fmt); // 字节所能表示的各种碱基组合中，各个碱基的累积数量
+	return fmt;
+}
 
 // 根据interval-bwt创建fmt-index
 FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
@@ -108,13 +153,13 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
 	fmt_gen_cnt_table(fmt);
 
 	bwtint_t i, j, k, m, n, n_occ, cnt[4], cnt2[4];
-	uint32_t c[4], c2[16] /*保存AA..TT*/;
-	uint32_t *buf;
+	uint32_t c[4], c2[16]; /*c用来保存原来的bwt碱基串的累积值，c2用来保存pre-bwt和bwt碱基对的累计值，如AA..TT*/
+	uint32_t *buf; /* 计算之后变成fmt结构中bwt部分 */
 
-	fmt->seq_len = bwt->seq_len;
+	fmt->seq_len = bwt->seq_len; // bwt碱基序列的长度，不包含$字符，也就是该长度比bwt matrix长度少1
 	for (i = 0; i < 5; ++i)
-		fmt->L2[i] = bwt->L2[i];
-	fmt->primary = bwt->primary;
+		fmt->L2[i] = bwt->L2[i]; // 每个碱基的总累积值
+	fmt->primary = bwt->primary; // $在末尾的行，在bwt matrix行中的排序位置
 
 	n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // check point 个数
 	fmt->bwt_size = (fmt->seq_len * 2  + 15) >> 4; // 要保存最后两列碱基
@@ -129,146 +174,95 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
 		bwt_occ4(bwt, before_first_line, cnt);
 		for (j = i * 4, k = 0; k < 4; ++j, ++k)
 			c2[j] = cnt[k];
-		// cout << "start: " << BASE[i] << " line: " << before_first_line << " occ: " << cnt[0] << '\t' << cnt[1] << '\t' << cnt[2] << '\t' << cnt[3] << endl;
 	}
-	// cout << "c2: ";
-	// for (m = 0; m < 16; ++m)
-	// 	cout << c2[m] << ' ';
-	// cout << endl;
-
 	// k表示buf存储的偏移量
 	for (i = k = 0; i < bwt->seq_len; ++i)
 	{
 		// 记录occ
 		if (i % OCC_INTERVAL == 0)
 		{
-			memcpy(buf + k, c, sizeof(uint32_t) * 4); // 保存occ
+			memcpy(buf + k, c, sizeof(uint32_t) * 4); // bwt str中各个碱基的occ
 			k += 4;
-			memcpy(buf + k, c2, sizeof(uint32_t) * 16); // 二次计算的occ
+			memcpy(buf + k, c2, sizeof(uint32_t) * 16); // pre-bwt:bwt碱基对的occ
 			k += 16;
 		}
-		// 每个32位整数保存8个倒数第二列碱基和8个倒数第一列(bwt)碱基
+		// 每个32位整数保存8个倒数第二列碱基（pre-bwt）和8个倒数第一列(bwt)碱基
 		if (i % 16 == 0) // 每个32位整数可以包含16个碱基，每次需要处理16个碱基，也就是间隔最小可以设置为16
 		{
-			uint32_t bwt_16_seq = bwt->bwt[i / 16];
-			uint32_t pre_bwt_16_seq = 0;
-			uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 8; // bwt字符串i对应的基准行
-			int offset = (i % OCC_INTERVAL) / 16;
-			bwt_16_seq = *(bwt_addr + offset);
-			for (j = 0; j < 16; ++j)
+			uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串
+			uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 8; // bwt字符串i对应的基准行，因为原始的bwt-cp（check point）包含由4个uint64_t(8个uint32_t)组成的occ信息
+			int offset = (i % OCC_INTERVAL) / 16;		   // 每OCC_INTERVAL个碱基共享同一个基准地址，每16个碱基共用一个uint32整型，因此需要偏移量来获取当前碱基串的首地址
+			uint32_t bwt_16_seq = *(bwt_addr + offset);    // 待处理的当前16个碱基串的首地址
+			for (j = 0; j < 16; ++j) // 对于bwt碱基串，一个一个碱基分别处理
 			{
-				bwtint_t cur_line = i + j;
-				if (cur_line < bwt->seq_len) // 因为bwt序列里除去了$符号，所以bwt序列个数比原版bwt少1
+				bwtint_t cur_str_line = i + j;  // 当前碱基在bwt str中的行排序
+				if (cur_str_line < bwt->seq_len) // 当前碱基行不应超出bwt str总碱基长度（bwt str长度比bwt matrix长度少1，因为bwt str不包含$）
 				{
-					uint8_t bwt_base = bwt_B0(bwt, cur_line); // 对应行的bwt的碱基
-					// 先求出该碱基对应在第一列的行
-					if (cur_line >= bwt->primary) // 因为bwt序列里除去了$符号，所以，超过$所在行之后，对应的seq位置应该加一，才是真正对应的行
-						cur_line += 1;
-					bwtint_t origin_base_line = bwt->L2[bwt_base] + 1 + bwt_occ(bwt, cur_line - 1, bwt_base); // bwt矩阵行
-					bwtint_t base_line = origin_base_line;
-					if (base_line >= bwt->primary) // base_line表示在bwt字符中的位置，所以超出$为最尾所在行之后，要减掉1
-						base_line -= 1;			   // bwt碱基序列行（不包含$）
-					uint32_t pre_bwt_base = bwt_B0(bwt, base_line); // bwt列碱基对应的前一个碱基
-					if (origin_base_line == bwt->primary)
+					uint8_t bwt_base = bwt_B0(bwt, cur_str_line); // 对应行的bwt的碱基
+					// 先求出该碱基对应在第一列的行（对应的bwt matrix行）
+					bwtint_t cur_mtx_line = cur_str_line;
+					if (cur_str_line >= bwt->primary) // 因为bwt序列里除去了$符号，所以，超过$所在行之后，对应的seq位置应该加一，才是真正对应bwt matrix的行
+						cur_mtx_line += 1;
+					bwt_occ4(bwt, cur_mtx_line, cnt);	  // 获取原来bwt-checkpoint中的occ值	  
+					for (m=0; m<4; ++m)
+						c[m] = (uint32_t)cnt[m]; // 碱基m在cur_bwt_mtx_line(包含)之前的累积值，直接拷贝原bwt中的occ即可
+
+					cnt[bwt_base] -= 1; // 得到cur_bwt_mtx_line(不包含)之前的累积量，即bwt_occ4(bwt, cur_bwt_mtx_line-1, cnt)
+					bwtint_t bwt_base_mtx_line = bwt->L2[bwt_base] + 1 + cnt[bwt_base]; // bwt_base对应的bwt matrix行（LF变换）
+
+					bwt_occ4(bwt, bwt_base_mtx_line, cnt2); // 计算bwt_base_mtx_line之前的occ
+					for (n = 0; n < 4; ++n)
+					{
+						int c2_idx = bwt_base << 2 | n; // bwt base放在前边
+						c2[c2_idx] = (uint32_t)cnt2[n]; // pre-bwt:bwt 碱基对的累计值
+					}
+					bwtint_t bwt_base_str_line = bwt_base_mtx_line; // bwt-str中对应的行排序
+					if (bwt_base_str_line >= bwt->primary)			// base_line表示在bwt str中的位置，所以超出$为最尾所在行之后，要减掉1
+						bwt_base_str_line -= 1;						// bwt碱基序列行（不包含$）
+					uint32_t pre_bwt_base = bwt_B0(bwt, bwt_base_str_line); // bwt列碱基对应的前一个碱基pre-bwt
+					// 此时，bwt_base对应的bwt matrix首行，是$排在最尾的行，说明bwt_base就是序列的第一个碱基，
+					// 此时计算出来的pre_bwt_base就是primary前一行的bwt base，以此来代替$字符，在后续的计算过程中需要考虑
+					if (bwt_base_mtx_line == bwt->primary)
 					{
 						// 计算sec_bcp
 						fmt->sec_bcp = pre_bwt_base << 2 | bwt_base; // 因为把$当成A处理了
-						fmt->sec_primary = cur_line;
-						fmt->first_base = bwt_base;
-						fmt->last_base = pre_bwt_base;
+						fmt->sec_primary = cur_mtx_line;		 // pre-bwt base为$的行排序（bwt-matrix行）
+						fmt->first_base = bwt_base; // 原始序列第一个碱基
+						fmt->last_base = pre_bwt_base; // 计算后替代$字符的碱基（应该是primary行上边一行对应的bwt base）
 					}
-					// 暂存
-					pre_bwt_16_seq = pre_bwt_16_seq | (pre_bwt_base << (15-j)*2);
-					if (base_line >= bwt->primary)
-						base_line += 1; // bwt矩阵行
-					bwtint_t pre_base_line = bwt->L2[pre_bwt_base] + 1 + bwt_occ(bwt, base_line - 1, pre_bwt_base);
-					// 获取c
-					bwt_occ4(bwt, cur_line, cnt);
-					for (m = 0; m < 4; ++m)
-					{
-						c[m] = (uint32_t)cnt[m]; // 碱基m在cur_line(包含)之前的累积值
-					}
-					// 求出c2
-					cnt[bwt_base] -= 1; // 得到cur_line(不包含)之前的累积量
-					// bwtint_t m_first_line = bwt->L2[bwt_base] + cnt[bwt_base]; // 该bwt_base对应的在bwt矩阵中行的前一行
-					// bwt_occ4(bwt, m_first_line, cnt2);
-					// for (n = 0; n < 4; ++n) // 只计算bwt_base对应的二级occ，其他用之前的值
-					// {
-					// 	int c2_idx = bwt_base << 2 | n;
-					// 	c2[c2_idx] = (uint32_t)cnt2[n];
-					// }
+					// 暂存 pre-bwt碱基序列
+					pre_bwt_16_seq = pre_bwt_16_seq | (pre_bwt_base << (15-j)*2); // 序列靠前的碱基排在uint32_t数据中的高位
 
-					for (m = 0; m < 4; ++m)
-					{
-						bwtint_t m_first_line = -1;
-
-					  	// if (m == bwt_base || cnt[m] > 0)
-						if (m == bwt_base)
-						{
-							m_first_line = bwt->L2[m] + 1 + cnt[m]; // m是否与bwt_base相同，这里需要想清楚，情况不一样的
-							if (m_first_line >= bwt->seq_len)
-								m_first_line = bwt->seq_len;
-							// cout << cur_line << '\t' << BASE[m] << '\t' << m_first_line << endl;
-							bwt_occ4(bwt, m_first_line, cnt2);
-							for (n = 0; n < 4; ++n)
-							{
-								int c2_idx = m << 2 | n;
-								c2[c2_idx] = (uint32_t)cnt2[n];
-							}
-						}
-					}
-					cnt[bwt_base] += 1; // cur_line(包含)之前
-					// cout << cur_line << '\t'
-					// 	 << base_line << '\t'
-					// 	 << pre_base_line << '\t'
-					// 	 << BASE[pre_bwt_base] << '\t'
-					// 	 << BASE[bwt_base] << '\t'
-					// 	 << cnt[0] << ' ' << cnt[1] << ' ' << cnt[2] << ' ' << cnt[3] << "\t\t";
+					// 输出调试信息
+					// cout << "mtx line: " << cur_mtx_line << ' ' << c[0] << ' ' << c[1] << ' ' << c[2] << ' ' << c[3] << ' ';
 					// for (m = 0; m < 16; ++m)
 					// 	cout << c2[m] << ' ';
 					// cout << endl;
-					// for (m = 0; m < 16; ++m)
-					// 	fprintf(fmt_out, "%-4d", c2[m]);
-					// fprintf(fmt_out, "\n");
 				}
 				else
 					break;
 			}
-			//print_base_uint32(pre_bwt_16_seq);
-			//cout << endl;
-			//print_base_uint32(bwt_16_seq);
 			// 保存bwt和pre_bwt
-			uint32_t tmp_seq = 0;
-			tmp_seq = (((pre_bwt_16_seq & (3 << 30)) >> 0) | ((bwt_16_seq & (3 << 30)) >> 2)) 
-				    | (((pre_bwt_16_seq & (3 << 28)) >> 2) | ((bwt_16_seq & (3 << 28)) >> 4))
-					| (((pre_bwt_16_seq & (3 << 26)) >> 4) | ((bwt_16_seq & (3 << 26)) >> 6))
-					| (((pre_bwt_16_seq & (3 << 24)) >> 6) | ((bwt_16_seq & (3 << 24)) >> 8))
-					| (((pre_bwt_16_seq & (3 << 22)) >> 8) | ((bwt_16_seq & (3 << 22)) >> 10)) 
-				    | (((pre_bwt_16_seq & (3 << 20)) >> 10) | ((bwt_16_seq & (3 << 20)) >> 12))
-					| (((pre_bwt_16_seq & (3 << 18)) >> 12) | ((bwt_16_seq & (3 << 18)) >> 14))
-					| (((pre_bwt_16_seq & (3 << 16)) >> 14) | ((bwt_16_seq & (3 << 16)) >> 16));
-			buf[k++] = tmp_seq;
-			//cout << i << endl;
-			//print_base_uint32(tmp_seq);
+			uint32_t pre_and_bwt_seq = 0;
+			for (m = 0; m < 8; ++m)
+			{
+				int lshift_bit = 30 - 2 * m;
+				pre_and_bwt_seq |= (((pre_bwt_16_seq & (3 << lshift_bit)) >> (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) >> ((m * 2) + 2)));
+			}
+			buf[k++] = pre_and_bwt_seq;
 			if (j > 8)
 			{
-				// cout << "j: " << j << endl;
-				tmp_seq = (((pre_bwt_16_seq & (3 << 14)) << 16) | ((bwt_16_seq & (3 << 14)) << 14)) 
-					    | (((pre_bwt_16_seq & (3 << 12)) << 14) | ((bwt_16_seq & (3 << 12)) << 12))
-						| (((pre_bwt_16_seq & (3 << 10)) << 12) | ((bwt_16_seq & (3 << 10)) << 10))
-						| (((pre_bwt_16_seq & (3 << 8)) << 10) | ((bwt_16_seq & (3 << 8)) << 8))
-						| (((pre_bwt_16_seq & (3 << 6)) << 8) | ((bwt_16_seq & (3 << 6)) << 6)) 
-					    | (((pre_bwt_16_seq & (3 << 4)) << 6) | ((bwt_16_seq & (3 << 4)) << 4))
-						| (((pre_bwt_16_seq & (3 << 2)) << 4) | ((bwt_16_seq & (3 << 2)) << 2))
-						| (((pre_bwt_16_seq & (3 << 0)) << 2) | ((bwt_16_seq & (3 << 0)) << 0));
-
-				buf[k++] = tmp_seq;
-				//print_base_uint32(tmp_seq);
+				pre_and_bwt_seq = 0;
+				for (m = 8; m > 0; --m)
+				{
+					int lshift_bit = 2 * m - 2;
+					pre_and_bwt_seq |= (((pre_bwt_16_seq & (3 << lshift_bit)) << (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) << (m * 2 - 2)));
+				}
+				buf[k++] = pre_and_bwt_seq;
 			}
 		}
 	}
 	// the last element
-	// cout << c[0] << '\t' << c[1] << '\t' << c[2] << '\t' << c[3] << endl;
 	memcpy(buf + k, c, sizeof(uint32_t) * 4);
 	k += 4;
 	memcpy(buf + k, c2, sizeof(uint32_t) * 16);
@@ -278,103 +272,149 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
 	xassert(k == fmt->bwt_size, "inconsistent bwt_size");
 	// update fmt
 	fmt->bwt = buf; 
-
 	return fmt;
 }
 
-
-
-#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
-#define fmt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 8 + 20))
-
-void fmt_occ4(const FMTIndex *fmt, bwtint_t k, int b, uint32_t cnt1[4], uint32_t cnt2[4])
+// 扩展两个个碱基，计算bwt base为b的pre-bwt str中各个碱基的occ
+void fmt_e2_occ4(const FMTIndex *fmt, bwtint_t k, int b, uint32_t cnt1[4], uint32_t cnt2[4])
 {
-	bwtint_t x;
+	uint32_t x1, x2;
 	uint32_t *p, tmp, *end;
+	bwtint_t bwt_k_line = k, bwt_k_base_line = k >> OCC_INTV_SHIFT << OCC_INTV_SHIFT;
 	if (k == (bwtint_t)(-1))
 	{
+		p = fmt->bwt + 4 + b * 4;
 		memset(cnt1, 0, 4 * sizeof(uint32_t));
-		memset(cnt2, 0, 4 * sizeof(uint32_t));
+		memcpy(cnt2, p, 4 * sizeof(uint32_t));
 		return;
 	}
 	k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行（去掉了$，所以大于$的行，都减掉1）
 	p = fmt_occ_intv(fmt, k);
-	cout << "base: " << BASE[b] << endl;
-	cout << "k: " << k << "; p: " << (uint64_t)p << endl;
-	// cout << "0: " << (uint64_t)fmt_occ_intv(fmt, 0)
-	// 	 << " ;31: " << (uint64_t)fmt_occ_intv(fmt, 31)
-	// 	 << " ;32: " << (uint64_t)fmt_occ_intv(fmt, 32)
-	// 	 << " ;64: " << (uint64_t)fmt_occ_intv(fmt, 64)
-	// 	 << " ;96: " << (uint64_t)fmt_occ_intv(fmt, 96) << endl;
-
-
 	memcpy(cnt1, p, 4 * sizeof(uint32_t));
 	memcpy(cnt2, p + 4 + b * 4, 4 * sizeof(uint32_t));
-	cout << "cnt1: " << cnt1[0] << '\t' << cnt1[1] << '\t' << cnt1[2] << '\t' << cnt1[3] << endl;
-	cout << "cnt2: " << cnt2[0] << '\t' << cnt2[1] << '\t' << cnt2[2] << '\t' << cnt2[3] << endl;
+	p += 20;											// 该地址是bwt和pre_bwt字符串数据的首地址
+	end = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3)); // this is the end point of the following loop
 
-	p += 20; // 该地址是bwt和pre_bwt字符串数据的首地址
-	end = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); // this is the end point of the following loop
-	// for (x = 0; p < end; ++p)
-	// 	x += __occ_aux4(bwt, *p);
-	// tmp = *p & ~((1U << ((~k & 15) << 1)) - 1);
-	// x += __occ_aux4(bwt, tmp) - (~k & 15);
-	// cnt[0] += x & 0xff;
-	// cnt[1] += x >> 8 & 0xff;
-	// cnt[2] += x >> 16 & 0xff;
-	// cnt[3] += x >> 24;
+	for (x1 = 0, x2 = 0; p < end; ++p)
+	{
+		x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
+		x2 += __fmt_occ_e2_aux4(fmt, b, *p);
+	}
+	tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
+	x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
+	x2 += __fmt_occ_e2_aux4(fmt, b, tmp);
+	if (b == 0)
+		x2 -= ~k & 7;
+	// 如果跨过了second_primary,那么可能需要减掉一次累积值
+	if (b == fmt->first_base && bwt_k_base_line < fmt->sec_primary && bwt_k_line >= fmt->sec_primary)
+	{
+		x2 -= 1 << (fmt->last_base << 3);
+	}
+
+	cnt1[0] += x1 & 0xff;
+	cnt1[1] += x1 >> 8 & 0xff;
+	cnt1[2] += x1 >> 16 & 0xff;
+	cnt1[3] += x1 >> 24;
+	cnt2[0] += x2 & 0xff;
+	cnt2[1] += x2 >> 8 & 0xff;
+	cnt2[2] += x2 >> 16 & 0xff;
+	cnt2[3] += x2 >> 24;
+
+	// cout << "fmt-occ: " << k << '\t' << cnt1[0] << '\t' << cnt1[1] << '\t' << cnt1[2] << '\t' << cnt1[3] << endl;
+	// cout << "fmt-occ-2: " << k << '\t' << cnt2[0] << '\t' << cnt2[1] << '\t' << cnt2[2] << '\t' << cnt2[3] << endl;
+	// cout << "bwt_k_base_line: " << bwt_k_base_line << endl;
+	// cout << "bwt_k_line: " << bwt_k_line << endl;
+	// cout << "sec_primary: " << fmt->sec_primary << endl;
 }
 
-void fmt_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b,
-			   uint32_t cntk1[4], uint32_t cntl1[4], uint32_t cntk2[4], uint32_t cntl2[4])
+// 对k行和l行同时计算occ，如果k和l落在同一个interval区间，可以减少一些计算量和访存
+void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b,
+			   uint32_t cntk1[4], uint32_t cntk2[4], uint32_t cntl1[4], uint32_t cntl2[4])
 {
+	// fmt_e2_occ4(fmt, k, b, cntk1, cntk2);
+	// fmt_e2_occ4(fmt, l, b, cntl1, cntl2);
+	// return;
 	bwtint_t _k, _l;
 	_k = k - (k >= fmt->primary); // 换算成了seq的行
 	_l = l - (l >= fmt->primary);
-//	if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1))
-//	{
-		fmt_occ4(fmt, k, b, cntk1, cntk2);
-		fmt_occ4(fmt, l, b, cntl1, cntk1);
-//	}
-//	else
-//	{
-//		bwtint_t x, y;
-//		uint32_t *p, tmp, *endk, *endl;
-//		k -= (k >= bwt->primary); // because $ is not in bwt
-//		l -= (l >= bwt->primary);
-//		p = bwt_occ_intv(bwt, k);
-//		memcpy(cntk, p, 4 * sizeof(bwtint_t));
-//		p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
-//		// prepare cntk[]
-//		endk = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4));
-//		endl = p + ((l >> 4) - ((l & ~OCC_INTV_MASK) >> 4));
-//		for (x = 0; p < endk; ++p)
-//			x += __occ_aux4(bwt, *p);
-//		y = x;
-//		tmp = *p & ~((1U << ((~k & 15) << 1)) - 1);
-//		x += __occ_aux4(bwt, tmp) - (~k & 15);
-//		// calculate cntl[] and finalize cntk[]
-//		for (; p < endl; ++p)
-//			y += __occ_aux4(bwt, *p);
-//		tmp = *p & ~((1U << ((~l & 15) << 1)) - 1);
-//		y += __occ_aux4(bwt, tmp) - (~l & 15);
-//		memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
-//		cntk[0] += x & 0xff;
-//		cntk[1] += x >> 8 & 0xff;
-//		cntk[2] += x >> 16 & 0xff;
-//		cntk[3] += x >> 24;
-//		cntl[0] += y & 0xff;
-//		cntl[1] += y >> 8 & 0xff;
-//		cntl[2] += y >> 16 & 0xff;
-//		cntl[3] += y >> 24;
-//	}
+	if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1))
+	{
+		fmt_e2_occ4(fmt, k, b, cntk1, cntk2);
+		fmt_e2_occ4(fmt, l, b, cntl1, cntl2);
+	}
+	else
+	{
+		uint32_t x1, x2, y1, y2;
+		uint32_t *p, tmp, *ek, *el;
+		bwtint_t bwt_k_line = k, bwt_l_line = l, bwt_base_line = k >> OCC_INTV_SHIFT << OCC_INTV_SHIFT;
+
+		k -= (k >= fmt->primary); // because $ is not in bwt
+		l -= (l >= fmt->primary);
+		p = fmt_occ_intv(fmt, k);
+		memcpy(cntk1, p, 4 * sizeof(uint32_t));
+		memcpy(cntk2, p + 4 + b * 4, 4 * sizeof(uint32_t));
+		memcpy(cntl1, cntk1, 4 * sizeof(uint32_t));
+		memcpy(cntl2, cntk2, 4 * sizeof(uint32_t));
+		p += 20;
+		// prepare cntk[]
+		ek = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3));
+		el = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3));
+		for (x1 = 0, x2 = 0; p < ek; ++p)
+		{
+			x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
+			x2 += __fmt_occ_e2_aux4(fmt, b, *p);
+		}
+		y1 = x1;
+		y2 = x2;
+		tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
+		x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
+		x2 += __fmt_occ_e2_aux4(fmt, b, tmp);
+		if (b == 0)
+			x2 -= ~k & 7;
+		for (; p < el; ++p)
+		{
+			y1 += __fmt_occ_e2_aux4(fmt, 4, *p);
+			y2 += __fmt_occ_e2_aux4(fmt, b, *p);
+		}
+		tmp = *p & ~((1U << ((~l & 7) << 2)) - 1);
+		y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~l & 7);
+		y2 += __fmt_occ_e2_aux4(fmt, b, tmp);
+		if (b == 0)
+			y2 -= ~l & 7;
+		// 如果跨过了second_primary,那么可能需要减掉一次累积值
+		if (b == fmt->first_base && bwt_base_line < fmt->sec_primary)
+		{
+			if (bwt_k_line >= fmt->sec_primary)
+				x2 -= 1 << (fmt->last_base << 3);
+			if (bwt_l_line >= fmt->sec_primary)
+				y2 -= 1 << (fmt->last_base << 3);
+		}
+		cntk1[0] += x1 & 0xff;
+		cntk1[1] += x1 >> 8 & 0xff;
+		cntk1[2] += x1 >> 16 & 0xff;
+		cntk1[3] += x1 >> 24;
+		cntk2[0] += x2 & 0xff;
+		cntk2[1] += x2 >> 8 & 0xff;
+		cntk2[2] += x2 >> 16 & 0xff;
+		cntk2[3] += x2 >> 24;
+
+		cntl1[0] += y1 & 0xff;
+		cntl1[1] += y1 >> 8 & 0xff;
+		cntl1[2] += y1 >> 16 & 0xff;
+		cntl1[3] += y1 >> 24;
+		cntl2[0] += y2 & 0xff;
+		cntl2[1] += y2 >> 8 & 0xff;
+		cntl2[2] += y2 >> 16 & 0xff;
+		cntl2[3] += y2 >> 24;
+
+		// cout << "fmt-occ: " << k << '\t' << cntk1[0] << '\t' << cntk1[1] << '\t' << cntk1[2] << '\t' << cntk1[3] << endl;
+		// cout << "fmt-occ-2: " << k << '\t' << cntk2[0] << '\t' << cntk2[1] << '\t' << cntk2[2] << '\t' << cntk2[3] << endl;
+		// cout << "fmt-occ: " << l << '\t' << cntl1[0] << '\t' << cntl1[1] << '\t' << cntl1[2] << '\t' << cntl1[3] << endl;
+		// cout << "fmt-occ-2: " << l << '\t' << cntl2[0] << '\t' << cntl2[1] << '\t' << cntl2[2] << '\t' << cntl2[3] << endl;
+	}
 }
 
-
-
-#define __fmt_occ_e2_aux4(fmt, b, val) \
-	((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24])
-
+// 扩展一个碱基，计算bwt str中各个碱基的occ
 void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4])
 {
 	uint32_t x;
@@ -402,69 +442,58 @@ void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4])
 	cnt[3] += x >> 24;
 }
 
-void fmt_e2_occ4(const FMTIndex *fmt, bwtint_t k, int b, uint32_t cnt1[4], uint32_t cnt2[4])
+// 对k行和l行同时计算bwt str的occ，如果k和l落在同一个interval区间，可以减少一些计算量和访存
+void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], uint32_t cntl[4])
 {
-	uint32_t x1, x2;
-	uint32_t *p, tmp, *end;
-	bwtint_t bwt_k_line = k, bwt_k_base_line = k >> OCC_INTV_SHIFT << OCC_INTV_SHIFT;
-	if (k == (bwtint_t)(-1))
+	bwtint_t _k, _l;
+	_k = k - (k >= fmt->primary); // 换算成了seq的行
+	_l = l - (l >= fmt->primary);
+	if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1))
 	{
-		p = fmt->bwt + 4 + b * 4;
-		memset(cnt1, 0, 4 * sizeof(uint32_t));
-		memcpy(cnt2, p, 4 * sizeof(uint32_t));
-		return;
+		fmt_e1_occ4(fmt, k, cntk);
+		fmt_e1_occ4(fmt, l, cntl);
 	}
-	k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行（去掉了$，所以大于$的行，都减掉1）
-	p = fmt_occ_intv(fmt, k);
-	// cout << "base: " << BASE[b] << endl;
-	// cout << "k: " << k << "; c 0 cnt: " << p[0] << '\t' << p[1] << '\t' << p[2] << '\t' << p[3] << endl;
-	memcpy(cnt1, p, 4 * sizeof(uint32_t));
-	memcpy(cnt2, p + 4 + b * 4, 4 * sizeof(uint32_t));
-	// cout << "[start: ] k: " << k << "; k line cnt: " << cnt[0] << '\t' << cnt[1] << '\t' << cnt[2] << '\t' << cnt[3] << endl;
-
-	p += 20;											// 该地址是bwt和pre_bwt字符串数据的首地址
-	end = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3)); // this is the end point of the following loop
-
-	for (x1 = 0, x2 = 0; p < end; ++p)
+	else
 	{
-		x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
-		x2 += __fmt_occ_e2_aux4(fmt, b, *p);
-	}
-	//{
-	//	x += fmt->cnt_table[b][*p & 0xff]
-	//		 + fmt->cnt_table[b][*p >> 8 & 0xff] 
-	//		 + fmt->cnt_table[b][*p >> 16 & 0xff] 
-	//		 + fmt->cnt_table[b][*p >> 24 & 0xff];
-	//	// cout << "p: " << *p << endl;
-	//	// print_base_uint32(*p);
-	//	// cout << (fmt->cnt_table[b][*p & 0xff] >> 24) << ' ' 
-	//	// 	 << fmt->cnt_table[b][*p >> 24 & 0xff]
-	//	// 	 << endl;
-	//}
+		uint32_t x1, y1;
+		uint32_t *p, tmp, *endk, *endl;
 
-	tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
-	x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
-	x2 += __fmt_occ_e2_aux4(fmt, b, tmp);
-	if (b == 0)
-		x2 -= ~k & 7;
-	// 如果跨过了second_primary,那么可能需要减掉一次累积值
-	if (b == fmt->first_base && bwt_k_base_line < fmt->sec_primary && bwt_k_line >= fmt->sec_primary)
-	{
-		x2 -= 1 << (fmt->last_base << 3);
+		k -= (k >= fmt->primary); // because $ is not in bwt
+		l -= (l >= fmt->primary);
+		p = fmt_occ_intv(fmt, k);
+		memcpy(cntk, p, 4 * sizeof(uint32_t));
+		memcpy(cntl, p, 4 * sizeof(uint32_t));
+		p += 20;
+		// prepare cntk[]
+		endk = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3));
+		endl = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3));
+		for (x1 = 0; p < endk; ++p)
+		{
+			x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
+		}
+		y1 = x1;
+		tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
+		x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
+		for (; p < endl; ++p)
+		{
+			y1 += __fmt_occ_e2_aux4(fmt, 4, *p);
+		}
+		tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
+		y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
+
+		cntk[0] += x1 & 0xff;
+		cntk[1] += x1 >> 8 & 0xff;
+		cntk[2] += x1 >> 16 & 0xff;
+		cntk[3] += x1 >> 24;
+
+		cntl[0] += y1 & 0xff;
+		cntl[1] += y1 >> 8 & 0xff;
+		cntl[2] += y1 >> 16 & 0xff;
+		cntl[3] += y1 >> 24;
 	}
-	// x += __occ_aux4(bwt, tmp) - (~k & 15);
-	// cout << "x: " << x << " b:" << b << endl;
-	cnt1[0] += x1 & 0xff;
-	cnt1[1] += x1 >> 8 & 0xff;
-	cnt1[2] += x1 >> 16 & 0xff;
-	cnt1[3] += x1 >> 24;
-	cnt2[0] += x2 & 0xff;
-	cnt2[1] += x2 >> 8 & 0xff;
-	cnt2[2] += x2 >> 16 & 0xff;
-	cnt2[3] += x2 >> 24;
-	// cout << "[end  : ]k: " << k << "; k line cnt: " << cnt[0] << '\t' << cnt[1] << '\t' << cnt[2] << '\t' << cnt[3] << endl;
 }
 
+// 扩展一个碱基
 void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1)
 {
 	uint32_t tk[4], tl[4];
@@ -472,6 +501,7 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
 
 	fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk);
 	fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl);
+	fmt_e1_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
 	for (i = 0; i != 4; ++i)
 	{
 		ok[i].x[!is_back] = fmt->L2[i] + 1 + tk[i]; // 起始行位置，互补链
@@ -483,19 +513,22 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
 	*ik = ok[b1];
 }
 
+// 扩展两个碱基
 void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1, int b2)
 {
 	uint32_t tk1[4], tl1[4], tk2[4], tl2[4];
 	int i;
 
+	// fmt_e2_occ4(fmt, ik->x[!is_back] - 1, b1, tk1, tk2);
+	// fmt_e2_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, tl1, tl2);
+	// tk表示在k行之前所有各个碱基累积出现次数，tl表示在l行之前的累积
+	fmt_e2_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tk2, tl1, tl2);
 
-	// fmt_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tl1, tk2, tl2); // tk表示在k行之前所有各个碱基累积出现次数，tl表示在l行之前的累积
+	// cout << "k: " << tk1[0] << '\t' << tk1[1] << '\t' << tk1[2] << '\t' << tk1[3] << endl;
+	// cout << "l: " << tl1[0] << '\t' << tl1[1] << '\t' << tl1[2] << '\t' << tl1[3] << endl;
+	// cout << "k: " << tk2[0] << '\t' << tk2[1] << '\t' << tk2[2] << '\t' << tk2[3] << endl;
+	// cout << "l: " << tl2[0] << '\t' << tl2[1] << '\t' << tl2[2] << '\t' << tl2[3] << endl;
 
-	fmt_e2_occ4(fmt, ik->x[!is_back] - 1, b1, tk1, tk2);
-	fmt_e2_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, tl1, tl2);
-
-	// fmt_e2_occ(fmt, -1, 0, tk);
-	
 	// 这里是反向扩展
 	for (i = 0; i != 4; ++i)
 	{
@@ -505,11 +538,17 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
 	// 因为计算的是互补碱基，所以3对应着0,2对应1，下边是正向扩展
 
 	ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary);
-	for (i = 2; i >= b1; --i)
-		ok[i].x[is_back] = ok[i + 1].x[is_back] + tl1[i + 1] - tk1[i + 1];
+	ok[2].x[is_back] = ok[3].x[is_back] + tl1[3] - tk1[3];
+	ok[1].x[is_back] = ok[2].x[is_back] + tl1[2] - tk1[2];
+	ok[0].x[is_back] = ok[1].x[is_back] + tl1[1] - tk1[1];
+
+	cout << "fmt-d: " << BASE[b1] << '\t' << ok[b1].x[is_back] << '\t' << ok[b1].x[2] << endl;
+
 	ok[3].x[is_back] = ok[b1].x[is_back] + (ok[b1].x[!is_back] <= fmt->primary && ok[b1].x[!is_back] + ok[b1].x[2] - 1 >= fmt->primary);
-	for (i = 2; i >= b2; --i)
-		ok[i].x[is_back] = ok[i + 1].x[is_back] + ok[i + 1].x[2];
+	ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
+	ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
+	ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
+
 	*ik = ok[b2];
 }
 // 利用fmt搜索seed，完整搜索，只需要单向搜索
@@ -557,13 +596,18 @@ int main_fmtidx(int argc, char **argv)
 	//create_bwt_mtx(seq);
 	//cout << seq << endl;
 
-	bwt_t *bwt = restore_bwt_str(argv[1]); // 读取bwt原始字符串（带ACGT总的累积量）
-	create_interval_occ_bwt(bwt); // 根据bwt字符串创建包含interval occ的bwt（128碱基+ACGT累积量）
+	bwt_t *bwt = restore_bwt(argv[1]); // 读取bwt原始字符串（带ACGT总的累积量）
+	// create_interval_occ_bwt(bwt); // 根据bwt字符串创建包含interval occ的bwt（128碱基+ACGT累积量）
 	cout << "L2: " << bwt->L2[0] << '\t' << bwt->L2[1] << '\t' << bwt->L2[2] << '\t'
 		 << bwt->L2[3] << '\t' << bwt->L2[4] << endl;
 
 	string s = "AACCCTAA";
 
+	srand(time(NULL));
+	s = generate_rand_seq(10);
+	cout << "seq: " << s << endl;
+	// s = "TTC";
+
 	bwt_search(bwt, s);
 	bwt_search2(bwt, s);
 
@@ -573,6 +617,9 @@ int main_fmtidx(int argc, char **argv)
 	// }
 	// TGGGAT
 	FMTIndex *fmt = create_fmt_from_bwt(bwt);
+	dump_fmt("ref.fmt", fmt);
+	// FMTIndex *fmt = restore_fmt("tiny.fmt");
+
 	fmt_search(fmt, s);
 	// cout << bwt->bwt_size << endl;
 	// cout << bwt->seq_len << endl;
diff --git a/fmt_index.h b/fmt_index.h
index 6eef697..cd39e27 100644
--- a/fmt_index.h
+++ b/fmt_index.h
@@ -3,6 +3,14 @@
 
 #include "bwt.h"
 
+// 获取碱基c（待查找序列的首个碱基）和对应的互补碱基对应的行，以及间隔
+#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
+// k行（bwt str行（不包含$））对应的check point occ数据起始地址（小于k且是OCC_INTERVAL的整数倍）
+#define fmt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 8 + 20))
+// 字节val中包含bwt base为b的pre-bwt中T G C A（按顺序保存在32位整数里（每个占8bit））的数量
+#define __fmt_occ_e2_aux4(fmt, b, val) \
+    ((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24])
+
 // fm-index, extend twice in one search step (one memory access)
 struct FMTIndex
 {
@@ -14,9 +22,9 @@ struct FMTIndex
     uint32_t *bwt;        // BWT
     // occurance array, separated to two parts
     uint32_t cnt_table[5][256]; // 4对应原来的cnt_table，0,1,2,3,分别对应该碱基的扩展
-    int sec_bcp;                // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
-    int first_base;             // 序列的第一个碱基2bit的int类型，0,1,2,3
-    int last_base;              // dollar转换成的base
+    uint8_t sec_bcp;                // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
+    uint8_t first_base;             // 序列的第一个碱基2bit的int类型，0,1,2,3
+    uint8_t last_base;              // dollar转换成的base
     // suffix array
     int sa_intv;
     bwtint_t n_sa;
diff --git a/util.cpp b/util.cpp
index c492066..3be7894 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1,7 +1,16 @@
+#define FSYNC_ON_FLUSH
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
-
+#include <string.h>
+#include <errno.h>
+#ifdef FSYNC_ON_FLUSH
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+#include <stdarg.h>
 #include "util.h"
 
 // base转成2bit值
@@ -39,6 +48,48 @@ void _err_fatal_simple_core(const char *func, const char *msg)
     abort();
 }
 
+// 打印信息并停止运行
+void _err_fatal_simple(const char *func, const char *msg)
+{
+    fprintf(stderr, "[%s] %s\n", func, msg);
+    exit(EXIT_FAILURE);
+}
+
+void err_fatal(const char *header, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[%s] ", header);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    va_end(args);
+    exit(EXIT_FAILURE);
+}
+
+void err_fatal_core(const char *header, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    fprintf(stderr, "[%s] ", header);
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, " Abort!\n");
+    va_end(args);
+    abort();
+}
+
+// 打开文件流
+FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
+{
+    FILE *fp = 0;
+    if (strcmp(fn, "-") == 0)
+        return (strstr(mode, "r")) ? stdin : stdout;
+    if ((fp = fopen(fn, mode)) == 0)
+    {
+        err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
+    }
+    return fp;
+}
+
 // 读取数据
 bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
 {                                  /* Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks */
@@ -53,4 +104,51 @@ bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
         offset += x;
     }
     return offset;
+}
+
+// 写二进制文件
+size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+    size_t ret = fwrite(ptr, size, nmemb, stream);
+    if (ret != nmemb)
+        _err_fatal_simple("fwrite", strerror(errno));
+    return ret;
+}
+
+// 刷新文件流
+int err_fflush(FILE *stream)
+{
+    int ret = fflush(stream);
+    if (ret != 0)
+        _err_fatal_simple("fflush", strerror(errno));
+
+#ifdef FSYNC_ON_FLUSH
+    /* Calling fflush() ensures that all the data has made it to the
+       kernel buffers, but this may not be sufficient for remote filesystems
+       (e.g. NFS, lustre) as an error may still occur while the kernel
+       is copying the buffered data to the file server.  To be sure of
+       catching these errors, we need to call fsync() on the file
+       descriptor, but only if it is a regular file.  */
+    {
+        struct stat sbuf;
+        if (0 != fstat(fileno(stream), &sbuf))
+            _err_fatal_simple("fstat", strerror(errno));
+
+        if (S_ISREG(sbuf.st_mode))
+        {
+            if (0 != fsync(fileno(stream)))
+                _err_fatal_simple("fsync", strerror(errno));
+        }
+    }
+#endif
+    return ret;
+}
+
+// 关闭文件流
+int err_fclose(FILE *stream)
+{
+    int ret = fclose(stream);
+    if (ret != 0)
+        _err_fatal_simple("fclose", strerror(errno));
+    return ret;
 }
\ No newline at end of file
diff --git a/util.h b/util.h
index 12e98fe..2ff81da 100644
--- a/util.h
+++ b/util.h
@@ -10,7 +10,9 @@ typedef uint64_t bwtint_t;
     if ((cond) == 0)       \
     _err_fatal_simple_core(__func__, msg)
 
-    double realtime(void);
+#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
+
+double realtime(void);
 
 // 在fm-indexv(或者bwt)查找过程中，记录结果
 struct bwtintv_t
@@ -27,5 +29,13 @@ void _err_fatal_simple_core(const char *func, const char *msg);
 int bval(char b);
 // 互补碱基值
 int cbval(char b);
+// 打开文件流
+FILE *err_xopen_core(const char *func, const char *fn, const char *mode);
+// 写二进制文件
+size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
+// 刷新文件流
+int err_fflush(FILE *stream);
+// 关闭文件流
+int err_fclose(FILE *stream);
 
 #endif
\ No newline at end of file