消融实验测试版本
This commit is contained in:
parent
bad3112151
commit
98ae5bb779
22
bwt.cpp
22
bwt.cpp
|
|
@ -227,6 +227,28 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin
|
|||
}
|
||||
}
|
||||
|
||||
// 设置某一行的排序值,sa的索引有效值从1开始,(0设置为-1, 小端模式)
|
||||
void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val)
|
||||
{
|
||||
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组,共享33个字节
|
||||
const int val_idx_in_block = k & 7;
|
||||
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
|
||||
bwtint_t *sa_addr = (bwtint_t *)(sa_arr + start_byte_idx);
|
||||
// *sa_addr &= (1 << val_idx_in_block) - 1; // 如果开辟内存的时候清零了,这一步可以省略,会清除后面的数据,只适合按递增顺序赋值
|
||||
*sa_addr |= (val & ((1L << 33) - 1)) << val_idx_in_block;
|
||||
}
|
||||
|
||||
// 获取某一行的排序值(小端模式)
|
||||
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k)
|
||||
{
|
||||
const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组,共享33个字节
|
||||
const int val_idx_in_block = k & 7;
|
||||
const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2);
|
||||
bwtint_t val = *(bwtint_t *)(sa_arr + start_byte_idx);
|
||||
val = (val >> val_idx_in_block) & 8589934591;
|
||||
return val;
|
||||
}
|
||||
|
||||
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
|
||||
{
|
||||
bwtint_t tk[4], tl[4];
|
||||
|
|
|
|||
5
bwt.h
5
bwt.h
|
|
@ -7,7 +7,7 @@
|
|||
using std::string;
|
||||
|
||||
// occ间隔对应的右移位数,5表示间隔32行保存一次
|
||||
#define OCC_INTV_SHIFT 6
|
||||
#define OCC_INTV_SHIFT 7
|
||||
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
|
||||
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
|
||||
|
||||
|
|
@ -47,6 +47,9 @@ struct bwt_t
|
|||
uint8_t *sa;
|
||||
};
|
||||
|
||||
void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val);
|
||||
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k);
|
||||
|
||||
// k行(包含,bwt mtx行)之前,碱基c的累计总数, interval大于等于32才能正确计算
|
||||
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c);
|
||||
// 统计k行(bwt mtx行,包含k行本身)之前4种碱基累积数量,这里的k是bwt矩阵里的行,比bwt字符串多1
|
||||
|
|
|
|||
598
fmt_index.cpp
598
fmt_index.cpp
|
|
@ -9,11 +9,14 @@
|
|||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
#include "util.h"
|
||||
#include "bwt.h"
|
||||
#include "fmt_index.h"
|
||||
using namespace std;
|
||||
using std::cout;
|
||||
using std::ifstream;
|
||||
|
||||
double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0,
|
||||
t11 = 0, t12 = 0, t13 = 0, t14 = 0;
|
||||
|
|
@ -155,48 +158,6 @@ void dump_fmt(const char *fn, const FMTIndex *fmt)
|
|||
}
|
||||
|
||||
// 将kmer hash数据写入到文件
|
||||
void dump_kmer_idx(const char *fn, const KmerEntry *kmer_entry)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(kmer_entry, 1, KMER_ARR_SIZE * sizeof(KmerEntry), fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 从文件中读取kmer hash信息
|
||||
KmerEntry *restore_kmer_idx(const char *fn)
|
||||
{
|
||||
FILE *fp = xopen(fn, "rb");
|
||||
uint32_t kmer_bytes = KMER_ARR_SIZE * sizeof(KmerEntry);
|
||||
KmerEntry *kmer_entry = (KmerEntry *)malloc(kmer_bytes);
|
||||
fread_fix(fp, kmer_bytes, kmer_entry);
|
||||
err_fclose(fp);
|
||||
return kmer_entry;
|
||||
}
|
||||
|
||||
// 将full kmer hash数据写入到文件
|
||||
void dump_full_entry_idx(const char *fn, const FullEntry *full_entry)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(full_entry, 1, FULL_ENTRY_ARR_SIZE * sizeof(FullEntry), fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 从文件中读取full kmer hash信息
|
||||
FullEntry *restore_full_entry_idx(const char *fn)
|
||||
{
|
||||
FILE *fp = xopen(fn, "rb");
|
||||
uint32_t full_entry_bytes = FULL_ENTRY_ARR_SIZE * sizeof(FullEntry);
|
||||
FullEntry *full_entry = (FullEntry *)malloc(full_entry_bytes);
|
||||
fread_fix(fp, full_entry_bytes, full_entry);
|
||||
err_fclose(fp);
|
||||
return full_entry;
|
||||
}
|
||||
|
||||
// 将full kmer hash数据写入到文件
|
||||
void dump_xmer_idx(const char *fn, const XmerHash *xh)
|
||||
{
|
||||
FILE *fp;
|
||||
|
|
@ -259,6 +220,41 @@ FMTIndex *restore_fmt(const char *fn)
|
|||
return fmt;
|
||||
}
|
||||
|
||||
// 读取sa数据
|
||||
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
|
||||
{
|
||||
char skipped[256];
|
||||
FILE *fp;
|
||||
bwtint_t primary;
|
||||
fp = xopen(fn, "rb");
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == fmt->primary, "SA-BWT inconsistency: primary is not the same.");
|
||||
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
|
||||
err_fread_noeof(&fmt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == fmt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
||||
fmt->n_sa = (fmt->seq_len + fmt->sa_intv) / fmt->sa_intv;
|
||||
fmt->sa = (uint8_t *)malloc(SA_BYTES(fmt->n_sa));
|
||||
fread_fix(fp, SA_BYTES(fmt->n_sa), fmt->sa);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 读取pac ref数据
|
||||
void fmt_load_ref_pac(const char *fn_prefix, FMTIndex *fmt)
|
||||
{
|
||||
string ann_fn = string(fn_prefix) + ".ann";
|
||||
string pac_fn = string(fn_prefix) + ".pac";
|
||||
FILE *fp;
|
||||
fp = xopen(ann_fn.c_str(), "r");
|
||||
fscanf(fp, "%ld", &fmt->l_pac);
|
||||
err_fclose(fp);
|
||||
// load pac
|
||||
fmt->pac = (uint8_t*)calloc(fmt->l_pac / 4 + 1, 1);
|
||||
fp = xopen(pac_fn.c_str(), "r");
|
||||
err_fread_noeof(fmt->pac, 1, fmt->l_pac / 4 + 1, fp); // concatenated 2-bit encoded sequence
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 根据interval-bwt创建fmt-index
|
||||
FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
|
||||
{
|
||||
|
|
@ -534,6 +530,59 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
|
|||
cnt[3] += x >> 24 & 0xff;
|
||||
}
|
||||
|
||||
// 这里的k是bwt str的行
|
||||
inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_t *b1, uint8_t *b2)
|
||||
{
|
||||
uint32_t *p;
|
||||
uint8_t base2;
|
||||
// 第一步,找到check point位置
|
||||
p = fmt_occ_intv(fmt, k); // check point起始位置
|
||||
p += 20; // bwt碱基起始位置
|
||||
// 第二步,找到mid check point位置
|
||||
int mk = k & FMT_OCC_INTV_MASK;
|
||||
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
|
||||
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置
|
||||
// 第三步,找到具体的uint32_t
|
||||
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基(和8个倒数第二bwt碱基)
|
||||
// 第四步,获取碱基
|
||||
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
|
||||
*b2 = base2 >> 2 & 3;
|
||||
*b1 = base2 & 3;
|
||||
}
|
||||
|
||||
// k, k1, k2都是bwt矩阵对应的行
|
||||
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
||||
{
|
||||
uint8_t b1, b2;
|
||||
bwtint_t tk[4], kk;
|
||||
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1)
|
||||
fmt_get_previous_base(fmt, kk, &b1, &b2);
|
||||
fmt_e2_occ(fmt, k, b1, b2, tk);
|
||||
*k1 = fmt->L2[b1] + tk[1];
|
||||
*k2 = fmt->L2[b2] + tk[3];
|
||||
}
|
||||
|
||||
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
||||
{
|
||||
bwtint_t sa = 0, mask = fmt->sa_intv - 1;
|
||||
bwtint_t k1, k2;
|
||||
while (k & mask)
|
||||
{
|
||||
++sa;
|
||||
fmt_previous_line(fmt, k, &k1, &k2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
|
||||
if (!(k1 & mask))
|
||||
{
|
||||
k = k1;
|
||||
break;
|
||||
}
|
||||
++sa;
|
||||
k = k2;
|
||||
}
|
||||
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
|
||||
return sa;
|
||||
}
|
||||
|
||||
// 扩展两个碱基
|
||||
inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2)
|
||||
{
|
||||
|
|
@ -566,6 +615,67 @@ inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int i
|
|||
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
}
|
||||
|
||||
// 序列和参考基因直接对比
|
||||
static void direct_extend(const FMTIndex *fmt, int len, const char *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt)
|
||||
{
|
||||
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
|
||||
#define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \
|
||||
while (k != qcond && r != rcond) \
|
||||
{ \
|
||||
const int base = PAC_BASE(fmt->pac, r); \
|
||||
if (q[k] != "ACGT"[base]) \
|
||||
break; \
|
||||
k += qstep; \
|
||||
r += rstep; \
|
||||
}
|
||||
#define EXTEND_BASE_LOOP_COMP(qcond, rcond, qstep, rstep) \
|
||||
while (k != qcond && r != rcond) \
|
||||
{ \
|
||||
const int base = 3 - PAC_BASE(fmt->pac, r); \
|
||||
if (q[k] != "ACGT"[base]) \
|
||||
break; \
|
||||
k += qstep; \
|
||||
r += rstep; \
|
||||
}
|
||||
|
||||
int k;
|
||||
int64_t r, rp;
|
||||
mt->num_match = 1;
|
||||
rp = fmt_sa(fmt, mtx_line);
|
||||
r = rp >= (int64_t)fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
|
||||
k = right_pos;
|
||||
if (rp < (int64_t)fmt->l_pac) // 匹配到了正向链
|
||||
{
|
||||
// 向前继续扩展
|
||||
r += right_pos - left_pos;
|
||||
EXTEND_BASE_LOOP(len, (int64_t)fmt->l_pac, 1, 1);
|
||||
mt->rm[0].qe = k;
|
||||
mt->rm[0].reverse = 0;
|
||||
// 向后扩展,x位置之前的碱基
|
||||
r -= k - left_pos + 1;
|
||||
k = left_pos - 1;
|
||||
EXTEND_BASE_LOOP(-1, (int64_t)-1, -1, -1);
|
||||
mt->rm[0].qs = k + 1;
|
||||
mt->rm[0].rs = r + 1;
|
||||
}
|
||||
else // 匹配到了互补链
|
||||
{
|
||||
r -= right_pos - left_pos;
|
||||
EXTEND_BASE_LOOP_COMP(len, (int64_t)-1, 1, -1);
|
||||
mt->rm[0].qe = k;
|
||||
mt->rm[0].reverse = 1;
|
||||
// 扩展x之前的碱基
|
||||
r += k - left_pos + 1;
|
||||
k = left_pos - 1;
|
||||
EXTEND_BASE_LOOP_COMP(-1, (int64_t)fmt->l_pac, -1, 1);
|
||||
mt->rm[0].qs = k + 1;
|
||||
mt->rm[0].rs = r - 1;
|
||||
}
|
||||
mt->info = mt->rm[0].qs;
|
||||
mt->info = mt->info << 32 | mt->rm[0].qe;
|
||||
mt->x[2] = 1;
|
||||
}
|
||||
|
||||
// 利用fmt搜索seed,完整搜索,只需要单向搜索
|
||||
bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
|
||||
{
|
||||
|
|
@ -604,166 +714,6 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
|
|||
return ik;
|
||||
}
|
||||
|
||||
// 获取kmer的fmt匹配信息
|
||||
inline void kmer_getval_at(KmerEntry *ke, bwtintv_t *ok, int pos)
|
||||
{
|
||||
bwtint_t x0, x1, x2;
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = ke->intv_arr + byte_idx;
|
||||
x0 = *arr;
|
||||
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x1 = *arr;
|
||||
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x2 = *((uint32_t *)arr);
|
||||
ok->x[0] = x0;
|
||||
ok->x[1] = x1;
|
||||
ok->x[2] = x2;
|
||||
}
|
||||
|
||||
// 设置kmer第pos个碱基对应的fmt匹配信息
|
||||
inline void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos)
|
||||
{
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = ke->intv_arr + byte_idx;
|
||||
arr[0] = (uint8_t)(ik.x[0] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
|
||||
arr += 5;
|
||||
arr[0] = (uint8_t)(ik.x[1] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
|
||||
arr += 5;
|
||||
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
||||
}
|
||||
|
||||
inline void full_entry_set(FullEntry *fe, bwtintv_t ik)
|
||||
{
|
||||
uint8_t *arr = fe->intv_arr;
|
||||
arr[0] = (uint8_t)(ik.x[0] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
|
||||
arr += 5;
|
||||
arr[0] = (uint8_t)(ik.x[1] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
|
||||
arr += 5;
|
||||
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
||||
}
|
||||
|
||||
inline void full_entry_get(FullEntry *fe, bwtintv_t *ok)
|
||||
{
|
||||
bwtint_t x0, x1, x2;
|
||||
uint8_t *arr = fe->intv_arr;
|
||||
x0 = *arr;
|
||||
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x1 = *arr;
|
||||
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x2 = *((uint32_t *)arr);
|
||||
ok->x[0] = x0;
|
||||
ok->x[1] = x1;
|
||||
ok->x[2] = x2;
|
||||
}
|
||||
|
||||
// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息
|
||||
void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
int i, c1, c2;
|
||||
int qlen = (int)q.size();
|
||||
bwtint_t tk[4], tl[4];
|
||||
|
||||
fmt_set_intv(fmt, bval(q[0]), ik);
|
||||
kmer_setval_at(&ke, ik, 0);
|
||||
|
||||
// 每次扩展两个碱基
|
||||
for (i = 1; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
c2 = cbval(q[i + 1]);
|
||||
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
|
||||
// 第一次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
|
||||
ik.x[2] = tl[1] - tk[1];
|
||||
kmer_setval_at(&ke, ik, i);
|
||||
|
||||
// 第二次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
|
||||
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
|
||||
ik.x[2] = tl[3] - tk[3];
|
||||
kmer_setval_at(&ke, ik, i + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i < qlen && bval(q[i]) < 4)
|
||||
{ // 最后一次扩展
|
||||
c1 = cbval(q[i]);
|
||||
c2 = 3;
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
|
||||
// 第一次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
|
||||
ik.x[2] = tl[1] - tk[1];
|
||||
kmer_setval_at(&ke, ik, i);
|
||||
}
|
||||
}
|
||||
|
||||
// 利用fmt搜索seed,利用kmer加速搜索过程
|
||||
bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
bwtintv_t ok;
|
||||
int i, c1, c2, x = 0;
|
||||
int qlen = (int)q.size();
|
||||
|
||||
// 先利用kmer进行索引查询
|
||||
uint32_t qbit = 0;
|
||||
x = min(KMER_LEN, qlen);
|
||||
//double tt = realtime();
|
||||
for (i = 0; i < x; ++i)
|
||||
{
|
||||
qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1);
|
||||
}
|
||||
kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1);
|
||||
//t1 += realtime() - tt;
|
||||
// tt = realtime();
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
c2 = cbval(q[i + 1]);
|
||||
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i < qlen && bval(q[i]) < 4)
|
||||
{ // 最后一次扩展
|
||||
c1 = cbval(q[i]);
|
||||
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
||||
//fmt_extend2(fmt, &ik, &ok, 0, c1, 3);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
//t2 += realtime() - tt;
|
||||
//cout << ik.x[2] << endl;
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 将用字符串表示的序列计算成用2bit表示的序列
|
||||
uint32_t str2bit(string &str)
|
||||
{
|
||||
|
|
@ -922,7 +872,7 @@ void create_xmer_index(FMTIndex *fmt)
|
|||
}
|
||||
|
||||
// 利用fmt搜索seed,利用xmer加速搜索过程
|
||||
bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q)
|
||||
bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q, int cmp_de)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
int i, c1, c2, x = 0;
|
||||
|
|
@ -957,7 +907,7 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q)
|
|||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
|
||||
}
|
||||
#if 0
|
||||
#if 1
|
||||
bwtintv_t ok;
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
|
|
@ -982,7 +932,7 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q)
|
|||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
#else
|
||||
#else // 测试一下bwt的性能
|
||||
bwtintv_t ok[4];
|
||||
for (i = x; i < qlen; ++i) {
|
||||
if (bval(q[i]) < 4) {
|
||||
|
|
@ -994,90 +944,117 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q)
|
|||
}
|
||||
|
||||
#endif
|
||||
if (cmp_de) {
|
||||
int64_t rp = fmt_sa(fmt, ik.x[0]);
|
||||
rp = rp >= (int64_t)fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
|
||||
ik.rm[0].rs = (uint32_t)rp;
|
||||
}
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 利用fmt搜索seed,利用xmer, direct_extend加速搜索过程
|
||||
bwtintv_t fmt_search_use_xmer_de(FMTIndex *fmt, const string &q)
|
||||
{
|
||||
bwtintv_t ik = {0};
|
||||
int i, c1, c2, x = 0;
|
||||
int qlen = (int)q.size();
|
||||
bwtintv_t mt = {0};
|
||||
|
||||
int max_xmer_len = 14;
|
||||
// 先利用kmer进行索引查询
|
||||
uint32_t qbit = 0;
|
||||
x = min(max_xmer_len, qlen);
|
||||
for (i = 0; i < x; ++i)
|
||||
{
|
||||
qbit |= bval(q[i]) << ((max_xmer_len - 1 - i) << 1);
|
||||
}
|
||||
if (x == 14)
|
||||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe14[qbit].intv_arr, &ik);
|
||||
}
|
||||
else if (x == 13)
|
||||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe13[qbit >> 2].intv_arr, &ik);
|
||||
}
|
||||
else if (x == 12)
|
||||
{
|
||||
// cout << "qbit: " << qbit << endl;
|
||||
xmer_getval_at(fmt->xmer_hash.xe12[qbit >> 4].intv_arr, &ik);
|
||||
}
|
||||
else if (x == 11)
|
||||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe11[qbit >> 6].intv_arr, &ik);
|
||||
}
|
||||
else
|
||||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
|
||||
}
|
||||
|
||||
bwtintv_t ok = {0};
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
c2 = cbval(q[i + 1]);
|
||||
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
||||
if (ok.x[2] == 1) {
|
||||
direct_extend(fmt, qlen, q.c_str(), 0, i + 2, ok.x[0], &mt);
|
||||
return mt;
|
||||
}
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i < qlen && bval(q[i]) < 4)
|
||||
{ // 最后一次扩展
|
||||
c1 = cbval(q[i]);
|
||||
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
|
||||
return ik;
|
||||
}
|
||||
|
||||
uint16_t *restore_kmer_bit(const char *fn, int kmer_len)
|
||||
{
|
||||
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "rb");
|
||||
fread_fix(fp, (1 << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit);
|
||||
err_fclose(fp);
|
||||
return kbit;
|
||||
}
|
||||
|
||||
void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len)
|
||||
{
|
||||
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
|
||||
ostringstream oss;
|
||||
oss << '.' << kmer_len << ".kmer.bit";
|
||||
string kmer_bit = prefix + oss.str();
|
||||
cout << kmer_bit << endl;
|
||||
uint64_t seq_up_val = (1L << (kmer_len << 1));
|
||||
cout << (1L << ((kmer_len << 1) - 4)) << '\t' << seq_up_val << endl;
|
||||
string seq;
|
||||
seq.resize(kmer_len);
|
||||
int percent = 1;
|
||||
double tt = realtime();
|
||||
for (uint64_t i = 0; i < seq_up_val; ++i)
|
||||
{
|
||||
if ((i+1) % (seq_up_val / 100) == 0)
|
||||
{
|
||||
tt = realtime() - tt;
|
||||
cout << percent++ << "\% finished, " << tt << "s elapsed." << endl;
|
||||
tt = realtime();
|
||||
}
|
||||
for (int j = kmer_len - 1; j >= 0; --j)
|
||||
{
|
||||
seq[kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
|
||||
}
|
||||
uint32_t pos = i >> 4;
|
||||
uint32_t bit_pos = i & 0xf;
|
||||
//bwtintv_t p = fmt_search_use_xmer(fmt, seq);
|
||||
//if (p.x[2] > 0) {
|
||||
// kbit[pos] |= 1 << bit_pos;
|
||||
//}
|
||||
}
|
||||
FILE *fp;
|
||||
fp = xopen(kmer_bit.c_str(), "wb");
|
||||
err_fwrite(kbit, 2, 1L << ((kmer_len << 1) - 4), fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
#define GEN_BWT_IDX 0
|
||||
#define GEN_FMT_IDX 0
|
||||
#define GEN_FMT_IDX 1
|
||||
#define GEN_XMER_IDX 0
|
||||
#define CMP_FMT_BWT_TIME 1
|
||||
#define CMP_FMT_BWT_RESULT 0
|
||||
#define CMP_FMT_BWT_RESULT 1
|
||||
|
||||
// argv[1] 应该是索引的前缀
|
||||
int main_fmtidx(int argc, char **argv)
|
||||
{
|
||||
//cout << "size: " << sizeof(FullEntry) << '\t' << sizeof(XmerEntryArr) << endl;
|
||||
string prefix = argv[1];
|
||||
string bwt_str = prefix + ".bwt.str";
|
||||
string bwt_idx, fmt_idx, xmer_idx;
|
||||
string bwt_idx, fmt_idx, xmer_idx, sa_fn, seq_fn;
|
||||
bwt_t *bwt;
|
||||
FMTIndex *fmt;
|
||||
ostringstream oss_bwt, oss_fmt, oss_xmer;
|
||||
ostringstream oss_bwt, oss_fmt, oss_xmer, oss_sa;
|
||||
oss_bwt << '.' << OCC_INTERVAL;
|
||||
oss_fmt << '.' << FMT_OCC_INTERVAL;
|
||||
oss_xmer << '.' << XMER_LEN;
|
||||
oss_fmt << '.' << FMT_MID_INTERVAL;
|
||||
oss_sa << ".33." << SA_INTV;
|
||||
|
||||
bwt_idx = prefix + oss_bwt.str() + ".bwt";
|
||||
fmt_idx = prefix + oss_fmt.str() + ".fmt";
|
||||
xmer_idx = prefix + ".14.kmer";
|
||||
sa_fn = prefix + oss_sa.str() + ".sa";
|
||||
seq_fn = "./seqs.txt";
|
||||
|
||||
cout << bwt_idx << endl;
|
||||
cout << fmt_idx << endl;
|
||||
cout << xmer_idx << endl;
|
||||
cout << sa_fn << endl;
|
||||
|
||||
|
||||
// 生成或读取bwt索引文件
|
||||
// 生成或读取bwt索引文件
|
||||
double time_read_bwt = realtime();
|
||||
#if GEN_BWT_IDX
|
||||
bwt = restore_bwt(bwt_str.c_str());
|
||||
|
|
@ -1100,7 +1077,7 @@ int main_fmtidx(int argc, char **argv)
|
|||
time_read_fmt = realtime() - time_read_fmt;
|
||||
cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl;
|
||||
|
||||
// 生成xmer索引文件
|
||||
// 生成或读取xmer索引文件
|
||||
double time_gen_xmer_hash = realtime();
|
||||
#if GEN_XMER_IDX
|
||||
create_xmer_index(fmt);
|
||||
|
|
@ -1111,80 +1088,119 @@ int main_fmtidx(int argc, char **argv)
|
|||
time_gen_xmer_hash = realtime() - time_gen_xmer_hash;
|
||||
cout << "[time gen xkmer hash:] " << time_gen_xmer_hash << "s" << endl;
|
||||
|
||||
int kmer_bit_len = 18;
|
||||
//create_kmer_bit(fmt, prefix, kmer_bit_len);
|
||||
// return 0;
|
||||
// 读取sa
|
||||
double time_load_sa = realtime();
|
||||
fmt_restore_sa(sa_fn.c_str(), fmt);
|
||||
time_load_sa = realtime() - time_load_sa;
|
||||
cout << "[time load sa:] " << time_load_sa << "s" << endl;
|
||||
|
||||
// 读取pac
|
||||
double time_load_pac = realtime();
|
||||
fmt_load_ref_pac(prefix.c_str(), fmt);
|
||||
time_load_pac = realtime() - time_load_pac;
|
||||
cout << "[time load pac:] " << time_load_pac << "s" << endl;
|
||||
|
||||
//cout << "l_pac: " << fmt->l_pac << endl;
|
||||
//return 0;
|
||||
|
||||
// 生成随机序列进行测试
|
||||
int seq_size = 10000000;
|
||||
//int seq_size = 1;
|
||||
int seq_len = kmer_bit_len;
|
||||
int seq_len = 60;
|
||||
vector<string> seq_arr(seq_size);
|
||||
seq_arr[0] = "CACATATTGGCG";
|
||||
//seq_arr[0] = "CACATATTGGCG";
|
||||
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
|
||||
// seq_arr[0] = "ACCTTTCGAATTGA";
|
||||
#if 1
|
||||
#if 0 // 随机生成seqs
|
||||
srand(time(NULL));
|
||||
double time_gen_seq = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
seq_arr[i] = generate_rand_seq(seq_len);
|
||||
time_gen_seq = realtime() - time_gen_seq;
|
||||
cout << "[time gen seq:] " << time_gen_seq << "s" << endl;
|
||||
#else
|
||||
ifstream seq_in(seq_fn.c_str(), ios::in);
|
||||
string read_line;
|
||||
seq_arr.resize(0);
|
||||
while (!seq_in.eof())
|
||||
{
|
||||
seq_in >> read_line;
|
||||
// cout << read_line.substr(0, seq_len) << endl;
|
||||
seq_arr.push_back(read_line.substr(0, seq_len));
|
||||
}
|
||||
seq_size = seq_arr.size();
|
||||
seq_in.close();
|
||||
#endif
|
||||
|
||||
// 对比bwt和fmt搜索的时间
|
||||
// 对比bwt和fmt搜索的时间
|
||||
|
||||
#if CMP_FMT_BWT_TIME
|
||||
//int nintv_0 = 0;
|
||||
//bwtint_t max_gap = 0;
|
||||
// bwt
|
||||
double time_bwt_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i) {
|
||||
bwtintv_t p = bwt_search(bwt, seq_arr[i]);
|
||||
//if (max_gap < p.x[2])
|
||||
// max_gap = p.x[2];
|
||||
//if (p.x[2] == 0)
|
||||
// ++nintv_0;
|
||||
// bwtintv_t p =
|
||||
bwt_search(bwt, seq_arr[i]);
|
||||
}
|
||||
//cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
|
||||
//cout << "max gap: " << max_gap << endl;
|
||||
time_bwt_search = realtime() - time_bwt_search;
|
||||
cout << "[time bwt search:] " << time_bwt_search << "s" << endl;
|
||||
|
||||
// fmt
|
||||
double time_fmt_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search(fmt, seq_arr[i]);
|
||||
time_fmt_search = realtime() - time_fmt_search;
|
||||
cout << "[time fmt search:] " << time_fmt_search << "s" << endl;
|
||||
|
||||
// kmer
|
||||
double time_xmer_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search_use_xmer(fmt, bwt, seq_arr[i]);
|
||||
fmt_search_use_xmer(fmt, bwt, seq_arr[i], 0);
|
||||
time_xmer_search = realtime() - time_xmer_search;
|
||||
cout << "[time xmer search:] " << time_xmer_search << "s" << endl;
|
||||
#endif
|
||||
// cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
|
||||
// cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl;
|
||||
// cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl;
|
||||
|
||||
// ostringstream oss_kmer_bit;
|
||||
// oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit";
|
||||
// uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len);
|
||||
// direct-extend
|
||||
double time_de_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search_use_xmer_de(fmt, seq_arr[i]);
|
||||
time_de_search = realtime() - time_de_search;
|
||||
cout << "[time direct-extend search:] " << time_de_search << "s" << endl;
|
||||
#endif
|
||||
|
||||
// 对比bwt和fmt搜索的结果
|
||||
#if CMP_FMT_BWT_RESULT
|
||||
double time_cmp = realtime();
|
||||
double time_cmp = realtime();
|
||||
long diff_num = 0;
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
{
|
||||
#if 0
|
||||
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
|
||||
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
|
||||
bwtintv_t p2 = fmt_search_use_xmer(fmt, seq_arr[i]);
|
||||
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
|
||||
cout << seq_arr[i] << endl
|
||||
bwtintv_t p2 = fmt_search_use_xmer(fmt, bwt, seq_arr[i], 0);
|
||||
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) {
|
||||
cout << "different: " << seq_arr[i] << endl
|
||||
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
|
||||
diff_num++;
|
||||
}
|
||||
#else
|
||||
bwtintv_t p1 = fmt_search_use_xmer(fmt, bwt, seq_arr[i], 1);
|
||||
bwtintv_t p2 = fmt_search_use_xmer_de(fmt, seq_arr[i]);
|
||||
if (p2.num_match == 1) {
|
||||
if (p1.rm[0].rs != p2.rm[0].rs || p1.x[2] != 1) {
|
||||
cout << "different: " << seq_arr[i] << '\t' << p1.rm[0].rs << '\t' << p2.rm[0].rs << endl;
|
||||
diff_num++;
|
||||
}
|
||||
}
|
||||
else if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) {
|
||||
cout << "different: " << seq_arr[i] << endl
|
||||
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
|
||||
diff_num++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
time_cmp = realtime() - time_cmp;
|
||||
cout << "[time compare bwt & fmt:] " << time_cmp << "s" << endl;
|
||||
cout << "[time compare bwt & fmt:] " << time_cmp << "s\tdifferent num: " << diff_num << endl;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
28
fmt_index.h
28
fmt_index.h
|
|
@ -22,7 +22,8 @@
|
|||
#elif FMT_MID_INTERVAL == 32
|
||||
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48))
|
||||
#elif FMT_MID_INTERVAL == 64
|
||||
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
|
||||
//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
|
||||
#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8 << 6))
|
||||
#elif FMT_MID_INTERVAL == 128
|
||||
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24))
|
||||
#else
|
||||
|
|
@ -38,26 +39,12 @@
|
|||
#define __fmt_mid_sum(x) \
|
||||
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
|
||||
|
||||
#define KMER_LEN 12
|
||||
#define KMER_ARR_SIZE (1 << (KMER_LEN << 1))
|
||||
|
||||
#define XMER_LEN 14
|
||||
|
||||
// 用来保存kmer对应的fmt的位置信息
|
||||
struct KmerEntry
|
||||
{
|
||||
// 40+40+32 14个byte,这样好处理
|
||||
uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
|
||||
};
|
||||
|
||||
#define FULL_ENTRY_LEN 14
|
||||
#define FULL_ENTRY_ARR_SIZE (1 << (FULL_ENTRY_LEN << 1))
|
||||
// kmer全部匹配对应的最后fmt匹配结果
|
||||
struct FullEntry
|
||||
{
|
||||
uint8_t intv_arr[14];
|
||||
};
|
||||
|
||||
// sa存储的行间隔
|
||||
#define SA_INTV 4
|
||||
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)
|
||||
|
||||
// x-mer entry, 1 - 14个碱基组成的hash key
|
||||
struct XmerEntry
|
||||
|
|
@ -93,9 +80,10 @@ struct FMTIndex
|
|||
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
|
||||
uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3
|
||||
uint8_t last_base; // dollar转换成的base
|
||||
// ref pac相关
|
||||
bwtint_t l_pac; // 参考序列长度
|
||||
uint8_t *pac; // 保存2bit编码的参考序列
|
||||
// 保存kmer对应的fmt位置信息
|
||||
KmerEntry *kmer_entry;
|
||||
FullEntry *full_entry;
|
||||
XmerHash xmer_hash;
|
||||
// suffix array
|
||||
int sa_intv;
|
||||
|
|
|
|||
10
util.h
10
util.h
|
|
@ -17,10 +17,20 @@ typedef uint64_t bwtint_t;
|
|||
|
||||
double realtime(void);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int qs; // query start
|
||||
int qe; // query end [)
|
||||
int reverse; // 反向链
|
||||
uint32_t rs; // 参考基因的起始点(包含)
|
||||
} ref_match_t;
|
||||
|
||||
// 在fm-indexv(或者bwt)查找过程中,记录结果
|
||||
struct bwtintv_t
|
||||
{
|
||||
bwtint_t x[3], info; // x[0]表示正链位置,x[1]表示互补链位置,x[2]表示间隔长度,info 表示read的起始结束位置
|
||||
int num_match;
|
||||
ref_match_t rm[2];
|
||||
};
|
||||
|
||||
// 读取单个数据
|
||||
|
|
|
|||
Loading…
Reference in New Issue