fast-bwa/fmt_idx.c

795 lines
31 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: 通过fmt-idx数据结构对seed过程进行加速fm-index twice search in one time
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/12/24
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "fmt_idx.h"
#include "utils.h"
#include "bntseq.h"
#include "kvec.h"
const static char BASE[4] = {'A', 'C', 'G', 'T'};
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(char **seq_arr, int kmer_len)
{
uint32_t seq_up_val = (1 << (kmer_len << 1));
for (uint32_t i = 0; i < seq_up_val; ++i)
{
seq_arr[i] = (char *)malloc(kmer_len);
for (int j = kmer_len - 1; j >= 0; --j)
{
seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
}
}
}
// 生成occ每个字节对应一个pattern
void fmt_gen_cnt_occ(FMTIndex *fmt)
{
// 0-8大于a的occ8-16大于b的occ16-24b的occ
int i, a, b, ti;
uint32_t oa, ooa, ob, oob;
for (i = 0; i != 256; ++i) // 遍历单个字节的各种情况
{
for (a = 0; a < 4; ++a) // ba格式
{
oa = 0;
ooa = 0;
oa += ((i >> 4 & 3) == a) + ((i & 3) == a);
ooa += ((i >> 4 & 3) > a) + ((i & 3) > a);
for (b = 0; b < 4; ++b)
{
oob = ob = 0;
oob += ((i >> 6 & 3) > b && (i >> 4 & 3) == a) + ((i >> 2 & 3) > b && (i & 3) == a);
ob += ((i >> 6 & 3) == b && (i >> 4 & 3) == a) + ((i >> 2 & 3) == b && (i & 3) == a);
ti = a << 2 | b;
fmt->cnt_occ[ti][i] = ob << 24 | oob << 16 | oa << 8 | ooa;
}
}
}
}
// fmt-index的count table4对应着bwt碱基的累积量0,1,2,3分别对应着bwt是A,C,G,Tpre-bwt的累积量
void fmt_gen_cnt_table(uint32_t cnt_table[4][256])
{
int i, j, k;
uint32_t x = 0;
for (i = 0; i != 256; ++i) // 遍历单个字节的各种情况
{
for (k = 0; k < 4; ++k) // bwt碱基
{
x = 0; // for [A,C,G,T][A,C,G,T]
for (j = 0; j != 4; ++j) // pre-bwt碱基
x |= (((i >> 6 & 3) == j && (i >> 4 & 3) == k) + ((i >> 2 & 3) == j && (i & 3) == k)) << (j << 3);
cnt_table[k][i] = x;
}
}
}
// 将fmt结构数据写入到二进制文件
void dump_fmt(const char *fn, const FMTIndex *fmt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&fmt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
err_fwrite(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
err_fwrite(&fmt->first_base, sizeof(uint8_t), 1, fp);
err_fwrite(&fmt->last_base, sizeof(uint8_t), 1, fp);
err_fwrite(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fwrite(fmt->bwt, 4, fmt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
// 从文件中读取fmt结构数据
FMTIndex *fmt_restore_fmt(const char *fn)
{
FMTIndex *fmt;
fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
FILE *fp = xopen(fn, "rb");
fseek(fp, 0, SEEK_END);
fmt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 6 - 3) >> 2; // 以32位word为单位计算的size
fmt->bwt = (uint32_t *)calloc(fmt->bwt_size, 4);
fseek(fp, 0, SEEK_SET);
err_fread_noeof(&fmt->primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
err_fread_noeof(&fmt->first_base, sizeof(uint8_t), 1, fp);
err_fread_noeof(&fmt->last_base, sizeof(uint8_t), 1, fp);
err_fread_noeof(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
fread_fix(fp, fmt->bwt_size << 2, fmt->bwt);
fmt->seq_len = fmt->L2[4];
err_fclose(fp);
fmt_gen_cnt_occ(fmt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量
return fmt;
}
// 将kmer hash数据写入到文件
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(kh->ke10, 1, (1 << (10 << 1)) * sizeof(KmerEntryArr), fp);
err_fwrite(kh->ke11, 1, (1 << (11 << 1)) * sizeof(KmerEntry), fp);
err_fwrite(kh->ke12, 1, (1 << (12 << 1)) * sizeof(KmerEntry), fp);
err_fwrite(kh->ke13, 1, (1 << (13 << 1)) * sizeof(KmerEntry), fp);
err_fwrite(kh->ke14, 1, (1 << (14 << 1)) * sizeof(KmerEntry), fp);
err_fflush(fp);
err_fclose(fp);
}
// 从文件中读取kmer hash信息
KmerHash fmt_restore_kmer_idx(const char *fn)
{
FILE *fp = xopen(fn, "rb");
KmerHash khash;
KmerHash *kh = &khash;
int len = 1 << (10 << 1);
kh->ke10 = (KmerEntryArr *)malloc(len * sizeof(KmerEntryArr));
fread_fix(fp, len * sizeof(KmerEntryArr), kh->ke10);
len = 1 << (11 << 1);
kh->ke11 = (KmerEntry *)malloc(len * sizeof(KmerEntry));
fread_fix(fp, len * sizeof(KmerEntry), kh->ke11);
len = 1 << (12 << 1);
kh->ke12 = (KmerEntry *)malloc(len * sizeof(KmerEntry));
fread_fix(fp, len * sizeof(KmerEntry), kh->ke12);
len = 1 << (13 << 1);
kh->ke13 = (KmerEntry *)malloc(len * sizeof(KmerEntry));
fread_fix(fp, len * sizeof(KmerEntry), kh->ke13);
len = 1 << (14 << 1);
kh->ke14 = (KmerEntry *)malloc(len * sizeof(KmerEntry));
fread_fix(fp, len * sizeof(KmerEntry), kh->ke14);
err_fclose(fp);
return khash;
}
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
{
char skipped[256];
FILE *fp;
bwtint_t primary;
fp = xopen(fn, "rb");
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == fmt->primary, "SA-BWT inconsistency: primary is not the same.");
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
err_fread_noeof(&fmt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == fmt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
fmt->n_sa = (fmt->seq_len + fmt->sa_intv) / fmt->sa_intv;
fmt->sa = (uint8_t *)malloc(SA_BYTES(fmt->n_sa));
fread_fix(fp, SA_BYTES(fmt->n_sa), fmt->sa);
err_fclose(fp);
}
// 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
{
// FILE *fmt_out = fopen("fmt.txt", "w");
FMTIndex *fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
fmt_gen_cnt_occ(fmt);
bwtint_t i, j, k, m, n, n_occ, cnt[4], cnt2[4];
uint32_t c[4], c2[16]; /*c用来保存原来的bwt碱基串的累积值c2用来保存pre-bwt和bwt碱基对的累计值如AA..TT*/
uint32_t *buf; /* 计算之后变成fmt结构中bwt部分 */
#ifdef FMT_MID_INTERVAL
// 加入中间的check point
uint32_t mc[4] = {0};
uint32_t cnt_table[4][256]; // 4对应原来的cnt_table0,1,2,3,分别对应该碱基的扩展
fmt_gen_cnt_table(cnt_table);
#endif
fmt->seq_len = bwt->seq_len; // bwt碱基序列的长度不包含$字符也就是该长度比bwt matrix长度少1
for (i = 0; i < 5; ++i)
fmt->L2[i] = bwt->L2[i]; // 每个碱基的总累积值
fmt->primary = bwt->primary; // $在末尾的行在bwt matrix行中的排序位置
n_occ = (bwt->seq_len + FMT_OCC_INTERVAL - 1) / FMT_OCC_INTERVAL + 1; // check point 个数
fmt->bwt_size = (fmt->seq_len * 2 + 15) >> 4; // 要保存最后两列碱基
fmt->bwt_size += n_occ * 20; // A,C,G,T和AA,AC.....TG,TT共20个
#ifdef FMT_MID_INTERVAL
uint32_t s1;
bwtint_t mn_occ = (bwt->seq_len >> FMT_OCC_INTV_SHIFT) * (FMT_OCC_INTERVAL / FMT_MID_INTERVAL - 1);
bwtint_t last_seq_len = bwt->seq_len % FMT_OCC_INTERVAL;
mn_occ += (last_seq_len + FMT_MID_INTERVAL - 1) / FMT_MID_INTERVAL - 1;
fmt->bwt_size += mn_occ * 4;
i = 0;
#endif
buf = (uint32_t *)calloc(fmt->bwt_size, 4); // 开辟计算fmt用到的缓存
c[0] = c[1] = c[2] = c[3] = 0;
// 首行的c2应该是对应的ACGT对应的行减去1的occ
for (i = 0; i < 4; ++i)
{
bwtint_t before_first_line = fmt->L2[i];
bwt_occ4(bwt, before_first_line, cnt);
for (j = i * 4, k = 0; k < 4; ++j, ++k)
c2[j] = cnt[k];
}
// k表示buf存储的偏移量
for (i = k = 0; i < bwt->seq_len; ++i)
{
// 记录occ
if (i % FMT_OCC_INTERVAL == 0)
{
memcpy(buf + k, c, sizeof(uint32_t) * 4); // bwt str中各个碱基的occ
k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16); // pre-bwt:bwt碱基对的occ
k += 16;
#ifdef FMT_MID_INTERVAL
mc[0] = mc[1] = mc[2] = mc[3] = 0;
#endif
}
// 每个32位整数保存8个倒数第二列碱基pre-bwt和8个倒数第一列(bwt)碱基
if (i % 16 == 0) // 每个32位整数可以包含16个碱基每次需要处理16个碱基也就是间隔最小可以设置为16
{
uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 4; // 这里加4还是加8要看保存occ的是是uint32还是uint64bwt字符串i对应的基准行因为原始的bwt-cpcheck point包含由4个uint32_t(8个uint32_t)组成的occ信息
int offset = (i % OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址每16个碱基共用一个uint32整型因此需要偏移量来获取当前碱基串的首地址
uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址
for (j = 0; j < 16; ++j) // 对于bwt碱基串一个一个碱基分别处理
{
bwtint_t cur_str_line = i + j; // 当前碱基在bwt str中的行排序
if (cur_str_line < bwt->seq_len) // 当前碱基行不应超出bwt str总碱基长度bwt str长度比bwt matrix长度少1因为bwt str不包含$
{
uint8_t bwt_base = bwt_B0(bwt, cur_str_line); // 对应行的bwt的碱基
// 先求出该碱基对应在第一列的行对应的bwt matrix行
bwtint_t cur_mtx_line = cur_str_line;
if (cur_str_line >= bwt->primary) // 因为bwt序列里除去了$符号,所以,超过$所在行之后对应的seq位置应该加一才是真正对应bwt matrix的行
cur_mtx_line += 1;
bwt_occ4(bwt, cur_mtx_line, cnt); // 获取原来bwt-checkpoint中的occ值
for (m = 0; m < 4; ++m)
c[m] = (uint32_t)cnt[m]; // 碱基m在cur_bwt_mtx_line(包含)之前的累积值直接拷贝原bwt中的occ即可
cnt[bwt_base] -= 1; // 得到cur_bwt_mtx_line(不包含)之前的累积量即bwt_occ4(bwt, cur_bwt_mtx_line-1, cnt)
bwtint_t bwt_base_mtx_line = bwt->L2[bwt_base] + 1 + cnt[bwt_base]; // bwt_base对应的bwt matrix行LF变换
bwt_occ4(bwt, bwt_base_mtx_line, cnt2); // 计算bwt_base_mtx_line之前的occ
for (n = 0; n < 4; ++n)
{
int c2_idx = bwt_base << 2 | n; // bwt base放在前边
c2[c2_idx] = (uint32_t)cnt2[n]; // pre-bwt:bwt 碱基对的累计值
}
bwtint_t bwt_base_str_line = bwt_base_mtx_line; // bwt-str中对应的行排序
if (bwt_base_str_line >= bwt->primary) // base_line表示在bwt str中的位置所以超出$为最尾所在行之后要减掉1
bwt_base_str_line -= 1; // bwt碱基序列行不包含$
uint32_t pre_bwt_base = bwt_B0(bwt, bwt_base_str_line); // bwt列碱基对应的前一个碱基pre-bwt
// 此时bwt_base对应的bwt matrix首行是$排在最尾的行说明bwt_base就是序列的第一个碱基
// 此时计算出来的pre_bwt_base就是primary前一行的bwt base以此来代替$字符,在后续的计算过程中需要考虑
if (bwt_base_mtx_line == bwt->primary)
{
// 计算sec_bcp
fmt->sec_bcp = pre_bwt_base << 2 | bwt_base; // 因为把$当成A处理了
fmt->sec_primary = cur_mtx_line; // pre-bwt base为$的行排序bwt-matrix行
fmt->first_base = bwt_base; // 原始序列第一个碱基
fmt->last_base = pre_bwt_base; // 计算后替代$字符的碱基应该是primary行上边一行对应的bwt base
}
// 暂存 pre-bwt碱基序列
pre_bwt_16_seq = pre_bwt_16_seq | (pre_bwt_base << (15 - j) * 2); // 序列靠前的碱基排在uint32_t数据中的高位
}
else
break;
}
// 保存bwt和pre_bwt
uint32_t pre_and_bwt_seq = 0;
uint32_t pre_and_bwt_seq_2 = 0;
for (m = 0; m < 8; ++m)
{
int lshift_bit = 30 - 2 * m;
pre_and_bwt_seq |= (((pre_bwt_16_seq & (3 << lshift_bit)) >> (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) >> ((m * 2) + 2)));
}
buf[k++] = pre_and_bwt_seq;
if (j > 8)
{
for (m = 8; m > 0; --m)
{
int lshift_bit = 2 * m - 2;
pre_and_bwt_seq_2 |= (((pre_bwt_16_seq & (3 << lshift_bit)) << (m * 2)) | ((bwt_16_seq & (3 << lshift_bit)) << (m * 2 - 2)));
}
#ifdef FMT_MID_INTERVAL // 计算前边8+8个碱基的mid interval occ
s1 = pre_and_bwt_seq;
for (m = 0; m < 4; ++m)
mc[m] += cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
#endif
#if FMT_MID_INTERVAL == 8 // 如果mid interval是8的话这里要保存一次
for (m = 0; m < 4; ++m)
buf[k++] = mc[m];
#endif
buf[k++] = pre_and_bwt_seq_2;
#ifdef FMT_MID_INTERVAL
s1 = pre_and_bwt_seq_2;
for (m = 0; m < 4; ++m)
mc[m] += cnt_table[m][s1 & 0xff] + cnt_table[m][s1 >> 8 & 0xff] + cnt_table[m][s1 >> 16 & 0xff] + cnt_table[m][s1 >> 24];
if ((i + 16) % FMT_OCC_INTERVAL != 0 && j == 16 && ((i + 16) & FMT_MID_INTV_MASK) == 0)
for (m = 0; m < 4; ++m)
buf[k++] = mc[m];
#endif
}
}
}
// the last element
memcpy(buf + k, c, sizeof(uint32_t) * 4);
k += 4;
memcpy(buf + k, c2, sizeof(uint32_t) * 16);
k += 16;
xassert(k == fmt->bwt_size, "inconsistent bwt_size");
// update fmt
fmt->bwt = buf;
return fmt;
}
// 扩展两个个碱基计算bwt base为b的pre-bwt str中各个碱基的occ
inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4])
{
uint32_t x = 0;
uint32_t *p, *q, tmp;
bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK); // cp = check point
int i, ti = b1 << 2 | b2;
cnt[0] = 0;
cnt[1] = 0;
cnt[2] = 0;
if (k == (bwtint_t)(-1))
{
p = fmt->bwt + 4 + b1 * 4;
for (i = b2 + 1; i < 4; ++i)
cnt[2] += p[i];
cnt[3] = p[b2];
return;
}
k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
p = fmt_occ_intv(fmt, k);
// fprintf(stderr, "k: %ld\n", k);
for (i = b1 + 1; i < 4; ++i)
cnt[0] += p[i]; // 大于b1的碱基的occ之和
cnt[1] = p[b1]; // b1的occ
q = p + 4 + b1 * 4;
for (i = b2 + 1; i < 4; ++i)
cnt[2] += q[i]; // 大于b2的occ之和
cnt[3] = q[b2]; // b2的occ
p += 20;
// 使用mid interval信息
int mk = k % FMT_OCC_INTERVAL;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
if (n_mintv > 0) // 至少超过了第一个mid interval
{
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)) - 4; // 对应的mid interval check point的首地址即A C G T的局部累积量
q = p + b1;
for (i = b1 + 1; i < 4; ++i)
x += p[i]; // 大于b1的碱基的occ之和
cnt[0] += __fmt_mid_sum(x);
x = *q;
cnt[1] += __fmt_mid_sum(x); // b1的occ
for (i = 3; i > b2; --i)
cnt[2] += x >> (i << 3) & 0xff; // 大于b2的occ之和
cnt[3] += x >> (b2 << 3) & 0xff; // b2的occ
x = 0;
p += 4;
}
uint32_t *end = p + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3));
for (; p < end; ++p)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
}
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
x += __fmt_occ_e2_aux2(fmt, ti, tmp);
if (b1 == 0)
{
x -= (~k & 7) << 8;
if (b2 == 0)
x -= (~k & 7) << 24;
}
// 如果跨过了second_primary,那么可能需要减掉一次累积值
if (b1 == fmt->first_base && cp_line < fmt->sec_primary && str_line >= fmt->sec_primary)
{
if (b2 < fmt->last_base)
cnt[2] -= 1;
else if (b2 == fmt->last_base)
cnt[3] -= 1;
}
cnt[0] += x & 0xff;
cnt[1] += x >> 8 & 0xff;
cnt[2] += x >> 16 & 0xff;
cnt[3] += x >> 24 & 0xff;
}
// 扩展两个碱基
inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2)
{
bwtint_t tk[4], tl[4];
bwtintv_t intv = {0};
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk);
fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
// 第一次扩展
intv.x[!is_back] = fmt->L2[b1] + 1 + tk[1];
intv.x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
intv.x[2] = tl[1] - tk[1];
*ok1 = intv;
// 第二次扩展
intv.x[is_back] = intv.x[is_back] + (intv.x[!is_back] <= fmt->primary && intv.x[!is_back] + intv.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
intv.x[!is_back] = fmt->L2[b2] + 1 + tk[3];
intv.x[2] = tl[3] - tk[3];
*ok2 = intv;
}
// 扩展一个碱基
inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1)
{
bwtint_t tk[4], tl[4];
int b2 = 3; // 如果只扩展一次那么第二个碱基设置成T可以减小一些计算量如计算大于b2的累积数量
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk);
fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
// 这里是反向扩展
ok->x[!is_back] = fmt->L2[b1] + 1 + tk[1];
ok->x[2] = tl[1] - tk[1];
// 第一次正向扩展
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
}
// 序列和参考基因直接对比
static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt)
{
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
#define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \
while (k != qcond && r != rcond) \
{ \
const int base = PAC_BASE(fmt->pac, r); \
if (q[k] != base) break; \
k += qstep; \
r += rstep; \
}
#define EXTEND_BASE_LOOP_COMP(qcond, rcond, qstep, rstep) \
while (k != qcond && r != rcond) \
{ \
const int base = 3 - PAC_BASE(fmt->pac, r); \
if (q[k] != base) break; \
k += qstep; \
r += rstep; \
}
int k;
int64_t r, rp;
mt->num_match = 1;
rp = fmt_sa(fmt, mtx_line);
r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
k = right_pos;
if (rp < fmt->l_pac) // 匹配到了正向链
{
// 向前继续扩展
r += right_pos - left_pos;
EXTEND_BASE_LOOP(len, fmt->l_pac, 1, 1);
mt->rm[0].qe = k;
mt->rm[0].reverse = 0;
// 向后扩展x位置之前的碱基
r -= k - left_pos + 1;
k = left_pos - 1;
EXTEND_BASE_LOOP(-1, -1, -1, -1);
mt->rm[0].qs = k + 1;
mt->rm[0].rs = r + 1;
}
else // 匹配到了互补链
{
r -= right_pos - left_pos;
EXTEND_BASE_LOOP_COMP(len, -1, 1, -1);
mt->rm[0].qe = k;
mt->rm[0].reverse = 1;
// 扩展x之前的碱基
r += k - left_pos + 1;
k = left_pos - 1;
EXTEND_BASE_LOOP_COMP(-1, fmt->l_pac, -1, 1);
mt->rm[0].qs = k + 1;
mt->rm[0].rs = r - 1;
}
mt->info = mt->rm[0].qs;
mt->info = mt->info << 32 | mt->rm[0].qe;
mt->x[2] = 1;
}
static inline void fmt_reverse_intvs(bwtintv_v *p)
{
if (p->n > 1) {
int j;
for (j = 0; j < p->n >> 1; ++j)
{
bwtintv_t tmp = p->a[p->n - 1 - j];
p->a[p->n - 1 - j] = p->a[j];
p->a[j] = tmp;
}
}
}
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec)
{
int i, j, ret, kmer_len;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0};
bwtintv_v *curr;
uint32_t qbit = 0;
mem->n = 0;
int only_forward = x == 0 || q[x - 1] > 3;
if (q[x] > 3) return x + 1;
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
curr = tmpvec; // use the temporary vector if provided
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
ik.info = x + 1;
// check change of the interval size and whether the interval size is too small to be extended further
#define CHECK_INTV_CHANGE(iv, ov, end_pos) \
if (ov.x[2] != iv.x[2]) { kv_push(bwtintv_t, *curr, iv); if (ov.x[2] < min_intv) break; } iv = ov; iv.info = end_pos
#define PUSH_VAL_AND_SKIP(iv) \
do { kv_push(bwtintv_t, *curr, iv); goto backward_search; } while(0)
// 处理kmer对应的匹配信息
if (only_forward) j = kmer_len - 1;
else j = 1;
for (curr->n = 0; j < kmer_len; ++j)
{
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
CHECK_INTV_CHANGE(ik, ok1, x + j + 1);
}
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
PUSH_VAL_AND_SKIP(ik);
// 扩展kmer之后的碱基
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_CHANGE(ik, ok1, i + 1);
CHECK_INTV_CHANGE(ik, ok2, i + 2);
#if 1 // 间隔为1的时候直接与reference比对
if (min_intv == 1 && ok2.x[2] == min_intv) // 在这里进行判断是否只有一个候选了
{
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end;
goto backward_search;
}
#endif
} else if (q[i] < 4) // q[i+1] >= 4
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
CHECK_INTV_CHANGE(ik, ok1, i + 1);
PUSH_VAL_AND_SKIP(ik);
}
else // q[i] >= 4
{
PUSH_VAL_AND_SKIP(ik);
}
}
for (; i == len - 1; ++i) // 扩展到了最后一个碱基
{
if (q[i] < 4)
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
CHECK_INTV_CHANGE(ik, ok1, i + 1);
}
else
PUSH_VAL_AND_SKIP(ik);
}
if (i == len)
kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
backward_search:
fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
if (mt.num_match == 0) ret = curr->a[0].info; // this will be the returned value扩展到的最远的位置
// 按照种子进行遍历,反向扩展
#define CHECK_ADD_MEM(pos, intv, mem) \
if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) { \
(intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); \
}
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
if (ok.x[2] < min_intv) { CHECK_ADD_MEM(pos, intv, mem); break; }
int last_kmer_start = 0;
for (j = 0; j < curr->n; ++j)
{
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
#if 1
if (!only_forward && p->info - x < HASH_KMER_LEN) {
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer
else qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
last_kmer_start = p->info - 1;
i = 1;
do { bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
if (i > 2) continue;
p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2];
i = p->info - (kmer_len - i + 3);
} else {
i = x - 1;
}
#else
i = x - 1;
#endif
for (; i > 0; i -= 2)
{
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
{
fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
ok2.info = p->info; *p = ok2;
}
else if (q[i] < 4) // 只能扩展一个
{
fmt_extend1(fmt, p, &ok1, 1, q[i]);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
CHECK_ADD_MEM(i, ok1, mem);
goto fmt_smem_end;
}
else
{ // 不能扩展
CHECK_ADD_MEM(i + 1, *p, mem);
goto fmt_smem_end;
}
}
for (; i == 0; --i)
{ // 扩展到了第一个碱基
if (q[i] < 4) {
fmt_extend1(fmt, p, &ok1, 1, q[i]);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info; *p = ok1;
} else {
CHECK_ADD_MEM(i + 1, *p, mem);
goto fmt_smem_end;
}
}
if (i == -1) {
CHECK_ADD_MEM(i + 1, *p, mem);
goto fmt_smem_end;
}
}
fmt_smem_end:
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
return ret;
}
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
{
int i, kmer_len;
bwtintv_t ik = {0}, ok1={0}, ok2={0};
uint64_t qbit;
memset(mem, 0, sizeof(bwtintv_t));
if (q[x] > 3) return x + 1;
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置
ik.info = x + kmer_len;
//fmt_set_intv(fmt, q[x], ik);
//ik.info = x + 1;
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
{ \
(ov) = (iv); \
(ov).info = (uint64_t)start_pos << 32 | (end_pos + 1); \
return end_pos + 1; \
}
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
ik = ok2;
}
else if (q[i] < 4) // q[i+1] >= 4
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
return i + 2;
}
else // q[i] >= 4
{
return i + 1;
}
}
if (i == len - 1) {
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
}
return len;
}
// 这里的k是bwt str的行
inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_t *b1, uint8_t *b2)
{
uint32_t *p;
uint8_t base2;
// 第一步找到check point位置
p = fmt_occ_intv(fmt, k); // check point起始位置
p += 20; // bwt碱基起始位置
// 第二步找到mid check point位置
int mk = k & FMT_OCC_INTV_MASK;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置
// 第三步找到具体的uint32_t
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基和8个倒数第二bwt碱基
// 第四步,获取碱基
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
*b2 = base2 >> 2 & 3;
*b1 = base2 & 3;
}
// k, k1, k2都是bwt矩阵对应的行
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
{
uint8_t b1, b2;
bwtint_t tk[4], kk;
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
fmt_get_previous_base(fmt, kk, &b1, &b2);
fmt_e2_occ(fmt, k, b1, b2, tk);
*k1 = fmt->L2[b1] + tk[1];
*k2 = fmt->L2[b2] + tk[3];
}
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
{
bwtint_t sa = 0, mask = fmt->sa_intv - 1;
bwtint_t k1, k2;
while (k & mask)
{
++sa;
fmt_previous_line(fmt, k, &k1, &k2);
if (!(k1 & mask)) {
k = k1;
break;
}
++sa;
k = k2;
}
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
return sa;
}