实现了seed过程的所有加速想法,seed部分实现了3倍左右加速比

This commit is contained in:
zzh 2024-02-20 01:12:02 +08:00
parent 4a1c2cd3db
commit fc2e0d9b0b
10 changed files with 304 additions and 146 deletions

4
.vscode/launch.json vendored
View File

@ -18,8 +18,8 @@
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/reference/human_g1k_v37_decoy.fasta",
"~/fastq/d_r1.fq",
"~/fastq/d_r2.fq",
"~/fastq/tiny_n_r1.fq",
"~/fastq/tiny_n_r2.fq",
"-o",
"/dev/null"
],

22
bwa.c
View File

@ -284,7 +284,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
bwt = bwt_restore_bwt(tmp);
fprintf(stderr, "zzh-1\n");
strcat(strcpy(tmp, prefix), ".33.4.sa"); // partial suffix array (SA)
strcat(strcpy(tmp, prefix), ".33.2.sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, bwt);
fprintf(stderr, "zzh-after-sa\n");
free(tmp); free(prefix);
@ -293,13 +293,14 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
FMTIndex *bwa_idx_load_fmt(const char *hint)
{
char *fmt_idx_fn, *kmer_idx_fn, *kmer_bit_fn, *sa_fn;
char *fmt_idx_fn, *kmer_idx_fn, *sa_fn;
// char *kmer_bit_fn;
FMTIndex *fmt;
char suffix[32];
int l_hint = strlen(hint);
fmt_idx_fn = malloc(l_hint + 32);
kmer_idx_fn = malloc(l_hint + 32);
kmer_bit_fn = malloc(l_hint + 32);
//kmer_bit_fn = malloc(l_hint + 32);
sa_fn = malloc(l_hint + 32);
sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL);
strcpy(fmt_idx_fn, hint);
@ -319,12 +320,12 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn);
// restore kmer bit
fmt->bit_kmer_len = 18;
sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len);
strcpy(kmer_bit_fn, hint);
strcpy(kmer_bit_fn + l_hint, suffix);
fprintf(stderr, "%s\n", kmer_bit_fn);
fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len);
// fmt->bit_kmer_len = 18;
// sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len);
// strcpy(kmer_bit_fn, hint);
// strcpy(kmer_bit_fn + l_hint, suffix);
// fprintf(stderr, "%s\n", kmer_bit_fn);
// fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len);
strcpy(sa_fn, hint);
sprintf(suffix, ".33.%d.sa", SA_INTV);
@ -369,6 +370,9 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
err_fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
// 赋值到fmt中对应的pac
idx->fmt->l_pac = idx->bns->l_pac;
idx->fmt->pac = idx->pac;
}
}
free(prefix);

102
bwamem.c
View File

@ -158,7 +158,9 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
//fprintf(fp1, "seq: %ld\n", dn++);
//fprintf(stderr, "seq: %ld\n", dn++);
//dn ++;
//goto third_seed;
#define USE_FMT 1
// goto third_seed;
while (x < len) {
if (seq[x] < 4) {
@ -166,25 +168,27 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
#ifdef SHOW_PERF
int64_t tmp_time = realtime_msec();
#endif
//x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
#if USE_FMT
x = fmt_smem(fmt, len, seq, x, start_width, &a->mem1, a->tmpv);
#else
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
#endif
#ifdef SHOW_PERF
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_smem1a, tmp_time);
__sync_fetch_and_add(&time_seed_1, tmp_time);
#endif
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
//if (slen < 16) ++n16;
//if (slen < 17) ++n17;
//if (slen < 18) ++n18;
//if (slen < 19) ++n19;
//++nall;
max_seed_len = fmax(max_seed_len, slen);
if (slen >= opt->min_seed_len)
if (slen >= opt->min_seed_len) {
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
kv_push(bwtintv_t, a->mem, *p);
}
}
} else {
++x;
@ -195,7 +199,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
// second pass: find MEMs inside a long SMEM
//if (max_seed_len == len - start_N_num)
// goto collect_intv_end;
// goto third_seed;
//goto third_seed;
old_n = a->mem.n;
for (k = 0; k < old_n; ++k) {
@ -205,24 +209,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
#ifdef SHOW_PERF
int64_t tmp_time = realtime_msec();
#endif
//bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
#if USE_FMT
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
#else
bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
#endif
#ifdef SHOW_PERF
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_smem1a, tmp_time);
__sync_fetch_and_add(&time_seed_2, tmp_time);
#endif
for (i = 0; i < a->mem1.n; ++i) {
//bwtintv_t *p = &a->mem1.a[i];
bwtintv_t *p = &a->mem1.a[i];
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
//int slen = (uint32_t)p->info - (p->info >> 32);
//if (slen < 16) ++n16;
//if (slen < 17) ++n17;
//if (slen < 18) ++n18;
//if (slen < 19) ++n19;
//++nall;
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info >> 32) >= opt->min_seed_len)
int slen = (uint32_t)p->info - (p->info >> 32);
if (slen >= opt->min_seed_len) {
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
}
}
//if (max_seed_len == len - start_N_num)
@ -239,11 +243,15 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
#ifdef SHOW_PERF
int64_t tmp_time = realtime_msec();
#endif
//x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
#if USE_FMT
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
#else
x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
#endif
#ifdef SHOW_PERF
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_smem1a, tmp_time);
__sync_fetch_and_add(&time_seed_3, tmp_time);
#endif
//bwtintv_t *p = &m;
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
@ -262,7 +270,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
}
}
//collect_intv_end:
collect_intv_end:
// sort
ks_introsort(mem_intv, a->mem.n, a->mem.a);
}
@ -381,6 +389,43 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm
int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k;
// if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
//if (0) {
if (p->num_match > 0) {
//continue;
for (k = 0; k < p->num_match; ++k) {
mem_chain_t tmp, *lower, *upper;
mem_seed_t s;
int rid, to_add = 0;
s.rbeg = p->rm[k].rs;
if (p->rm[k].reverse) {
s.rbeg = (fmt->l_pac << 1) - 1 - s.rbeg;
}
tmp.pos = s.rbeg;
s.qbeg = p->info >> 32;
s.score = s.len = slen;
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
if (rid < 0) continue;
if (kb_size(tree))
{
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid))
to_add = 1;
}
else
to_add = 1;
if (to_add)
{ // add the seed as a new chain
tmp.n = 1;
tmp.m = 4;
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
tmp.seeds[0] = s;
tmp.rid = rid;
tmp.is_alt = !!bns->anns[rid].is_alt;
kb_putp(chn, tree, &tmp);
}
}
continue;
}
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
mem_chain_t tmp, *lower, *upper;
@ -389,14 +434,23 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm
#ifdef SHOW_PERF
int64_t tmp_time = realtime_msec();
#endif
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
// s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k);
#ifdef SHOW_PERF
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_sa, tmp_time);
__sync_fetch_and_add(&num_sa, 1);
#endif
s.qbeg = p->info >> 32;
s.score= s.len = slen;
#ifdef SHOW_PERF
tmp_time = realtime_msec();
#endif
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
#ifdef SHOW_PERF
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bns, tmp_time);
#endif
if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary; TODO: split the seed; don't discard it!!!
if (kb_size(tree)) {
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain

22
bwt.c
View File

@ -61,6 +61,10 @@ static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inver
x = t2;
// bwtint_t cnt[4], cnt1;
// bwt_occ4(bwt, k, cnt);
// cnt1 = bwt_occ(bwt, k, x);
x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置后缀索引
return k == bwt->primary? 0 : x;
}
@ -137,14 +141,14 @@ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
}
/* without setting bwt->sa[0] = -1, the following line should be
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
#ifdef SHOW_PERF
int64_t tmp_time = realtime_msec();
#endif
//#ifdef SHOW_PERF
// int64_t tmp_time = realtime_msec();
//#endif
sa += bwt_get_sa(bwt->sa, k / bwt->sa_intv);
#ifdef SHOW_PERF
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
#endif
//#ifdef SHOW_PERF
// tmp_time = realtime_msec() - tmp_time;
// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
//#endif
return sa;
}
@ -354,7 +358,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
bwtintv_t ik = {0}, ok[4] = {0};
bwtintv_v a[2], *prev, *curr, *swap;
mem->n = 0;
@ -447,7 +451,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
{
int i, c;
bwtintv_t ik, ok[4];
bwtintv_t ik = {0}, ok[4] = {0};
memset(mem, 0, sizeof(bwtintv_t));
if (q[x] > 3) return x + 1;

9
bwt.h
View File

@ -59,8 +59,17 @@ typedef struct {
uint8_t *sa;
} bwt_t;
typedef struct {
int qs; // query start
int qe; // query end [)
int reverse; // 反向链
uint32_t rs; // 参考基因的起始点(包含)
} ref_match_t;
typedef struct {
bwtint_t x[3], info;
int num_match;
ref_match_t rm[2];
} bwtintv_t;
typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;

View File

@ -48,13 +48,17 @@ int64_t time_ksw_extend2 = 0,
time_ksw_global2 = 0,
time_ksw_align2 = 0,
time_bwt_smem1a = 0,
time_seed_1 = 0,
time_seed_2 = 0,
time_seed_3 = 0,
time_fmt_smem_0 = 0,
time_bwt_extend = 0,
time_bwt_occ4 = 0,
time_bwt_sa = 0,
time_bwt_sa_read = 0;
time_bwt_sa_read = 0,
time_bns = 0;
int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0;
int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0;
FILE *fp1;
@ -381,7 +385,6 @@ int main_mem(int argc, char *argv[])
}
} else update_a(opt, &opt0);
bwa_fill_scmat(opt->a, opt->b, opt->mat);
fprintf(stderr, "zzh-1\n");
aux.idx = bwa_idx_load_from_shm(argv[optind]);
if (aux.idx == 0) {
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
@ -396,7 +399,6 @@ int main_mem(int argc, char *argv[])
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
return 1;
}
fprintf(stderr, "zzh-2\n");
fp = gzdopen(fd, "r");
aux.ks = kseq_init(fp);
if (optind + 2 < argc) {
@ -432,16 +434,21 @@ int main_mem(int argc, char *argv[])
#ifdef SHOW_PERF
fprintf(stderr, "\n");
fprintf(stderr, "time_bwt_smem1a: %f s\n", time_bwt_smem1a / 1000.0 / opt->n_threads);
fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_fmt_smem_0: %f s\n", time_fmt_smem_0 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bwt_extend: %f s\n", time_bwt_extend / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads);
fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bwt_sa_read: %f s\n", time_bwt_sa_read / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bns: %f s\n", time_bns / 1000.0 / opt->n_threads);
fprintf(stderr, "all seed num: %ld\n", nall);
fprintf(stderr, "all seed num litter than 19: %ld\n", n19);
fprintf(stderr, "all seed num litter than 18: %ld\n", n18);
fprintf(stderr, "all seed num litter than 17: %ld\n", n17);
fprintf(stderr, "all seed num litter than 16: %ld\n", n16);
fprintf(stderr, "get sa num: %ld\n", num_sa);
fprintf(stderr, "\n");
fclose(fp1);

255
fmt_idx.c
View File

@ -357,7 +357,7 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
{
uint32_t x = 0;
uint32_t *p, *q, tmp;
bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK);
bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK); // cp = check point
int i, ti = b1 << 2 | b2;
cnt[0] = 0;
cnt[1] = 0;
@ -462,7 +462,7 @@ inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwti
#endif
#endif
bwtint_t tk[4], tl[4];
bwtintv_t intv;
bwtintv_t intv = {0};
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk);
fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
@ -629,95 +629,18 @@ inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
}
// 当x为0或者q[x-1]为N时只需要前向搜索即可
int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwtintv_v *mem)
{
#ifdef SHOW_PERF
#if 1
int64_t tmp_time = realtime_msec();
#endif
#endif
int i, j = 1, ret, kmer_len;
const int min_intv = 1;
bwtintv_t ik, ok1, ok2;
uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
mem->n = 0;
fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++);
while (ik.x[2] == 0) {
fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++);
}
if (j != 2) { // kmer hash没有找到对应的匹配
ik.info = x + kmer_len - j + 2;
goto fmt_smem_forward_end;
}
ik.info = x + kmer_len;
// 继续向前扩展
for (i = x + kmer_len; i + 1 < len; i += 2) {
if (ik.x[2] < 2)
goto fmt_smem_forward_end;
if (q[i] < 4 && q[i + 1] < 4) // 两个都可以扩展
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
if (ok2.x[2] >= min_intv) { // 可以继续扩展
ik = ok2; ik.info = i + 2;
} else if (ok1.x[2] >= min_intv) { // 第二个间隔不够
ik = ok1; ik.info = i + 1;
goto fmt_smem_forward_end;
} else { // 两个间隔都不够
goto fmt_smem_forward_end;
}
}
else if (q[i] < 4) // q[i+1] >= 4只能扩展一个
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
if (ok1.x[2] >= min_intv) {
ik = ok1; ik.info = i + 1;
}
goto fmt_smem_forward_end;
}
else { // q[i] >= 4
goto fmt_smem_forward_end;
}
}
if (i == len - 1) // 扩展到了最后一个碱基
{
if (q[i] < 4) {
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
if (ok1.x[2] >= min_intv) {
ik = ok1; ik.info = i + 1;
}
}
}
fmt_smem_forward_end:
ret = ik.info;
ik.info |= (uint64_t)x << 32;
if (fmt->bit_kmer_len <= (uint32_t)ik.info - (ik.info >> 32))
kv_push(bwtintv_t, *mem, ik);
#ifdef SHOW_PERF
#if 1
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_fmt_smem_0, tmp_time);
#endif
#endif
return ret;
//return len;
}
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, ret, kmer_len;
bwtintv_t ik, ok1, ok2;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0};
bwtintv_v a[1], *curr;
uint32_t qbit = 0;
mem->n = 0;
int only_forward = x == 0 || q[x - 1] > 3;
if (q[x] > 3) return x + 1;
//if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
kv_init(a[0]);
@ -741,6 +664,8 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
PUSH_VAL_AND_SKIP(ik);
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
// 扩展kmer之后的碱基
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
@ -749,6 +674,99 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
CHECK_INTV_CHANGE(ik, ok1, i + 1);
CHECK_INTV_CHANGE(ik, ok2, i + 2);
// 在这里进行判断是否只有一个候选了
//if (0)
if (min_intv < 2 && ok2.x[2] == min_intv)
{ // 最多处理两个
mt.num_match = min_intv;
int64_t r, rp;
int k;
int max_qs = 0, min_qe = len;
for (j = 0; j < ok2.x[2]; ++j)
{
rp = fmt_sa(fmt, ok2.x[0] + j);
r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
k = i + 2;
if (rp < fmt->l_pac) // 匹配到了正向链
{
// 向前继续扩展
r += i + 2 - x;
while (k < len && r < fmt->l_pac) {
int base = PAC_BASE(fmt->pac, r);
if (q[k] != base)
break;
++k;
++r;
}
mt.rm[j].qe = k;
mt.rm[j].reverse = 0;
// 向后扩展x位置之前的碱基
r -= k - x + 1;
k = x - 1;
while (k > -1 && r > -1) {
int base = PAC_BASE(fmt->pac, r);
if (q[k] != base)
break;
--k;
--r;
}
mt.rm[j].qs = k + 1;
mt.rm[j].rs = r + 1;
}
else // 匹配到了互补链
{
r -= i + 2 - x;
while (k < len && r > -1)
{
const int base = 3 - PAC_BASE(fmt->pac, r);
if (q[k] != base)
break;
++k;
--r;
}
mt.rm[j].qe = k;
mt.rm[j].reverse = 1;
// 扩展x之前的碱基
r += k - x + 1;
k = x - 1;
while (k > -1 && r < fmt->l_pac)
{
const int base = 3 - PAC_BASE(fmt->pac, r);
if (q[k] != base)
break;
--k;
++r;
}
mt.rm[j].qs = k + 1;
mt.rm[j].rs = r - 1;
}
max_qs = max_qs < mt.rm[j].qs ? mt.rm[j].qs : max_qs;
min_qe = min_qe > mt.rm[j].qe ? mt.rm[j].qe : min_qe;
}
if (min_intv > 1)
{
// 修正两个匹配的位置,使比对的碱基个数和位置一致
for (j = 0; j < min_intv; ++j)
{
if (mt.rm[j].reverse)
mt.rm[j].rs -= max_qs - mt.rm[j].qs;
else
mt.rm[j].rs += max_qs - mt.rm[j].qs;
mt.rm[j].qs = max_qs;
mt.rm[j].qe = min_qe;
}
}
mt.info = mt.rm[0].qs;
mt.info = mt.info << 32 | mt.rm[0].qe;
mt.x[2] = min_intv;
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
{
goto fmt_smem_end;
}
goto backward_search;
}
} else if (q[i] < 4) // q[i+1] >= 4
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
@ -775,7 +793,8 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
backward_search:
fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
ret = curr->a[0].info; // this will be the returned value扩展到的最远的位置
if (mt.num_match == 0)
ret = curr->a[0].info; // this will be the returned value扩展到的最远的位置
// 按照种子进行遍历,反向扩展
#define CHECK_ADD_MEM(pos, intv, mem) \
@ -788,18 +807,8 @@ backward_search:
{
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
uint64_t qbit = 0;
// if (p->info - x < fmt->bit_kmer_len) {
// qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer
// if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf))))
// continue;
// }
if (!only_forward && p->info - x < HASH_KMER_LEN) {
//qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
//if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf))))
// continue;
//qbit >>= (fmt->bit_kmer_len - HASH_KMER_LEN) << 1;
//kmer_len = kmer_len > HASH_KMER_LEN ? HASH_KMER_LEN : kmer_len;
i = 1;
do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
if (i > 2) continue;
@ -859,7 +868,7 @@ fmt_smem_end:
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
{
int i, kmer_len;
bwtintv_t ik, ok1, ok2;
bwtintv_t ik = {0}, ok1={0}, ok2={0};
uint64_t qbit;
@ -906,3 +915,63 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
}
return len;
}
// 这里的k是bwt str的行
inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_t *b1, uint8_t *b2)
{
uint32_t *p;
uint8_t base2;
// 第一步找到check point位置
p = fmt_occ_intv(fmt, k); // check point起始位置
p += 20; // bwt碱基起始位置
// 第二步找到mid check point位置
int mk = k & FMT_OCC_INTV_MASK;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置
// 第三步找到具体的uint32_t
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基和8个倒数第二bwt碱基
// 第四步,获取碱基
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
*b2 = base2 >> 2 & 3;
*b1 = base2 & 3;
}
// k, k1, k2都是bwt矩阵对应的行
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
{
uint8_t b1, b2;
bwtint_t tk[4], kk;
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
fmt_get_previous_base(fmt, kk, &b1, &b2);
fmt_e2_occ(fmt, k, b1, b2, tk);
*k1 = fmt->L2[b1] + tk[1];
*k2 = fmt->L2[b2] + tk[3];
}
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
{
bwtint_t sa = 0, mask = fmt->sa_intv - 1;
bwtint_t k1, k2;
while (k & mask)
{
++sa;
fmt_previous_line(fmt, k, &k1, &k2);
if (!(k1 & mask)) {
k = k1;
break;
}
++sa;
k = k2;
}
//#ifdef SHOW_PERF
// int64_t tmp_time = realtime_msec();
//#endif
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
//#ifdef SHOW_PERF
// tmp_time = realtime_msec() - tmp_time;
// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
//#endif
return sa;
}

View File

@ -17,7 +17,7 @@ Date : 2023/12/24
#define FMT_OCC_INTERVAL (1 << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV_SHIFT 5
#define FMT_MID_INTV_SHIFT 6
#define FMT_MID_INTERVAL (1 << FMT_MID_INTV_SHIFT)
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
@ -91,8 +91,12 @@ typedef struct
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
uint8_t last_base; // dollar转换成的base
// ref pac相关
bwtint_t l_pac; // 参考序列长度
uint8_t *pac; // 保存2bit编码的参考序列
// 保存kmer对应的fmt位置信息
KmerHash kmer_hash;
// kmer_bit好像效果一般
uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配
int bit_kmer_len;
// suffix array
@ -132,4 +136,7 @@ void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k);
#endif

4
run.sh
View File

@ -4,10 +4,10 @@ thread=1
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
#reference=~/data/reference/human_g1k_v37_decoy.fasta
#n_r1=~/fastq/sn_r1.fq
#n_r2=~/fastq/sn_r2.fq
n_r1=~/fastq/sn_r1.fq
n_r2=~/fastq/sn_r2.fq
#n_r1=~/fastq/ssn_r1.fq
#n_r2=~/fastq/ssn_r2.fq
#n_r1=~/fastq/tiny_n_r1.fq
#n_r2=~/fastq/tiny_n_r2.fq
#n_r1=~/fastq/diff_r1.fq

View File

@ -37,13 +37,17 @@ extern int64_t time_ksw_extend2,
time_ksw_global2,
time_ksw_align2,
time_bwt_smem1a,
time_seed_1,
time_seed_2,
time_seed_3,
time_fmt_smem_0,
time_bwt_extend,
time_bwt_occ4,
time_bwt_sa,
time_bwt_sa_read;
time_bwt_sa_read,
time_bns;
extern int64_t dn, n16, n17, n18, n19, nall;
extern int64_t dn, n16, n17, n18, n19, nall, num_sa;
extern FILE *fp1;