实现了seed过程的所有加速想法,seed部分实现了3倍左右加速比
This commit is contained in:
parent
4a1c2cd3db
commit
fc2e0d9b0b
|
|
@ -18,8 +18,8 @@
|
|||
"-R",
|
||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||
"~/reference/human_g1k_v37_decoy.fasta",
|
||||
"~/fastq/d_r1.fq",
|
||||
"~/fastq/d_r2.fq",
|
||||
"~/fastq/tiny_n_r1.fq",
|
||||
"~/fastq/tiny_n_r2.fq",
|
||||
"-o",
|
||||
"/dev/null"
|
||||
],
|
||||
|
|
|
|||
22
bwa.c
22
bwa.c
|
|
@ -284,7 +284,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
|||
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
|
||||
bwt = bwt_restore_bwt(tmp);
|
||||
fprintf(stderr, "zzh-1\n");
|
||||
strcat(strcpy(tmp, prefix), ".33.4.sa"); // partial suffix array (SA)
|
||||
strcat(strcpy(tmp, prefix), ".33.2.sa"); // partial suffix array (SA)
|
||||
bwt_restore_sa(tmp, bwt);
|
||||
fprintf(stderr, "zzh-after-sa\n");
|
||||
free(tmp); free(prefix);
|
||||
|
|
@ -293,13 +293,14 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
|||
|
||||
FMTIndex *bwa_idx_load_fmt(const char *hint)
|
||||
{
|
||||
char *fmt_idx_fn, *kmer_idx_fn, *kmer_bit_fn, *sa_fn;
|
||||
char *fmt_idx_fn, *kmer_idx_fn, *sa_fn;
|
||||
// char *kmer_bit_fn;
|
||||
FMTIndex *fmt;
|
||||
char suffix[32];
|
||||
int l_hint = strlen(hint);
|
||||
fmt_idx_fn = malloc(l_hint + 32);
|
||||
kmer_idx_fn = malloc(l_hint + 32);
|
||||
kmer_bit_fn = malloc(l_hint + 32);
|
||||
//kmer_bit_fn = malloc(l_hint + 32);
|
||||
sa_fn = malloc(l_hint + 32);
|
||||
sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL);
|
||||
strcpy(fmt_idx_fn, hint);
|
||||
|
|
@ -319,12 +320,12 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
|
|||
fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn);
|
||||
|
||||
// restore kmer bit
|
||||
fmt->bit_kmer_len = 18;
|
||||
sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len);
|
||||
strcpy(kmer_bit_fn, hint);
|
||||
strcpy(kmer_bit_fn + l_hint, suffix);
|
||||
fprintf(stderr, "%s\n", kmer_bit_fn);
|
||||
fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len);
|
||||
// fmt->bit_kmer_len = 18;
|
||||
// sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len);
|
||||
// strcpy(kmer_bit_fn, hint);
|
||||
// strcpy(kmer_bit_fn + l_hint, suffix);
|
||||
// fprintf(stderr, "%s\n", kmer_bit_fn);
|
||||
// fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len);
|
||||
|
||||
strcpy(sa_fn, hint);
|
||||
sprintf(suffix, ".33.%d.sa", SA_INTV);
|
||||
|
|
@ -369,6 +370,9 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
|||
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
||||
err_fclose(idx->bns->fp_pac);
|
||||
idx->bns->fp_pac = 0;
|
||||
// 赋值到fmt中对应的pac
|
||||
idx->fmt->l_pac = idx->bns->l_pac;
|
||||
idx->fmt->pac = idx->pac;
|
||||
}
|
||||
}
|
||||
free(prefix);
|
||||
|
|
|
|||
102
bwamem.c
102
bwamem.c
|
|
@ -158,7 +158,9 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
//fprintf(fp1, "seq: %ld\n", dn++);
|
||||
//fprintf(stderr, "seq: %ld\n", dn++);
|
||||
//dn ++;
|
||||
//goto third_seed;
|
||||
|
||||
#define USE_FMT 1
|
||||
// goto third_seed;
|
||||
|
||||
while (x < len) {
|
||||
if (seq[x] < 4) {
|
||||
|
|
@ -166,25 +168,27 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
#ifdef SHOW_PERF
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
//x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
|
||||
#if USE_FMT
|
||||
x = fmt_smem(fmt, len, seq, x, start_width, &a->mem1, a->tmpv);
|
||||
|
||||
#else
|
||||
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
|
||||
#endif
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_smem1a, tmp_time);
|
||||
__sync_fetch_and_add(&time_seed_1, tmp_time);
|
||||
#endif
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||
//if (slen < 16) ++n16;
|
||||
//if (slen < 17) ++n17;
|
||||
//if (slen < 18) ++n18;
|
||||
//if (slen < 19) ++n19;
|
||||
//++nall;
|
||||
max_seed_len = fmax(max_seed_len, slen);
|
||||
if (slen >= opt->min_seed_len)
|
||||
if (slen >= opt->min_seed_len) {
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
kv_push(bwtintv_t, a->mem, *p);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
++x;
|
||||
|
|
@ -195,7 +199,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
// second pass: find MEMs inside a long SMEM
|
||||
//if (max_seed_len == len - start_N_num)
|
||||
// goto collect_intv_end;
|
||||
// goto third_seed;
|
||||
//goto third_seed;
|
||||
|
||||
old_n = a->mem.n;
|
||||
for (k = 0; k < old_n; ++k) {
|
||||
|
|
@ -205,24 +209,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
#ifdef SHOW_PERF
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
//bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
|
||||
#if USE_FMT
|
||||
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
|
||||
#else
|
||||
bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
|
||||
#endif
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_smem1a, tmp_time);
|
||||
__sync_fetch_and_add(&time_seed_2, tmp_time);
|
||||
#endif
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
//bwtintv_t *p = &a->mem1.a[i];
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//int slen = (uint32_t)p->info - (p->info >> 32);
|
||||
//if (slen < 16) ++n16;
|
||||
//if (slen < 17) ++n17;
|
||||
//if (slen < 18) ++n18;
|
||||
//if (slen < 19) ++n19;
|
||||
//++nall;
|
||||
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info >> 32) >= opt->min_seed_len)
|
||||
int slen = (uint32_t)p->info - (p->info >> 32);
|
||||
if (slen >= opt->min_seed_len) {
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
//if (max_seed_len == len - start_N_num)
|
||||
|
|
@ -239,11 +243,15 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
#ifdef SHOW_PERF
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
//x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||
|
||||
#if USE_FMT
|
||||
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||
#else
|
||||
x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||
#endif
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_smem1a, tmp_time);
|
||||
__sync_fetch_and_add(&time_seed_3, tmp_time);
|
||||
#endif
|
||||
//bwtintv_t *p = &m;
|
||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
|
|
@ -262,7 +270,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
}
|
||||
}
|
||||
|
||||
//collect_intv_end:
|
||||
collect_intv_end:
|
||||
// sort
|
||||
ks_introsort(mem_intv, a->mem.n, a->mem.a);
|
||||
}
|
||||
|
|
@ -381,6 +389,43 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm
|
|||
int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||||
int64_t k;
|
||||
// if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
|
||||
//if (0) {
|
||||
if (p->num_match > 0) {
|
||||
//continue;
|
||||
for (k = 0; k < p->num_match; ++k) {
|
||||
mem_chain_t tmp, *lower, *upper;
|
||||
mem_seed_t s;
|
||||
int rid, to_add = 0;
|
||||
s.rbeg = p->rm[k].rs;
|
||||
if (p->rm[k].reverse) {
|
||||
s.rbeg = (fmt->l_pac << 1) - 1 - s.rbeg;
|
||||
}
|
||||
tmp.pos = s.rbeg;
|
||||
s.qbeg = p->info >> 32;
|
||||
s.score = s.len = slen;
|
||||
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
|
||||
if (rid < 0) continue;
|
||||
if (kb_size(tree))
|
||||
{
|
||||
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
||||
if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid))
|
||||
to_add = 1;
|
||||
}
|
||||
else
|
||||
to_add = 1;
|
||||
if (to_add)
|
||||
{ // add the seed as a new chain
|
||||
tmp.n = 1;
|
||||
tmp.m = 4;
|
||||
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
|
||||
tmp.seeds[0] = s;
|
||||
tmp.rid = rid;
|
||||
tmp.is_alt = !!bns->anns[rid].is_alt;
|
||||
kb_putp(chn, tree, &tmp);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
|
||||
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
|
||||
mem_chain_t tmp, *lower, *upper;
|
||||
|
|
@ -389,14 +434,23 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm
|
|||
#ifdef SHOW_PERF
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
// s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k);
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_sa, tmp_time);
|
||||
__sync_fetch_and_add(&num_sa, 1);
|
||||
#endif
|
||||
s.qbeg = p->info >> 32;
|
||||
s.score= s.len = slen;
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec();
|
||||
#endif
|
||||
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bns, tmp_time);
|
||||
#endif
|
||||
if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary; TODO: split the seed; don't discard it!!!
|
||||
if (kb_size(tree)) {
|
||||
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
||||
|
|
|
|||
22
bwt.c
22
bwt.c
|
|
@ -61,6 +61,10 @@ static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inver
|
|||
|
||||
x = t2;
|
||||
|
||||
// bwtint_t cnt[4], cnt1;
|
||||
// bwt_occ4(bwt, k, cnt);
|
||||
// cnt1 = bwt_occ(bwt, k, x);
|
||||
|
||||
x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置(后缀索引),
|
||||
return k == bwt->primary? 0 : x;
|
||||
}
|
||||
|
|
@ -137,14 +141,14 @@ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
|||
}
|
||||
/* without setting bwt->sa[0] = -1, the following line should be
|
||||
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
|
||||
#ifdef SHOW_PERF
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
//#ifdef SHOW_PERF
|
||||
// int64_t tmp_time = realtime_msec();
|
||||
//#endif
|
||||
sa += bwt_get_sa(bwt->sa, k / bwt->sa_intv);
|
||||
#ifdef SHOW_PERF
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
|
||||
#endif
|
||||
//#ifdef SHOW_PERF
|
||||
// tmp_time = realtime_msec() - tmp_time;
|
||||
// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
|
||||
//#endif
|
||||
return sa;
|
||||
}
|
||||
|
||||
|
|
@ -354,7 +358,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
|
|||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
int i, j, c, ret;
|
||||
bwtintv_t ik, ok[4];
|
||||
bwtintv_t ik = {0}, ok[4] = {0};
|
||||
bwtintv_v a[2], *prev, *curr, *swap;
|
||||
|
||||
mem->n = 0;
|
||||
|
|
@ -447,7 +451,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
|
||||
{
|
||||
int i, c;
|
||||
bwtintv_t ik, ok[4];
|
||||
bwtintv_t ik = {0}, ok[4] = {0};
|
||||
|
||||
memset(mem, 0, sizeof(bwtintv_t));
|
||||
if (q[x] > 3) return x + 1;
|
||||
|
|
|
|||
9
bwt.h
9
bwt.h
|
|
@ -59,8 +59,17 @@ typedef struct {
|
|||
uint8_t *sa;
|
||||
} bwt_t;
|
||||
|
||||
typedef struct {
|
||||
int qs; // query start
|
||||
int qe; // query end [)
|
||||
int reverse; // 反向链
|
||||
uint32_t rs; // 参考基因的起始点(包含)
|
||||
} ref_match_t;
|
||||
|
||||
typedef struct {
|
||||
bwtint_t x[3], info;
|
||||
int num_match;
|
||||
ref_match_t rm[2];
|
||||
} bwtintv_t;
|
||||
|
||||
typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
|
||||
|
|
|
|||
15
fastmap.c
15
fastmap.c
|
|
@ -48,13 +48,17 @@ int64_t time_ksw_extend2 = 0,
|
|||
time_ksw_global2 = 0,
|
||||
time_ksw_align2 = 0,
|
||||
time_bwt_smem1a = 0,
|
||||
time_seed_1 = 0,
|
||||
time_seed_2 = 0,
|
||||
time_seed_3 = 0,
|
||||
time_fmt_smem_0 = 0,
|
||||
time_bwt_extend = 0,
|
||||
time_bwt_occ4 = 0,
|
||||
time_bwt_sa = 0,
|
||||
time_bwt_sa_read = 0;
|
||||
time_bwt_sa_read = 0,
|
||||
time_bns = 0;
|
||||
|
||||
int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0;
|
||||
int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0;
|
||||
|
||||
FILE *fp1;
|
||||
|
||||
|
|
@ -381,7 +385,6 @@ int main_mem(int argc, char *argv[])
|
|||
}
|
||||
} else update_a(opt, &opt0);
|
||||
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
fprintf(stderr, "zzh-1\n");
|
||||
aux.idx = bwa_idx_load_from_shm(argv[optind]);
|
||||
if (aux.idx == 0) {
|
||||
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||
|
|
@ -396,7 +399,6 @@ int main_mem(int argc, char *argv[])
|
|||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "zzh-2\n");
|
||||
fp = gzdopen(fd, "r");
|
||||
aux.ks = kseq_init(fp);
|
||||
if (optind + 2 < argc) {
|
||||
|
|
@ -432,16 +434,21 @@ int main_mem(int argc, char *argv[])
|
|||
#ifdef SHOW_PERF
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "time_bwt_smem1a: %f s\n", time_bwt_smem1a / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_fmt_smem_0: %f s\n", time_fmt_smem_0 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bwt_extend: %f s\n", time_bwt_extend / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bwt_sa_read: %f s\n", time_bwt_sa_read / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bns: %f s\n", time_bns / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "all seed num: %ld\n", nall);
|
||||
fprintf(stderr, "all seed num litter than 19: %ld\n", n19);
|
||||
fprintf(stderr, "all seed num litter than 18: %ld\n", n18);
|
||||
fprintf(stderr, "all seed num litter than 17: %ld\n", n17);
|
||||
fprintf(stderr, "all seed num litter than 16: %ld\n", n16);
|
||||
fprintf(stderr, "get sa num: %ld\n", num_sa);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
fclose(fp1);
|
||||
|
|
|
|||
255
fmt_idx.c
255
fmt_idx.c
|
|
@ -357,7 +357,7 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
|
|||
{
|
||||
uint32_t x = 0;
|
||||
uint32_t *p, *q, tmp;
|
||||
bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK);
|
||||
bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK); // cp = check point
|
||||
int i, ti = b1 << 2 | b2;
|
||||
cnt[0] = 0;
|
||||
cnt[1] = 0;
|
||||
|
|
@ -462,7 +462,7 @@ inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwti
|
|||
#endif
|
||||
#endif
|
||||
bwtint_t tk[4], tl[4];
|
||||
bwtintv_t intv;
|
||||
bwtintv_t intv = {0};
|
||||
// tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积
|
||||
fmt_e2_occ(fmt, ik->x[!is_back] - 1, b1, b2, tk);
|
||||
fmt_e2_occ(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tl);
|
||||
|
|
@ -629,95 +629,18 @@ inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int
|
|||
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
|
||||
}
|
||||
|
||||
// 当x为0,或者q[x-1]为N时,只需要前向搜索即可
|
||||
int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwtintv_v *mem)
|
||||
{
|
||||
#ifdef SHOW_PERF
|
||||
#if 1
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
#endif
|
||||
int i, j = 1, ret, kmer_len;
|
||||
const int min_intv = 1;
|
||||
bwtintv_t ik, ok1, ok2;
|
||||
uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
mem->n = 0;
|
||||
|
||||
fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++);
|
||||
while (ik.x[2] == 0) {
|
||||
fmt_kmer_get(fmt, &ik, qbit, kmer_len - j++);
|
||||
}
|
||||
if (j != 2) { // kmer hash没有找到对应的匹配
|
||||
ik.info = x + kmer_len - j + 2;
|
||||
goto fmt_smem_forward_end;
|
||||
}
|
||||
ik.info = x + kmer_len;
|
||||
|
||||
// 继续向前扩展
|
||||
for (i = x + kmer_len; i + 1 < len; i += 2) {
|
||||
if (ik.x[2] < 2)
|
||||
goto fmt_smem_forward_end;
|
||||
if (q[i] < 4 && q[i + 1] < 4) // 两个都可以扩展
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
if (ok2.x[2] >= min_intv) { // 可以继续扩展
|
||||
ik = ok2; ik.info = i + 2;
|
||||
} else if (ok1.x[2] >= min_intv) { // 第二个间隔不够
|
||||
ik = ok1; ik.info = i + 1;
|
||||
goto fmt_smem_forward_end;
|
||||
} else { // 两个间隔都不够
|
||||
goto fmt_smem_forward_end;
|
||||
}
|
||||
}
|
||||
else if (q[i] < 4) // q[i+1] >= 4,只能扩展一个
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
if (ok1.x[2] >= min_intv) {
|
||||
ik = ok1; ik.info = i + 1;
|
||||
}
|
||||
goto fmt_smem_forward_end;
|
||||
}
|
||||
else { // q[i] >= 4
|
||||
goto fmt_smem_forward_end;
|
||||
}
|
||||
}
|
||||
if (i == len - 1) // 扩展到了最后一个碱基
|
||||
{
|
||||
if (q[i] < 4) {
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
if (ok1.x[2] >= min_intv) {
|
||||
ik = ok1; ik.info = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fmt_smem_forward_end:
|
||||
ret = ik.info;
|
||||
ik.info |= (uint64_t)x << 32;
|
||||
if (fmt->bit_kmer_len <= (uint32_t)ik.info - (ik.info >> 32))
|
||||
kv_push(bwtintv_t, *mem, ik);
|
||||
#ifdef SHOW_PERF
|
||||
#if 1
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_fmt_smem_0, tmp_time);
|
||||
#endif
|
||||
#endif
|
||||
return ret;
|
||||
//return len;
|
||||
}
|
||||
|
||||
// 找smem(seed)
|
||||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
int i, j, ret, kmer_len;
|
||||
bwtintv_t ik, ok1, ok2;
|
||||
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
|
||||
bwtintv_t mt = {0};
|
||||
bwtintv_v a[1], *curr;
|
||||
uint32_t qbit = 0;
|
||||
mem->n = 0;
|
||||
int only_forward = x == 0 || q[x - 1] > 3;
|
||||
|
||||
if (q[x] > 3) return x + 1;
|
||||
//if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展
|
||||
|
||||
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
|
||||
kv_init(a[0]);
|
||||
|
|
@ -741,6 +664,8 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
|
||||
PUSH_VAL_AND_SKIP(ik);
|
||||
|
||||
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
|
||||
|
||||
// 扩展kmer之后的碱基
|
||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||
{ // forward search
|
||||
|
|
@ -749,6 +674,99 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
CHECK_INTV_CHANGE(ik, ok1, i + 1);
|
||||
CHECK_INTV_CHANGE(ik, ok2, i + 2);
|
||||
// 在这里进行判断是否只有一个候选了
|
||||
//if (0)
|
||||
if (min_intv < 2 && ok2.x[2] == min_intv)
|
||||
{ // 最多处理两个
|
||||
mt.num_match = min_intv;
|
||||
int64_t r, rp;
|
||||
int k;
|
||||
int max_qs = 0, min_qe = len;
|
||||
for (j = 0; j < ok2.x[2]; ++j)
|
||||
{
|
||||
rp = fmt_sa(fmt, ok2.x[0] + j);
|
||||
r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
|
||||
k = i + 2;
|
||||
if (rp < fmt->l_pac) // 匹配到了正向链
|
||||
{
|
||||
// 向前继续扩展
|
||||
r += i + 2 - x;
|
||||
while (k < len && r < fmt->l_pac) {
|
||||
int base = PAC_BASE(fmt->pac, r);
|
||||
if (q[k] != base)
|
||||
break;
|
||||
++k;
|
||||
++r;
|
||||
}
|
||||
mt.rm[j].qe = k;
|
||||
mt.rm[j].reverse = 0;
|
||||
// 向后扩展,x位置之前的碱基
|
||||
r -= k - x + 1;
|
||||
k = x - 1;
|
||||
while (k > -1 && r > -1) {
|
||||
int base = PAC_BASE(fmt->pac, r);
|
||||
if (q[k] != base)
|
||||
break;
|
||||
--k;
|
||||
--r;
|
||||
}
|
||||
mt.rm[j].qs = k + 1;
|
||||
mt.rm[j].rs = r + 1;
|
||||
}
|
||||
else // 匹配到了互补链
|
||||
{
|
||||
r -= i + 2 - x;
|
||||
while (k < len && r > -1)
|
||||
{
|
||||
const int base = 3 - PAC_BASE(fmt->pac, r);
|
||||
if (q[k] != base)
|
||||
break;
|
||||
++k;
|
||||
--r;
|
||||
}
|
||||
mt.rm[j].qe = k;
|
||||
mt.rm[j].reverse = 1;
|
||||
// 扩展x之前的碱基
|
||||
r += k - x + 1;
|
||||
k = x - 1;
|
||||
while (k > -1 && r < fmt->l_pac)
|
||||
{
|
||||
const int base = 3 - PAC_BASE(fmt->pac, r);
|
||||
if (q[k] != base)
|
||||
break;
|
||||
--k;
|
||||
++r;
|
||||
}
|
||||
mt.rm[j].qs = k + 1;
|
||||
mt.rm[j].rs = r - 1;
|
||||
}
|
||||
max_qs = max_qs < mt.rm[j].qs ? mt.rm[j].qs : max_qs;
|
||||
min_qe = min_qe > mt.rm[j].qe ? mt.rm[j].qe : min_qe;
|
||||
}
|
||||
if (min_intv > 1)
|
||||
{
|
||||
// 修正两个匹配的位置,使比对的碱基个数和位置一致
|
||||
for (j = 0; j < min_intv; ++j)
|
||||
{
|
||||
if (mt.rm[j].reverse)
|
||||
mt.rm[j].rs -= max_qs - mt.rm[j].qs;
|
||||
else
|
||||
mt.rm[j].rs += max_qs - mt.rm[j].qs;
|
||||
mt.rm[j].qs = max_qs;
|
||||
mt.rm[j].qe = min_qe;
|
||||
}
|
||||
}
|
||||
mt.info = mt.rm[0].qs;
|
||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
||||
mt.x[2] = min_intv;
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
ret = (uint32_t)mt.info;
|
||||
if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
{
|
||||
goto fmt_smem_end;
|
||||
}
|
||||
goto backward_search;
|
||||
}
|
||||
} else if (q[i] < 4) // q[i+1] >= 4
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
|
|
@ -775,7 +793,8 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
|
||||
backward_search:
|
||||
fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
||||
ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置
|
||||
if (mt.num_match == 0)
|
||||
ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置
|
||||
|
||||
// 按照种子进行遍历,反向扩展
|
||||
#define CHECK_ADD_MEM(pos, intv, mem) \
|
||||
|
|
@ -788,18 +807,8 @@ backward_search:
|
|||
{
|
||||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
||||
uint64_t qbit = 0;
|
||||
// if (p->info - x < fmt->bit_kmer_len) {
|
||||
// qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer
|
||||
// if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf))))
|
||||
// continue;
|
||||
// }
|
||||
if (!only_forward && p->info - x < HASH_KMER_LEN) {
|
||||
//qbit = build_backward_kmer(q, p->info - 1, fmt->bit_kmer_len, &kmer_len); // 创建反向kmer
|
||||
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
|
||||
//if (!(fmt->kmer_bit[qbit >> 4] & (1 << (qbit & 0xf))))
|
||||
// continue;
|
||||
//qbit >>= (fmt->bit_kmer_len - HASH_KMER_LEN) << 1;
|
||||
//kmer_len = kmer_len > HASH_KMER_LEN ? HASH_KMER_LEN : kmer_len;
|
||||
i = 1;
|
||||
do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
|
||||
if (i > 2) continue;
|
||||
|
|
@ -859,7 +868,7 @@ fmt_smem_end:
|
|||
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
|
||||
{
|
||||
int i, kmer_len;
|
||||
bwtintv_t ik, ok1, ok2;
|
||||
bwtintv_t ik = {0}, ok1={0}, ok2={0};
|
||||
uint64_t qbit;
|
||||
|
||||
|
||||
|
|
@ -905,4 +914,64 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
// 这里的k是bwt str的行
|
||||
inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_t *b1, uint8_t *b2)
|
||||
{
|
||||
uint32_t *p;
|
||||
uint8_t base2;
|
||||
|
||||
// 第一步,找到check point位置
|
||||
p = fmt_occ_intv(fmt, k); // check point起始位置
|
||||
p += 20; // bwt碱基起始位置
|
||||
// 第二步,找到mid check point位置
|
||||
int mk = k & FMT_OCC_INTV_MASK;
|
||||
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
|
||||
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置
|
||||
// 第三步,找到具体的uint32_t
|
||||
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基(和8个倒数第二bwt碱基)
|
||||
// 第四步,获取碱基
|
||||
|
||||
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
|
||||
*b2 = base2 >> 2 & 3;
|
||||
*b1 = base2 & 3;
|
||||
}
|
||||
|
||||
// k, k1, k2都是bwt矩阵对应的行
|
||||
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
||||
{
|
||||
uint8_t b1, b2;
|
||||
bwtint_t tk[4], kk;
|
||||
kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1)
|
||||
fmt_get_previous_base(fmt, kk, &b1, &b2);
|
||||
fmt_e2_occ(fmt, k, b1, b2, tk);
|
||||
*k1 = fmt->L2[b1] + tk[1];
|
||||
*k2 = fmt->L2[b2] + tk[3];
|
||||
}
|
||||
|
||||
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
||||
{
|
||||
bwtint_t sa = 0, mask = fmt->sa_intv - 1;
|
||||
bwtint_t k1, k2;
|
||||
while (k & mask)
|
||||
{
|
||||
++sa;
|
||||
fmt_previous_line(fmt, k, &k1, &k2);
|
||||
if (!(k1 & mask)) {
|
||||
k = k1;
|
||||
break;
|
||||
}
|
||||
++sa;
|
||||
k = k2;
|
||||
}
|
||||
//#ifdef SHOW_PERF
|
||||
// int64_t tmp_time = realtime_msec();
|
||||
//#endif
|
||||
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
|
||||
//#ifdef SHOW_PERF
|
||||
// tmp_time = realtime_msec() - tmp_time;
|
||||
// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
|
||||
//#endif
|
||||
return sa;
|
||||
}
|
||||
|
|
@ -17,7 +17,7 @@ Date : 2023/12/24
|
|||
#define FMT_OCC_INTERVAL (1 << FMT_OCC_INTV_SHIFT)
|
||||
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
|
||||
|
||||
#define FMT_MID_INTV_SHIFT 5
|
||||
#define FMT_MID_INTV_SHIFT 6
|
||||
#define FMT_MID_INTERVAL (1 << FMT_MID_INTV_SHIFT)
|
||||
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
|
||||
|
||||
|
|
@ -91,8 +91,12 @@ typedef struct
|
|||
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
|
||||
uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3
|
||||
uint8_t last_base; // dollar转换成的base
|
||||
// ref pac相关
|
||||
bwtint_t l_pac; // 参考序列长度
|
||||
uint8_t *pac; // 保存2bit编码的参考序列
|
||||
// 保存kmer对应的fmt位置信息
|
||||
KmerHash kmer_hash;
|
||||
// kmer_bit好像效果一般
|
||||
uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配
|
||||
int bit_kmer_len;
|
||||
// suffix array
|
||||
|
|
@ -132,4 +136,7 @@ void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
|
|||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
|
||||
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
|
||||
|
||||
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k);
|
||||
|
||||
#endif
|
||||
4
run.sh
4
run.sh
|
|
@ -4,10 +4,10 @@ thread=1
|
|||
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
|
||||
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
|
||||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
#n_r1=~/fastq/sn_r1.fq
|
||||
#n_r2=~/fastq/sn_r2.fq
|
||||
n_r1=~/fastq/sn_r1.fq
|
||||
n_r2=~/fastq/sn_r2.fq
|
||||
#n_r1=~/fastq/ssn_r1.fq
|
||||
#n_r2=~/fastq/ssn_r2.fq
|
||||
#n_r1=~/fastq/tiny_n_r1.fq
|
||||
#n_r2=~/fastq/tiny_n_r2.fq
|
||||
#n_r1=~/fastq/diff_r1.fq
|
||||
|
|
|
|||
8
utils.h
8
utils.h
|
|
@ -37,13 +37,17 @@ extern int64_t time_ksw_extend2,
|
|||
time_ksw_global2,
|
||||
time_ksw_align2,
|
||||
time_bwt_smem1a,
|
||||
time_seed_1,
|
||||
time_seed_2,
|
||||
time_seed_3,
|
||||
time_fmt_smem_0,
|
||||
time_bwt_extend,
|
||||
time_bwt_occ4,
|
||||
time_bwt_sa,
|
||||
time_bwt_sa_read;
|
||||
time_bwt_sa_read,
|
||||
time_bns;
|
||||
|
||||
extern int64_t dn, n16, n17, n18, n19, nall;
|
||||
extern int64_t dn, n16, n17, n18, n19, nall, num_sa;
|
||||
|
||||
extern FILE *fp1;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue