From 8c2a90e3e24bb1f25bcccbbb8c61fe51ce82f9a7 Mon Sep 17 00:00:00 2001 From: zzh Date: Mon, 24 Jun 2024 18:11:54 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E5=8F=8C=E7=BA=BF?= =?UTF-8?q?=E7=A8=8B=E8=A7=A3=E5=8E=8B=E8=AF=BB=E5=86=99fastq.gz=E7=9A=84b?= =?UTF-8?q?ug=EF=BC=8C=E4=BF=AE=E6=94=B9=E4=BA=86=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/settings.json | 9 +- Makefile | 9 +- bwa.c | 2 +- bwamem.c | 351 +++++++++------------------- bwamem.h | 8 - bwamem_pair.c | 8 +- debug.c | 59 +++++ debug.h | 28 +++ debug.sh | 10 +- fastmap.c | 248 +++----------------- fmt_idx.c | 518 +++++++++++++++++++++++++++++++++++++----- fmt_idx.h | 2 + ksw.c | 46 +++- ksw_align_avx2.c | 75 ++++++ ksw_extend2_avx2.c | 172 +++++++------- ksw_extend2_avx2_u8.c | 42 +--- profiling.c | 110 +++++++++ profiling.h | 89 ++++++++ rle.h | 7 +- run.sh | 19 +- utils.h | 42 +--- 21 files changed, 1132 insertions(+), 722 deletions(-) create mode 100644 debug.c create mode 100644 debug.h create mode 100644 ksw_align_avx2.c create mode 100644 profiling.c create mode 100644 profiling.h diff --git a/.vscode/settings.json b/.vscode/settings.json index 0121c4d..691d63c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -22,6 +22,13 @@ "stdlib.h": "c", "array": "c", "initializer_list": "c", - "utility": "c" + "utility": "c", + "fmt_idx.h": "c", + "profiling.h": "c", + "neon_sse.h": "c", + "scalar_sse.h": "c", + "immintrin.h": "c", + "ksw.h": "c", + "debug.h": "c" } } \ No newline at end of file diff --git a/Makefile b/Makefile index 3fd821a..da11aa1 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF SHOW_DATA_PERF= #-DSHOW_DATA_PERF -FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH +FILTER_FULL_MATCH= -DFILTER_FULL_MATCH USE_MT_READ= -DUSE_MT_READ AR= ar @@ -14,10 +14,11 @@ DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_F LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \ QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o yarn.o AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - bwape.o kopen.o pemerge.o maxk.o \ + bwape.o kopen.o pemerge.o maxk.o ksw_align_avx2.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o fastmap.o bwtsw2_pair.o \ - fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o + bwtsw2_chain.o fastmap.o bwtsw2_pair.o profiling.o \ + fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o \ + debug.o PROG= fastbwa INCLUDES= LIBS= -lm -lz -lpthread -ldl diff --git a/bwa.c b/bwa.c index c02f9ea..a595973 100644 --- a/bwa.c +++ b/bwa.c @@ -154,7 +154,7 @@ void bseq_read_pe_mt(int chunk_size, int *n_, void *ks1_, void *ks2_, int copy_c if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } - *n_ = n; *seqs_ptr = seqs; + *n_ = n; *seqs_ptr = seqs; *size_ = size; *m_ = n; return; } m = (chunk_size + size / init_n_reads - 1) / (size / init_n_reads); diff --git a/bwamem.c b/bwamem.c index 1844e59..c1609ec 100644 --- a/bwamem.c +++ b/bwamem.c @@ -43,6 +43,8 @@ #include "ksort.h" #include "utils.h" #include "fmt_idx.h" +#include "profiling.h" +#include "debug.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -127,14 +129,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len, int start_N_num = 0, start_flag = 1; a->mem.n = 0; -#ifdef SHOW_PERF - uint64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif // 1. first pass: find all SMEMs while (x < len) { if (seq[x] < 4) { @@ -154,15 +148,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len, if (start_flag) ++start_N_num; } } -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - a->time_seed_1 += tmp_time2 - tmp_time1; - tmp_time1 = tmp_time2; -#endif #ifdef FILTER_FULL_MATCH if (max_seed_len == len - start_N_num) goto collect_intv_end; @@ -183,15 +168,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len, } } } -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - a->time_seed_2 += tmp_time2 - tmp_time1; - tmp_time1 = tmp_time2; -#endif // 3. third pass: LAST-like if (opt->max_mem_intv > 0) { @@ -206,14 +182,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len, } else ++x; } } -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - a->time_seed_3 += tmp_time2 - tmp_time1; -#endif #ifdef FILTER_FULL_MATCH collect_intv_end: @@ -371,15 +339,8 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t //step = 1; for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { mem_seed_t s; -#ifdef SHOW_PERF - uint64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif s.rbeg = fmt_sa(fmt, p->x[0] + k); + //kv_push(uint64_t, v1, s.rbeg); //fprintf(stderr, "%ld\t", s.rbeg); //if (p->x[0] == 0) @@ -397,14 +358,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t //} //s.rbeg = fmt_sa(fmt, p->x[0] + k); -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - aux->time_sa += tmp_time2 - tmp_time1; -#endif CHECK_ADD_CHAIN(tmp, lower, upper); } //fprintf(stderr, "\n"); @@ -424,15 +377,15 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t #if 0 for (i = 0; i < chain.n; ++i) { - fprintf(gfp1, "chain-begin: %d\n", i); + fprintf(gf[0], "chain-begin: %d\n", i); int j; - fprintf(gfp1, "chain-%ld %d\n", chain.a[i].pos, chain.a[i].rid); + fprintf(gf[0], "chain-%ld %d\n", chain.a[i].pos, chain.a[i].rid); for (j = 0; j < chain.a[i].n; ++j) { - fprintf(gfp1, "chain-%d %ld %d\n", chain.a[i].seeds[j].qbeg, chain.a[i].seeds[j].rbeg, chain.a[i].seeds[j].len); + fprintf(gf[0], "chain-%d %ld %d\n", chain.a[i].seeds[j].qbeg, chain.a[i].seeds[j].rbeg, chain.a[i].seeds[j].len); } - fprintf(gfp1, "chain-end\n"); + fprintf(gf[0], "chain-end\n"); } - fprintf(gfp1, "fun-end\n"); + fprintf(gf[0], "fun-end\n"); #endif // ks_introsort(uint64_sort, v1.n, v1.a); // ks_introsort(uint64_sort, v2.n, v2.a); @@ -763,7 +716,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) #define MAX_BAND_TRY 2 -void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av, void *buf) +void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av, void *buf, int tid) { int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; @@ -867,29 +820,13 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rseq[tmp - 1 - j]]); putchar('\n'); printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)query[s->qbeg - 1 - j]]); putchar('\n'); } -#ifdef SHOW_PERF - uint64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - + PROF_START(bsw); #ifndef USE_AVX2 a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf); #else a->score = ksw_extend2_avx2(s->qbeg, query, tmp, rseq, 1, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf); #endif - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - aux->time_bsw += tmp_time2 - tmp_time1; -#endif + PROF_END(tprof[T_BSW][tid], bsw); if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } @@ -919,27 +856,13 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n'); printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n'); } -#ifdef SHOW_PERF - uint64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif + PROF_START(bsw); #ifndef USE_AVX2 a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf); #else a->score = ksw_extend2_avx2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 0, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf); #endif -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - aux->time_bsw += tmp_time2 - tmp_time1; -#endif + PROF_END(tprof[T_BSW][tid], bsw); if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } @@ -1255,7 +1178,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const FMTIndex *fmt, const bn for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s, buf); + mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s, buf, 0); free(chn.a[i].seeds); } free(chn.a); @@ -1363,7 +1286,7 @@ static void worker1(void *data, int i, int tid) static void worker2(void *data, int i, int tid) { - extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2]); + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid); extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); mem_worker_t *w = (mem_worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { @@ -1374,7 +1297,7 @@ static void worker2(void *data, int i, int tid) free(w->regs[i].a); } else { if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1], &w->sams[i<<1]); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1], &w->sams[i<<1], tid); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } } @@ -1387,7 +1310,6 @@ static smem_aux_t *smem_aux_init() a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); a->sw_buf = calloc(1, sizeof(buf_t)); a->seq_buf = calloc(1, sizeof(buf_t)); - a->time_seed_1 = a->time_seed_2 = a->time_seed_3 = a->time_bsw = a->time_sa = 0; return a; } @@ -1424,8 +1346,10 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b } return w; } +#include "khash.h" +KHASH_MAP_INIT_INT64(seed, uint64_t); -static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a) +static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a, int tid) { int i, k, x = 0, old_n; int start_width = 1; @@ -1433,29 +1357,22 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in int max_seed_len = 0; int start_N_num = 0, start_flag = 1; smem->mem.n = 0; - -#ifdef SHOW_PERF - uint64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif + //int s1_num = 0; // 1. first pass: find all SMEMs - //fprintf(gfp1, "read\n"); + PROF_START(seed_1); + // fprintf(gf[0], "read\n"); while (x < len) { if (seq[x] < 4) { start_flag = 0; -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT #ifdef COUNT_SEED_LENGTH int last_x = x; #endif #endif x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]); -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT #ifdef COUNT_SEED_LENGTH - fprintf(gfp1, "%d\n", x - last_x); + fprintf(gf[0], "%d\n", x - last_x); #endif #endif for (i = 0; i < a->mem1.n; ++i) { @@ -1481,25 +1398,17 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in gdat[1] += 1; // seed-1 full match num ++ #endif -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - a->time_seed_1 += tmp_time2 - tmp_time1; - tmp_time1 = tmp_time2; -#endif + PROF_END(tprof[T_SEED_1][tid], seed_1); #ifdef FILTER_FULL_MATCH if (max_seed_len == len - start_N_num) { -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT if (start_N_num == 0) { #ifdef GET_FULL_MATCH_READ for (i = 0; i < len; ++i) - fprintf(gfp1, "%c", "ACGT"[seq[i]]); - fprintf(gfp1, "\n"); + fprintf(gf[0], "%c", "ACGT"[seq[i]]); + fprintf(gf[0], "\n"); #endif #ifdef SHOW_DATA_PERF gdat[2]++; @@ -1509,40 +1418,49 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in #endif } #endif + //goto third_seed; goto collect_intv_end; } #endif - + bwtintv_v mem2 = {0}; // 2. second pass: find MEMs inside a long SMEM + uint32_v qev; + PROF_START(seed_2); old_n = smem->mem.n; + //fprintf(gf[2], "seed-1\n"); for (k = 0; k < old_n; ++k) { bwtintv_t *p = &smem->mem.a[k]; int start = p->info>>32, end = (int32_t)p->info; if (end - start < split_len || p->x[2] > opt->split_width) continue; -// fprintf(gfp1, "start\n"); - fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]); + //++ s1_num; + //fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", p->info >> 32, (int)p->info, p->x[0], p->x[1], p->x[2]); + // fprintf(gf[0], "start\n"); + fmt_smem_2(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0], qev); for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info >> 32); -// fprintf(gfp1, "%d\n", slen); +// fprintf(gf[0], "%d\n", slen); if (slen >= opt->min_seed_len) { kv_push(bwtintv_t, smem->mem, a->mem1.a[i]); + kv_push(bwtintv_t, mem2, a->mem1.a[i]); } } -// fprintf(gfp1, "end\n"); +// fprintf(gf[0], "end\n"); } + //fprintf(gf[2], "seed-2\n"); + PROF_END(tprof[T_SEED_2][tid], seed_2); + //if (s1_num > 1) { + //ks_introsort(mem_intv, mem2.n, mem2.a); + //for (i = 0; i < mem2.n; ++i) { + // fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", mem2.a[i].info >> 32, (int)mem2.a[i].info, mem2.a[i].x[0], mem2.a[i].x[1], mem2.a[i].x[2]); + //} + //fprintf(gf[2], "\n"); + //} -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - a->time_seed_2 += tmp_time2 - tmp_time1; - tmp_time1 = tmp_time2; -#endif - +// third_seed: +#if 1 // 3. third pass: LAST-like + PROF_START(seed_3); if (opt->max_mem_intv > 0) { x = 0; while (x < len) { @@ -1556,14 +1474,7 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in } else ++x; } } - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - a->time_seed_3 += tmp_time2 - tmp_time1; + PROF_END(tprof[T_SEED_3][tid], seed_3); #endif #ifdef FILTER_FULL_MATCH @@ -1571,18 +1482,21 @@ collect_intv_end: #endif // sort ks_introsort(mem_intv, smem->mem.n, smem->mem.a); + + + khash_t(str) * h; //for (i = 0; i < smem->mem.n; ++i) { // fprintf(stderr, "seed %d: %ld %ld %ld\n", i, smem->mem.a[i].x[0], smem->mem.a[i].x[1], smem->mem.a[i].x[2]); //} //fprintf(stderr, "split\n"); } -void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv) +void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid) { int i; if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match - mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux); + mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux, tid); smemv->pos_arr.n = 0; for (i=0; imem.n; ++i) { bwtintv_t *p = &smemv->mem.a[i]; @@ -1590,41 +1504,27 @@ void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t int64_t k; if (p->num_match > 0) { uint64_t pos = p->rm[0].rs; - if (p->rm[0].reverse) - pos = (fmt->l_pac << 1) - 1 - pos; - //pos |= 1LL << 63; + if (p->rm[0].reverse) pos = (fmt->l_pac << 1) - 1 - pos; kv_push(uint64_t, smemv->pos_arr, pos); + if (p->num_match > 1) { + uint64_t pos = p->rm[1].rs; + if (p->rm[1].reverse) pos = (fmt->l_pac << 1) - 1 - pos; + kv_push(uint64_t, smemv->pos_arr, pos); + } } else { step = p->x[2] > opt->max_occ ? p->x[2] / opt->max_occ : 1; for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { -#ifdef SHOW_PERF - uint64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - + PROF_START(sal); uint64_t pos = fmt_sa_offset(fmt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - aux->time_sa += tmp_time2 - tmp_time1; -#endif - + PROF_END(tprof[T_SAL][tid], sal); kv_push(uint64_t, smemv->pos_arr, pos); } } } } -void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, int len, const uint8_t *seq, smem_v *smemv, mem_chain_v *chain) +void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, int len, const uint8_t *seq, smem_v *smemv, mem_chain_v *chain, int tid) { int i, j, b, e, l_rep; int64_t l_pac = bns->l_pac; @@ -1677,7 +1577,14 @@ void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *b if (p->num_match > 0) { mem_seed_t s; s.rbeg = smemv->pos_arr.a[j++]; + //fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info>>32, (uint32_t)p->info); CHECK_ADD_CHAIN(tmp, lower, upper); + if (p->num_match > 1) { + mem_seed_t s; + s.rbeg = smemv->pos_arr.a[j++]; + //fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info >> 32, (uint32_t)p->info); + CHECK_ADD_CHAIN(tmp, lower, upper); + } } else { step = p->x[2] > opt->max_occ ? p->x[2] / opt->max_occ : 1; for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) @@ -1686,6 +1593,7 @@ void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *b uint64_t sa = smemv->pos_arr.a[j++]; uint64_t sa_idx = sa << 16 >> 16; s.rbeg = (sa >> 48) + bwt_get_sa((uint8_t *)fmt->sa, sa_idx); + // fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info >> 32, (uint32_t)p->info); CHECK_ADD_CHAIN(tmp, lower, upper); } } @@ -1728,7 +1636,7 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) // mem主要流程 void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *seq_arr, int nseq, smem_aux_t *aux, smem_v *smem_arr, mem_chain_v *chain_arr, mem_alnreg_v *reg_arr, - int calc_isize, int64_t l_pac, uint64_v *isize) + int calc_isize, int64_t l_pac, uint64_v *isize, int tid) { int i, j, l_seq; @@ -1736,57 +1644,39 @@ void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t mem_alnreg_v *regp; char *seq; -#ifdef SHOW_PERF - uint64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - // 1. seeding + PROF_START(seed_all); for (i = 0; i < nseq; ++i) { seq = seq_arr[i].seq; l_seq = seq_arr[i].l_seq; for (j = 0; j < l_seq; ++j) { seq[j] = seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]]; } - find_smem(opt, fmt, l_seq, (uint8_t *)seq, aux, &smem_arr[i]); + find_smem(opt, fmt, l_seq, (uint8_t *)seq, aux, &smem_arr[i], tid); } - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - aux->time_seed_all += tmp_time2 - tmp_time1; - tmp_time1 = tmp_time2; -#endif + PROF_END(tprof[T_SEED_ALL][tid], seed_all); // 2. chain + PROF_START(chain_all); for (i=0; in = mem_chain_flt(opt, chnp->n, chnp->a); + PROF_END(tprof[T_FLT_CHAIN][tid], flt_chain); + PROF_START(flt_chained_seeds); mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chnp->n, chnp->a, aux->seq_buf); + PROF_END(tprof[T_FLT_CHANNED_SEEDS][tid], flt_chained_seeds); if (bwa_verbose >= 4) mem_print_chain(bns, chnp); } - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - aux->time_chain += tmp_time2 - tmp_time1; - tmp_time1 = tmp_time2; -#endif + PROF_END(tprof[T_CHAIN_ALL][tid], chain_all); // 3. align + PROF_START(aln_all); for (i=0; in; ++j) { mem_chain_t *p = &chnp->a[j]; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", j); - mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, regp, aux); + mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, regp, aux, tid); free(chnp->a[j].seeds); } @@ -1816,20 +1706,14 @@ void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t p->is_alt = 1; } } - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - aux->time_bsw_all += tmp_time2 - tmp_time1; -#endif + PROF_END(tprof[T_ALN_ALL][tid], aln_all); #if 1 // 4. calc insert size + #define MIN_RATIO 0.8 if (calc_isize) { + PROF_START(ins_size); for (i = 0; i < nseq>>1; ++i) { int dir; int64_t is; @@ -1843,6 +1727,7 @@ void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } + PROF_END(tprof[T_INS_SIZE][tid], ins_size); } #endif } @@ -1852,12 +1737,12 @@ static void worker_smem_align(void *data, int i, int tid) mem_worker_t *w = (mem_worker_t *)data; int start = i * w->opt->batch_size; int end = MIN(start + w->opt->batch_size, w->n_reads); - mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid]); + mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid); } static void worker_sam(void *data, int i, int tid) { - extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2]); + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid); extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); extern void mem_matesw_batch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t *s, mem_alnreg_v *a, int data_size); mem_worker_t *w = (mem_worker_t*)data; @@ -1879,7 +1764,7 @@ static void worker_sam(void *data, int i, int tid) for (i=start; i= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i].name); - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed+i)>>1, &w->seqs[i], &w->regs[i], &w->sams[i]); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed+i)>>1, &w->seqs[i], &w->regs[i], &w->sams[i], tid); free(w->regs[i].a); free(w->regs[i+1].a); } } @@ -1887,7 +1772,8 @@ static void worker_sam(void *data, int i, int tid) void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams) { - extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); + extern void kt_for(int n_threads, void (*func)(void *, int, int), void *data, int n); + PROF_START(mem_prepare); mem_pestat_t pes[4]; int batch_size = opt->batch_size; // 对于pair-end数据,必须是2的整数倍,因为要保证pair-end数据在同一个线程里 int n_batch = (n + batch_size - 1) / batch_size; @@ -1912,46 +1798,25 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed w->isize_arr[i][j].n = 0; } #endif + PROF_END(gprof[G_MEM_PREPARE], mem_prepare); -#ifdef SHOW_PERF - uint64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - - //kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions + PROF_START(kernel); + // kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions + PROF_END(gprof[G_MEM_KERNEL], kernel); + PROF_START(pestat); if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 else mem_pestat(opt, w->bns->l_pac, n, w->isize_arr, pes); // otherwise, infer the insert size distribution from data //else mem_pestat_old(opt, w->bns->l_pac, n, w->regs, pes); } + PROF_END(gprof[G_MEM_PESTAT], pestat); -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - time_work_kernel += tmp_time2 - tmp_time1; - tmp_time1 = tmp_time2; -#endif - + PROF_START(mem_sam); kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment //kt_for(opt->n_threads, worker_sam, w, n_batch); - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - time_work_sam += tmp_time2 - tmp_time1; -#endif + PROF_END(gprof[G_MEM_SAM], mem_sam); //free(w.regs); if (bwa_verbose >= 3) diff --git a/bwamem.h b/bwamem.h index 1149658..3f7a2c8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -145,14 +145,6 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; typedef struct { bwtintv_v mem, mem1, *tmpv[2]; buf_t *sw_buf, *seq_buf; - uint64_t time_seed_1; - uint64_t time_seed_2; - uint64_t time_seed_3; - uint64_t time_seed_all; - uint64_t time_chain; - uint64_t time_sa; - uint64_t time_bsw; - uint64_t time_bsw_all; } smem_aux_t; typedef struct { diff --git a/bwamem_pair.c b/bwamem_pair.c index eedd521..7974447 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -256,6 +256,7 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co kswr_t aln; mem_alnreg_t b; int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); + //fprintf(stderr, "%d\n", xtra); aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 @@ -547,7 +548,7 @@ void mem_reorder_primary5(int T, mem_alnreg_v *a); #define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) -int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2]) +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid) { extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); @@ -562,6 +563,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co memset(h, 0, sizeof(mem_aln_t) * 2); memset(g, 0, sizeof(mem_aln_t) * 2); n_aa[0] = n_aa[1] = 0; + PROF_START(matesw); if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment mem_alnreg_v b[2]; kv_init(b[0]); kv_init(b[1]); @@ -574,6 +576,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); } + PROF_END(tprof[T_SAM_MATESW][tid], matesw); + n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); if (opt->flag & MEM_F_PRIMARY5) { @@ -637,7 +641,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } else XA[0] = XA[1] = 0; // write SAM for (i = 0; i < 2; ++i) { + PROF_START(reg2aln); h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]); + PROF_END(tprof[T_SAM_REG2ALN][tid], reg2aln); h[i].mapq = q_se[i]; h[i].flag |= 0x40< +#include "debug.h" + +FILE *gf[4] = {0}, + *gfq[4] = {0}, + *gft[4] = {0}, + *gfi[4] = {0}; + +int open_qti_files() +{ + char fn[1024] = {0}; + int i = 0; + for (; i < 4; ++i) + { + sprintf(fn, "./output/q%d.txt", i); + gfq[i] = fopen(fn, "w"); + sprintf(fn, "./output/t%d.txt", i); + gft[i] = fopen(fn, "w"); + sprintf(fn, "./output/i%d.txt", i); + gfi[i] = fopen(fn, "w"); + } + return 0; +} + +int open_debug_files() +{ + char fn[1024] = {0}; + int i = 0; + for (; i < 4; ++i) + { + sprintf(fn, "./output/fp%d.txt", i); + gf[i] = fopen(fn, "w"); + } + return 0; +} + +int close_files() +{ + int i = 0; + for (; i < 4; ++i) + { + if (gf[i] != 0) + fclose(gf[i]); + if (gfq[i] != 0) + fclose(gfq[i]); + if (gft[i] != 0) + fclose(gft[i]); + if (gfi[i] != 0) + fclose(gfi[i]); + } + return 0; +} \ No newline at end of file diff --git a/debug.h b/debug.h new file mode 100644 index 0000000..9f285ca --- /dev/null +++ b/debug.h @@ -0,0 +1,28 @@ +/********************************************************************************************* + Description: data and files for debugging + Copyright : All right reserved by NCIC.ICT + + Author : Zhang Zhonghai + Date : 2024/04/09 +***********************************************************************************************/ +#include + +////////////////// for debug and test ////////////////////////// + +#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件,并记录debug信息 +// #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度,以及最终扩展的长度 +// #define GET_FULL_MATCH_READ // 获取完全匹配的reads +// #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量 +// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的query,target,和其他用于计算的数据 +// #define GET_KSW_ALIGN_SEQ +// #define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里 + +//////////////////////////////////////////////////////////////// + +extern FILE *gf[4], *gfq[4], *gft[4], *gfi[4]; + +int open_qti_files(); + +int open_debug_files(); + +int close_files(); \ No newline at end of file diff --git a/debug.sh b/debug.sh index 77d8bc6..34bda87 100755 --- a/debug.sh +++ b/debug.sh @@ -1,8 +1,8 @@ thread=1 -n_r1=~/data/fastq/dataset/na12878_wgs_150/diff_r1.fq -n_r2=~/data/fastq/dataset/na12878_wgs_150/diff_r2.fq -#n_r1=~/data/fastq/dataset/na12878_wes_144/diff_r1.fq -#n_r2=~/data/fastq/dataset/na12878_wes_144/diff_r2.fq +#n_r1=~/data/fastq/dataset/na12878_wgs_150/diff_r1.fq +#n_r2=~/data/fastq/dataset/na12878_wgs_150/diff_r2.fq +n_r1=~/data/fastq/dataset/na12878_wes_144/diff_r1.fq +n_r2=~/data/fastq/dataset/na12878_wes_144/diff_r2.fq #n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq #n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq #n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq @@ -20,7 +20,7 @@ n_r2=~/data/fastq/dataset/na12878_wgs_150/diff_r2.fq #n_r2=~/fastq/diff_all_r2.fq #n_r1=~/fastq/d_r1.fq #n_r2=~/fastq/d_r2.fq -reference=~/reference/human_g1k_v37_decoy.fasta +reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta #out=./ssn.sam out=./out.sam #out=/dev/null diff --git a/fastmap.c b/fastmap.c index 007d71a..0529960 100644 --- a/fastmap.c +++ b/fastmap.c @@ -42,60 +42,10 @@ #include "kseq.h" #include "fmt_idx.h" #include "yarn.h" +#include "profiling.h" +#include "debug.h" KSEQ_DECLARE(gzFile) -// 记录运行时间的变量 -/* -#ifdef SHOW_PERF -int64_t time_ksw_extend2 = 0, - time_ksw_global2 = 0, - time_ksw_align2 = 0, - time_bwt_smem1a = 0, - time_seed_1 = 0, - time_seed_2 = 0, - time_seed_3 = 0, - time_fmt_smem_0 = 0, - time_bwt_extend = 0, - time_bwt_occ4 = 0, - time_bwt_sa = 0, - time_bwt_sa_read = 0, - time_bns = 0, - time_work1 = 0, - time_work2 = 0, - time_flt_chain = 0; - -int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0; -int64_t s1n = 0, s2n = 0, s3n = 0, s1l = 0, s2l = 0, s3l = 0; -int64_t gn[100] = {0}; -int64_t g_num_smem2 = 0; -#endif -*/ - -#ifdef SHOW_PERF -uint64_t time_process_data = 0, time_read = 0, time_write = 0, time_compute = 0, - time_seed_1 = 0, time_seed_2 = 0, time_seed_3 = 0, - time_seed_sa = 0, time_seed_chain = 0, time_seed_all = 0, - time_bsw = 0, time_bsw_all = 0, - time_work_kernel = 0, time_work_sam = 0, - time_load_idx = 0; - -uint64_t proc_freq = 1000; - -#endif - -#ifdef SHOW_DATA_PERF -/* -gdat[0]: read nums -gdat[1]: seed-1 full match nums -*/ -int64_t gdat[100]; -#endif - -#ifdef DEBUG_OUTPUT -FILE * gfp1, *gfp2, *gfp3, *gfp4; -FILE *gfq[4], *gft[4], *gfi[4]; -#endif - extern unsigned char nst_nt4_table[256]; void *kopen(const char *fn, int *_fd); @@ -134,28 +84,12 @@ typedef struct { static inline void *read_data(ktp_aux_t *aux, ktp_data_t *data) { -#ifdef SHOW_PERF - int64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - + PROF_START(read); ktp_data_t *ret = aux->data + aux->data_idx; aux->data_idx = !aux->data_idx; int64_t size = 0; bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2, aux->copy_comment, &size, &ret->m_seqs, &ret->seqs); - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - time_read += tmp_time2 - tmp_time1; -#endif + PROF_END(gprof[G_READ], read); #if 0 // 计算内存占用 @@ -179,15 +113,7 @@ static inline void *read_data(ktp_aux_t *aux, ktp_data_t *data) // calculate static inline void *calc_data(ktp_aux_t *aux, ktp_data_t *data) { -#ifdef SHOW_PERF - int64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - + PROF_START(compute); const mem_opt_t *opt = aux->opt; if (data->n_sams != data->n_seqs) { if (data->m_sams < data->m_seqs) { @@ -225,15 +151,8 @@ static inline void *calc_data(ktp_aux_t *aux, ktp_data_t *data) free(ss[0]); free(ss[1]); } else mem_process_seqs(opt, aux->w, aux->n_processed, data->n_seqs, data->seqs, aux->pes0, data->sams); aux->n_processed += data->n_seqs; + PROF_END(gprof[G_COMPUTE], compute); -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - time_compute += tmp_time2 - tmp_time1; -#endif return data; } @@ -241,15 +160,8 @@ static inline void *calc_data(ktp_aux_t *aux, ktp_data_t *data) static inline void *write_data(ktp_aux_t *aux, ktp_data_t *data) { int i; -#ifdef SHOW_PERF - int64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - //int64_t ms = 0; + PROF_START(write); + // int64_t ms = 0; int buf_written = 0; for (i = 0; i < data->n_sams; ++i) { @@ -285,15 +197,7 @@ static inline void *write_data(ktp_aux_t *aux, ktp_data_t *data) //} //fprintf(stderr, "sam size: %ld M\n", ms / 1024 / 1024); - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - time_write += tmp_time2 - tmp_time1; -#endif + PROF_END(gprof[G_WRITE], write); return 0; } @@ -440,23 +344,9 @@ static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) int main_mem(int argc, char *argv[]) { - -#ifdef DEBUG_OUTPUT - gfp1 = fopen("./fp1.txt", "w"); - gfp2 = fopen("./fp2.txt", "w"); - gfp3 = fopen("./fp3.txt", "w"); - gfp4 = fopen("./fp4.txt", "w"); - char fnbuf[1024]; - int ii; - for (ii = 0; ii < 4; ++ii) - { - sprintf(fnbuf, "./ext_out/q_%d.fa", ii); - gfq[ii] = fopen(fnbuf, "w"); - sprintf(fnbuf, "./ext_out/t_%d.fa", ii); - gft[ii] = fopen(fnbuf, "w"); - sprintf(fnbuf, "./ext_out/i_%d.txt", ii); - gfi[ii] = fopen(fnbuf, "w"); - } +#ifdef DEBUG_FILE_OUTPUT + open_debug_files(); + // open_debug_files #endif #ifdef SHOW_PERF @@ -469,6 +359,8 @@ int main_mem(int argc, char *argv[]) #endif #endif + PROF_START(all); + PROF_START(prepare); mem_opt_t *opt, opt0; int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0; int fixed_chunk_size = -1; @@ -694,16 +586,9 @@ int main_mem(int argc, char *argv[]) } } else update_a(opt, &opt0); bwa_fill_scmat(opt->a, opt->b, opt->mat); + PROF_END(gprof[G_PREPARE], prepare); -#ifdef SHOW_PERF - int64_t tmp_time1, tmp_time2; -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - + PROF_START(idx); aux.idx = bwa_idx_load_from_shm(argv[optind]); if (aux.idx == 0) { if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak @@ -712,15 +597,7 @@ int main_mem(int argc, char *argv[]) if (ignore_alt) for (i = 0; i < aux.idx->bns->n_seqs; ++i) aux.idx->bns->anns[i].is_alt = 0; - -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - time_load_idx += tmp_time2 - tmp_time1; -#endif + PROF_END(gprof[G_LOAD_IDX], idx); ko = kopen(argv[optind + 1], &fd); if (ko == 0) { @@ -757,84 +634,24 @@ int main_mem(int argc, char *argv[]) aux.wbuf_size = 16777216; aux.wbuf = malloc(aux.wbuf_size); -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time1 = __rdtsc(); -#else - tmp_time1 = realtime_msec(); -#endif -#endif - + PROF_START(pipeline); if (no_mt_io == 2) new_pipeline(&aux); else kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3); + PROF_END(gprof[G_PIPELINE], pipeline); -#ifdef SHOW_PERF -#if USE_RDTSC - tmp_time2 = __rdtsc(); -#else - tmp_time2 = realtime_msec(); -#endif - time_process_data += tmp_time2 - tmp_time1; -#endif - - free(hdr_line); - free(opt); - bwa_idx_destroy(aux.idx); - kseq_destroy(aux.ks); +// free(hdr_line); +// free(opt); +// bwa_idx_destroy(aux.idx); +// kseq_destroy(aux.ks); err_gzclose(fp); kclose(ko); if (aux.ks2) { - kseq_destroy(aux.ks2); +// kseq_destroy(aux.ks2); err_gzclose(fp2); kclose(ko2); } + PROF_END(gprof[G_ALL], all); #ifdef SHOW_PERF - uint64_t avg_seed_1 = 0, avg_seed_2 = 0, avg_seed_3 = 0, - avg_sa = 0, avg_chain = 0, avg_seed_all = 0, - avg_bsw = 0, avg_bsw_all = 0; - uint64_t min_seed_all = UINT64_MAX, min_bsw_all = UINT64_MAX; - uint64_t max_seed_all = 0, max_bsw_all = 0; - - for (i = 0; i < opt->n_threads; ++i) { - avg_seed_1 += aux.w->aux[i]->time_seed_1; - avg_seed_2 += aux.w->aux[i]->time_seed_2; - avg_seed_3 += aux.w->aux[i]->time_seed_3; - avg_sa += aux.w->aux[i]->time_sa; - avg_chain += aux.w->aux[i]->time_chain; - avg_seed_all += aux.w->aux[i]->time_seed_all; - - avg_bsw += aux.w->aux[i]->time_bsw; - avg_bsw_all += aux.w->aux[i]->time_bsw_all; - - min_seed_all = MIN(min_seed_all, aux.w->aux[i]->time_seed_all); - min_bsw_all = MIN(min_bsw_all, aux.w->aux[i]->time_bsw_all); - max_seed_all = MAX(max_seed_all, aux.w->aux[i]->time_seed_all); - max_bsw_all = MAX(max_bsw_all, aux.w->aux[i]->time_bsw_all); - } - - fprintf(stderr, "\n"); - fprintf(stderr, "time_load_idx: %0.2lf s\n", time_load_idx * 1.0 / proc_freq); - fprintf(stderr, "time_process_data: %0.2lf s\n", time_process_data * 1.0 / proc_freq); - fprintf(stderr, "time_read: %0.2lf s\n", time_read * 1.0 / proc_freq); - fprintf(stderr, "time_compute: %0.2lf s\n", time_compute * 1.0 / proc_freq); - fprintf(stderr, "time_write: %0.2lf s\n", time_write * 1.0 / proc_freq); - - fprintf(stderr, "time_work_kernel: %0.2lf s\n", time_work_kernel * 1.0 / proc_freq); - fprintf(stderr, "time_work_sam: %0.2lf s\n", time_work_sam * 1.0 / proc_freq); - fprintf(stderr, "time_process_seq: %0.2lf s\n", (time_work_kernel + time_work_sam) * 1.0 / proc_freq); - - fprintf(stderr, "time_seed_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg_seed_all * 1.0 / proc_freq / opt->n_threads, min_seed_all * 1.0 / proc_freq, max_seed_all * 1.0 / proc_freq); - fprintf(stderr, "time_sa: %0.2lf s\n", avg_sa * 1.0 / proc_freq / opt->n_threads); - fprintf(stderr, "time_chain: %0.2lf s\n", avg_chain * 1.0 / proc_freq / opt->n_threads); - - fprintf(stderr, "time_bsw_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg_bsw_all * 1.0 / proc_freq / opt->n_threads, min_bsw_all * 1.0 / proc_freq, max_bsw_all * 1.0 / proc_freq); - fprintf(stderr, "\n"); - fprintf(stderr, "time_seed_1: %0.2lf s\n", avg_seed_1 * 1.0 / proc_freq / opt->n_threads); - fprintf(stderr, "time_seed_2: %0.2lf s\n", avg_seed_2 * 1.0 / proc_freq / opt->n_threads); - fprintf(stderr, "time_seed_3: %0.2lf s\n", avg_seed_3 * 1.0 / proc_freq / opt->n_threads); - fprintf(stderr, "time_seed_chain: %0.2lf s\n", (avg_seed_all + avg_chain) * 1.0 / proc_freq / opt->n_threads); - fprintf(stderr, "time_bsw: %0.2lf s\n", avg_bsw * 1.0 / proc_freq / opt->n_threads); - fprintf(stderr, "\n"); - + display_stats(opt->n_threads); #endif #ifdef SHOW_DATA_PERF @@ -843,17 +660,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "\n"); #endif -#ifdef DEBUG_OUTPUT - fclose(gfp1); - fclose(gfp2); - fclose(gfp3); - fclose(gfp4); - for (ii = 0; ii < 4; ++ii) - { - fclose(gfq[ii]); - fclose(gft[ii]); - fclose(gfi[ii]); - } +#ifdef DEBUG_FILE_OUTPUT + close_files(); #endif return 0; diff --git a/fmt_idx.c b/fmt_idx.c index d27faec..3290853 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -16,22 +16,7 @@ Date : 2023/12/24 #include "bntseq.h" #include "kvec.h" #include "kstring.h" - -//const static char BASE[4] = {'A', 'C', 'G', 'T'}; - -// 生成所有KMER_LEN长度的序列,字符串表示 -//void gen_all_seq(char **seq_arr, int kmer_len) -//{ -// uint32_t seq_up_val = (1 << (kmer_len << 1)); -// for (uint32_t i = 0; i < seq_up_val; ++i) -// { -// seq_arr[i] = (char *)malloc(kmer_len); -// for (int j = kmer_len - 1; j >= 0; --j) -// { -// seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3]; -// } -// } -//} +#include "debug.h" // 生成occ,每个字节对应一个pattern void fmt_gen_cnt_occ(FMTIndex *fmt) @@ -750,7 +735,7 @@ inline void fmt_direct_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok } // 序列和参考基因直接对比 -static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt) +static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtint_t ref_pos, bwtintv_t *mt, int mtidx) { #define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3) #define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \ @@ -775,7 +760,8 @@ static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int le int k; int64_t r, rp; mt->num_match = 1; - rp = fmt_sa(fmt, mtx_line); + // rp = fmt_sa(fmt, mtx_line); + rp = ref_pos; r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp; k = right_pos; if (rp < fmt->l_pac) // 匹配到了正向链 @@ -783,30 +769,30 @@ static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int le // 向前继续扩展 r += right_pos - left_pos; EXTEND_BASE_LOOP(len, fmt->l_pac, 1, 1); - mt->rm[0].qe = k; - mt->rm[0].reverse = 0; + mt->rm[mtidx].qe = k; + mt->rm[mtidx].reverse = 0; // 向后扩展,x位置之前的碱基 r -= k - left_pos + 1; k = left_pos - 1; EXTEND_BASE_LOOP(-1, -1, -1, -1); - mt->rm[0].qs = k + 1; - mt->rm[0].rs = r + 1; + mt->rm[mtidx].qs = k + 1; + mt->rm[mtidx].rs = r + 1; } else // 匹配到了互补链 { r -= right_pos - left_pos; EXTEND_BASE_LOOP_COMP(len, -1, 1, -1); - mt->rm[0].qe = k; - mt->rm[0].reverse = 1; + mt->rm[mtidx].qe = k; + mt->rm[mtidx].reverse = 1; // 扩展x之前的碱基 r += k - left_pos + 1; k = left_pos - 1; EXTEND_BASE_LOOP_COMP(-1, fmt->l_pac, -1, 1); - mt->rm[0].qs = k + 1; - mt->rm[0].rs = r - 1; + mt->rm[mtidx].qs = k + 1; + mt->rm[mtidx].rs = r - 1; } - mt->info = mt->rm[0].qs; - mt->info = mt->info << 32 | mt->rm[0].qe; + mt->info = mt->rm[mtidx].qs; + mt->info = mt->info << 32 | mt->rm[mtidx].qe; mt->x[2] = 1; } @@ -863,16 +849,15 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int #if 1 if (min_intv == 1 && ok2.x[2] == min_intv) { -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT #ifdef COUNT_SEED_LENGTH - fprintf(gfp1, "%d\t", i + 2 - x); + fprintf(gf[0], "%d\t", i + 2 - x); #endif #endif - direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); - //mt.x[0] = ok2.x[0]; -#ifdef DEBUG_OUTPUT + direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0); +#ifdef DEBUG_FILE_OUTPUT #if 0 - fprintf(gfp1, "mt %ld %ld\n", ok2.x[0], ok2.x[1]); + fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); #endif #endif kv_push(bwtintv_t, *mem, mt); @@ -939,6 +924,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv if (x == 0 || q[x - 1] > 3) return fmt_smem_forward(fmt, len, q, x, min_intv, min_seed_len, mem); + // int flag = 0; int i, j, ret, kmer_len; bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; bwtintv_t mt = {0}; @@ -947,8 +933,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv mem->n = 0; if (q[x] > 3) return x + 1; - if (min_intv < 1) - min_intv = 1; // the interval size should be at least 1 + if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 curr = tmpvec; // use the temporary vector if provided qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); @@ -965,6 +950,7 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv } \ iv = ov; \ iv.info = end_pos + #define PUSH_VAL_AND_SKIP(iv) \ do \ { \ @@ -972,6 +958,22 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv goto backward_search; \ } while (0) +#define DIRECT_EXTEND_2_SEED(idx0, idx1, qspos) \ + mt.rm[idx1].qs = MAX(mt.rm[idx0].qs, lm->rm[0].qs); \ + mt.rm[idx1].qe = MIN(mt.rm[idx0].qe, lm->rm[0].qe); \ + mt.rm[idx0].qs = mt.rm[idx1].qs; \ + mt.rm[idx0].qe = mt.rm[idx1].qe; \ + mt.rm[idx1].reverse = lm->rm[0].reverse; \ + left_ext = qspos - mt.rm[0].qs; \ + if (mt.rm[0].reverse) \ + mt.rm[0].rs = (fmt->l_pac << 1) - 1 - (s1 - left_ext); \ + else \ + mt.rm[0].rs = s1 - left_ext; \ + if (mt.rm[1].reverse) \ + mt.rm[1].rs = (fmt->l_pac << 1) - 1 - (s2 - left_ext); \ + else \ + mt.rm[1].rs = s2 - left_ext; + // 处理kmer对应的匹配信息 for (curr->n = 0, j = 1; j < kmer_len; ++j) { @@ -992,37 +994,55 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); CHECK_INTV_CHANGE(ik, ok1, i + 1); -#if 0 - if (min_intv == 2 && ok1.x[2] == 2 && ok2.x[2] == 1 && lm && lm->num_match == 1) - { - bwtint_t bwt_mtx_row = ok1.x[0]; - if (bwt_mtx_row == ok2.x[0]) - bwt_mtx_row = ok1.x[0] + 1; - direct_extend(fmt, len, q, x, i + 1, bwt_mtx_row, &mt); - kv_push(bwtintv_t, *mem, mt); - //fprintf(gfp1, "cond-appear: %d %d %ld %ld %ld %ld\n", x, i + 2, ok1.x[0], ok1.x[1], ok2.x[0], ok2.x[1]); - //print_flag = 1; - } - else - { - CHECK_INTV_CHANGE(ik, ok2, i + 2); - } -#else CHECK_INTV_CHANGE(ik, ok2, i + 2); -#endif #if 1 // 在这里进行判断是否只有一个候选了 if (min_intv == 1 && ok2.x[2] == min_intv) { - direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); - //mt.x[0] = ok2.x[0]; - //fprintf(gfp1, "mt %ld %ld\n", ok2.x[0], ok2.x[1]); + direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0); + //fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len ret = (uint32_t)mt.info; if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end; goto backward_search; - } // if (min_intv == 2 && ok2.x[2] == min_intv && !print_flag) + } +#if 0 + // 判断只有两个候选的情况 + else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) { + bwtint_t s1 = fmt_sa(fmt, ok2.x[0]); + bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1); + int left_ext; + bwtint_t lmrp = lm->rm[0].rs; + if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp; + + if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集 + direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1); + DIRECT_EXTEND_2_SEED(1, 0, x); + } + else + { + direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0); + DIRECT_EXTEND_2_SEED(0, 1, x); + } + mt.info = mt.rm[0].qs; + mt.info = mt.info << 32 | mt.rm[0].qe; + mt.num_match = 2; + mt.x[2] = 2; + kv_push(bwtintv_t, *mem, mt); + ret = (uint32_t)mt.info; + + //fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse); + //fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2); + //s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1; + //s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2; + //fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2); + + if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + goto fmt_smem_end; + goto backward_search; + } +#endif #endif } else if (q[i] < 4) // q[i+1] >= 4 @@ -1062,7 +1082,11 @@ backward_search: (intv).info |= (uint64_t)(pos) << 32; \ kv_push(bwtintv_t, *mem, (intv)); \ } - + // if (flag) + //{ + // fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1)); + // flag = 0; + // } #define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \ if (ok.x[2] < min_intv) \ { \ @@ -1112,6 +1136,73 @@ backward_search: CHECK_INTV_ADD_MEM(ok2, i, ok1, mem); ok2.info = p->info; *p = ok2; +#if 0 + // 在这里进行判断是否只有一个候选了 + if (min_intv == 1 && ok2.x[2] == min_intv) + { + int qs = i - 1; + int qe = (int)ok2.info; + if (mem->n > 0) + { + int qsl = (int)(mem->a[mem->n - 1].info >> 32); + int qel = (int)(mem->a[mem->n - 1].info); + if (qsl <= qs && qel >= qe) + break; + } + direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0); + // fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); + kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len + if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + goto fmt_smem_end; + break; + } +#endif +#if 0 + if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len) + { + int qs = i - 1; + int qe = (int)ok2.info; + if (mem->n > 0) + { + int qsl = (int)(mem->a[mem->n - 1].info >> 32); + int qel = (int)(mem->a[mem->n - 1].info); + if (qsl <= qs && qel >= qe) + break; + } +#if 1 + int left_ext; + bwtint_t s1 = fmt_sa(fmt, ok2.x[0]); + bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1); + + bwtint_t lmrp = lm->rm[0].rs; + if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp; + + if (lmrp + qs == s1 + lm->rm[0].qs) + { // s1是lm的子集 + direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1); + DIRECT_EXTEND_2_SEED(1, 0, qs); + } + else + { + direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0); + DIRECT_EXTEND_2_SEED(0, 1, qs); + } + mt.info = mt.rm[0].qs; + mt.info = mt.info << 32 | mt.rm[0].qe; + mt.num_match = 2; + mt.x[2] = 2; + kv_push(bwtintv_t, *mem, mt); + // fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse); + // fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2); + // s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1; + // s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2; + // fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2); + if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + goto fmt_smem_end; + break; +#endif + } +#endif } else if (q[i] < 4) // 只能扩展一个 { @@ -1152,7 +1243,316 @@ backward_search: } fmt_smem_end: - //if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gfp1, "\n"); + //if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n"); + fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate + return ret; +} + +// 找smem(seed)(lm: long_smem) +int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev) +{ + // int flag = 0; + int i, j, ret, kmer_len; + bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; + bwtintv_t mt = {0}; + bwtintv_v *curr; + uint32_t qbit = 0; + mem->n = 0; + curr = tmpvec; // use the temporary vector if provided + + qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); + bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置 + ik.info = x + 1; + //int print_flag = 0; +// check change of the interval size and whether the interval size is too small to be extended further +#define CHECK_INTV_CHANGE_2(iv, ov, end_pos) \ + if (ov.x[2] != iv.x[2]) \ + { \ + \ + kv_push(bwtintv_t, *curr, iv); \ + if (ov.x[2] < min_intv) \ + break; \ + } \ + iv = ov; \ + iv.info = end_pos + +#define PUSH_VAL_AND_SKIP_2(iv) \ + do \ + { \ + kv_push(bwtintv_t, *curr, iv); \ + goto backward_search; \ + } while (0) + + // 处理kmer对应的匹配信息 + for (curr->n = 0, j = 1; j < kmer_len; ++j) + { + bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j); + CHECK_INTV_CHANGE_2(ik, ok1, x + j + 1); + } + if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后 + PUSH_VAL_AND_SKIP_2(ik); + + // 扩展kmer之后的碱基 + // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); + for (i = (int)ik.info; i + 1 < len; i += 2) + { // forward search + if (q[i] < 4 && q[i + 1] < 4) + { + fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); + CHECK_INTV_CHANGE_2(ik, ok1, i + 1); + CHECK_INTV_CHANGE_2(ik, ok2, i + 2); +#if 1 + // 在这里进行判断是否只有一个候选了 + if (min_intv == 1 && ok2.x[2] == min_intv) + { + direct_extend(fmt, len, q, x, i + 2, ok2.x[0], fmt_sa(fmt, ok2.x[0]), & mt, 0); + //fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); + kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len + ret = (uint32_t)mt.info; + if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + goto fmt_smem_end; + goto backward_search; + } +#if 1 + // 判断只有两个候选的情况 + else if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qe > (i+1) && i+2-x >= min_seed_len) { + bwtint_t s1 = fmt_sa(fmt, ok2.x[0]); + bwtint_t s2 = fmt_sa(fmt, ok2.x[0]+1); + int left_ext; + bwtint_t lmrp = lm->rm[0].rs; + if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp; + + if (lmrp + x == s1 + lm->rm[0].qs) { // s1是lm的子集 + direct_extend(fmt, len, q, x, i + 2, ok2.x[0] + 1, s2, &mt, 1); + DIRECT_EXTEND_2_SEED(1, 0, x); + } + else + { + direct_extend(fmt, len, q, x, i + 2, ok2.x[0], s1, &mt, 0); + DIRECT_EXTEND_2_SEED(0, 1, x); + } + mt.info = mt.rm[0].qs; + mt.info = mt.info << 32 | mt.rm[0].qe; + mt.num_match = 2; + mt.x[2] = 2; + kv_push(bwtintv_t, *mem, mt); + ret = (uint32_t)mt.info; + + //fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse); + //fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", x, i+2, s1, s2); + //s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1; + //s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2; + //fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2); + + if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + goto fmt_smem_end; + goto backward_search; + } +#endif +#endif + } + else if (q[i] < 4) // q[i+1] >= 4 + { + fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); + CHECK_INTV_CHANGE_2(ik, ok1, i + 1); + PUSH_VAL_AND_SKIP_2(ik); + } + else // q[i] >= 4 + { + PUSH_VAL_AND_SKIP_2(ik); + } + } + for (; i == len - 1; ++i) // 扩展到了最后一个碱基 + { + if (q[i] < 4) + { + fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); + CHECK_INTV_CHANGE_2(ik, ok1, i + 1); + } + else + PUSH_VAL_AND_SKIP_2(ik); + } + if (i == len) + kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end + +backward_search: + fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first + if (mt.num_match == 0) + ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 + else + ret = (uint32_t)mt.info; + // 按照种子进行遍历,反向扩展 +#define CHECK_ADD_MEM(pos, intv, mem) \ + if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) \ + { \ + (intv).info |= (uint64_t)(pos) << 32; \ + kv_push(bwtintv_t, *mem, (intv)); \ + } + // if (flag) + //{ + // fprintf(gf[0], "[ne][re] qs:%ld, qe:%d, rs:%ld, %ld\n", (intv).info >> 32, (int32_t)(intv).info, fmt_sa(fmt, (intv).x[0]), fmt_sa(fmt, (intv).x[0] + 1)); + // flag = 0; + // } +#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \ + if (ok.x[2] < min_intv) \ + { \ + CHECK_ADD_MEM(pos, intv, mem); \ + break; \ + } + + int last_kmer_start = 0; + for (j = 0; j < curr->n; ++j) + { + bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 + // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); + // __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); + if (p->info - x < HASH_KMER_LEN) + { + if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4) + qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer + else + qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer + last_kmer_start = p->info - 1; + i = 1; + do + { + bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++); + } while (ik.x[2] < min_intv); + if (i > 2) + continue; + p->x[0] = ik.x[1]; + p->x[1] = ik.x[0]; + p->x[2] = ik.x[2]; + i = p->info - (kmer_len - i + 3); + } + else + { + i = x - 1; + } + for (; i > 0; i -= 2) + { + if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展 + { + // fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); + fmt_direct2_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2); + __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2); + CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); + ok1.info = p->info; + CHECK_INTV_ADD_MEM(ok2, i, ok1, mem); + ok2.info = p->info; + *p = ok2; +#if 0 + // 在这里进行判断是否只有一个候选了 + if (min_intv == 1 && ok2.x[2] == min_intv) + { + int qs = i - 1; + int qe = (int)ok2.info; + if (mem->n > 0) + { + int qsl = (int)(mem->a[mem->n - 1].info >> 32); + int qel = (int)(mem->a[mem->n - 1].info); + if (qsl <= qs && qel >= qe) + break; + } + direct_extend(fmt, len, q, qs, qe, ok2.x[0], fmt_sa(fmt, ok2.x[0]), &mt, 0); + // fprintf(gf[0], "mt %ld %ld\n", ok2.x[0], ok2.x[1]); + kv_push(bwtintv_t, *mem, mt); // 这里可以判断一下是否大于min_seed_len + if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + goto fmt_smem_end; + break; + } +#endif +#if 0 + if (min_intv == 2 && ok2.x[2] == min_intv && lm && lm->num_match && lm->rm[0].qs < i && lm->rm[0].qe >= (uint32_t)ok2.info && (uint32_t)ok2.info - i + 1 >= min_seed_len) + { + int qs = i - 1; + int qe = (int)ok2.info; + if (mem->n > 0) + { + int qsl = (int)(mem->a[mem->n - 1].info >> 32); + int qel = (int)(mem->a[mem->n - 1].info); + if (qsl <= qs && qel >= qe) + break; + } +#if 1 + int left_ext; + bwtint_t s1 = fmt_sa(fmt, ok2.x[0]); + bwtint_t s2 = fmt_sa(fmt, ok2.x[0] + 1); + + bwtint_t lmrp = lm->rm[0].rs; + if (lm->rm[0].reverse) lmrp = (fmt->l_pac << 1) - 1 - lmrp; + + if (lmrp + qs == s1 + lm->rm[0].qs) + { // s1是lm的子集 + direct_extend(fmt, len, q, qs, qe, ok2.x[0] + 1, s2, &mt, 1); + DIRECT_EXTEND_2_SEED(1, 0, qs); + } + else + { + direct_extend(fmt, len, q, qs, qe, ok2.x[0], s1, &mt, 0); + DIRECT_EXTEND_2_SEED(0, 1, qs); + } + mt.info = mt.rm[0].qs; + mt.info = mt.info << 32 | mt.rm[0].qe; + mt.num_match = 2; + mt.x[2] = 2; + kv_push(bwtintv_t, *mem, mt); + // fprintf(gf[0], "[de][mt] qs:%d, qe:%d, rs:%u, %ld, reverse:%d\n", lm->rm[0].qs, lm->rm[0].qe, lm->rm[0].rs, lmrp, lm->rm[0].reverse); + // fprintf(gf[0], "[de][sd] qs:%d, qe:%d, rs:%ld, %ld\n", qs, qe, s1, s2); + // s1 = mt.rm[0].rs; if (mt.rm[0].reverse) s1 = (fmt->l_pac << 1) - 1 - s1; + // s2 = mt.rm[1].rs; if (mt.rm[1].reverse) s2 = (fmt->l_pac << 1) - 1 - s2; + // fprintf(gf[0], "[de][re] qs:%d, qe:%d, rs:%u, %u, %ld, %ld\n", mt.rm[0].qs, mt.rm[0].qe, mt.rm[0].rs, mt.rm[1].rs, s1, s2); + if (mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) + goto fmt_smem_end; + break; +#endif + } +#endif + } + else if (q[i] < 4) // 只能扩展一个 + { + // fmt_extend1(fmt, p, &ok1, 1, q[i]); + fmt_direct_extend1(fmt, p, &ok1, 1, q[i]); + CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); + ok1.info = p->info; + CHECK_ADD_MEM(i, ok1, mem); + goto fmt_smem_end; + } + else + { // 不能扩展 + CHECK_ADD_MEM(i + 1, *p, mem); + goto fmt_smem_end; + } + } + for (; i == 0; --i) + { // 扩展到了第一个碱基 + if (q[i] < 4) + { + // fmt_extend1(fmt, p, &ok1, 1, q[i]); + fmt_direct_extend1(fmt, p, &ok1, 1, q[i]); + CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); + ok1.info = p->info; + *p = ok1; + } + else + { + CHECK_ADD_MEM(i + 1, *p, mem); + goto fmt_smem_end; + } + } + if (i == -1) + { + CHECK_ADD_MEM(i + 1, *p, mem); + goto fmt_smem_end; + } + } + +fmt_smem_end: + //if (mem->n == 0 && min_intv > 1 && print_flag) fprintf(gf[0], "\n"); fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate return ret; } diff --git a/fmt_idx.h b/fmt_idx.h index 3f07851..cb8436a 100644 --- a/fmt_idx.h +++ b/fmt_idx.h @@ -12,6 +12,7 @@ Date : 2023/12/24 #include #include #include "bwt.h" +#include "utils.h" #define FMT_OCC_INTV_SHIFT 8 #define FMT_OCC_INTERVAL (1 << FMT_OCC_INTV_SHIFT) @@ -109,6 +110,7 @@ void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos); void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos); // 找smem(seed) int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec); +int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev); int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem); diff --git a/ksw.c b/ksw.c index 21fb7e8..02f4f19 100644 --- a/ksw.c +++ b/ksw.c @@ -38,14 +38,7 @@ #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x),1) -#define UNLIKELY(x) __builtin_expect((x),0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif +#include "profiling.h" const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; @@ -166,6 +159,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del _mm_store_si128(Hmax + i, zero); } // the core loop + PROF_START(loop); for (i = 0; i < tlen; ++i) { int j, k, imax; __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector @@ -229,6 +223,8 @@ end_loop16: } S = H1; H1 = H0; H0 = S; // swap H0 and H1 } + PROF_END(gprof[G_KSW_LOOP], loop); + PROF_START(end_loop); r.score = gmax + q->shift < 255? gmax : 255; r.te = te; if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score @@ -249,6 +245,7 @@ end_loop16: } } free(b); + PROF_END(gprof[G_KSW_END_LOOP], end_loop); return r; } @@ -297,6 +294,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de _mm_store_si128(Hmax + i, zero); } // the core loop + PROF_START(loop); for (i = 0; i < tlen; ++i) { int j, k, imax; __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector @@ -347,7 +345,9 @@ end_loop8: if (gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; + } + PROF_END(gprof[G_KSW_LOOP], loop); r.score = gmax; r.te = te; { int max = -1, tmp, low, high, qlen = slen * 8; @@ -376,18 +376,46 @@ static inline void revseq(int l, uint8_t *s) t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; } +static void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int fnum) +{ +#ifdef DEBUG_FILE_OUTPUT + // 写到三个文件里,query.fa,target.fa,每行一个序列,info.txt,包含前缀得分h0,和长度信息qlen,tlen + FILE *query_f = gfq[fnum], + *target_f = gft[fnum], + *info_f = gfi[fnum]; + const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'}; + int i; + // 处理query + for (i = 0; i < qlen; ++i) + fprintf(query_f, "%c", seq_map[query[i]]); + fprintf(query_f, "\n"); + // 处理target + for (i = 0; i < tlen; ++i) + fprintf(target_f, "%c", seq_map[target[i]]); + fprintf(target_f, "\n"); + // 处理其他信息 + fprintf(info_f, "%-8d%-8d\n", qlen, tlen); +#endif +} + kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry) { int size; kswq_t *q; kswr_t r, rr; kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int); - +#ifdef GET_KSW_ALIGN_SEQ + write_query_target_sequence(qlen, query, tlen, target, 0); +#endif q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); + //q = (qry && *qry) ? *qry : ksw_qinit(2, qlen, query, m, mat); + //fprintf(stderr, "%d %d\n", tlen, qlen); if (qry && *qry == 0) *qry = q; func = q->size == 2? ksw_i16 : ksw_u8; size = q->size; r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra); + //fprintf(gf[0], "%d %d %d %d %d %d %d\n", r.score, r.qb, r.qe, r.tb, r.te, r.score2, r.te2); + //fprintf(gf[0], "%d %d %d %d %d\n", r.score, r.qb, r.qe, r.tb, r.te); if (qry == 0) free(q); if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end diff --git a/ksw_align_avx2.c b/ksw_align_avx2.c new file mode 100644 index 0000000..67ae098 --- /dev/null +++ b/ksw_align_avx2.c @@ -0,0 +1,75 @@ +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "ksw.h" + +static const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; + +typedef struct _kswq_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m256i *qp, *H0, *H1, *E, *Hmax; +} kswq_t; + +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet (ACGTN) + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_t *ksw_qinit_avx2(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + kswq_t *q; + int slen, a, tmp, p; + size = size > 1 ? 2 : 1; + p = 16 * (3 - size); // # values per __m256i + slen = (qlen + p - 1) / p; // segmented length + q = (kswq_t *)malloc(sizeof(kswq_t) + 512 + 32 * slen * (m + 4)); // a single block of memory + q->qp = (__m256i *)(((size_t)q + sizeof(kswq_t) + 31) >> 5 << 5); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} \ No newline at end of file diff --git a/ksw_extend2_avx2.c b/ksw_extend2_avx2.c index 5903b5c..d1b6005 100644 --- a/ksw_extend2_avx2.c +++ b/ksw_extend2_avx2.c @@ -6,30 +6,15 @@ #include #include #include "utils.h" +#include "debug.h" -#ifdef DEBUG_OUTPUT -extern FILE *gfp1, *gfp2, *gfp3; -extern FILE *gfq[4], *gft[4], *gfi[4]; -#endif +#define ELIMINATE_DIFF_1 +// #define ELIMINATE_DIFF_3 #define NO_VAL -1 -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x),1) -#define UNLIKELY(x) __builtin_expect((x),0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -//#undef MAX -//#undef MIN -//#define MAX(x, y) ((x) > (y) ? (x) : (y)) -//#define MIN(x, y) ((x) < (y) ? (x) : (y)) #define SIMD_WIDTH 16 -// typedef struct { size_t m; uint8_t *addr; } buf_t; - extern int ksw_extend2_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf); @@ -92,13 +77,10 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { __m256i f1j1 = _mm256_loadu_si256((__m256i*) (&fA1[j-1])); \ __m256i h0j1 = _mm256_loadu_si256((__m256i*) (&hA0[j-1])); \ __m256i qs_vec = _mm256_loadu_si256((__m256i*) (&seq[j-1])); \ - __m256i ts_vec = _mm256_loadu_si256((__m256i*) (&ref[i])); + __m256i ts_vec = _mm256_loadu_si256((__m256i*) (&ref[tlen - i])); // 比对ref和seq的序列,计算罚分 #define SIMD_CMP_SEQ \ - ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \ __m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); \ __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \ __m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \ @@ -152,7 +134,7 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ int16_t *maxVal = (int16_t*)&max_vec; \ m = MAX(m, maxVal[0]); /*用来解决与BSW结果不一样的第二种情况(上边界)*/ \ - if (maxVal[0] > 0) { \ + if (maxVal[0] > 0 && m >= max) { \ for(j=beg, i=iend; j<=end; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { \ __m256i h2_vec = _mm256_loadu_si256((__m256i*) (&hA2[j])); \ __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \ @@ -174,9 +156,9 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { tmp = fA1; fA1 = fA2; fA2 = tmp; \ tmp = mA1; mA1 = mA2; mA2 = tmp; -void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum) +static void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum) { -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT // 写到三个文件里,query.fa,target.fa,每行一个序列,info.txt,包含前缀得分h0,和长度信息qlen,tlen FILE *query_f = gfq[fnum], *target_f = gft[fnum], @@ -222,8 +204,8 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 { //return ksw_extend2_origin(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off); -#ifdef DEBUG_OUTPUT - //fprintf(gfp1, "%d\n", qlen); +#ifdef DEBUG_FILE_OUTPUT + //fprintf(gf[0], "%d\n", qlen); #ifdef GET_DIFFERENT_EXTENSION_LENGTH if (qlen <= 30) { write_query_target_sequence(qlen, query, tlen, target, h0, 0); @@ -238,6 +220,7 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 #endif if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf); + int16_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M int16_t *seq, *ref; uint8_t *mem; @@ -267,10 +250,10 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 seq=&qtmem[0]; ref=&qtmem[seq_size]; if (is_left) { for (i=0; i abs(mj - mi) ? max_off : abs(mj - mi); } - else if (zdrop > 0) { + else if (zdrop > 0 && mi > -1) { if (mi - max_i > mj - max_j) { if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break; } else { @@ -512,46 +494,46 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 SWAP_DATA_POINTER; } -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT #ifdef DEBUG_SW_EXTEND - fprintf(gfp1, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); - fprintf(gfp2, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); - fprintf(gfp3, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); - fprintf(gfp1, "%-4d", -1); - fprintf(gfp2, "%-4d", -1); - fprintf(gfp3, "%-4d", -1); - fprintf(gfp1, "%-4d", -1); - fprintf(gfp2, "%-4d", -1); - fprintf(gfp3, "%-4d", -1); + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); for (djj = 0; djj < qlen; ++djj) { - fprintf(gfp1, "%-4c", "ACGTN"[query[djj]]); - fprintf(gfp2, "%-4c", "ACGTN"[query[djj]]); - fprintf(gfp3, "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]); } - fprintf(gfp1, "\n"); - fprintf(gfp2, "\n"); - fprintf(gfp3, "\n"); + fprintf(gf[0], "\n"); + fprintf(gf[1], "\n"); + fprintf(gf[2], "\n"); for (dii = 0; dii <= tlen; ++dii) { if (dii > 0) { - fprintf(gfp1, "%-4c", "ACGTN"[target[dii - 1]]); - fprintf(gfp2, "%-4c", "ACGTN"[target[dii - 1]]); - fprintf(gfp3, "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]); } else { - fprintf(gfp1, "%-4d", -1); - fprintf(gfp2, "%-4d", -1); - fprintf(gfp3, "%-4d", -1); + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); } for (djj = 0; djj <= qlen; ++djj) { - fprintf(gfp1, "%-4d", score[dii][djj]); - fprintf(gfp2, "%-4d", ins[dii][djj]); - fprintf(gfp3, "%-4d", del[dii][djj]); + fprintf(gf[0], "%-4d", score[dii][djj]); + fprintf(gf[1], "%-4d", ins[dii][djj]); + fprintf(gf[2], "%-4d", del[dii][djj]); } - fprintf(gfp1, "\n"); - fprintf(gfp2, "\n"); - fprintf(gfp3, "\n"); + fprintf(gf[0], "\n"); + fprintf(gf[1], "\n"); + fprintf(gf[2], "\n"); } #endif #endif @@ -658,7 +640,7 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 ins[0][0] = del[0][0] = score[0][0] = h0; #endif -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT #ifdef COUNT_CALC_NUM int bsw_cal_num = 0; int real_cal_num = 0; @@ -669,7 +651,7 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 if (beg >= end) break; bsw_cal_num += end - beg; } - fprintf(gfp1, "start\n%d\n", bsw_cal_num); + fprintf(gf[0], "start\n%d\n", bsw_cal_num); #endif #endif @@ -692,7 +674,7 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 //m = h1; // 用来解决和VP-BSW结果不一样的第一种情况(左边界) for (j = beg; LIKELY(j < end); ++j) { -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT #ifdef COUNT_CALC_NUM real_cal_num++; #endif @@ -770,57 +752,57 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长 // fprintf(stderr, "beg: %d; end: %d\n", beg, end); // } } -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT #ifdef DEBUG_SW_EXTEND - fprintf(gfp1, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); - fprintf(gfp2, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); - fprintf(gfp3, "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); + fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max); - fprintf(gfp1, "%-4d", -1); - fprintf(gfp2, "%-4d", -1); - fprintf(gfp3, "%-4d", -1); - fprintf(gfp1, "%-4d", -1); - fprintf(gfp2, "%-4d", -1); - fprintf(gfp3, "%-4d", -1); + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); for (djj = 0; djj < qlen; ++djj) { - fprintf(gfp1, "%-4c", "ACGTN"[query[djj]]); - fprintf(gfp2, "%-4c", "ACGTN"[query[djj]]); - fprintf(gfp3, "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]); + fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]); } - fprintf(gfp1, "\n"); - fprintf(gfp2, "\n"); - fprintf(gfp3, "\n"); + fprintf(gf[0], "\n"); + fprintf(gf[1], "\n"); + fprintf(gf[2], "\n"); for (dii = 0; dii <= tlen; ++dii) { if (dii > 0) { - fprintf(gfp1, "%-4c", "ACGTN"[target[dii - 1]]); - fprintf(gfp2, "%-4c", "ACGTN"[target[dii - 1]]); - fprintf(gfp3, "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]); + fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]); } else { - fprintf(gfp1, "%-4d", -1); - fprintf(gfp2, "%-4d", -1); - fprintf(gfp3, "%-4d", -1); + fprintf(gf[0], "%-4d", -1); + fprintf(gf[1], "%-4d", -1); + fprintf(gf[2], "%-4d", -1); } for (djj = 0; djj <= qlen; ++djj) { - fprintf(gfp1, "%-4d", score[dii][djj]); - fprintf(gfp2, "%-4d", ins[dii][djj]); - fprintf(gfp3, "%-4d", del[dii][djj]); + fprintf(gf[0], "%-4d", score[dii][djj]); + fprintf(gf[1], "%-4d", ins[dii][djj]); + fprintf(gf[2], "%-4d", del[dii][djj]); } - fprintf(gfp1, "\n"); - fprintf(gfp2, "\n"); - fprintf(gfp3, "\n"); + fprintf(gf[0], "\n"); + fprintf(gf[1], "\n"); + fprintf(gf[2], "\n"); } #endif #endif -#ifdef DEBUG_OUTPUT +#ifdef DEBUG_FILE_OUTPUT #ifdef COUNT_CALC_NUM - fprintf(gfp1, "%d\nend\n", real_cal_num); + fprintf(gf[0], "%d\nend\n", real_cal_num); #endif #endif diff --git a/ksw_extend2_avx2_u8.c b/ksw_extend2_avx2_u8.c index 19b7dd7..2974254 100644 --- a/ksw_extend2_avx2_u8.c +++ b/ksw_extend2_avx2_u8.c @@ -1,27 +1,15 @@ #include #include #include -#include #include #include #include +#include "utils.h" -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x),1) -#define UNLIKELY(x) __builtin_expect((x),0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif +#define ELIMINATE_DIFF_1 -#undef MAX -#undef MIN -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#define MIN(x, y) ((x) < (y) ? (x) : (y)) #define SIMD_WIDTH 32 -typedef struct { size_t m; uint8_t *addr; } buf_t; - static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { {0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, @@ -57,8 +45,7 @@ static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} }; -//static const uint8_t reverse_mask[SIMD_WIDTH] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -static const uint8_t reverse_mask[SIMD_WIDTH] = {7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}; +//static const uint8_t reverse_mask[SIMD_WIDTH] = {7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}; #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) //const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3); // 初始化变量 @@ -71,7 +58,6 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7,6,5,4,3,2,1,0,15,14,13,12,11, __m256i e_del_vec; \ __m256i e_ins_vec; \ __m256i h_vec_mask[SIMD_WIDTH]; \ - __m256i reverse_mask_vec; \ zero_vec = _mm256_setzero_si256(); \ oe_del_vec = _mm256_set1_epi8(oe_del); \ oe_ins_vec = _mm256_set1_epi8(oe_ins); \ @@ -81,7 +67,6 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7,6,5,4,3,2,1,0,15,14,13,12,11, __m256i mis_sc_vec = _mm256_set1_epi8(b); \ __m256i amb_sc_vec = _mm256_set1_epi8(1); \ __m256i amb_vec = _mm256_set1_epi8(4); \ - reverse_mask_vec = _mm256_loadu_si256((__m256i*) (reverse_mask)); \ for (i=0; i 0) { \ + if (maxVal[0] > 0 && m >= max) { \ for(j=beg, i=iend; j<=end; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { \ __m256i h2_vec = _mm256_loadu_si256((__m256i*) (&hA2[j])); \ __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ @@ -236,10 +219,10 @@ int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长 seq=(uint8_t*)&qtmem[0]; ref=(uint8_t*)&qtmem[seq_size]; if (is_left) { for (i=0; i abs(mj - mi) ? max_off : abs(mj - mi); } - else if (zdrop > 0) { + else if (zdrop > 0 && mi > -1) { if (mi - max_i > mj - max_j) { if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break; } else { diff --git a/profiling.c b/profiling.c new file mode 100644 index 0000000..223502e --- /dev/null +++ b/profiling.c @@ -0,0 +1,110 @@ +/* +Description: profiling related data + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2024/04/06 +*/ + +#include +#include "utils.h" +#include "profiling.h" + +#ifdef SHOW_PERF +uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD] = {0}; +uint64_t proc_freq = 1000; +uint64_t gprof[LIM_GLOBAL_PROF_TYPE] = {0}; +#endif + +#ifdef SHOW_DATA_PERF +/* +tdat[0]: read nums +tdat[1]: seed-1 full match nums +*/ +int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0}; +int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0}; + +#endif + +int find_opt(uint64_t *a, int len, double *max, double *min, double *avg) +{ + int i = 0; + uint64_t umax = 0, umin = UINT64_MAX, uavg = 0; + for (i = 0; i < len; i++) + { + if (a[i] > umax) umax = a[i]; + if (a[i] < umin) umin = a[i]; + uavg += a[i]; + } + *avg = uavg * 1.0 / len / proc_freq; + *max = umax * 1.0 / proc_freq; + *min = umin * 1.0 / proc_freq; + return 1; +} + +int display_stats(int nthreads) +{ +#ifdef SHOW_PERF + double avg, max, min; + + fprintf(stderr, "[steps in main_mem]\n"); + fprintf(stderr, "time_parse_arg: %0.2lf s\n", gprof[G_PREPARE] * 1.0 / proc_freq); + fprintf(stderr, "time_load_idx: %0.2lf s\n", gprof[G_LOAD_IDX] * 1.0 / proc_freq); + fprintf(stderr, "time_pipeline: %0.2lf s\n", gprof[G_PIPELINE] * 1.0 / proc_freq); + fprintf(stderr, "time_all: %0.2lf s\n", gprof[G_ALL] * 1.0 / proc_freq); + + fprintf(stderr, "\n[steps in pipeline]\n"); + fprintf(stderr, "time_read: %0.2lf s\n", gprof[G_READ] * 1.0 / proc_freq); + fprintf(stderr, "time_compute: %0.2lf s\n", gprof[G_COMPUTE] * 1.0 / proc_freq); + fprintf(stderr, "time_write: %0.2lf s\n", gprof[G_WRITE] * 1.0 / proc_freq); + + fprintf(stderr, "\n[steps in mem_process_seqs]\n"); + fprintf(stderr, "time_mem_prepare: %0.2lf s\n", gprof[G_MEM_PREPARE] * 1.0 / proc_freq); + fprintf(stderr, "time_mem_kernel: %0.2lf s\n", gprof[G_MEM_KERNEL] * 1.0 / proc_freq); + fprintf(stderr, "time_mem_pestat: %0.2lf s\n", gprof[G_MEM_PESTAT] * 1.0 / proc_freq); + fprintf(stderr, "time_mem_sam: %0.2lf s\n", gprof[G_MEM_SAM] * 1.0 / proc_freq); + + fprintf(stderr, "\n[steps in kernel]\n"); + find_opt(tprof[T_SEED_ALL], nthreads, &max, &min, &avg); + fprintf(stderr, "time_seed_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min); + find_opt(tprof[T_CHAIN_ALL], nthreads, &max, &min, &avg); + fprintf(stderr, "time_chain_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min); + find_opt(tprof[T_ALN_ALL], nthreads, &max, &min, &avg); + fprintf(stderr, "time_aln_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min); + find_opt(tprof[T_INS_SIZE], nthreads, &max, &min, &avg); + fprintf(stderr, "time_ins_size_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg, max, min); + + fprintf(stderr, "\n[steps in seeding]\n"); + find_opt(tprof[T_SEED_1], nthreads, &max, &min, &avg); + fprintf(stderr, "time_seed_1: %0.2lf s\n", avg); + find_opt(tprof[T_SEED_2], nthreads, &max, &min, &avg); + fprintf(stderr, "time_seed_2: %0.2lf s\n", avg); + find_opt(tprof[T_SEED_3], nthreads, &max, &min, &avg); + fprintf(stderr, "time_seed_3: %0.2lf s\n", avg); + + fprintf(stderr, "\n[steps in chain]\n"); + find_opt(tprof[T_GEN_CHAIN], nthreads, &max, &min, &avg); + fprintf(stderr, "time_gen_chain: %0.2lf s\n", avg); + find_opt(tprof[T_FLT_CHAIN], nthreads, &max, &min, &avg); + fprintf(stderr, "time_fil_chain: %0.2lf s\n", avg); + find_opt(tprof[T_FLT_CHANNED_SEEDS], nthreads, &max, &min, &avg); + fprintf(stderr, "time_flt_chained_seeds: %0.2lf s\n", avg); + find_opt(tprof[T_SAL], nthreads, &max, &min, &avg); + fprintf(stderr, "time_sal: %0.2lf s\n", avg); + find_opt(tprof[T_BSW], nthreads, &max, &min, &avg); + fprintf(stderr, "time_bsw: %0.2lf s\n", avg); + + fprintf(stderr, "\n[steps in gen sam]\n"); + find_opt(tprof[T_SAM_MATESW], nthreads, &max, &min, &avg); + fprintf(stderr, "time_mate_sw: %0.2lf s\n", avg); + find_opt(tprof[T_SAM_REG2ALN], nthreads, &max, &min, &avg); + fprintf(stderr, "time_reg2aln: %0.2lf s\n", avg); + + fprintf(stderr, "time_ksw_loop: %0.2lf s\n", gprof[G_KSW_LOOP] * 1.0 / proc_freq); + fprintf(stderr, "time_ksw_end_loop: %0.2lf s\n", gprof[G_KSW_END_LOOP] * 1.0 / proc_freq); + + fprintf(stderr, "\n"); +#endif + return 1; +} \ No newline at end of file diff --git a/profiling.h b/profiling.h new file mode 100644 index 0000000..4ad868d --- /dev/null +++ b/profiling.h @@ -0,0 +1,89 @@ +/* +Description: profiling related data + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2024/04/06 +*/ +#ifndef PROFILING_H_ +#define PROFILING_H_ + +#include + +#define USE_RDTSC 1 + +#define LIM_THREAD 128 +#define LIM_THREAD_PROF_TYPE 128 +#define LIM_GLOBAL_PROF_TYPE 128 +#define LIM_THREAD_DATA_TYPE 128 +#define LIM_GLOBAL_DATA_TYPE 128 + +#ifdef SHOW_PERF +extern uint64_t proc_freq; +extern uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD]; +extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE]; +#endif + +#ifdef SHOW_DATA_PERF +extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0}; +extern int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0}; +#endif + +#ifdef SHOW_PERF +#if USE_RDTSC +#define PROF_START(tmp_time) const uint64_t prof_tmp_##tmp_time = __rdtsc() +#define PROF_END(result, tmp_time) result += __rdtsc() - prof_tmp_##tmp_time +#else +#define PROF_START(tmp_time) uint64_t prof_tmp_##tmp_time = realtime_msec() +#define PROF_END(result, tmp_time) result += realtime_msec() - prof_tmp_##tmp_time +#endif +#else +#define PROF_START(tmp_time) +#define PROF_END(result, tmp_time) +#endif + + + +// GLOBAL +enum +{ + G_ALL = 0, + G_PIPELINE, + G_READ, + G_WRITE, + G_COMPUTE, + G_PREPARE, + G_LOAD_IDX, + G_MEM_PREPARE, + G_MEM_KERNEL, + G_MEM_PESTAT, + G_MEM_SAM, + G_KSW_LOOP, + G_KSW_END_LOOP +}; + +// THREAD +enum +{ + T_SEED_ALL = 0, + T_CHAIN_ALL, + T_ALN_ALL, + T_INS_SIZE, + T_SEED_1, + T_SEED_2, + T_SEED_3, + T_SAL, + T_GEN_CHAIN, + T_FLT_CHAIN, + T_FLT_CHANNED_SEEDS, + T_READ_SA, + T_BSW, + T_BSW_ALL, + T_SAM_MATESW, + T_SAM_REG2ALN +}; + +int display_stats(int); + +#endif \ No newline at end of file diff --git a/rle.h b/rle.h index 4f8946d..40d9a02 100644 --- a/rle.h +++ b/rle.h @@ -2,12 +2,7 @@ #define RLE6_H_ #include - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x),1) -#else -#define LIKELY(x) (x) -#endif +#include "utils.h" #ifdef __cplusplus extern "C" { diff --git a/run.sh b/run.sh index ab1aeb8..dd64e64 100755 --- a/run.sh +++ b/run.sh @@ -1,8 +1,10 @@ -thread=64 +thread=1 ## d1 -#n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq -#n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq +#n_r1=~/data/fastq/dataset/na12878_wes_144/1w_1.fq +#n_r2=~/data/fastq/dataset/na12878_wes_144/1w_2.fq +n_r1=~/data/fastq/dataset/na12878_wes_144/ss_1.fq +n_r2=~/data/fastq/dataset/na12878_wes_144/ss_2.fq #n_r1=~/data/fastq/dataset/na12878_wes_144/45m_r1.fq #n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq #n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz @@ -27,19 +29,20 @@ thread=64 #n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz #n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq #n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq -n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq -n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq +#n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq +#n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta #reference=~/reference/bwa/human_g1k_v37_decoy.fasta #reference=~/data/reference/human_g1k_v37_decoy.fasta -out=~/data1/fast-out.sam +#out=~/data1/out-u8-1.sam +#out=~/data1/out-i16.sam +out=~/data1/out.sam +#out=/dev/null time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ $reference \ $n_r1 \ $n_r2 \ -o $out -2 - - diff --git a/utils.h b/utils.h index b752e71..f64be9e 100644 --- a/utils.h +++ b/utils.h @@ -30,44 +30,22 @@ #include #include #include - -// for debug and test - -//#define DEBUG_OUTPUT // 打开gfp1-4文件,并记录debug信息 -//#define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度,以及最终扩展的长度 -//#define GET_FULL_MATCH_READ // 获取完全匹配的reads -//#define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量 -// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的query,target,和其他用于计算的数据 -//#define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里 - -//////////////////// - -#define USE_RDTSC 1 - -#ifdef SHOW_PERF -extern uint64_t time_process_data, time_read, time_write, time_compute, - time_seed_1, time_seed_2, time_seed_3, - time_seed_sa, time_seed_chain, time_seed_all, - time_bsw, time_bsw_all, - time_work_kernel, time_work_sam, - time_load_idx; -extern uint64_t proc_freq; -#endif - -#ifdef SHOW_DATA_PERF -extern int64_t gdat[100]; -#endif - -#ifdef DEBUG_OUTPUT -extern FILE *gfp1, *gfp2, *gfp3, *gfp4; -extern FILE *gfq[4], *gft[4], *gfi[4]; -#endif +#include +#include "profiling.h" #undef MAX #undef MIN #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x), 1) +#define UNLIKELY(x) __builtin_expect((x), 0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + #ifdef __GNUC__ // Tell GCC to validate printf format string and args #define ATTRIBUTE(list) __attribute__ (list)