From e997699a4763791560ad4b57bf01b5eb7f2ded6e Mon Sep 17 00:00:00 2001 From: zzh Date: Thu, 15 Jan 2026 23:04:02 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E4=BA=86=E4=B8=80=E4=B8=AAbu?= =?UTF-8?q?g=EF=BC=8C=E6=98=AFxtra=E8=B5=8B=E5=80=BC=E6=90=9E=E9=94=99?= =?UTF-8?q?=E4=BA=86=EF=BC=8C=E7=AC=AC=E4=BA=8C=E9=98=B6=E6=AE=B5=E5=BA=94?= =?UTF-8?q?=E8=AF=A5=E8=B5=8B=E5=80=BC=E7=BB=99xtras2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .vscode/launch.json | 6 ++-- .vscode/tasks.json | 2 +- Makefile | 4 +-- paired_sam.c | 76 ++++++++++++++++++++++++++++++++------------- profiling.c | 17 ++++++++-- profiling.h | 10 ++++-- run.sh | 9 ++++-- 8 files changed, 91 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index dab5627..e5ce07d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.log *.sam +output/ # Prerequisites *.d diff --git a/.vscode/launch.json b/.vscode/launch.json index 7bc316a..a3676a7 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,13 +13,13 @@ "args": [ "mem", "-t", - "32", + "1", "-M", "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", "~/data/reference/fmt/human_g1k_v37_decoy.fasta", - "~/data/dataset/D1/n1.fq.gz", - "~/data/dataset/D1/n2.fq.gz", + "~/data/dataset/D2/n1.fq.gz", + "~/data/dataset/D2/n2.fq.gz", "-o", "D1-out.sam", ], diff --git a/.vscode/tasks.json b/.vscode/tasks.json index f76ae19..9e3ef23 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -6,7 +6,7 @@ { "label": "Build", "type": "shell", - "command": "make clean; make -j 16", + "command": "make clean; make -j 32", "problemMatcher": [], "group": { "kind": "build", diff --git a/Makefile b/Makefile index a3664a3..5351642 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,9 @@ CC= gcc -CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw -O3 +CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw #-O3 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF -SHOW_DATA_PERF= #-DSHOW_DATA_PERF +SHOW_DATA_PERF= -DSHOW_DATA_PERF FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH USE_MT_READ= -DUSE_MT_READ diff --git a/paired_sam.c b/paired_sam.c index 9804d23..bff1b8e 100644 --- a/paired_sam.c +++ b/paired_sam.c @@ -6,6 +6,7 @@ #include "ksw_align_avx.h" #include "kvec.h" +#include "debug.h" // 原版生成sam函数 static void worker_sam(void* data, int i, int tid) { @@ -76,7 +77,7 @@ static inline void revseq(int l, uint8_t* s) { // 计算当前seq是否需要做matesw,需要的话保存必要的数据 static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], const mem_alnreg_t* a, int a_j, - bseq1_t* seq, mem_alnreg_v* ma, int64_t sid, msw_stats_t* stats, msw_task_v* msw8, msw_task_v* msw16) { + bseq1_t* seq, mem_alnreg_v* ma, int64_t sid, msw_stats_t* stats, msw_task_v* msw8, msw_task_v* msw16, int64_t n_processed, int tid) { int64_t l_pac = bns->l_pac; int l_ms = seq->l_seq; int i, r, skip[4], rid; @@ -112,6 +113,8 @@ static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const ui re = l_pac << 1; rid = get_rid_range(bns, pac, &rb, (rb + re) >> 1, &re); // 计算ref对应的染色体id和区间起始终止位置 if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening + tdat[TD_MSW_CNT][tid] += 1; + stats->max_ref_len = max_(stats->max_ref_len, re - rb); // fprintf(stderr, "zzh here\n"); int xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250 ? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); @@ -169,7 +172,7 @@ static void worker_matesw_tasks(void* data, int idx, int tid) { for (i = 0; i < 2; ++i) for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) get_matesw_tasks(opt, w->bns, w->pac, w->pes, &b[i].a[j], aj[i].a[j], &s[!i], &a[!i], i_s + !i, &w->msw->t_msw_stats[tid], - &w->msw->t_msw_tasks_u8[tid], &w->msw->t_msw_tasks_i16[tid]); + &w->msw->t_msw_tasks_u8[tid], &w->msw->t_msw_tasks_i16[tid], w->n_processed, tid); } free(b[0].a); free(b[1].a); @@ -178,7 +181,7 @@ static void worker_matesw_tasks(void* data, int idx, int tid) { } static void msw_avx512_u8(const mem_opt_t* opt, int num_tasks, msw_buf_t* msw_buf, msw_ref_v* refs, msw_seq_v* seqs, int_v* xtras, - msw_kswr_v* kswrs, int tid) { + msw_kswr_v* kswrs, int phase, int tid) { int i = 0, j = 0, k = 0; int refLen[SIMD512_WIDTH8] = {0}; @@ -239,9 +242,14 @@ static void msw_avx512_u8(const mem_opt_t* opt, int num_tasks, msw_buf_t* msw_bu } PROF_END(tprof[T_MSW_PACK_SEQ][tid], msw_pack_seq); + if (phase == 0) + tdat[TD_ALIGN_1_CNT][tid] += 1; + else + tdat[TD_ALIGN_2_CNT][tid] += 1; + // 利用smid指令计算 ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen, - &xtras->a[i], refLen, &kswrs->a[i], 0); + &xtras->a[i], refLen, &kswrs->a[i], phase); } } @@ -292,7 +300,7 @@ static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) { PROF_END(tprof[T_MSW_GET_REF][tid], msw_get_ref); PROF_START(msw_1); - msw_avx512_u8(opt, endIdx - startIdx, msw_buf, refs, seqs, xtras, kswrs, tid); + msw_avx512_u8(opt, endIdx - startIdx, msw_buf, refs, seqs, xtras, kswrs, 0, tid); // 第一阶段 PROF_END(tprof[T_MSW_1][tid], msw_1); // 第二阶段计算 @@ -307,19 +315,25 @@ static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) { msw_task_t* task = w->msw->p_msw_tasks_u8.a[i]; kswr_avx_t r = kswrs->a[j]; task->aln = r; + // 打印测试 + // kswr_avx_t aln = r; + //fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", task->seq_id + w->n_processed, aln.score, aln.te, aln.qe, + // aln.score2, aln.te2, aln.tb, aln.qb); + if ((task->xtra & KSW_XSTART) == 0 || ((task->xtra & KSW_XSUBO) && r.score < (task->xtra & 0xffff))) continue; - task->xtra = KSW_XSTOP | task->aln.score; + task->xtra = KSW_XSTOP | r.score; kv_push(int, msw_task_ids, i); // 1. 获取对应的ref refs2->a[k] = refs->a[j]; revseq(r.te + 1, refs2->a[k].ref); + // refs2->a[k].l_ref = r.te + 1; // zzh add, for test // 2. 获取对应的seq seqs2->a[k] = seqs->a[j]; revseq(r.qe + 1, seqs2->a[k].seq); seqs2->a[k].l_seq = r.qe + 1; // 3. 其他数据 - xtras->a[k] = task->xtra; + xtras2->a[k] = task->xtra; kswrs2->a[k] = r; ++k; } @@ -327,10 +341,9 @@ static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) { for (; k < roundEndIdx; ++k) { refs2->a[k].l_ref = 0; seqs2->a[k].l_seq = 0; - xtras2->a[k] = 0; } PROF_START(msw_2); - msw_avx512_u8(opt, msw_task_ids.n, msw_buf, refs2, seqs2, xtras2, kswrs2, tid); + msw_avx512_u8(opt, msw_task_ids.n, msw_buf, refs2, seqs2, xtras2, kswrs2, 1, tid); // 第二阶段 PROF_END(tprof[T_MSW_2][tid], msw_2); // 结果赋值 @@ -338,6 +351,9 @@ static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) { i = msw_task_ids.a[k]; msw_task_t* task = w->msw->p_msw_tasks_u8.a[i]; task->aln = kswrs2->a[k]; + // kswr_avx_t aln = task->aln; + //fprintf(gf[1], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", task->seq_id + w->n_processed, aln.score, aln.te, aln.qe, + // aln.score2, aln.te2, aln.tb, aln.qb); } kv_destroy(msw_task_ids); @@ -350,7 +366,7 @@ static int check_add_align(kswr_avx_t aln, int min_seed_len, int is_rev, int64_t int64_t rb) { int res = 0; if (aln.score >= min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 - int i, tmp; + // int i, tmp; mem_alnreg_t b; memset(&b, 0, sizeof(mem_alnreg_t)); b.rid = a->rid; @@ -366,13 +382,13 @@ static int check_add_align(kswr_avx_t aln, int min_seed_len, int is_rev, int64_t // printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, // is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); kv_push(mem_alnreg_t, *ma, b); // make room for a new element - // move b s.t. ma is sorted - for (i = 0; i < ma->n - 1; ++i) // find the insertion point - if (ma->a[i].score < b.score) - break; - tmp = i; - for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1]; - ma->a[i] = b; +// // move b s.t. ma is sorted +// for (i = 0; i < ma->n - 1; ++i) // find the insertion point +// if (ma->a[i].score < b.score) +// break; +// tmp = i; +// for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1]; +// ma->a[i] = b; res = 1; } return res; @@ -406,16 +422,34 @@ static void workder_gen_sam(void* data, int idx, int tid) { memset(g, 0, sizeof(mem_aln_t) * 2); n_aa[0] = n_aa[1] = 0; + int n[2] = {0}; for (i = 0; i < 2; ++i) { int si = !i; - int n = 0; // 这里应该先给task排序,因为u8和i16是分开排序的,需要合在一起 for (k = 0; k < s[si].msw.n; ++k) { msw_task_t* t = s[si].msw.a[k]; - n += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, &a[!si].a[t->aj], s[si].l_seq, (uint8_t*)s[si].seq, &a[si], t->rb); + // kswr_avx_t aln = t->aln; fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb); + + n[i] += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, &a[!si].a[t->aj], s[si].l_seq, (uint8_t*)s[si].seq, &a[si], t->rb); } - if (n > 0) { - a[si].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[si].n, a[si].a); + } + // 处理完2个pair read之后再排序,因为上面操作有插入,排序后之前记录的索引就无效了,所以上面不能排序,要两个pair read处理完之后再排序 + for (i = 0; i < 2; ++i) { + int tmp; + if (n[i] > 0) { + for (j = 0; j < n[i]; ++j) { // 在这里排序 + // move b s.t. ma is sorted + int nidx = a[i].n - n[i] + j; + mem_alnreg_t* b = &a[i].a[nidx]; + for (k = 0; k < nidx; ++k) // find the insertion point + if (a[i].a[k].score < b->score) + break; + tmp = k; + for (k = nidx; k > tmp; --k) a[i].a[k] = a[i].a[k - 1]; + a[i].a[k] = *b; + } + + a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a); } } diff --git a/profiling.c b/profiling.c index cf3efae..b74e864 100644 --- a/profiling.c +++ b/profiling.c @@ -21,7 +21,7 @@ uint64_t proc_freq = 1000; uint64_t gprof[LIM_GLOBAL_PROF_TYPE] = {0}; #endif -#ifdef SHOW_DATA_PERF +//#ifdef SHOW_DATA_PERF /* tdat[0]: read nums tdat[1]: seed-1 full match nums @@ -29,7 +29,7 @@ tdat[1]: seed-1 full match nums int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD] = {0}; int64_t gdat[LIM_GLOBAL_DATA_TYPE] = {0}; -#endif +//#endif int find_opt(uint64_t *a, int len, double *max, double *min, double *avg) { @@ -47,6 +47,15 @@ int find_opt(uint64_t *a, int len, double *max, double *min, double *avg) return 1; } +int64_t get_sum(int64_t* a, int len) { + int i = 0; + int64_t all = 0; + for (i = 0; i < len; i++) { + all += a[i]; + } + return all; +} + int display_stats(int nthreads) { #ifdef SHOW_PERF @@ -146,5 +155,9 @@ int display_stats(int nthreads) fprintf(stderr, "\n"); #endif + fprintf(stderr, "all msw cnt: %ld\n", get_sum(tdat[TD_MSW_CNT], nthreads)); + fprintf(stderr, "align 1 cnt: %ld\n", get_sum(tdat[TD_ALIGN_1_CNT], nthreads)); + fprintf(stderr, "align 2 cnt: %ld\n", get_sum(tdat[TD_ALIGN_2_CNT], nthreads)); + return 1; } \ No newline at end of file diff --git a/profiling.h b/profiling.h index 88e1e1d..280327d 100644 --- a/profiling.h +++ b/profiling.h @@ -25,10 +25,10 @@ extern uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD]; extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE]; #endif -#ifdef SHOW_DATA_PERF +//#ifdef SHOW_DATA_PERF extern int64_t tdat[LIM_THREAD_DATA_TYPE][LIM_THREAD]; extern int64_t gdat[LIM_GLOBAL_DATA_TYPE]; -#endif +//#endif #ifdef SHOW_PERF #if USE_RDTSC @@ -47,6 +47,12 @@ extern uint64_t calc_num; extern uint64_t more_num; extern uint64_t not_more_num; +enum { + TD_ALIGN_1_CNT, + TD_ALIGN_2_CNT, + TD_MSW_CNT, +}; + // GLOBAL enum { G_ALL = 0, diff --git a/run.sh b/run.sh index a52b5a8..3ada1c6 100755 --- a/run.sh +++ b/run.sh @@ -1,14 +1,17 @@ -thread=32 +thread=64 make -j 16 n1=~/data/dataset/D2/n1.fq.gz n2=~/data/dataset/D2/n2.fq.gz +#n1=/file-data/zzh/dataset/D1/45mr1.fq.gz +#n2=/file-data/zzh/dataset/D1/45mr2.fq.gz + reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta -#out=/dev/null -out=./D2-out.sam +out=/dev/null +#out=~/data/D2-out.sam prog=./fastalign #prog=/home/zzh/fastbwa