diff --git a/bntseq.c b/bntseq.c index 0322040..188763b 100644 --- a/bntseq.c +++ b/bntseq.c @@ -424,7 +424,7 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end return seq; } -void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len, size_t *m_seq, uint8_t **seqp) +void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, size_t *len, size_t *m_seq, uint8_t **seqp) { if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap if (end > l_pac<<1) end = l_pac<<1; @@ -477,8 +477,9 @@ uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, in void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp) { - int64_t far_beg, far_end, len; - int is_rev; + int64_t far_beg, far_end; + size_t len; + int is_rev; if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap assert(*beg <= mid && mid < *end); diff --git a/bntseq.h b/bntseq.h index 7ea0286..e0ae040 100644 --- a/bntseq.h +++ b/bntseq.h @@ -79,9 +79,11 @@ extern "C" { uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); - void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp); + void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t* pac, int64_t beg, int64_t end, size_t* len, size_t* m_seq, uint8_t** seqp); + void bns_fetch_seq_no_alloc(const bntseq_t* bns, const uint8_t* pac, int64_t* beg, int64_t mid, int64_t* end, int* rid, size_t* m_seq, + uint8_t** seqp); #ifdef __cplusplus -} + } #endif static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) diff --git a/bwamem.c b/bwamem.c index b169cb0..eabf10c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -111,7 +111,8 @@ mem_opt_t *mem_opt_init() o->max_chain_extend = 1<<30; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); bwa_fill_scmat(o->a, o->b, o->mat); - return o; + o->msw_batch_size = 1024; + return o; } /*************************** @@ -1293,6 +1294,16 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b w->matesw_arr16 = malloc(i * sizeof(matesw_data_v*)); w->msw_buf = calloc(i, sizeof(matesw_buf_t)); w->mem_stats = calloc(i, sizeof(mem_stats_t)); + w->msw_2_arr8 = malloc(i * sizeof(matesw_ptr_v*)); + w->msw_ref = calloc(i, sizeof(byte_vv)); + w->msw_seq = calloc(i, sizeof(msw_seq_v)); + w->msw_kswr = calloc(i, sizeof(msw_kswr_v)); + w->msw_xtra = calloc(i, sizeof(int_v)); + + w->msw_2_ref = calloc(i, sizeof(msw_ref_v)); + w->msw_2_seq = calloc(i, sizeof(msw_seq_v)); + w->msw_2_kswr = calloc(i, sizeof(msw_kswr_v)); + w->msw_2_xtra = calloc(i, sizeof(int_v)); for (i = 0; i < opt->n_threads; ++i) { w->aux[i] = smem_aux_init(); @@ -1301,11 +1312,12 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b w->isize_arr[i] = calloc(4, sizeof(uint64_v)); w->matesw_arr8[i] = calloc(1, sizeof(matesw_data_v)); w->matesw_arr16[i] = calloc(1, sizeof(matesw_data_v)); + w->msw_2_arr8[i] = calloc(1, sizeof(matesw_data_t*)); for (j = 0; j < opt->batch_size; ++j) { - kv_init(w->smem_arr[i][j].mem); + kv_init(w->smem_arr[i][j].mem); kv_init(w->smem_arr[i][j].pos_arr); kv_init(w->chain_arr[i][j]); - } + } } return w; } diff --git a/bwamem.h b/bwamem.h index fa9c0bd..8ab06d5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -32,6 +32,7 @@ #include "bwa.h" #include "fmt_idx.h" #include "utils.h" +#include "ksw_align_avx.h" #define MEM_MAPQ_COEF 30.0 #define MEM_MAPQ_MAX 60 @@ -84,6 +85,7 @@ typedef struct { int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + int msw_batch_size; // 每次处理的msw的个数,64的倍数 } mem_opt_t; typedef struct { @@ -166,8 +168,26 @@ typedef struct { int max_msw_ref_len; // mate sw里的最长的ref长度 } mem_stats_t; // 记录一些用到的数据统计,比如最长ref长度,最长seq长度等 +typedef kvec_t(byte_v) byte_vv; + typedef struct { - int calc_isize; + uint8_t* seq; // 只是指向seq的指针 + int l_seq; // seq长度 +} msw_seq_t; + +typedef struct { + uint8_t* ref; + int l_ref; +} msw_ref_t; + +typedef kvec_t(msw_seq_t) msw_seq_v; + +typedef kvec_t(msw_ref_t) msw_ref_v; + +typedef kvec_t(kswr_avx_t) msw_kswr_v; + +typedef struct { + int calc_isize; const mem_opt_t *opt; const bwt_t *bwt; const FMTIndex *fmt; @@ -182,13 +202,27 @@ typedef struct { mem_alnreg_v *regs; uint64_v **isize_arr; matesw_data_v** matesw_arr8; // 收集需要做mate sw的任务的数据 + matesw_ptr_v** msw_2_arr8; // 第二阶段任务,每个线程单独收集 matesw_data_v** matesw_arr16; //matesw_range_v msw_range_arr8; //matesw_range_v msw_range_arr16; matesw_ptr_v msw_tasks8; // 需要做mates sw的任务的指针 matesw_ptr_v msw_tasks16; - matesw_buf_t* msw_buf; // mate sw过程用到的缓存空间 + matesw_ptr_v msw_2_tasks8; // msw第二阶段任务 + matesw_buf_t* msw_buf; // mate sw过程用到的缓存空间 mem_stats_t* mem_stats; // 记录一些开辟空间等用到的统计数据,如seq长度,ref长度等 + + byte_vv* msw_ref; // 获取ref时的缓存空间,避免频繁的内存开辟和释放 + msw_seq_v* msw_seq; // seq的缓存空间,个数由msw_batch_size决定,上面的ref一样 + msw_kswr_v* msw_kswr; // 计算结果 + int_v* msw_xtra; // msw额外参数 + + // msw第二阶段 + msw_ref_v* msw_2_ref; + msw_seq_v* msw_2_seq; + msw_kswr_v* msw_2_kswr; + int_v* msw_2_xtra; + int64_t n_processed; int64_t n; // regs元素个数,可动态扩展 int64_t n_reads; diff --git a/paired_sam.c b/paired_sam.c index 4999230..2f8014a 100644 --- a/paired_sam.c +++ b/paired_sam.c @@ -48,9 +48,9 @@ static int get_rid_range(const bntseq_t* bns, const uint8_t* pac, int64_t* beg, } // 获取ref序列 -static inline uint8_t* get_ref_seq(const bntseq_t* bns, const uint8_t* pac, int64_t beg, int64_t end) { - int64_t len; - return bns_get_seq(bns->l_pac, pac, beg, end, &len); +static inline uint8_t* get_ref_seq(const bntseq_t* bns, const uint8_t* pac, int64_t beg, int64_t end, byte_v *buf) { + bns_get_seq_no_alloc(bns->l_pac, pac, beg, end, &buf->n, &buf->m, &buf->a); + return buf->a; } // 计算当前seq是否需要做matesw,需要的话保存必要的数据 @@ -111,105 +111,232 @@ static void get_matesw_data(const mem_opt_t* opt, const bntseq_t* bns, const uin static void worker_matesw_data(void* data, int idx, int tid) { mem_worker_t* w = (mem_worker_t*)data; const mem_opt_t* opt = w->opt; - int64_t i_s = idx << 1; - bseq1_t* s = &w->seqs[i_s]; - mem_alnreg_v* a = &w->regs[i_s]; - s->msw.n = 0; // 清空之前的matesw数据 - int i, j; - mem_alnreg_v b[2]; - kv_init(b[0]); - kv_init(b[1]); - // fprintf(stderr, "zzh test\n"); - for (i = 0; i < 2; ++i) - for (j = 0; j < a[i].n; ++j) - if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) - kv_push(mem_alnreg_t, b[i], a[i].a[j]); - for (i = 0; i < 2; ++i) - for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) - get_matesw_data(opt, w->bns, w->pac, w->pes, &b[i].a[j], &s[!i], &a[!i], i_s + !i, &w->mem_stats[tid], w->matesw_arr8[tid], w->matesw_arr16[tid]); - free(b[0].a); - free(b[1].a); + int startIdx = idx * opt->batch_size; + int endIdx = (idx + 1) * opt->batch_size; + if (endIdx > w->n_reads) + endIdx = w->n_reads; + + int id = 0; + for (id = startIdx; id < endIdx; id += 2) { + int64_t i_s = id; + bseq1_t* s = &w->seqs[i_s]; + mem_alnreg_v* a = &w->regs[i_s]; + s->msw.n = 0; // 清空之前的matesw数据 + int i, j; + mem_alnreg_v b[2]; + kv_init(b[0]); + kv_init(b[1]); + // fprintf(stderr, "zzh test\n"); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + get_matesw_data(opt, w->bns, w->pac, w->pes, &b[i].a[j], &s[!i], &a[!i], i_s + !i, &w->mem_stats[tid], w->matesw_arr8[tid], + w->matesw_arr16[tid]); + free(b[0].a); + free(b[1].a); + } } // 再多线程计算matesw,利用inter-query的simd并行 static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) { mem_worker_t* w = (mem_worker_t*)data; - int startIdx = idx * SIMD512_WIDTH8; - int endIdx = (idx + 1) * SIMD512_WIDTH8; + const mem_opt_t* opt = w->opt; + int startIdx = idx * w->opt->msw_batch_size; + int endIdx = (idx + 1) * w->opt->msw_batch_size; if (endIdx > w->msw_tasks8.n) endIdx = w->msw_tasks8.n; + int roundEndIdx = ((endIdx + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8) * SIMD512_WIDTH8; + int i = 0, j = 0, k = 0; - int maxSeqLen = 0, maxRefLen = 0; - uint8_t* refArr[SIMD512_WIDTH8] = {0}; - uint8_t* seqArr[SIMD512_WIDTH8] = {0}; int refLen[SIMD512_WIDTH8] = {0}; - int seqLen[SIMD512_WIDTH8] = {0}; - int seqQuantaLen[SIMD512_WIDTH8] = {0}; - int xtraA[SIMD512_WIDTH8] = {0}; - kswr_avx_t alns[SIMD512_WIDTH8] = {0}; matesw_buf_t* msw_buf = &w->msw_buf[tid]; // 缓冲区 + byte_vv* refs = &w->msw_ref[tid]; + msw_seq_v* seqs = &w->msw_seq[tid]; + int_v* xtras = &w->msw_xtra[tid]; + msw_kswr_v* kswrs = &w->msw_kswr[tid]; + // 获取ref和seq数据 + PROF_START(msw_get_ref); for (i = startIdx, j = 0; i < endIdx; ++i, ++j) { matesw_data_t* task = kv_A(w->msw_tasks8, i); // 1. 获取对应的ref - refArr[j] = get_ref_seq(w->bns, w->pac, task->rb, task->re); - refLen[j] = task->re - task->rb; - maxRefLen = max_(maxRefLen, refLen[j]); + byte_v* ref = &kv_A(*refs, j); + get_ref_seq(w->bns, w->pac, task->rb, task->re, ref); // 2. 获取对应的seq - seqArr[j] = (uint8_t*)w->seqs[task->seq_id].seq; - seqLen[j] = w->seqs[task->seq_id].l_seq; - int quanta = (seqLen[j] + 16 - 1) / 16; // based on SSE-8 bit lane - quanta *= 16; - seqQuantaLen[j] = quanta; - maxSeqLen = max_(maxSeqLen, quanta); + msw_seq_t* seq = &kv_A(*seqs, j); + seq->seq = (uint8_t*)w->seqs[task->seq_id].seq; + seq->l_seq = w->seqs[task->seq_id].l_seq; // 3. 其他数据 - xtraA[j] = task->xtra; + xtras->a[j] = task->xtra; } - // 如果不到64个,补全剩下的 - for (; j < SIMD512_WIDTH8; ++j) { - refArr[j] = refArr[0]; - refLen[j] = refLen[0]; - seqArr[j] = seqArr[0]; - seqLen[j] = seqLen[0]; - xtraA[j] = xtraA[0]; + for (; i < roundEndIdx; ++i, ++j) { + byte_v* ref = &kv_A(*refs, j); + ref->n = 0; + msw_seq_t* seq = &kv_A(*seqs, j); + seq->l_seq = 0; + xtras->a[j] = 0; } - // 将ref和seq赋值给对应的用来计算的缓存 - uint8_t* mySeq1SoA = msw_buf->refArr; - uint8_t* mySeq2SoA = msw_buf->seqArr; + PROF_END(tprof[T_MSW_GET_REF][tid], msw_get_ref); - for (j = 0; j < SIMD512_WIDTH8; ++j) { - uint8_t* seq1 = refArr[j]; - uint8_t* seq2 = seqArr[j]; + // 准备数据并进行计算 + for (i = 0; i < endIdx - startIdx; i += SIMD512_WIDTH8) { + // 将ref和seq赋值给对应的用来计算的缓存 + uint8_t* mySeq1SoA = msw_buf->refArr; + uint8_t* mySeq2SoA = msw_buf->seqArr; + int maxSeqLen = 0, maxRefLen = 0; + + PROF_START(msw_pack_seq); // 处理ref - for (k = 0; k < refLen[j]; ++k) { - mySeq1SoA[k * SIMD512_WIDTH8 + j] = (seq1[k] == AMBIG_ ? AMBR : seq1[k]); + for (j = 0; j < SIMD512_WIDTH8; ++j) { + byte_v* ref = &refs->a[i + j]; + uint8_t* seq1 = ref->a; + refLen[j] = ref->n; + // 处理ref + for (k = 0; k < ref->n; ++k) { + mySeq1SoA[k * SIMD512_WIDTH8 + j] = (seq1[k] == AMBIG_ ? AMBR : seq1[k]); + } + maxRefLen = max_(maxRefLen, ref->n); } - for (; k < maxRefLen; ++k) { - mySeq1SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + + for (j = 0; j < SIMD512_WIDTH8; ++j) { + byte_v* ref = &refs->a[i + j]; + for (k = ref->n; k < maxRefLen; ++k) { + mySeq1SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + } } - // 处理seq - for (k = 0; k < seqLen[j]; ++k) { - mySeq2SoA[k * SIMD512_WIDTH8 + j] = (seq2[k] == AMBIG_ ? AMBR : seq2[k]); + + // 处理query + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_seq_t* seq = &kv_A(*seqs, i + j); + uint8_t* seq2 = seq->seq; + int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane + for (k = 0; k < seq->l_seq; k++) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = (seq2[k] == AMBIG_ ? AMBQ : seq2[k]); + } + for (; k < quanta; ++k) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = DUMMY5; // SSE quanta + } + maxSeqLen = max_(maxSeqLen, quanta); } - for (; k < seqQuantaLen[j]; ++k) { - mySeq2SoA[k * SIMD512_WIDTH8 + j] = DUMMY5; - } - for (; k < maxSeqLen; ++k) { - mySeq2SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_seq_t* seq = &kv_A(*seqs, i + j); + int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane + for (k = quanta; k < maxSeqLen; k++) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + } } + PROF_END(tprof[T_MSW_PACK_SEQ][tid], msw_pack_seq); + + // 利用smid指令计算 + PROF_START(msw_1); + ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen, + &xtras->a[i], refLen, &kswrs->a[i], 0); + PROF_END(tprof[T_MSW_1][tid], msw_1); } - const mem_opt_t* opt = w->opt; - // 利用smid指令计算 - ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen, - xtraA, refLen, alns, 0); - // 保存结果 - for (i = startIdx, j = 0; i < endIdx; ++i, ++j) { +#if 1 + msw_ref_v* refs2 = &w->msw_2_ref[tid]; + msw_seq_v* seqs2 = &w->msw_2_seq[tid]; + int_v* xtras2 = &w->msw_2_xtra[tid]; + msw_kswr_v* kswrs2 = &w->msw_2_kswr[tid]; + + int_v msw_task_ids; // 用于第二阶段任务统计 + kv_init(msw_task_ids); + // 保存结果,并筛选第二阶段msw计算的任务 + for (i = startIdx, j = 0, k = 0; i < endIdx; ++i, ++j) { matesw_data_t* task = kv_A(w->msw_tasks8, i); - task->aln = alns[j]; - free(refArr[j]); + task->aln = kswrs->a[j]; + if ((task->xtra & KSW_XSTART) == 0 || ((task->xtra & KSW_XSUBO) && task->aln.score < (task->xtra & 0xffff))) + continue; + task->xtra = KSW_XSTOP | task->aln.score; + kv_push(int, msw_task_ids, j); + + // 1. 获取对应的ref + byte_v* ref = &kv_A(*refs, j); + msw_ref_t* ref2 = &kv_A(*refs2, k); + ref2->l_ref = ref->n; + ref2->ref = ref->a; + // 2. 获取对应的seq + msw_seq_t* seq = &kv_A(*seqs, j); + msw_seq_t* seq2 = &kv_A(*seqs2, k); + seq2->seq = seq->seq; + seq2->l_seq = seq->l_seq; + + // 3. 其他数据 + xtras2->a[k] = task->xtra; + ++k; } + roundEndIdx = ((msw_task_ids.n + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8) * SIMD512_WIDTH8; + for (; k < roundEndIdx; ++k) { + msw_ref_t* ref2 = &kv_A(*refs2, k); + ref2->l_ref = 0; + msw_seq_t* seq2 = &kv_A(*seqs2, k); + seq2->l_seq = 0; + xtras2->a[k] = 0; + } + + // 准备数据并进行计算 + for (i = 0; i < msw_task_ids.n; i += SIMD512_WIDTH8) { + // 将ref和seq赋值给对应的用来计算的缓存 + uint8_t* mySeq1SoA = msw_buf->refArr; + uint8_t* mySeq2SoA = msw_buf->seqArr; + int maxSeqLen = 0, maxRefLen = 0; + + PROF_START(msw_pack_seq); + // 处理ref + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_ref_t* ref = &refs2->a[i + j]; + uint8_t* seq1 = ref->ref; + refLen[j] = ref->l_ref; + // 处理ref + for (k = 0; k < ref->l_ref; ++k) { + mySeq1SoA[k * SIMD512_WIDTH8 + j] = (seq1[k] == AMBIG_ ? AMBR : seq1[k]); + } + maxRefLen = max_(maxRefLen, ref->l_ref); + } + + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_ref_t* ref = &refs2->a[i + j]; + for (k = ref->l_ref; k < maxRefLen; ++k) { + mySeq1SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + } + } + + // 处理query + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_seq_t* seq = &kv_A(*seqs2, i + j); + uint8_t* seq2 = seq->seq; + int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane + for (k = 0; k < seq->l_seq; k++) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = (seq2[k] == AMBIG_ ? AMBQ : seq2[k]); + } + for (; k < quanta; ++k) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = DUMMY5; // SSE quanta + } + maxSeqLen = max_(maxSeqLen, quanta); + } + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_seq_t* seq = &kv_A(*seqs2, i + j); + int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane + for (k = quanta; k < maxSeqLen; k++) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + } + } + PROF_END(tprof[T_MSW_PACK_SEQ][tid], msw_pack_seq); + + // 利用smid指令计算 + PROF_START(msw_2); + ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen, + &xtras2->a[i], refLen, &kswrs2->a[i], 1); + PROF_END(tprof[T_MSW_2][tid], msw_2); + } + + kv_destroy(msw_task_ids); +#endif } static void worker_calc_matesw16(void* data, int i, int tid) {} @@ -249,8 +376,71 @@ static void update_mem_stats(mem_worker_t* w) { // 开辟缓冲区 static void alloc_update_cache_avx512(mem_worker_t* w) { - int i = 0; + int i = 0, j = 0; for (i = 0; i < w->opt->n_threads; ++i) { + // 检查ref缓存空间 + byte_vv* refs = &w->msw_ref[i]; + if (refs->n < w->opt->msw_batch_size) { // 初始化ref缓存 + for (j = refs->n; j < w->opt->msw_batch_size; ++j) { + byte_v *p = kv_pushp(byte_v, *refs); + kv_init(*p); + } + } + msw_ref_v* refs2 = &w->msw_2_ref[i]; + if (refs2->n < w->opt->msw_batch_size) { // 初始化seq缓存 + for (j = refs2->n; j < w->opt->msw_batch_size; ++j) { + msw_ref_t* p = kv_pushp(msw_ref_t, *refs2); + p->l_ref = 0; + p->ref = 0; + } + } + + // 检查seq缓存空间 + msw_seq_v* seqs = &w->msw_seq[i]; + if (seqs->n < w->opt->msw_batch_size) { // 初始化seq缓存 + for (j = seqs->n; j < w->opt->msw_batch_size; ++j) { + msw_seq_t* p = kv_pushp(msw_seq_t, *seqs); + p->l_seq = 0; + p->seq = 0; + } + } + msw_seq_v* seqs2 = &w->msw_2_seq[i]; + if (seqs2->n < w->opt->msw_batch_size) { // 初始化seq缓存 + for (j = seqs2->n; j < w->opt->msw_batch_size; ++j) { + msw_seq_t* p = kv_pushp(msw_seq_t, *seqs2); + p->l_seq = 0; + p->seq = 0; + } + } + + msw_kswr_v* kswrs = &w->msw_kswr[i]; // 初始化msw结果 + if (kswrs->n < w->opt->msw_batch_size) { + for (j = kswrs->n; j < w->opt->msw_batch_size; ++j) { + kv_pushp(kswr_avx_t, *kswrs); + } + } + + msw_kswr_v* kswrs2 = &w->msw_2_kswr[i]; // 初始化msw结果 + if (kswrs2->n < w->opt->msw_batch_size) { + for (j = kswrs2->n; j < w->opt->msw_batch_size; ++j) { + kv_pushp(kswr_avx_t, *kswrs2); + } + } + + int_v* xtras = &w->msw_xtra[i]; // 初始化xtra + if (xtras->n < w->opt->msw_batch_size) { + for (j = xtras->n; j < w->opt->msw_batch_size; ++j) { + kv_pushp(int, *xtras); + } + } + + int_v* xtras2 = &w->msw_2_xtra[i]; // 初始化xtra + if (xtras2->n < w->opt->msw_batch_size) { + for (j = xtras2->n; j < w->opt->msw_batch_size; ++j) { + kv_pushp(int, *xtras2); + } + } + matesw_buf_t* b = &w->msw_buf[i]; // 更新跟ref len有关的缓冲区 if (b->ref_len < w->mem_stats[i].max_msw_ref_len) { @@ -259,6 +449,11 @@ static void alloc_update_cache_avx512(mem_worker_t* w) { if (b->rowMax) _mm_free(b->rowMax); b->refArr = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); b->rowMax = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + for (j = 0; j < w->opt->msw_batch_size; ++j) { + byte_v* ref = &kv_A(*refs, j); + kv_resize(uint8_t, *ref, b->ref_len); + // ref->n = ref->m; + } } // 更新跟seq len有关的缓冲区 if (b->seq_len < w->mem_stats[i].max_seq_len) { @@ -287,9 +482,10 @@ void gen_paired_sam(mem_worker_t* w) { if (w->opt->flag & MEM_F_NO_RESCUE) { kt_for(w->opt->n_threads, worker_sam, w, w->n_reads >> 1); // generate alignment } else { + int batch_n = (w->n_reads + w->opt->batch_size - 1) / w->opt->batch_size; // 1. 计算哪些read需要matesw PROF_START(get_matesw_data); - kt_for(w->opt->n_threads, worker_matesw_data, w, w->n_reads >> 1); + kt_for(w->opt->n_threads, worker_matesw_data, w, batch_n); PROF_END(gprof[G_get_matesw_data], get_matesw_data); // 更新stats PROF_START(update_stats_cache); @@ -303,10 +499,11 @@ void gen_paired_sam(mem_worker_t* w) { gather_matesw_task(w, w->matesw_arr16, &w->msw_tasks16); PROF_END(gprof[G_gather_matesw_task], gather_matesw_task); PROF_START(calc_matesw); + int msw_batch_n = (w->msw_tasks8.n + w->opt->msw_batch_size - 1) / w->opt->msw_batch_size; if (w->msw_tasks8.n > 0) - kt_for(w->opt->n_threads, worker_calc_matesw_avx512_u8, w, (w->msw_tasks8.n + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8); + kt_for(w->opt->n_threads, worker_calc_matesw_avx512_u8, w, msw_batch_n); if (w->msw_tasks16.n > 0) - kt_for(w->opt->n_threads, worker_calc_matesw16, w, (w->msw_tasks16.n + SIMD512_WIDTH16 - 1) / SIMD512_WIDTH16); + kt_for(w->opt->n_threads, worker_calc_matesw16, w, msw_batch_n); PROF_END(gprof[G_calc_matesw], calc_matesw); // 3. kt_for(w->opt->n_threads, workder_gen_sam, w, w->n_reads >> 1); diff --git a/profiling.c b/profiling.c index 833d254..cf3efae 100644 --- a/profiling.c +++ b/profiling.c @@ -126,6 +126,20 @@ int display_stats(int nthreads) fprintf(stderr, "calc_matesw: %0.2lf s\n", gprof[G_calc_matesw] * 1.0 / proc_freq); fprintf(stderr, "gen_sam: %0.2lf s\n", gprof[G_gen_sam] * 1.0 / proc_freq); + find_opt(tprof[T_MSW_GET_REF], nthreads, &max, &min, &avg); + fprintf(stderr, "msw_get_ref: %0.2lf s\n", avg); + + find_opt(tprof[T_MSW_PACK_SEQ], nthreads, &max, &min, &avg); + fprintf(stderr, "msw_pack_seq: %0.2lf s\n", avg); + + find_opt(tprof[T_MSW_1], nthreads, &max, &min, &avg); + fprintf(stderr, "msw_1: %0.2lf s\n", avg); + + find_opt(tprof[T_MSW_2], nthreads, &max, &min, &avg); + fprintf(stderr, "msw_2: %0.2lf s\n", avg); + + + fprintf(stderr, "more num: %ld\n", more_num); fprintf(stderr, "no more num: %ld\n", not_more_num); diff --git a/profiling.h b/profiling.h index ffad5e2..88e1e1d 100644 --- a/profiling.h +++ b/profiling.h @@ -102,6 +102,10 @@ enum T_SEEDING, T_EXTENSION, T_SAM, + T_MSW_1, + T_MSW_2, + T_MSW_GET_REF, + T_MSW_PACK_SEQ, }; int display_stats(int); diff --git a/run.sh b/run.sh index 756933b..1446e8a 100755 --- a/run.sh +++ b/run.sh @@ -2,8 +2,8 @@ thread=32 make -j 16 -n1=~/data/dataset/D1/n1.fq.gz -n2=~/data/dataset/D1/n2.fq.gz +n1=~/data/dataset/D2/n1.fq.gz +n2=~/data/dataset/D2/n2.fq.gz reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta