From 884e47d57d8aa645b5db527b275ec69d2b88acfd Mon Sep 17 00:00:00 2001 From: zzh Date: Mon, 19 Jan 2026 01:17:43 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=86hyb-index=E5=92=8Cavx512=E7=9A=84mate-?= =?UTF-8?q?sw=E7=BB=93=E5=90=88=E8=B5=B7=E6=9D=A5=E4=BA=86=EF=BC=8C?= =?UTF-8?q?=E9=80=9F=E5=BA=A6=E5=BE=88=E5=BF=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 7 +- .vscode/tasks.json | 2 +- Makefile | 21 +- bntseq.c | 24 ++ bntseq.h | 3 +- bwa.h | 7 +- bwamem.c | 49 ++- bwamem.h | 7 +- bwamem_pair.c | 2 +- fastmap.c | 7 +- ksw_align_avx.h | 74 ++++ ksw_align_avx2.c | 9 + ksw_align_avx512.c | 362 ++++++++++++++++++ mate_sw.c | 91 +++++ mate_sw.h | 99 +++++ paired_sam.c | 892 ++++++++++++++++++++++++++++++++++++++++++++ paired_sam.h | 44 +++ profiling.c | 14 +- profiling.h | 15 +- utils.h | 43 +++ 20 files changed, 1729 insertions(+), 43 deletions(-) create mode 100644 ksw_align_avx.h create mode 100644 ksw_align_avx2.c create mode 100644 ksw_align_avx512.c create mode 100644 mate_sw.c create mode 100644 mate_sw.h create mode 100644 paired_sam.c create mode 100644 paired_sam.h diff --git a/.vscode/launch.json b/.vscode/launch.json index fd57c23..34f8f4b 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -17,7 +17,8 @@ "-M", "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", - "/home/zzh/work/bioinfo/hyb-align/index/human_g1k_v37_decoy.fasta", + // "/home/zzh/work/bioinfo/hyb-align/index/human_g1k_v37_decoy.fasta", + "~/data/reference/hyb/human_g1k_v37_decoy.fasta", //"./b1.fq", "./b2.fq", //"~/data/dataset/real/D1/n1.fq", "~/data/dataset/real/D1/n2.fq", //"~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq", @@ -25,8 +26,8 @@ //"~/data/dataset/real/D3/n1.fq", //"~/data/dataset/real/D3/n2.fq", // "~/data/dataset/real/D2/n1.fq.gz", "~/data/dataset/real/D2/n2.fq.gz", - //"~/data/dataset/real/D2/n1.fq.gz", "~/data/dataset/real/D2/n2.fq.gz", - "~/data/dataset/real/D3/1w1.fq", "~/data/dataset/real/D3/1w2.fq", + "~/data/dataset/D2/n1.fq.gz", "~/data/dataset/D2/n2.fq.gz", + //"~/data/dataset/real/D3/1w1.fq", "~/data/dataset/real/D3/1w2.fq", "-o", "/dev/null", // "-g", diff --git a/.vscode/tasks.json b/.vscode/tasks.json index 2ea391f..9e3ef23 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -6,7 +6,7 @@ { "label": "Build", "type": "shell", - "command": "make -j 16", + "command": "make clean; make -j 32", "problemMatcher": [], "group": { "kind": "build", diff --git a/Makefile b/Makefile index 5369d09..e2a86ae 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,22 @@ CC= gcc #CC= clang --analyze -CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O3 -WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar -DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -DUSE_AVX2_EXT -DSHOW_PERF -DSHOW_DATA_PERF -DDEBUG_FILE_OUTPUT + +AVX2 = -mavx2 +AVX512BW = -mavx512bw + +OPTIMIZE=-O3 +CFLAGS= -g -Wall -Wno-unused-function $(AVX2) $(AVX512BW) $(OPTIMIZE) + +WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS +#SAM_EXACT= -DSAM_EXACT +SHOW_PERF= -DSHOW_PERF +SHOW_DATA_PERF= -DSHOW_DATA_PERF +DEBUG_FILE_OUTPUT= -DDEBUG_FILE_OUTPUT + +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -DUSE_AVX2_EXT $(SHOW_PERF) $(SHOW_DATA_PERF) $(SAM_EXACT) $(DEBUG_FILE_OUTPUT) HYBOBJS= hyb_bwa.o hyb_utils.o hyb_seeding_1.o hyb_seeding_2.o hyb_seeding_3.o hyb_create_idx.o debug.o profiling.o share_mem.o yarn.o \ - ksw_extend2_avx2.o ksw_extend2_avx2_u8.o + ksw_extend2_avx2.o ksw_extend2_avx2_u8.o paired_sam.o ksw_align_avx2.o ksw_align_avx512.o mate_sw.o LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \ QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ @@ -17,7 +28,7 @@ PROG= hybalign INCLUDES= LIBS= -lm -lz -lpthread SUBDIRS= . -JE_MALLOC=/home/zzh/work/bioinfo/libjemalloc.a +JE_MALLOC=#../libjemalloc.a ifeq ($(shell uname -s),Linux) LIBS += -lrt diff --git a/bntseq.c b/bntseq.c index f932da9..276fbb8 100644 --- a/bntseq.c +++ b/bntseq.c @@ -425,6 +425,30 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end return seq; } +void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, size_t *len, size_t *m_seq, uint8_t **seqp) +{ + if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap + if (end > l_pac<<1) end = l_pac<<1; + if (beg < 0) beg = 0; + if (beg >= l_pac || end <= l_pac) { + int64_t k, l = 0; + *len = end - beg; + if (*len > *m_seq) { + *m_seq = *len; + *seqp = realloc(*seqp, end - beg); + } + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + (*seqp)[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + (*seqp)[l++] = _get_pac(pac, k); + } + } else *len = 0; // if bridging the forward-reverse boundary, return nothing +} + uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) { int64_t far_beg, far_end, len; diff --git a/bntseq.h b/bntseq.h index 92e3afc..5aa237a 100644 --- a/bntseq.h +++ b/bntseq.h @@ -79,9 +79,10 @@ extern "C" { uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); + void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t* pac, int64_t beg, int64_t end, size_t* len, size_t* m_seq, uint8_t** seqp); #ifdef __cplusplus -} + } #endif static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) diff --git a/bwa.h b/bwa.h index 91c2663..7c0eec1 100644 --- a/bwa.h +++ b/bwa.h @@ -31,8 +31,10 @@ #include "bntseq.h" #include "bwt.h" -#include "kstring.h" #include "hyb_idx.h" +#include "kstring.h" +#include "ksw_align_avx.h" +#include "mate_sw.h" #define BWA_IDX_BWT 0x1 #define BWA_IDX_BNS 0x2 @@ -63,7 +65,8 @@ typedef struct { int l_seq, id; int m_name, m_comment, m_seq, m_qual; char *name, *comment, *seq, *qual; - kstring_t sam; + // kstring_t sam; + msw_seq_task_v msw_task; } bseq1_t; typedef struct { diff --git a/bwamem.c b/bwamem.c index 9f2845c..a64d3cb 100644 --- a/bwamem.c +++ b/bwamem.c @@ -34,14 +34,17 @@ #include #endif -#include "kstring.h" -#include "bwamem.h" #include "bntseq.h" +#include "bwamem.h" +#include "debug.h" +#include "hyb_idx.h" +#include "ksort.h" +#include "kstring.h" #include "ksw.h" #include "kvec.h" -#include "ksort.h" +#include "paired_sam.h" +#include "profiling.h" #include "utils.h" -#include "hyb_idx.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -112,6 +115,8 @@ mem_opt_t *mem_opt_init() o->skip_entire_match = 0; o->batch_size = 256; + o->msw_batch_size = 1024; + return o; } @@ -160,13 +165,15 @@ mem_worker_t* init_mem_worker(const mem_opt_t* opt, const bwt_t *bwt, const Hybr w->chain_arr = (mem_chain_v **)malloc(i * sizeof(mem_chain_v *)); w->isize_arr = (uint64_v **)malloc(i * sizeof(uint64_v *)); w->seed_arr = (HybSeedArr **)malloc(i * sizeof(HybSeedArr*)); + w->msw = calloc(1, sizeof(msw_data_t)); + init_msw_data(w->msw, opt->n_threads, opt->msw_batch_size); for (i = 0; i < opt->n_threads; ++i) { w->aux[i] = smem_aux_init(); w->smem_arr[i] = (smem_v*)malloc(opt->batch_size * sizeof(smem_v)); w->chain_arr[i] = (mem_chain_v*)malloc(opt->batch_size * sizeof(mem_chain_v)); - w->isize_arr[i] = (uint64_v *)calloc(4, sizeof(uint64_v)); - w->seed_arr[i] = (HybSeedArr *)malloc(opt->batch_size * sizeof(HybSeedArr)); + w->isize_arr[i] = (uint64_v*)calloc(4, sizeof(uint64_v)); + w->seed_arr[i] = (HybSeedArr*)malloc(opt->batch_size * sizeof(HybSeedArr)); for (j = 0; j < opt->batch_size; ++j) { kv_init(w->smem_arr[i][j].mem); kv_init(w->smem_arr[i][j].pos_arr); @@ -242,6 +249,14 @@ void find_smem(const mem_opt_t* opt, const bwt_t* bwt, int len, const uint8_t* s smemv->pos_arr.n = 0; } +static inline void clear_seeds(HybSeedArr* seeds) { + int i, j; + for (i = 0; i < seeds->n; ++i) { + _destory_clear_vec(seeds->a[i].ref_pos_arr); + } + _destory_clear_vec(*seeds); +} + // hybrid-index-based seeding #define hyb_seed_lt(a, b) ((a).seed_start == (b).seed_start ? (a).seed_end < (b).seed_end : (a).seed_start < (b).seed_start) KSORT_INIT(hyb_seed, HybSeed, hyb_seed_lt) @@ -250,7 +265,8 @@ static void hyb_seeding(const mem_opt_t* opt, const HybridIndex* hyb, ReadSeq* r HybSeedArr* seeds, uint64_t seq_id, int tid) { int i = 0; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); - seeds->n = 0; + // seeds->n = 0; + clear_seeds(seeds); // fprintf(stderr, "seq-id: %ld\n", seq_id); @@ -701,8 +717,6 @@ int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_ return m; } -typedef kvec_t(int) int_v; - static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z) { // similar to the loop in mem_chain_flt() int i, k, tmp; @@ -1137,7 +1151,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str); } if (m && m->n_cigar) { kputsn("\tMC:Z:", 6, str); add_cigar(opt, m, str, which); } - if (m) { kputsn("\tMQ:i:", 6, str); kputw(m->mapq, str);} + // zzh if (m) { kputsn("\tMQ:i:", 6, str); kputw(m->mapq, str);} if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } @@ -1236,7 +1250,6 @@ void mem_reorder_primary5(int T, mem_alnreg_v *a) // TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible void mem_reg2sam(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, bseq1_t* s, mem_alnreg_v* a, int extra_flag, const mem_aln_t* m, seq_sam_t* ss) { - extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); // kstring_t str; kvec_t(mem_aln_t) aa; int k, l; @@ -1378,7 +1391,7 @@ static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t* // mem主要流程 void mem_core_process(const mem_opt_t* opt, const bwt_t* bwt, const HybridIndex* hyb, const bntseq_t* bns, const uint8_t* pac, bseq1_t* seq_arr, - int nseq, smem_aux_t* aux, void* seed_arr, mem_chain_v* chain_arr, mem_alnreg_v* reg_arr, int calc_isize, int64_t l_pac, + int nseq, smem_aux_t* aux, smem_v* smem_arr, HybSeedArr* seed_arr, mem_chain_v* chain_arr, mem_alnreg_v* reg_arr, int calc_isize, int64_t l_pac, uint64_v* isize, int tid) { int i, j, l_seq; mem_chain_v* chnp; @@ -1386,7 +1399,6 @@ void mem_core_process(const mem_opt_t* opt, const bwt_t* bwt, const HybridIndex* char* seq; if (opt->use_bwt) { - smem_v *smem_arr = (smem_v*)seed_arr; // 1. seeding PROF_START(seeding); for (i = 0; i < nseq; ++i) { @@ -1422,7 +1434,7 @@ void mem_core_process(const mem_opt_t* opt, const bwt_t* bwt, const HybridIndex* } PROF_END(tprof[T_CHAIN][tid], chain); } else { - HybSeedArr* seeds = (HybSeedArr*)seed_arr; + HybSeedArr* seeds = seed_arr; // 1. seeding PROF_START(seeding); RangeArr read_ranges = {0}; @@ -1563,7 +1575,7 @@ static void worker_smem_extension(void *data, long i, int tid) mem_worker_t *w = (mem_worker_t*)data; int start = i * w->opt->batch_size; int end = MIN(start + w->opt->batch_size, w->n_reads); - mem_core_process(w->opt, w->bwt, w->hyb, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, + mem_core_process(w->opt, w->bwt, w->hyb, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->seed_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid); } @@ -1586,7 +1598,6 @@ static void worker_sam(void *data, long i, int tid) } void mem_process_seqs(const mem_opt_t* opt, mem_worker_t* w, int64_t n_processed, int n, bseq1_t* seqs, const mem_pestat_t* pes0, seq_sam_t* sams) { - extern void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); mem_pestat_t pes[4]; double ctime, rtime; int n_batch = (n + opt->batch_size - 1) / opt->batch_size; @@ -1624,7 +1635,11 @@ void mem_process_seqs(const mem_opt_t* opt, mem_worker_t* w, int64_t n_processed PROF_END(gprof[G_MEM_PESTAT], pestat); PROF_START(gen_sam); - kt_for(opt->n_threads, worker_sam, w, (opt->flag & MEM_F_PE) ? n >> 1 : n); // generate alignment + if ((opt->flag & MEM_F_PE)) { // pair-end + gen_paired_sam(w); + } else { + kt_for(opt->n_threads, worker_sam, w, n); // generate alignment + } PROF_END(gprof[G_GEN_SAM], gen_sam); if (bwa_verbose >= 3) diff --git a/bwamem.h b/bwamem.h index 3d42c39..6c58a55 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,6 +31,8 @@ #include "bwa.h" #include "bwt.h" #include "hyb_idx.h" +#include "ksw_align_avx.h" +#include "mate_sw.h" #include "utils.h" #define MEM_MAPQ_COEF 30.0 @@ -86,6 +88,7 @@ typedef struct { int batch_size; // number of bases to process in each batch int use_bwt; // whether to use bwt index for seeding int skip_entire_match; // whether to skip the second and third seeding steps for entire matching reads + int msw_batch_size; // 每次处理的msw的个数,64的倍数 } mem_opt_t; typedef struct { @@ -147,9 +150,6 @@ typedef struct { mem_chain_t* a; } mem_chain_v; -typedef kvec_t(uint8_t) byte_v; -typedef kvec_t(byte_v) byte_vv; - typedef struct { bwtintv_v mem, mem1, *tmpv[2]; buf_t *sw_buf, *seq_buf; @@ -181,6 +181,7 @@ typedef struct { mem_chain_v** chain_arr; mem_alnreg_v* regs; uint64_v** isize_arr; + msw_data_t* msw; int64_t n_processed; int64_t n; int64_t n_reads; diff --git a/bwamem_pair.c b/bwamem_pair.c index 91c4cc2..3d83e89 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -169,7 +169,7 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0, tid); #ifdef SHOW_DATA_PERF - tdat[TD_MATESW_CNT][tid] += 1; + tdat[TD_MSW_CNT][tid] += 1; #endif memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 diff --git a/fastmap.c b/fastmap.c index f8a3813..5345e9d 100644 --- a/fastmap.c +++ b/fastmap.c @@ -142,8 +142,9 @@ static inline void* calc_data(ktp_aux_t* aux, ktp_data_t* data) { free(sep[1]); free(ss[0]); free(ss[1]); - } else + } else { mem_process_seqs(opt, aux->w, aux->n_processed, data->n_seqs, data->seqs, aux->pes0, data->sams); + } aux->n_processed += data->n_seqs; PROF_END(gprof[G_COMPUTE], compute); @@ -169,6 +170,8 @@ static inline void* write_data(ktp_aux_t* aux, ktp_data_t* data) { buf_written = 0; } } + // 释放空间 + _destory_clear_kstring(data->sams[i].sam); } if (buf_written > 0) { err_fwrite(aux->wbuf, 1, buf_written, stdout); @@ -440,7 +443,7 @@ int main_mem(int argc, char *argv[]) __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); } else if (c == 'b') opt->batch_size = atoi(optarg) >> 1 << 1, opt->batch_size = opt->batch_size > 1 ? opt->batch_size : 256; - else if (c == 'g') + else if (c == 'g') // legacy bwt index opt->use_bwt = 1; else if (c == 'e') opt->skip_entire_match = 1; else return 1; diff --git a/ksw_align_avx.h b/ksw_align_avx.h new file mode 100644 index 0000000..ea3ea64 --- /dev/null +++ b/ksw_align_avx.h @@ -0,0 +1,74 @@ +#pragma once + +#include +#include +#include + +#include "ksw.h" + + +#define AMBIG_ 4 // ambiguous base +// for 16 bit +#define DUMMY1_ 4 +#define DUMMY2_ 5 +#define DUMMY3 26 +#define AMBR16 15 +#define AMBQ16 16 +// for 8-bit +#define DUMMY8 8 +#define DUMMY5 5 +#define AMBRQ 0xFF +#define AMBR 4 +#define AMBQ 8 + +#define min_(x, y) ((x) > (y) ? (y) : (x)) +#define max_(x, y) ((x) > (y) ? (x) : (y)) + +typedef struct { + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_avx_t; + +typedef struct { + int seq_len; + int ref_len; + uint8_t* H0; + uint8_t* H1; + uint8_t* Hmax; + uint8_t* F; + uint8_t* rowMax; + uint8_t* seqArr; + uint8_t* refArr; +} msw_buf_t; // inter-query算法的mate sw计算过程需要用到的缓存空间 + +#ifdef __cplusplus +extern "C" { +#endif + + +void ksw_align_avx2_u8(int qlen, uint8_t* query, int tlen, uint8_t* target, int m, const int8_t* mat, int gapo, int gape, int xtra, kswq_t** qry); +void ksw_align_avx2_i16(int qlen, uint8_t* query, int tlen, uint8_t* target, int m, const int8_t* mat, int gapo, int gape, int xtra, kswq_t** qry); + +void ksw_align_avx512_u8(int8_t w_match, // match分数,正数 + int8_t w_mismatch, // 错配罚分,负数 + int8_t o_ins, // 开始一个insert罚分,正数 + int8_t e_ins, // 延续一个insert罚分,正数 + int8_t o_del, // 开始一个delete罚分,正数 + int8_t e_del, // 延续一个delete罚分,正数 + msw_buf_t* cache, // 计算用到的一些数据 + uint8_t* seq1SoA, // ref序列,已经pack好了 + uint8_t* seq2SoA, // seq序列 + int16_t nrow, // 最长的行数,对应ref长度 + int16_t ncol, // 最长的列数,对应seq长度 + int* xtras, // 每个seq对应一个xtra + int* rlenA, // ref真实长度 + kswr_avx_t* alns, // 存放结果 + int phase); // 正向阶段0,反向阶段1 + +void ksw_align_avx512_i16(int qlen, uint8_t* query, int tlen, uint8_t* target, int m, const int8_t* mat, int gapo, int gape, int xtra, kswq_t** qry); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/ksw_align_avx2.c b/ksw_align_avx2.c new file mode 100644 index 0000000..82a9fa5 --- /dev/null +++ b/ksw_align_avx2.c @@ -0,0 +1,9 @@ +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "ksw.h" diff --git a/ksw_align_avx512.c b/ksw_align_avx512.c new file mode 100644 index 0000000..bf0a386 --- /dev/null +++ b/ksw_align_avx512.c @@ -0,0 +1,362 @@ +#include "ksw_align_avx.h" + +#undef SIMD_WIDTH8 +#undef SIMD_WIDTH16 +#define SIMD_WIDTH8 64 +#define SIMD_WIDTH16 32 + +// 默认的非ACGT对应的罚分 +static const int8_t w_ambig = -1; + +#define MAIN_SAM_CODE8_OPT(s1, s2, h00, h11, e11, f11, f21, max512, sft512) \ + { \ + __m512i sbt11, xor11, or11; \ + xor11 = _mm512_xor_si512(s1, s2); \ + sbt11 = _mm512_shuffle_epi8(permSft512, xor11); \ + __mmask64 cmpq = _mm512_cmpeq_epu8_mask(s2, five512); \ + sbt11 = _mm512_mask_blend_epi8(cmpq, sbt11, sft512); \ + or11 = _mm512_or_si512(s1, s2); \ + __mmask64 cmp = _mm512_movepi8_mask(or11); \ + __m512i m11 = _mm512_adds_epu8(h00, sbt11); \ + m11 = _mm512_mask_blend_epi8(cmp, m11, zero512); \ + m11 = _mm512_subs_epu8(m11, sft512); \ + h11 = _mm512_max_epu8(m11, e11); \ + h11 = _mm512_max_epu8(h11, f11); \ + __mmask64 cmp0 = _mm512_cmpgt_epu8_mask(h11, imax512); \ + imax512 = _mm512_max_epu8(imax512, h11); \ + iqe512 = _mm512_mask_blend_epi8(cmp0, iqe512, l512); \ + __m512i gapE512 = _mm512_subs_epu8(h11, oe_ins512); \ + e11 = _mm512_subs_epu8(e11, e_ins512); \ + e11 = _mm512_max_epu8(gapE512, e11); \ + __m512i gapD512 = _mm512_subs_epu8(h11, oe_del512); \ + f21 = _mm512_subs_epu8(f11, e_del512); \ + f21 = _mm512_max_epu8(gapD512, f21); \ + } + +#define MAIN_SAM_CODE16_OPT(s1, s2, h00, h11, e11, f11, f21, max512) \ + { \ + __m512i sbt11, xor11, or11; \ + xor11 = _mm512_xor_si512(s1, s2); \ + sbt11 = _mm512_permutexvar_epi16(xor11, perm512); \ + __m512i m11 = _mm512_add_epi16(h00, sbt11); \ + or11 = _mm512_or_si512(s1, s2); \ + __mmask64 cmp = _mm512_movepi8_mask(or11); \ + m11 = _mm512_mask_blend_epi8(cmp, m11, zero512); \ + h11 = _mm512_max_epi16(m11, e11); \ + h11 = _mm512_max_epi16(h11, f11); \ + h11 = _mm512_max_epi16(h11, zero512); \ + __mmask32 cmp0 = _mm512_cmpgt_epi16_mask(h11, imax512); \ + imax512 = _mm512_max_epi16(imax512, h11); \ + iqe512 = _mm512_mask_blend_epi16(cmp0, iqe512, l512); \ + __m512i gapE512 = _mm512_sub_epi16(h11, oe_ins512); \ + e11 = _mm512_sub_epi16(e11, e_ins512); \ + e11 = _mm512_max_epi16(gapE512, e11); \ + __m512i gapD512 = _mm512_sub_epi16(h11, oe_del512); \ + f21 = _mm512_sub_epi16(f11, e_del512); \ + f21 = _mm512_max_epi16(gapD512, f21); \ + } + +void ksw_align_avx512_u8(int8_t w_match, // match分数,正数 + int8_t w_mismatch, // 错配罚分,负数 + int8_t o_ins, // 开始一个insert罚分,正数 + int8_t e_ins, // 延续一个insert罚分,正数 + int8_t o_del, // 开始一个delete罚分,正数 + int8_t e_del, // 延续一个delete罚分,正数 + msw_buf_t* cache, // 计算用到的一些数据 + uint8_t* seq1SoA, // ref序列,已经pack好了 + uint8_t* seq2SoA, // seq序列 + int16_t nrow, // 最长的行数,对应ref长度 + int16_t ncol, // 最长的列数,对应seq长度 + int* xtras, // 每个seq对应一个xtra + int* rlenA, // ref真实长度 + kswr_avx_t* alns, // 存放结果 + int phase) { // 正向阶段0,反向阶段1 + + int g_qmax = max_(w_match, w_mismatch); + g_qmax = max_(g_qmax, w_ambig); + + uint8_t minsc[SIMD_WIDTH8] __attribute__((aligned(64))) = {0}; // min score ? + uint8_t endsc[SIMD_WIDTH8] __attribute__((aligned(64))) = {0}; // ending position score ? + + __m512i zero512 = _mm512_setzero_si512(); + __m512i one512 = _mm512_set1_epi8(1); + + int8_t temp[SIMD_WIDTH8] __attribute((aligned(64))) = {0}; // 应该是用来根据比较结果赋分值的 + + uint8_t shift = 127, mdiff = 0; + mdiff = max_(w_match, (int8_t)w_mismatch); + mdiff = max_(mdiff, (int8_t)w_ambig); + shift = min_(w_match, (int8_t)w_mismatch); + shift = min_((int8_t)shift, w_ambig); + + shift = 256 - (uint8_t)shift; + mdiff += shift; + + temp[0] = w_match; // states: 1. matches + temp[1] = temp[2] = temp[3] = w_mismatch; // 2. mis-matches + temp[4] = temp[5] = temp[6] = temp[7] = w_ambig; // 3. beyond boundary + temp[8] = temp[9] = temp[10] = temp[11] = w_ambig; // 4. 0 - sse2 region + temp[12] = w_ambig; // 5. ambig + + for (int i = 0; i < 16; i++) // for shuffle_epi8 + temp[i] += shift; + + int pos = 0; + for (int i = 16; i < SIMD_WIDTH8; i++) { + temp[i] = temp[pos++]; + if (pos % 16 == 0) + pos = 0; + } + + __m512i permSft512 = _mm512_load_si512(temp); + __m512i sft512 = _mm512_set1_epi8(shift); + __m512i cmax512 = _mm512_set1_epi8(255); + + // __m512i minsc512, endsc512; + __mmask64 minsc_msk_a = 0x0000, endsc_msk_a = 0x0000; + int val = 0; + for (int i = 0; i < SIMD_WIDTH8; i++) { + int xtra = xtras[i]; + val = (xtra & KSW_XSUBO) ? xtra & 0xffff : 0x10000; + if (val <= 255) { + minsc[i] = val; + minsc_msk_a |= (0x1L << i); + } + // msc_mask; + val = (xtra & KSW_XSTOP) ? xtra & 0xffff : 0x10000; + if (val <= 255) { + endsc[i] = val; + endsc_msk_a |= (0x1L << i); + } + } + + __m512i minsc512 = _mm512_load_si512((__m512i*)minsc); + __m512i endsc512 = _mm512_load_si512((__m512i*)endsc); + + __m512i e_del512 = _mm512_set1_epi8(e_del); + __m512i oe_del512 = _mm512_set1_epi8(o_del + e_del); + __m512i e_ins512 = _mm512_set1_epi8(e_ins); + __m512i oe_ins512 = _mm512_set1_epi8(o_ins + e_ins); + __m512i five512 = _mm512_set1_epi8(DUMMY5); // ambig mapping element + __m512i gmax512 = zero512; // exit1 = zero512; + __m512i te512 = _mm512_set1_epi16(-1); // changed to -1 + __m512i te512_ = _mm512_set1_epi16(-1); // changed to -1 + + __mmask64 exit0 = 0xFFFFFFFFFFFFFFFF; + + // 计算过程用到的一些数据,用cache预先开辟的空间 + uint8_t* H0 = cache->H0; + uint8_t* H1 = cache->H1; + uint8_t* Hmax = cache->Hmax; + uint8_t* F = cache->F; + uint8_t* rowMax = cache->rowMax; + + for (int i = 0; i <= ncol; i++) { + _mm512_store_si512((__m512*)(H0 + i * SIMD_WIDTH8), zero512); + _mm512_store_si512((__m512*)(Hmax + i * SIMD_WIDTH8), zero512); + _mm512_store_si512((__m512*)(F + i * SIMD_WIDTH8), zero512); + } +#if 1 + __m512i max512 = zero512, imax512, pimax512 = zero512; + __mmask64 mask512 = 0x0000; + __mmask64 minsc_msk = 0x0000; + + __m512i qe512 = _mm512_set1_epi8(0); + _mm512_store_si512((__m512i*)(H0), zero512); + _mm512_store_si512((__m512i*)(H1), zero512); +#endif +#if 1 + int i, limit = nrow; + for (i = 0; i < nrow; i++) { + __m512i e11 = zero512; + __m512i h00, h11, s1; + __m512i i512 = _mm512_set1_epi16(i); + int j; + + s1 = _mm512_load_si512((__m512i*)(seq1SoA + (i + 0) * SIMD_WIDTH8)); + imax512 = zero512; + __m512i iqe512 = _mm512_set1_epi8(-1); + + __m512i l512 = zero512; + for (j = 0; j < ncol; j++) { + __m512i f11, s2, f21; + h00 = _mm512_load_si512((__m512i*)(H0 + j * SIMD_WIDTH8)); // check for col "0" + s2 = _mm512_load_si512((__m512i*)(seq2SoA + (j)*SIMD_WIDTH8)); + f11 = _mm512_load_si512((__m512i*)(F + (j + 1) * SIMD_WIDTH8)); + + MAIN_SAM_CODE8_OPT(s1, s2, h00, h11, e11, f11, f21, max512, sft512); + + _mm512_store_si512((__m512i*)(H1 + (j + 1) * SIMD_WIDTH8), h11); // check for col "0" + _mm512_store_si512((__m512i*)(F + (j + 1) * SIMD_WIDTH8), f21); + l512 = _mm512_add_epi8(l512, one512); + } + + // Block I,从第二行开始,需要和前一行比较,来计算max score + if (i > 0) { + __mmask64 msk64 = _mm512_cmpgt_epu8_mask(imax512, pimax512); + msk64 |= mask512; + pimax512 = _mm512_mask_blend_epi8(msk64, pimax512, zero512); + pimax512 = _mm512_mask_blend_epi8(minsc_msk, zero512, pimax512); + pimax512 = _mm512_mask_blend_epi8(exit0, zero512, pimax512); + + _mm512_store_si512((__m512i*)(rowMax + (i - 1) * SIMD_WIDTH8), pimax512); + mask512 = ~msk64; + } + pimax512 = imax512; + minsc_msk = _mm512_cmpge_epu8_mask(imax512, minsc512); + minsc_msk &= minsc_msk_a; + + // Block II: gmax, te + __mmask64 cmp0 = _mm512_cmpgt_epu8_mask(imax512, gmax512); + cmp0 &= exit0; + gmax512 = _mm512_mask_blend_epi8(cmp0, gmax512, imax512); + te512 = _mm512_mask_blend_epi16((__mmask32)cmp0, te512, i512); + te512_ = _mm512_mask_blend_epi16((__mmask32)(cmp0 >> SIMD_WIDTH16), te512_, i512); + qe512 = _mm512_mask_blend_epi8(cmp0, qe512, iqe512); + + cmp0 = _mm512_cmpge_epu8_mask(gmax512, endsc512); + cmp0 &= endsc_msk_a; + + __m512i left512 = _mm512_adds_epu8(gmax512, sft512); + __mmask64 cmp2 = _mm512_cmpge_epu8_mask(left512, cmax512); + + exit0 = (~(cmp0 | cmp2)) & exit0; + if (exit0 == 0) { + limit = i++; + break; + } + + uint8_t* S = H1; + H1 = H0; + H0 = S; + i512 = _mm512_add_epi16(i512, one512); + } // for nrow + + pimax512 = _mm512_mask_blend_epi8(mask512, pimax512, zero512); + pimax512 = _mm512_mask_blend_epi8(minsc_msk, zero512, pimax512); + pimax512 = _mm512_mask_blend_epi8(exit0, zero512, pimax512); + _mm512_store_si512((__m512i*)(rowMax + (i - 1) * SIMD_WIDTH8), pimax512); + + /******************* DP loop over *****************************/ + /**************** Partial output setting **********************/ + uint8_t score[SIMD_WIDTH8] __attribute((aligned(64))); + int16_t te1[SIMD_WIDTH8] __attribute((aligned(64))); + uint8_t qe[SIMD_WIDTH8] __attribute((aligned(64))); + int16_t low[SIMD_WIDTH8] __attribute((aligned(64))); + int16_t high[SIMD_WIDTH8] __attribute((aligned(64))); + + _mm512_store_si512((__m512i*)score, gmax512); + _mm512_store_si512((__m512i*)te1, te512); + _mm512_store_si512((__m512i*)(te1 + SIMD_WIDTH16), te512_); + _mm512_store_si512((__m512i*)qe, qe512); + + int live = 0; + + for (int l = 0; l < SIMD_WIDTH8; l++) { + int16_t* te; + if (i < SIMD_WIDTH16) + te = te1; + else + te = te1; + if (phase) { // 第二阶段,反向比对 + if (alns[l].score == score[l]) { + alns[l].tb = alns[l].te - te[l]; + alns[l].qb = alns[l].qe - qe[l]; + } + } else { // 第一阶段,正向比对 + alns[l].score = score[l] + shift < 255 ? score[l] : 255; + alns[l].te = te[l]; + alns[l].qe = qe[l]; + if (alns[l].score != 255) { + qe[l] = 1; + live++; + } else + qe[l] = 0; + } + } + + if (phase) + return; + + if (live == 0) + return; + + /*************** Score2 and te2 *******************/ + int qmax = g_qmax; + int maxl = 0, minh = nrow; + for (int i = 0; i < SIMD_WIDTH8; i++) { + int val = (score[i] + qmax - 1) / qmax; + int16_t* te = te1; + low[i] = te[i] - val; + high[i] = te[i] + val; + if (qe[i]) { + maxl = maxl < low[i] ? low[i] : maxl; + minh = minh > high[i] ? high[i] : minh; + } + } + + max512 = zero512; + te512 = _mm512_set1_epi16(-1); + te512_ = _mm512_set1_epi16(-1); + __m512i low512 = _mm512_load_si512((__m512i*)low); // make it int16 + __m512i high512 = _mm512_load_si512((__m512i*)high); // int16 + __m512i low512_ = _mm512_load_si512((__m512i*)(low + SIMD_WIDTH16)); // make it int16 + __m512i high512_ = _mm512_load_si512((__m512i*)(high + SIMD_WIDTH16)); // int16 + + __m512i rmax512; + for (int i = 0; i < maxl; i++) { + __m512i i512 = _mm512_set1_epi16(i); + rmax512 = _mm512_load_si512((__m512i*)(rowMax + i * SIMD_WIDTH8)); + + __mmask64 mask11 = _mm512_cmpgt_epi16_mask(low512, i512); + __mmask64 mask12 = _mm512_cmpgt_epi16_mask(low512_, i512); + __mmask64 mask2 = _mm512_cmpgt_epu8_mask(rmax512, max512); + __mmask64 mask1 = mask11 | (mask12 << SIMD_WIDTH16); + mask2 &= mask1; + max512 = _mm512_mask_blend_epi8(mask2, max512, rmax512); + te512 = _mm512_mask_blend_epi16(mask2, te512, i512); + te512_ = _mm512_mask_blend_epi16(mask2 >> SIMD_WIDTH16, te512_, i512); + } + + // Added new block -- due to bug + int16_t rlen[SIMD_WIDTH8] __attribute((aligned(64))); + for (int i = 0; i < SIMD_WIDTH8; i++) rlen[i] = rlenA[i]; + __m512i rlen512 = _mm512_load_si512(rlen); + __m512i rlen512_ = _mm512_load_si512(rlen + SIMD_WIDTH16); + + for (int i = minh + 1; i < limit; i++) { + __m512i i512 = _mm512_set1_epi16(i); + rmax512 = _mm512_load_si512((__m512i*)(rowMax + i * SIMD_WIDTH8)); + __mmask64 mask11 = _mm512_cmpgt_epi16_mask(i512, high512); + __mmask64 mask12 = _mm512_cmpgt_epi16_mask(i512, high512_); + __mmask64 mask2 = _mm512_cmpgt_epu8_mask(rmax512, max512); + __mmask64 mask1 = mask11 | (mask12 << SIMD_WIDTH16); + mask2 &= mask1; + // new, bug + __mmask64 mask11_ = _mm512_cmpgt_epi16_mask(rlen512, i512); + __mmask64 mask12_ = _mm512_cmpgt_epi16_mask(rlen512_, i512); + __mmask64 mask1_ = mask11_ | (mask12_ << SIMD_WIDTH16); + mask2 &= mask1_; + max512 = _mm512_mask_blend_epi8(mask2, max512, rmax512); + te512 = _mm512_mask_blend_epi16(mask2, te512, i512); + te512_ = _mm512_mask_blend_epi16(mask2 >> SIMD_WIDTH16, te512_, i512); + } + + int16_t temp4[SIMD_WIDTH8] __attribute((aligned(64))); + _mm512_store_si512((__m512i*)temp, max512); + _mm512_store_si512((__m512i*)temp4, te512); + _mm512_store_si512((__m512i*)(temp4 + SIMD_WIDTH16), te512_); + + for (int i = 0; i < SIMD_WIDTH8; i++) { + int16_t* te2; + te2 = temp4; + if (qe[i]) { + alns[i].score2 = (temp[i] == 0 ? (int)-1 : (uint8_t)temp[i]); + alns[i].te2 = te2[i]; + } else { + alns[i].score2 = -1; + alns[i].te2 = -1; + } + } +#endif +} \ No newline at end of file diff --git a/mate_sw.c b/mate_sw.c new file mode 100644 index 0000000..832bf31 --- /dev/null +++ b/mate_sw.c @@ -0,0 +1,91 @@ +#include "mate_sw.h" + + +// 初始化mate sw计算相关的数据 +void init_msw_data(msw_data_t* msw, int n_threads, int msw_batch_size) { +#define _alloc_msw_num(data, num, data_type) ((data) = calloc(num, sizeof(data_type))) +#define _alloc_msw(data, data_type) ((data) = calloc(n_threads, sizeof(data_type))) + int i = 0, j = 0; + + _alloc_msw(msw->t_msw_tasks, msw_task_v*); + for (i = 0; i < n_threads; ++i) { + _alloc_msw_num(msw->t_msw_tasks[i], 2, msw_task_v); + } + + _alloc_msw(msw->t_msw_buf, msw_buf_t); + _alloc_msw(msw->t_msw_stats, msw_stats_t); + _alloc_msw(msw->t_msw_ref_buf, byte_vv); + _alloc_msw(msw->t_msw_seq_buf, byte_vv); + + _alloc_msw(msw->t_msw_1_refs, msw_ref_v); + _alloc_msw(msw->t_msw_1_seqs, msw_seq_v); + _alloc_msw(msw->t_msw_1_xtras, int_v); + _alloc_msw(msw->t_msw_1_kswrs, msw_kswr_v); + + _alloc_msw(msw->t_msw_2_refs, msw_ref_v); + _alloc_msw(msw->t_msw_2_seqs, msw_seq_v); + _alloc_msw(msw->t_msw_2_xtras, int_v); + _alloc_msw(msw->t_msw_2_kswrs, msw_kswr_v); + + // 初始化缓存空间 + msw_ref_t init_ref = {0}; + msw_seq_t init_seq = {0}; + kswr_avx_t init_kswr = {0}; + byte_v init_byte_v = {0}; + + for (i = 0; i < n_threads; ++i) { + for (j = 0; j < msw_batch_size; ++j) { + // for ref seq buf + kv_push(byte_v, msw->t_msw_ref_buf[i], init_byte_v); + kv_push(byte_v, msw->t_msw_seq_buf[i], init_byte_v); + + // for msw第一阶段 + kv_push(msw_ref_t, msw->t_msw_1_refs[i], init_ref); + kv_push(msw_seq_t, msw->t_msw_1_seqs[i], init_seq); + kv_push(int, msw->t_msw_1_xtras[i], 0); + kv_push(kswr_avx_t, msw->t_msw_1_kswrs[i], init_kswr); + + // for msw第二阶段 + kv_push(msw_ref_t, msw->t_msw_2_refs[i], init_ref); + kv_push(msw_seq_t, msw->t_msw_2_seqs[i], init_seq); + kv_push(int, msw->t_msw_2_xtras[i], 0); + kv_push(kswr_avx_t, msw->t_msw_2_kswrs[i], init_kswr); + } + } +} + +void calc_msw_mem_size(msw_data_t* msw, int n_threads, int64_t* total_bytes) { + int i; + //j, k; + int64_t bytes = 0; + double gb_d = 1024.0 * 1024 * 1024; + // 1. t_msw_tasks + for (i = 0; i < n_threads; ++i) { + msw_task_v* v = msw->t_msw_tasks[i]; + bytes += sizeof(msw_task_v*); + bytes += v[0].m * sizeof(msw_task_t); // u8 + bytes += v[1].m * sizeof(msw_task_t); // i16 + } + fprintf(stderr, "t_msw_tasks: %f GB\n", bytes / gb_d); + *total_bytes += bytes; + + // 2. p_msw_tasks + bytes = 0; + bytes += msw->p_msw_tasks_u8.m * sizeof(msw_task_t*); + bytes += msw->p_msw_tasks_i16.m * sizeof(msw_task_t*); + fprintf(stderr, "p_msw_tasks: %f GB\n", bytes / gb_d); + *total_bytes += bytes; + + // 3. t_msw_buf + bytes = 0; + for (i = 0; i < n_threads; ++i) { + msw_buf_t* v = &msw->t_msw_buf[i]; + bytes += sizeof(msw_buf_t); + bytes += v->ref_len * 64 * 2; // refArr + rowMax; + bytes += v->seq_len * 64 * 5; // seqArr, H0, H1, Hmax, F + } + fprintf(stderr, "t_msw_buf: %f GB\n", bytes / gb_d); + *total_bytes += bytes; + + // 4. t_msw_stats +} \ No newline at end of file diff --git a/mate_sw.h b/mate_sw.h new file mode 100644 index 0000000..3dcfc4f --- /dev/null +++ b/mate_sw.h @@ -0,0 +1,99 @@ +/* + Description: 计算mate sw过程用到的一些数据和函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2026/01/12 +*/ + +#pragma once + +#include + +#include "ksw_align_avx.h" +#include "kvec.h" + +// 需要做mate sw的read +typedef struct { + uint8_t is_rev; // seq是否在反向互补链上 + uint8_t skip; // 记录任务创建时候的skip状态 + int xtra; + int64_t rb, re; // ref的起始截止位置,左闭右开 + int64_t seq_id; // 对应的当前数据块的seq id + int aj; // 对应的pair-end的第几个aln; + int to; // task order,因为u8和i16分开放入的,但是他们也是有顺序的,这个用来排序 + kswr_avx_t aln; +} msw_task_t; + +// mate sw task数组 +typedef kvec_t(msw_task_t) msw_task_v; + +// mate sw task的指针数组 +typedef kvec_t(msw_task_t*) msw_task_ptr_v; + +// 每个seq保留task所在的线程,和数组idx +typedef struct { + int thread_idx; // 在哪个线程 + int arr_idx; // 是u8还是i16 + int task_idx; // 数组内的索引 +} msw_seq_task_t; + +typedef kvec_t(msw_seq_task_t) msw_seq_task_v; + +////////////////////////////////////////////// + +// 用来存放获取的mate sw阶段的ref序列 +// typedef kvec_t(byte_v) byte_vv; + +// 记录一些用到的数据统计,比如最长ref长度,最长seq长度等 +typedef struct { + int max_seq_len; + int max_ref_len; // mate sw里的最长的ref长度 +} msw_stats_t; + +// 用于记录参与计算的query,只保存指针 +typedef struct { + uint8_t* seq; // 只是指向seq的指针 + int l_seq; // seq长度 +} msw_seq_t; +typedef kvec_t(msw_seq_t) msw_seq_v; + +// 用于记录参与计算的ref,只保存指针 +typedef struct { + uint8_t* ref; + int l_ref; +} msw_ref_t; +typedef kvec_t(msw_ref_t) msw_ref_v; + +// 记录结果 +typedef kvec_t(kswr_avx_t) msw_kswr_v; + +//////////////////////////////////////////// + +// mate sw相关的所有数据 +typedef struct { + msw_task_v** t_msw_tasks; // 0是u8, 1是i16,线程内收集需要做mate sw的任务的数据 + msw_task_ptr_v p_msw_tasks_u8; // 线程共享的mate sw task指针 + msw_task_ptr_v p_msw_tasks_i16; + msw_buf_t* t_msw_buf; // 每个线程一个,mate sw 缓存 + msw_stats_t* t_msw_stats; // 每个线程一个,统计信息 + msw_stats_t p_msw_stats; //只是用来开辟空间 + byte_vv* t_msw_ref_buf; // ref的缓存 + byte_vv* t_msw_seq_buf; // seq的缓存,因为有些seq需要反向互补,而且用了buf之后,第二阶段反转不需要再还原了 + + msw_ref_v* t_msw_1_refs; // msw 第一阶段数据 + msw_seq_v* t_msw_1_seqs; + int_v* t_msw_1_xtras; + msw_kswr_v* t_msw_1_kswrs; + + msw_ref_v* t_msw_2_refs; // msw 第一阶段数据 + msw_seq_v* t_msw_2_seqs; + int_v* t_msw_2_xtras; + msw_kswr_v* t_msw_2_kswrs; + +} msw_data_t; + +void init_msw_data(msw_data_t* msw, int n_threads, int msw_batch_size); + +void calc_msw_mem_size(msw_data_t* msw, int n_threads, int64_t *total_bytes); \ No newline at end of file diff --git a/paired_sam.c b/paired_sam.c new file mode 100644 index 0000000..e356fee --- /dev/null +++ b/paired_sam.c @@ -0,0 +1,892 @@ +#include "paired_sam.h" + +#include +#include +#include + +#include "ksw_align_avx.h" +#include "kvec.h" +#include "debug.h" +#include "mate_sw.h" +#include "profiling.h" + +// 原版生成sam函数 +static void worker_sam(void* data, long i, int tid) { + mem_worker_t* w = (mem_worker_t*)data; + if (bwa_verbose >= 4) + printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i << 1 | 0].name); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed >> 1) + i, &w->seqs[i << 1], &w->regs[i << 1], &w->sams[i << 1], tid); + free(w->regs[i << 1 | 0].a); + free(w->regs[i << 1 | 1].a); +} + +// 应该是推测read的方向,正链还是反向互补等等 +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t* dist) { + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2 ? b2 : (l_pac << 1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1 ? p2 - b1 : b1 - p2; + return (r1 == r2 ? 0 : 1) ^ (p2 > b1 ? 0 : 3); +} + +// 根据ref的begin和end计算对应的rid和修正之后的end +static int get_rid_range(const bntseq_t* bns, const uint8_t* pac, int64_t* beg, int64_t mid, int64_t* end) { + int64_t far_beg, far_end; + int is_rev; + int rid; + + if (*end < *beg) + *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap + assert(*beg <= mid && mid < *end); + rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[rid].offset; + far_end = far_beg + bns->anns[rid].len; + if (is_rev) { // flip to the reverse strand + int64_t tmp = far_beg; + far_beg = (bns->l_pac << 1) - far_end; + far_end = (bns->l_pac << 1) - tmp; + } + *beg = *beg > far_beg ? *beg : far_beg; + *end = *end < far_end ? *end : far_end; + return rid; +} + +// 获取ref序列 +static inline uint8_t* get_ref_seq(const bntseq_t* bns, const uint8_t* pac, int64_t beg, int64_t end, byte_v *buf) { + bns_get_seq_no_alloc(bns->l_pac, pac, beg, end, &buf->n, &buf->m, &buf->a); + return buf->a; +} + +// 将query序列放入缓存 +static inline void get_query_seq(int is_rev, int l_seq, uint8_t *seq, byte_v *query) { + int i = 0; + if (query->m < l_seq) { + kv_resize(uint8_t, *query, l_seq); + } + query->n = l_seq; + if (is_rev) { + for (i = 0; i < l_seq; ++i) query->a[l_seq - 1 - i] = seq[i] < 4 ? 3 - seq[i] : 4; + } else { + for (i = 0; i < l_seq; ++i) query->a[i] = seq[i]; + } +} + +// 翻转seq +static inline void revseq(int l, uint8_t* s) { + int i, t; + for (i = 0; i < l >> 1; ++i) t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; +} + +// 计算当前seq是否需要做matesw,需要的话保存必要的数据 +static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], const mem_alnreg_t* a, int a_j, + bseq1_t* seq, mem_alnreg_v* ma, int64_t sid, msw_stats_t* stats, msw_task_v* msw_tasks, int64_t n_processed, int tid) { + int64_t l_pac = bns->l_pac; + int l_ms = seq->l_seq; + int i, r, skip[4], rid; + stats->max_seq_len = max_(stats->max_seq_len, l_ms); + for (r = 0; r < 4; ++r) skip[r] = pes[r].failed ? 1 : 0; + for (i = 0; i < ma->n; ++i) { // check which orinentation has been found + int64_t dist; + r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip[r] = 1; + } + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) + return; // consistent pair exist; no need to perform SW + int task_order = 0; + for (r = 0; r < 4; ++r) { + uint8_t is_rev, is_larger; + int64_t rb, re; // 左闭右开 + if (skip[r]) + continue; + is_rev = (r >> 1 != (r & 1)); // whether to reverse complement the mate + is_larger = !(r >> 1); // whether the mate has larger coordinate + if (!is_rev) { + rb = is_larger ? a->rb + pes[r].low : a->rb - pes[r].high; + // if on the same strand, end position should be larger to make room for the seq length + re = (is_larger ? a->rb + pes[r].high : a->rb - pes[r].low) + l_ms; + } else { + rb = (is_larger ? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands + re = is_larger ? a->rb + pes[r].high : a->rb - pes[r].low; + } + if (rb < 0) + rb = 0; + if (re > l_pac << 1) + re = l_pac << 1; + rid = get_rid_range(bns, pac, &rb, (rb + re) >> 1, &re); // 计算ref对应的染色体id和区间起始终止位置 + if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening + tdat[TD_MSW_CNT][tid] += 1; + + stats->max_ref_len = max_(stats->max_ref_len, re - rb); + // fprintf(stderr, "zzh here\n"); + int xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250 ? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); + msw_task_t* p; + int msw_idx = 0; // u8 + if (!(xtra & KSW_XBYTE)) + msw_idx = 1; // i16 + msw_task_v* task_arr = &msw_tasks[msw_idx]; + p = kv_pushp(msw_task_t, *task_arr); + p->is_rev = is_rev; + p->skip = skip[0] | skip[1] << 1 | skip[2] << 2 | skip[3] << 3; + p->xtra = xtra; + p->rb = rb; + p->re = re; + p->seq_id = sid; + p->aj = a_j; + p->to = task_order++; // 有啥用? 应该是用来合并u8和i16的时候排序用 + + msw_seq_task_t seq_task = {tid, msw_idx, task_arr->n - 1}; + kv_push(msw_seq_task_t, seq->msw_task, seq_task); + // 将matesw任务和对应的seq关联起来,这里放指针是不行的,因为指针位置会变,要保存offset才行 + } + } +} +// 先计算哪些read需要做matesw +static void worker_matesw_tasks(void* data, long idx, int tid) { + mem_worker_t* w = (mem_worker_t*)data; + const mem_opt_t* opt = w->opt; + int startIdx = idx * opt->batch_size; + int endIdx = (idx + 1) * opt->batch_size; + if (endIdx > w->n_reads) + endIdx = w->n_reads; + + int id = 0; + mem_alnreg_v b[2]; + int_v aj[2]; + kv_init(aj[0]); + kv_init(aj[1]); + kv_init(b[0]); + kv_init(b[1]); + + for (id = startIdx; id < endIdx; id += 2) { + int64_t i_s = id; + bseq1_t* s = &w->seqs[i_s]; + mem_alnreg_v* a = &w->regs[i_s]; + int i, j; + + _clear_vec(b[0]); + _clear_vec(b[1]); + _clear_vec(aj[0]); + _clear_vec(aj[1]); + // fprintf(stderr, "zzh test\n"); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) { + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + kv_push(int, aj[i], j); + } + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + get_matesw_tasks(opt, w->bns, w->pac, w->pes, &b[i].a[j], aj[i].a[j], &s[!i], &a[!i], i_s + !i, &w->msw->t_msw_stats[tid], + w->msw->t_msw_tasks[tid], w->n_processed, tid); + } + free(b[0].a); + free(b[1].a); + free(aj[0].a); + free(aj[1].a); +} + +static void msw_avx512_u8(const mem_opt_t* opt, int num_tasks, msw_buf_t* msw_buf, msw_ref_v* refs, msw_seq_v* seqs, int_v* xtras, + msw_kswr_v* kswrs, int phase, int tid) { + int i = 0, j = 0, k = 0; + int refLen[SIMD512_WIDTH8] = {0}; + + // 准备数据并进行计算 + for (i = 0; i < num_tasks; i += SIMD512_WIDTH8) { + // 将ref和seq赋值给对应的用来计算的缓存 + uint8_t* mySeq1SoA = msw_buf->refArr; + uint8_t* mySeq2SoA = msw_buf->seqArr; + int maxSeqLen = 0, maxRefLen = 0; + + // for test +#if 0 + for (j = 0; j < SIMD512_WIDTH8; ++j) { + fprintf(stderr, "r: %d, s: %d\n", refs->a[i + j].l_ref, seqs->a[i + j].l_seq); + fflush(stderr); + } +#endif + + PROF_START(msw_pack_seq); + // 处理ref + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_ref_t* ref = &refs->a[i + j]; + uint8_t* seq1 = ref->ref; + refLen[j] = ref->l_ref; + // 处理ref + for (k = 0; k < ref->l_ref; ++k) { + mySeq1SoA[k * SIMD512_WIDTH8 + j] = (seq1[k] == AMBIG_ ? AMBR : seq1[k]); + } + maxRefLen = max_(maxRefLen, ref->l_ref); + } + + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_ref_t* ref = &refs->a[i + j]; + for (k = ref->l_ref; k <= maxRefLen; ++k) { + mySeq1SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + } + } + + // 处理query + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_seq_t* seq = &seqs->a[i + j]; + uint8_t* seq2 = seq->seq; + int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane + for (k = 0; k < seq->l_seq; k++) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = (seq2[k] == AMBIG_ ? AMBQ : seq2[k]); + } + for (; k < quanta; ++k) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = DUMMY5; // SSE quanta + } + maxSeqLen = max_(maxSeqLen, quanta); + } + for (j = 0; j < SIMD512_WIDTH8; ++j) { + msw_seq_t* seq = &seqs->a[i + j]; + int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane + for (k = quanta; k <= maxSeqLen; k++) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + } + } + PROF_END(tprof[T_MSW_PACK_SEQ][tid], msw_pack_seq); + + if (phase == 0) + tdat[TD_ALIGN_1_CNT][tid] += 1; + else + tdat[TD_ALIGN_2_CNT][tid] += 1; + + // 利用smid指令计算 + ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen, + &xtras->a[i], refLen, &kswrs->a[i], phase); + } +} + +// 再多线程计算matesw,利用inter-query的simd并行 +static void worker_calc_matesw_avx512_u8(void* data, long idx, int tid) { + mem_worker_t* w = (mem_worker_t*)data; + const mem_opt_t* opt = w->opt; + int startIdx = idx * w->opt->msw_batch_size; + int endIdx = (idx + 1) * w->opt->msw_batch_size; + if (endIdx > w->msw->p_msw_tasks_u8.n) + endIdx = w->msw->p_msw_tasks_u8.n; + int roundEndIdx = ((endIdx + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8) * SIMD512_WIDTH8; + + int i = 0, j = 0, k = 0; + + msw_buf_t* msw_buf = &w->msw->t_msw_buf[tid]; // 缓冲区 + byte_vv* ref_bufs = &w->msw->t_msw_ref_buf[tid]; + byte_vv* seq_bufs = &w->msw->t_msw_seq_buf[tid]; + + msw_ref_v* refs = &w->msw->t_msw_1_refs[tid]; + msw_seq_v* seqs = &w->msw->t_msw_1_seqs[tid]; + int_v* xtras = &w->msw->t_msw_1_xtras[tid]; + msw_kswr_v* kswrs = &w->msw->t_msw_1_kswrs[tid]; + + // 获取ref和seq数据 + PROF_START(msw_get_ref); + for (i = startIdx, j = 0; i < endIdx; ++i, ++j) { + msw_task_t* task = w->msw->p_msw_tasks_u8.a[i]; + // 1. 获取对应的ref + byte_v* ref_buf = &ref_bufs->a[j]; + get_ref_seq(w->bns, w->pac, task->rb, task->re, ref_buf); + refs->a[j].l_ref = ref_buf->n; + refs->a[j].ref = ref_buf->a; + // 2. 获取对应的seq + byte_v* seq_buf = &seq_bufs->a[j]; + get_query_seq(task->is_rev, w->seqs[task->seq_id].l_seq, (uint8_t*)w->seqs[task->seq_id].seq, seq_buf); + seqs->a[j].l_seq = seq_buf->n; + seqs->a[j].seq = seq_buf->a; + // 3. 其他数据 + xtras->a[j] = task->xtra; + kswrs->a[j].tb = kswrs->a[j].qb = -1; + } + for (; i < roundEndIdx; ++i, ++j) { + refs->a[j].l_ref = 0; + seqs->a[j].l_seq = 0; + xtras->a[j] = 0; + } + PROF_END(tprof[T_MSW_GET_REF][tid], msw_get_ref); + + PROF_START(msw_1); + msw_avx512_u8(opt, endIdx - startIdx, msw_buf, refs, seqs, xtras, kswrs, 0, tid); // 第一阶段 + PROF_END(tprof[T_MSW_1][tid], msw_1); + + // 第二阶段计算 + msw_ref_v* refs2 = &w->msw->t_msw_2_refs[tid]; + msw_seq_v* seqs2 = &w->msw->t_msw_2_seqs[tid]; + int_v* xtras2 = &w->msw->t_msw_2_xtras[tid]; + msw_kswr_v* kswrs2 = &w->msw->t_msw_2_kswrs[tid]; + int_v msw_task_ids = {0}; // 用于第二阶段任务统计 + + // 保存结果,并筛选第二阶段msw计算的任务 + for (i = startIdx, j = 0, k = 0; i < endIdx; ++i, ++j) { + msw_task_t* task = w->msw->p_msw_tasks_u8.a[i]; + kswr_avx_t r = kswrs->a[j]; + task->aln = r; + // 打印测试 + // kswr_avx_t aln = r; + //fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", task->seq_id + w->n_processed, aln.score, aln.te, aln.qe, + // aln.score2, aln.te2, aln.tb, aln.qb); + + if ((task->xtra & KSW_XSTART) == 0 || ((task->xtra & KSW_XSUBO) && r.score < (task->xtra & 0xffff))) + continue; + task->xtra = KSW_XSTOP | r.score; + kv_push(int, msw_task_ids, i); + // 1. 获取对应的ref + refs2->a[k] = refs->a[j]; + revseq(r.te + 1, refs2->a[k].ref); + // refs2->a[k].l_ref = r.te + 1; // zzh add, for test + // 2. 获取对应的seq + seqs2->a[k] = seqs->a[j]; + revseq(r.qe + 1, seqs2->a[k].seq); + seqs2->a[k].l_seq = r.qe + 1; + // 3. 其他数据 + xtras2->a[k] = task->xtra; + kswrs2->a[k] = r; + ++k; + } + roundEndIdx = ((msw_task_ids.n + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8) * SIMD512_WIDTH8; + for (; k < roundEndIdx; ++k) { + refs2->a[k].l_ref = 0; + seqs2->a[k].l_seq = 0; + } + PROF_START(msw_2); + msw_avx512_u8(opt, msw_task_ids.n, msw_buf, refs2, seqs2, xtras2, kswrs2, 1, tid); // 第二阶段 + PROF_END(tprof[T_MSW_2][tid], msw_2); + + // 结果赋值 + for (k = 0; k < msw_task_ids.n; ++k) { + i = msw_task_ids.a[k]; + msw_task_t* task = w->msw->p_msw_tasks_u8.a[i]; + task->aln = kswrs2->a[k]; + // kswr_avx_t aln = task->aln; + //fprintf(gf[1], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", task->seq_id + w->n_processed, aln.score, aln.te, aln.qe, + // aln.score2, aln.te2, aln.tb, aln.qb); + } + + kv_destroy(msw_task_ids); +} + +static void worker_calc_matesw_avx512_i16(void* data, long idx, int tid) {} + +// 检测添加新的align +static int check_add_align(const mem_opt_t* opt, kswr_avx_t aln, int is_rev, int64_t l_pac, mem_alnreg_t* a, int l_ms, uint8_t* ms, + mem_alnreg_v* ma, int64_t rb) { + int res = 0; + if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 + mem_alnreg_t b; + memset(&b, 0, sizeof(mem_alnreg_t)); + b.rid = a->rid; + b.is_alt = a->is_alt; + b.qb = is_rev ? l_ms - (aln.qe + 1) : aln.qb; + b.qe = is_rev ? l_ms - aln.qb : aln.qe + 1; + b.rb = is_rev ? (l_pac << 1) - (rb + aln.te + 1) : rb + aln.tb; + b.re = is_rev ? (l_pac << 1) - (rb + aln.tb) : rb + aln.te + 1; + b.score = aln.score; + b.csub = aln.score2; + b.secondary = -1; + b.seedcov = (b.re - b.rb < b.qe - b.qb ? b.re - b.rb : b.qe - b.qb) >> 1; + // printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, + // is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); + kv_push(mem_alnreg_t, *ma, b); // make room for a new element + +#ifdef SAM_EXACT + // move b s.t. ma is sorted + int i, tmp; + for (i = 0; i < ma->n - 1; ++i) // find the insertion point + if (ma->a[i].score < b.score) + break; + tmp = i; + for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i - 1]; + ma->a[i] = b; + ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a); +#endif + + res = 1; + } + return res; +} + +#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) + +// 根据比对结果生成sam +void generate_sam(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], + seq_sam_t ss[2], int64_t n_processed, int tid) { + + int i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2]; + kstring_t str; + mem_aln_t h[2], g[2], aa[2][2]; + + // int cmp = strcmp("ERR194147.17699", s[0].name); + + str.l = str.m = 0; + str.s = 0; + memset(h, 0, sizeof(mem_aln_t) * 2); + memset(g, 0, sizeof(mem_aln_t) * 2); + n_aa[0] = n_aa[1] = 0; + n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id << 1 | 0); + n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id << 1 | 1); + if (opt->flag & MEM_F_PRIMARY5) { + mem_reorder_primary5(opt->T, &a[0]); + mem_reorder_primary5(opt->T, &a[1]); + } + if (opt->flag & MEM_F_NOPAIRING) + goto no_pairing; + // pairing single-end hits + if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) { + int is_multi[2], q_pe, score_un, q_se[2]; + char** XA[2]; + // check if an end has multiple hits even after mate-SW + for (i = 0; i < 2; ++i) { + for (j = 1; j < n_pri[i]; ++j) + if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) + break; + is_multi[i] = j < n_pri[i] ? 1 : 0; + } + if (is_multi[0] || is_multi[1]) + goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score + // compute mapQ for the best SE hit + score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + // q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + subo = subo > score_un ? subo : score_un; + q_pe = raw_mapq(o - subo, opt->a); + if (n_sub > 0) + q_pe -= (int)(4.343 * log(n_sub + 1) + .499); + if (q_pe < 0) + q_pe = 0; + if (q_pe > 60) + q_pe = 60; + q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499); + // the following assumes no split hits + if (o > score_un) { // paired alignment is preferred + mem_alnreg_t* c[2]; + c[0] = &a[0].a[z[0]]; + c[1] = &a[1].a[z[1]]; + for (i = 0; i < 2; ++i) { + if (c[i]->secondary >= 0) + c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; + q_se[i] = mem_approx_mapq_se(opt, c[i]); + } + q_se[0] = q_se[0] > q_pe ? q_se[0] : q_pe < q_se[0] + 40 ? q_pe : q_se[0] + 40; + q_se[1] = q_se[1] > q_pe ? q_se[1] : q_pe < q_se[1] + 40 ? q_pe : q_se[1] + 40; + extra_flag |= 2; + // cap at the tandem repeat score + q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a) ? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a); + q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a) ? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a); + } else { // the unpaired alignment is preferred + z[0] = z[1] = 0; + q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); + q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); + } + for (i = 0; i < 2; ++i) { + int k = a[i].a[z[i]].secondary_all; + if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT + assert(a[i].a[k].secondary_all < 0); + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].secondary_all == k || j == k) + a[i].a[j].secondary_all = z[i]; + a[i].a[z[i]].secondary_all = -1; + } + } + + if (!(opt->flag & MEM_F_ALL)) { + for (i = 0; i < 2; ++i) XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq); + } else + XA[0] = XA[1] = 0; + + // write SAM + + for (i = 0; i < 2; ++i) { + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]); + h[i].mapq = q_se[i]; + h[i].flag |= 0x40 << i | extra_flag; + h[i].XA = XA[i] ? XA[i][z[i]] : 0; + aa[i][n_aa[i]++] = h[i]; + if (n_pri[i] < a[i].n) { // the read has ALT hits + mem_alnreg_t* p = &a[i].a[n_pri[i]]; + if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) + continue; + g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p); + g[i].flag |= 0x800 | 0x40 << i | extra_flag; + g[i].XA = XA[i] ? XA[i][n_pri[i]] : 0; + aa[i][n_aa[i]++] = g[i]; + } + } + ss[0].sam.l = 0; + // PROF_START(aln2sam); + for (i = 0; i < n_aa[0]; ++i) mem_aln2sam(opt, bns, &ss[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits + ss[1].sam.l = 0; + for (i = 0; i < n_aa[1]; ++i) mem_aln2sam(opt, bns, &ss[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits + if (strcmp(s[0].name, s[1].name) != 0) + err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); + // free + for (i = 0; i < 2; ++i) { + free(h[i].cigar); + free(g[i].cigar); + if (XA[i] == 0) + continue; + for (j = 0; j < a[i].n; ++j) free(XA[i][j]); + free(XA[i]); + } + } else + goto no_pairing; + + goto end_clear; + +no_pairing: + for (i = 0; i < 2; ++i) { + int which = -1; + if (a[i].n) { + if (a[i].a[0].score >= opt->T) + which = 0; + else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T) + which = n_pri[i]; + } + if (which >= 0) + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]); + else + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); + } + if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && + h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + int64_t dist; + int d; + d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); + if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) + extra_flag |= 2; + } + mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41 | extra_flag, &h[1], &ss[0]); + mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81 | extra_flag, &h[0], &ss[1]); + if (strcmp(s[0].name, s[1].name) != 0) + err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); + free(h[0].cigar); + free(h[1].cigar); + +end_clear: + +// 清理seq相关的内存空间 + _destory_clear_vec(s[0].msw_task); + _destory_clear_vec(s[1].msw_task); + + free(a[0].a); + free(a[1].a); +} + +// 最后再计算并生成sam数据 +static void workder_gen_sam(void* data, long idx, int tid) { + mem_worker_t* w = (mem_worker_t*)data; + const mem_opt_t* opt = w->opt; + const bntseq_t* bns = w->bns; + const uint8_t* pac = w->pac; + const mem_pestat_t *pes = w->pes; + + int startIdx = idx * opt->batch_size; + int endIdx = (idx + 1) * opt->batch_size; + if (endIdx > w->n_reads) + endIdx = w->n_reads; + + int id = 0, i = 0, j = 0, k = 0; + +#ifdef SAM_EXACT + mem_alnreg_v bb[2]; + kv_init(bb[0]); kv_init(bb[1]); +#endif + for (id = startIdx; id < endIdx; id += 2) { + // 初始化变量 + bseq1_t* s = &w->seqs[id]; + mem_alnreg_v* a = &w->regs[id]; + seq_sam_t* ss = &w->sams[id]; + +#if 0 + int cmp = strcmp("ERR194147.1629456", s[0].name) == 0; + // int cmp = (id + w->n_processed) == 202924; + if (cmp == 1) { + fprintf(stderr, "%s\n", s[0].name); + fflush(stderr); + //exit(0); + } +#endif + + int n[2] = {0}; +#ifndef SAM_EXACT + int nn[2] = {0}; +#endif + +#ifdef SAM_EXACT + _clear_vec(bb[0]); + _clear_vec(bb[1]); + for (i = 0; i < 2; ++i) { + for (k = 0; k < s[!i].msw_task.n; ++k) { + msw_seq_task_t st = s[!i].msw_task.a[k]; + msw_task_t* t = &w->msw->t_msw_tasks[st.thread_idx][st.arr_idx].a[st.task_idx]; + kv_push(mem_alnreg_t, bb[i], a[i].a[t->aj]); + } + } +#endif + + for (i = 0; i < 2; ++i) { + int si = !i; +#ifndef SAM_EXACT + int origin_n = a[si].n; +#endif + // 这里应该先给task排序,因为u8和i16是分开排序的,需要合在一起,不用了,添加task的时候已经排序过了 + for (k = 0; k < s[si].msw_task.n; ++k) { + msw_seq_task_t st = s[si].msw_task.a[k]; + msw_task_t* t = &w->msw->t_msw_tasks[st.thread_idx][st.arr_idx].a[st.task_idx]; +#ifdef SAM_EXACT + mem_alnreg_t* b = &bb[i].a[k]; +#else + mem_alnreg_t* b = &a[i].a[t->aj]; +#endif + mem_alnreg_v* ma = &a[si]; + uint8_t skip = 0; + if (n[si]) { // 添加了新的aln,需要检测skip +#ifdef SAM_EXACT + for (j = 0; j < ma->n; ++j) { // check which orinentation has been found +#else + for (j = origin_n; j < ma->n; ++j) { +#endif + int64_t dist; + int r; + r = mem_infer_dir(bns->l_pac, b->rb, ma->a[j].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip |= 1 << r; + } + } + // 检查新添加的aln是不是把当前的任务skip掉了 + if ((t->skip | skip) == 15) + continue; +#if 0 + kswr_avx_t aln = t->aln; // fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + + // w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb); + if (cmp) { + fprintf(stderr, "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + w->n_processed, aln.score, aln.te, + aln.qe, aln.score2, aln.te2, aln.tb, aln.qb); fflush(stderr); + } +#endif +#ifndef SAM_EXACT + nn[si] += 1; +#endif + n[si] += check_add_align(opt, t->aln, t->is_rev, bns->l_pac, b, s[si].l_seq, (uint8_t*)s[si].seq, ma, t->rb); + } + } + // 处理完2个pair read之后再排序,因为上面操作有插入,排序后之前记录的索引就无效了,所以上面不能排序,要两个pair read处理完之后再排序 +#ifndef SAM_EXACT + for (i = 0; i < 2; ++i) { + if (nn[i] > 0) { + a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a); + } + } +#endif + + //////////////////////////////////////////////////////////// + // 这里需要传入全局id + generate_sam(opt, bns, pac, pes, (w->n_processed + id) >> 1, s, a, ss, w->n_processed, tid); + } + +#ifdef SAM_EXACT + _destory_clear_vec(bb[0]); + _destory_clear_vec(bb[1]); +#endif +} + +// 划分matesw任务 +static void gather_matesw_task(mem_worker_t* w, msw_task_v** thread_tasks) { + //_clear_vec(w->msw->p_msw_tasks_u8); + //_clear_vec(w->msw->p_msw_tasks_i16); + _destory_clear_vec(w->msw->p_msw_tasks_u8); + _destory_clear_vec(w->msw->p_msw_tasks_i16); + int i = 0, j = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + for (j = 0; j < thread_tasks[i][0].n; ++j) { + msw_task_t* tp = &thread_tasks[i][0].a[j]; + kv_push(msw_task_t*, w->msw->p_msw_tasks_u8, tp); + } + for (j = 0; j < thread_tasks[i][1].n; ++j) { + msw_task_t* tp = &thread_tasks[i][1].a[j]; + kv_push(msw_task_t*, w->msw->p_msw_tasks_i16, tp); + } + } +} + +// 更新stats +static void update_msw_stats(mem_worker_t* w) { + int i = 0, max_seq_len = 0, max_ref_len = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + max_ref_len = max_(max_ref_len, w->msw->t_msw_stats[i].max_ref_len); + max_seq_len = max_(max_seq_len, w->msw->t_msw_stats[i].max_seq_len); + } + int quanta = ((max_seq_len + 16 - 1) / 16) * 16; // based on SSE-8 bit lane + max_seq_len = quanta + 1; // 这里需要加一,因为ksw_avx512里赋值的时候是 <= ncol + + w->msw->p_msw_stats.max_ref_len = max_ref_len + 1; + w->msw->p_msw_stats.max_seq_len = max_seq_len; +} + +// 开辟缓冲区 +static void alloc_update_cache_avx512(mem_worker_t* w) { + int i = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + msw_buf_t* b = &w->msw->t_msw_buf[i]; + // 更新跟ref len有关的缓冲区 + if (b->ref_len < w->msw->p_msw_stats.max_ref_len) { + b->ref_len = w->msw->p_msw_stats.max_ref_len; + if (b->refArr) _mm_free(b->refArr); + if (b->rowMax) _mm_free(b->rowMax); + b->refArr = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->rowMax = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + } + // 更新跟seq len有关的缓冲区 + if (b->seq_len < w->msw->p_msw_stats.max_seq_len) { + b->seq_len = w->msw->p_msw_stats.max_seq_len; + if (b->seqArr) _mm_free(b->seqArr); + if (b->H0) _mm_free(b->H0); + if (b->H1) _mm_free(b->H1); + if (b->Hmax) _mm_free(b->Hmax); + if (b->F) _mm_free(b->F); + b->seqArr = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->H0 = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->H1 = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->Hmax = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->F = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + } + } +} + +void calc_worker_mem(mem_worker_t* w) { + // 1. seqs 所占空间 + int64_t bytes = 0; + int64_t all = 0; + double gb_d = 1024.0 * 1024 * 1024; + int i, j, k; + for (i = 0; in_reads; ++i) { + bseq1_t* s = &w->seqs[i]; + bytes += sizeof(*s) + s->m_seq + s->m_name + s->m_comment + s->m_seq + s->m_qual + s->msw_task.m * sizeof(msw_seq_task_t); + } + fprintf(stderr, "seqs: %f GB\n", bytes / gb_d); + all += bytes; + // 2. sams 内存 + bytes = 0; + for (i = 0; i < w->n_reads; ++i) { + seq_sam_t* s = &w->sams[i]; + bytes += sizeof(*s) + s->sam.m; + } + fprintf(stderr, "sams: %f GB\n", bytes / gb_d); + all += bytes; + // 3. smem_arr + bytes = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + bytes += sizeof(smem_v*); + bytes += w->opt->batch_size * sizeof(smem_v); + for (j = 0; j < w->opt->batch_size; ++j) { + smem_v* sv = &w->smem_arr[i][j]; + bytes += sizeof(bwtintv_v) + sv->mem.m * sizeof(bwtintv_t); + bytes += sizeof(uint64_v) + sv->pos_arr.m * sizeof(uint64_t); + } + bytes += w->opt->batch_size * sizeof(uint64_t); + } + fprintf(stderr, "smem_arr: %f GB\n", bytes / gb_d); + all += bytes; + // 4. seed_arr + bytes = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + bytes += sizeof(HybSeedArr*); + bytes += w->opt->batch_size * sizeof(HybSeedArr); + for (j = 0; j < w->opt->batch_size; ++j) { + HybSeedArr* hsa = &w->seed_arr[i][j]; + for (k = 0; k < hsa->m; ++k) { + bytes += sizeof(HybSeed) + hsa->a[k].ref_pos_arr.m * sizeof(uint64_t); + } + } + } + fprintf(stderr, "seed_arr: %f GB\n", bytes / gb_d); + all += bytes; + // 5. chain_arr + bytes = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + bytes += sizeof(mem_chain_v*); + bytes += w->opt->batch_size * sizeof(mem_chain_v); + for (j = 0; j < w->opt->batch_size; ++j) { + mem_chain_v* cv = &w->chain_arr[i][j]; + for (k = 0; k < cv->m; ++k) { + bytes += sizeof(mem_chain_t) + cv->a[k].m * sizeof(mem_seed_t); + } + } + } + fprintf(stderr, "chain_arr: %f GB\n", bytes / gb_d); + all += bytes; + // 6. regs + bytes = 0; + for (i = 0; i < w->n_reads; ++i) { + bytes += sizeof(mem_alnreg_v) + w->regs[i].m * sizeof(mem_alnreg_t); + } + fprintf(stderr, "regs: %f GB\n", bytes / gb_d); + all += bytes; + // 7. isize_arr + bytes = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + bytes += sizeof(uint64_v*) + w->isize_arr[i]->m * sizeof(uint64_t); + } + fprintf(stderr, "isize_arr: %f GB\n", bytes / gb_d); + all += bytes; + // 8. msw + bytes = 0; + calc_msw_mem_size(w->msw, w->opt->n_threads, &bytes); + all += bytes; + + fprintf(stderr, "msw: %f GB\n", bytes / gb_d); + + fprintf(stderr, "[all]: %f GB\n", all / gb_d); +} + +// 针对pair end数据,生成sam的过程 +void gen_paired_sam(mem_worker_t* w) { + // calc_worker_mem(w); + if (w->opt->flag & MEM_F_NO_RESCUE) { + kt_for(w->opt->n_threads, worker_sam, w, w->n_reads >> 1); // generate alignment + } else { + // 清空一下任务数组,不能放到线程里去做 + int i = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + //_clear_vec(w->msw->t_msw_tasks[i][0]); + //_clear_vec(w->msw->t_msw_tasks[i][1]); + _destory_clear_vec(w->msw->t_msw_tasks[i][0]); + _destory_clear_vec(w->msw->t_msw_tasks[i][1]); + } + int batch_n = (w->n_reads + w->opt->batch_size - 1) / w->opt->batch_size; + + // 1. 计算哪些read需要matesw + PROF_START(get_matesw_data); + kt_for(w->opt->n_threads, worker_matesw_tasks, w, batch_n); + PROF_END(gprof[G_get_matesw_data], get_matesw_data); + + // 更新stats + PROF_START(update_stats_cache); + update_msw_stats(w); + // 开辟缓冲区 + alloc_update_cache_avx512(w); + PROF_END(gprof[G_update_stats_cache], update_stats_cache); + + // 2. 收集每个线程中的msw任务 + PROF_START(gather_matesw_task); + gather_matesw_task(w, w->msw->t_msw_tasks); + PROF_END(gprof[G_gather_matesw_task], gather_matesw_task); + + PROF_START(calc_matesw); + int msw_batch_n = 0; + // 3. 处理msw任务 + if (w->msw->p_msw_tasks_u8.n > 0) { + msw_batch_n = (w->msw->p_msw_tasks_u8.n + w->opt->msw_batch_size - 1) / w->opt->msw_batch_size; + kt_for(w->opt->n_threads, worker_calc_matesw_avx512_u8, w, msw_batch_n); + } + if (w->msw->p_msw_tasks_i16.n > 0) { + msw_batch_n = (w->msw->p_msw_tasks_i16.n + w->opt->msw_batch_size - 1) / w->opt->msw_batch_size; + kt_for(w->opt->n_threads, worker_calc_matesw_avx512_i16, w, msw_batch_n); + } + PROF_END(gprof[G_calc_matesw], calc_matesw); + + // 4. 生成sam + PROF_START(gen_sam); + kt_for(w->opt->n_threads, workder_gen_sam, w, batch_n); + PROF_END(gprof[G_gen_sam], gen_sam); + } + fprintf(stderr, "zzh : u8: %ld i16: %ld\n", w->msw->p_msw_tasks_u8.n, w->msw->p_msw_tasks_i16.n); +} \ No newline at end of file diff --git a/paired_sam.h b/paired_sam.h new file mode 100644 index 0000000..41dfd9d --- /dev/null +++ b/paired_sam.h @@ -0,0 +1,44 @@ +/* + Description: 针对paired数据,计算生成sam步骤的优化 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2026/01/08 +*/ +#pragma once + +#include "bwamem.h" + +// for avx512 +#define SIMD512_WIDTH8 64 +#define SIMD512_WIDTH16 32 + + + +void gen_paired_sam(mem_worker_t* w); + +extern int mem_sam_pe(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], + mem_alnreg_v a[2], seq_sam_t ss[2], int tid); + +extern void mem_reg2ovlp(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, bseq1_t* s, mem_alnreg_v* a); + +extern void kt_for(int n_threads, void (*func)(void*, long, int), void* data, int n); + +extern int mem_sort_dedup_patch(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, uint8_t* query, int n, mem_alnreg_t* a); + +extern void mem_reg2sam(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, bseq1_t* s, mem_alnreg_v* a, int extra_flag, + const mem_aln_t* m, seq_sam_t* ss); + +extern void mem_aln2sam(const mem_opt_t* opt, const bntseq_t* bns, kstring_t* str, bseq1_t* s, int n, const mem_aln_t* list, int which, + const mem_aln_t* m); + +extern int mem_mark_primary_se(const mem_opt_t* opt, int n, mem_alnreg_t* a, int64_t id); +extern int mem_approx_mapq_se(const mem_opt_t* opt, const mem_alnreg_t* a); + +extern char** mem_gen_alt(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_alnreg_v* a, int l_query, const char* query); + +extern void mem_reorder_primary5(int T, mem_alnreg_v* a); + +extern int mem_pair(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, + int* sub, int* n_sub, int z[2], int n_pri[2]); \ No newline at end of file diff --git a/profiling.c b/profiling.c index fd23eed..1a8c78f 100644 --- a/profiling.c +++ b/profiling.c @@ -56,9 +56,9 @@ int find_opt(uint64_t *a, int len, double *max, double *min, double *avg) return 1; } -uint64_t get_sum(uint64_t *a, int len) { +int64_t get_sum(int64_t *a, int len) { int i = 0; - uint64_t all = 0; + int64_t all = 0; for (i = 0; i < len; i++) { all += a[i]; } @@ -143,6 +143,12 @@ int display_stats(int nthreads) // for gen-sam FORMAT_PERF_OUT("gen-sam", gprof[G_GEN_SAM] * 1.0 / proc_freq, 0); + FORMAT_PERF_OUT("get_matesw_data", gprof[G_get_matesw_data] * 1.0 / proc_freq, 1); + FORMAT_PERF_OUT("update_stats_cache", gprof[G_update_stats_cache] * 1.0 / proc_freq, 1); + FORMAT_PERF_OUT("gather_matesw_task", gprof[G_gather_matesw_task] * 1.0 / proc_freq, 1); + FORMAT_PERF_OUT("calc_matesw", gprof[G_calc_matesw] * 1.0 / proc_freq, 1); + FORMAT_PERF_OUT("gen_sam", gprof[G_gen_sam] * 1.0 / proc_freq, 1); + FORMAT_PERF_OUT_3("sam_mate_sw", tprof[T_SAM_MATESW], 1); FORMAT_PERF_OUT_3("mate_sw_1", tprof[T_MSW_1], 2); FORMAT_PERF_OUT_3("mate_sw_2", tprof[T_MSW_2], 2); @@ -246,7 +252,7 @@ int display_stats(int nthreads) #ifdef SHOW_DATA_PERF fprintf(stderr, "\n"); fprintf(stderr, "average seed cnt: %0.2lf\n", get_sum(tdat[TD_SEED_CNT], nthreads) * 1.0 / gdat[GD_READ_CNT]); - fprintf(stderr, "average matesw cnt: %0.2lf\n", get_sum(tdat[TD_MATESW_CNT], nthreads) * 1.0 / gdat[GD_READ_CNT]); + fprintf(stderr, "average matesw cnt: %0.2lf\n", get_sum(tdat[TD_MSW_CNT], nthreads) * 1.0 / gdat[GD_READ_CNT]); fprintf(stderr, "align 1 cnt: %ld\n", get_sum(tdat[TD_ALIGN_1_CNT], nthreads)); fprintf(stderr, "align 2 cnt: %ld\n", get_sum(tdat[TD_ALIGN_2_CNT], nthreads)); @@ -254,4 +260,4 @@ int display_stats(int nthreads) fprintf(stderr, "\n"); return 0; -} \ No newline at end of file +} diff --git a/profiling.h b/profiling.h index 71b28f7..98ad174 100644 --- a/profiling.h +++ b/profiling.h @@ -56,9 +56,9 @@ enum { TD_SEED_1_4, TD_SEED_1_5, TD_SEED_CNT, - TD_MATESW_CNT, TD_ALIGN_1_CNT, TD_ALIGN_2_CNT, + TD_MSW_CNT, }; // gdat @@ -77,7 +77,12 @@ enum { G_SEED_AND_EXT, G_MEM_PESTAT, G_GEN_SAM, - G_UNCOMPRESS + G_UNCOMPRESS, + G_gather_matesw_task, + G_calc_matesw, + G_get_matesw_data, + G_gen_sam, + G_update_stats_cache, }; // THREAD @@ -99,6 +104,10 @@ enum { T_SORT_DEDUP, T_GEN_SAM, T_MEM_REG2ALN, + T_MSW_1, + T_MSW_2, + T_MSW_GET_REF, + T_MSW_PACK_SEQ, T_CHAIN_ALL, T_ALN_ALL, @@ -143,8 +152,6 @@ enum { T_SEED_3_3_0, T_SEED_3_3_1, T_SEED_3_3_2, - T_MSW_1, - T_MSW_2, }; int display_stats(int); diff --git a/utils.h b/utils.h index ee236ed..aa4d2ff 100644 --- a/utils.h +++ b/utils.h @@ -35,6 +35,7 @@ #include #include "debug.h" +#include "kvec.h" #include "profiling.h" #ifdef __GNUC__ @@ -57,6 +58,43 @@ ///////////// added for hyb-align ///////////// +// rdtsc +#if defined(__GNUC__) && __GNUC__ < 11 && !defined(__clang__) +#if defined(__i386__) +static inline unsigned long long __rdtsc(void) { + unsigned long long int x; + __asm__ volatile(".byte 0x0f, 0x31" : "=A"(x)); + return x; +} +#elif defined(__x86_64__) +static inline unsigned long long __rdtsc(void) { + unsigned hi, lo; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((unsigned long long)lo) | (((unsigned long long)hi) << 32); +} +#endif +#endif +// end of rdtsc + +#define _destory_clear_vec(arr) \ + do { \ + free((arr).a); \ + (arr).a = 0; \ + (arr).m = (arr).n = 0; \ + } while (0) + +#define _destory_clear_kstring(arr) \ + do { \ + free((arr).s); \ + (arr).s = 0; \ + (arr).m = (arr).l = 0; \ + } while (0) + +#define _clear_vec(arr) \ + do { \ + (arr).n = 0; \ + } while (0) + #undef MAX #undef MIN #define MAX(x, y) ((x) > (y) ? (x) : (y)) @@ -125,6 +163,11 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; +typedef kvec_t(uint32_t) uint32_v; +typedef kvec_t(int) int_v; +typedef kvec_t(uint8_t) byte_v; +typedef kvec_t(byte_v) byte_vv; + typedef struct { size_t m; uint8_t* addr;