diff --git a/.gitignore b/.gitignore index 2421f71..72181d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*.log + # Prerequisites *.d fastalign diff --git a/.vscode/launch.json b/.vscode/launch.json index 905ef22..86d6645 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -17,12 +17,11 @@ "-M", "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", - "~/data1/fmt_ref/human_g1k_v37_decoy.fasta", - "~/data/fastq/dataset/na12878_wes_144/1w_1.fq", - "~/data/fastq/dataset/na12878_wes_144/1w_2.fq", + "~/data/reference/fmt/human_g1k_v37_decoy.fasta", + "~/data/dataset/D1/n1.fq.gz", + "~/data/dataset/D1/n2.fq.gz", "-o", "/dev/null", - "-Z" ], "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 }, diff --git a/Makefile b/Makefile index a13c7a8..ac5935f 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CC= gcc -CFLAGS= -g -Wall -Wno-unused-function -mavx2 #-O2 +CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF @@ -16,7 +16,7 @@ AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o profiling.o \ fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o \ - debug.o + debug.o paired_sam.o ksw_align_avx2.o ksw_align_avx512.o PROG= fastalign INCLUDES= LIBS= -lm -lz -lpthread -ldl diff --git a/bntseq.c b/bntseq.c index b2163e9..0322040 100644 --- a/bntseq.c +++ b/bntseq.c @@ -400,6 +400,7 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) return nn; } +// todo, 修改成不开辟内存 uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) { uint8_t *seq = 0; diff --git a/bwa.h b/bwa.h index 0059fa0..989d049 100644 --- a/bwa.h +++ b/bwa.h @@ -32,6 +32,9 @@ #include "kstring.h" #include "bwt.h" #include "fmt_idx.h" +#include "kvec.h" +#include "ksw.h" +#include "ksw_align_avx.h" #define BWA_IDX_BWT 0x1 #define BWA_IDX_BNS 0x2 @@ -48,32 +51,40 @@ #define BWA_DBG_QNAME 0x1 -typedef struct { - uint64_t *kmer_offsets; // ert kmer - uint8_t *mlt_table; // ert mlt - uint8_t *bref; // binary ref - uint64_t kmer_size; - uint64_t mlt_size; - uint64_t bref_size; -} ERT; - typedef struct { bwt_t *bwt; // FM-index FMTIndex *fmt;// FMT-index bntseq_t *bns; // information on the reference sequences uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base - ERT *ert; int is_shm; int64_t l_mem; uint8_t *mem; } bwaidx_t; + +// 需要做mate sw的read typedef struct { - int l_seq, id; + int is_rev; // seq是否在反向互补链上 + int xtra; + int64_t rb, re; // ref的起始截止位置,左闭右开 + int64_t seq_id; // 对应的当前数据块的seq id + kswr_avx_t aln; +} matesw_data_t; + +typedef struct { + size_t n, m; + matesw_data_t* a; +} matesw_data_v; + +typedef kvec_t(matesw_data_t*) matesw_ptr_v; + +typedef struct { + int l_seq, id; int m_name, m_comment, m_seq, m_qual; char *name, *comment, *seq, *qual; - kstring_t sam; + matesw_ptr_v msw; + kstring_t sam; } bseq1_t; typedef struct { diff --git a/bwamem.c b/bwamem.c index 649d588..b169cb0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -45,6 +45,7 @@ #include "fmt_idx.h" #include "profiling.h" #include "debug.h" +#include "paired_sam.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -529,7 +530,7 @@ int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_ return m; } -typedef kvec_t(int) int_v; +// typedef kvec_t(int) int_v; static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z) { // similar to the loop in mem_chain_flt() @@ -954,7 +955,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4; } ks_resize(str, str->l + (qe - qb) + 1); - for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; // 反向互补 kputc('\t', str); if (s->qual) { // printf qual ks_resize(str, str->l + (qe - qb) + 1); @@ -1016,17 +1017,18 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq * Integrated interface * ************************/ +// 计算匹配质量分数 int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) { int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; double identity; sub = a->csub > sub? a->csub : sub; - if (sub >= a->score) return 0; - l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; - identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; + if (sub >= a->score) return 0; // score还没有次优高,那就匹配质量分数为0 + l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; // 匹配的seq和ref长度较长的那个 + identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; // 应该是跟全匹配比较一下差多少 if (a->score == 0) { mapq = 0; - } else if (opt->mapQ_coef_len > 0) { + } else if (opt->mapQ_coef_len > 0) { // 匹配质量分数相关系数长度? double tmp; tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l); tmp *= identity * identity; @@ -1038,7 +1040,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; - mapq = (int)(mapq * (1. - a->frac_rep) + .499); + mapq = (int)(mapq * (1. - a->frac_rep) + .499); // 重叠区会调低匹配质量分数 return mapq; } @@ -1080,7 +1082,7 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, kv_init(aa); // str.l = str.m = 0; str.s = 0; ss->sam.l = 0; - for (k = l = 0; k < a->n; ++k) { + for (k = l = 0; k < a->n; ++k) { // 处理当前seq的所有匹配结果 mem_alnreg_t *p = &a->a[k]; mem_aln_t *q; if (p->score < opt->T) continue; @@ -1154,6 +1156,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const FMTIndex *fmt, const bn return regs; } +// 局部结果转成最终匹配结果 mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; @@ -1166,14 +1169,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.rid = -1; a.pos = -1; a.flag |= 0x4; return a; } - qb = ar->qb, qe = ar->qe; - rb = ar->rb, re = ar->re; + qb = ar->qb, qe = ar->qe; // query的起始终止位置 + rb = ar->rb, re = ar->re; // reference的起始终止位置 query = malloc(l_query); for (i = 0; i < l_query; ++i) // convert to the nt4 encoding - query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; - a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; + query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; // 这里不用了吧,seeding的时候都已经这样处理过了 + a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; // 是首选匹配就计算质量分数 if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment - tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); + tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); // 计算banded width w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); w2 = w2 > tmp? w2 : tmp; if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); @@ -1189,10 +1192,10 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; - a.NM = NM; + a.NM = NM; // 编辑距离 pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; - if (a.n_cigar > 0) { // squeeze out leading or trailing deletions + if (a.n_cigar > 0) { // squeeze out leading or trailing deletions,去除首尾的deletion if ((a.cigar[0]&0xf) == 2) { pos += a.cigar[0]>>4; --a.n_cigar; @@ -1202,7 +1205,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly } } - if (qb != 0 || qe != l_query) { // add clipping to CIGAR + if (qb != 0 || qe != l_query) { // add clipping to CIGAR,添加首尾的cliping int clip5, clip3; clip5 = is_rev? l_query - qe : qb; clip3 = is_rev? qb : l_query - qe; @@ -1244,6 +1247,18 @@ static void worker_sam(void *data, int i, int tid) } } +static void worker_sam_single_end(void *data, int i, int tid) +{ + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid); + extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); + mem_worker_t *w = (mem_worker_t*)data; + if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); + if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]); + mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]); + free(w->regs[i].a); +} + static smem_aux_t *smem_aux_init() { smem_aux_t *a; @@ -1264,29 +1279,35 @@ static void smem_aux_destroy(smem_aux_t *a) } // -mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const ERT *ert, const bntseq_t *bns, const uint8_t *pac, int useERT) +mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac) { int i = opt->n_threads, j; mem_worker_t *w = calloc(1, sizeof(mem_worker_t)); - w->opt = opt; w->bns = bns; w->pac = pac; w->fmt = fmt; w->ert = ert; w->useERT = useERT; + w->opt = opt; w->bns = bns; w->pac = pac; w->fmt = fmt; w->calc_isize = 0; w->n = 0; w->regs = 0; w->aux = malloc(i * sizeof(smem_aux_t)); w->smem_arr = malloc(i * sizeof(smem_v *)); w->chain_arr = malloc(i * sizeof(mem_chain_v *)); w->isize_arr = malloc(i * sizeof(uint64_v *)); + w->matesw_arr8 = malloc(i * sizeof(matesw_data_v*)); + w->matesw_arr16 = malloc(i * sizeof(matesw_data_v*)); + w->msw_buf = calloc(i, sizeof(matesw_buf_t)); + w->mem_stats = calloc(i, sizeof(mem_stats_t)); - for (i = 0; i < opt->n_threads; ++i) { - w->aux[i] = smem_aux_init(); + for (i = 0; i < opt->n_threads; ++i) { + w->aux[i] = smem_aux_init(); w->smem_arr[i] = malloc(opt->batch_size * sizeof(smem_v)); w->chain_arr[i] = malloc(opt->batch_size * sizeof(mem_chain_v)); w->isize_arr[i] = calloc(4, sizeof(uint64_v)); - for (j = 0; j < opt->batch_size; ++j) { + w->matesw_arr8[i] = calloc(1, sizeof(matesw_data_v)); + w->matesw_arr16[i] = calloc(1, sizeof(matesw_data_v)); + for (j = 0; j < opt->batch_size; ++j) { kv_init(w->smem_arr[i][j].mem); kv_init(w->smem_arr[i][j].pos_arr); kv_init(w->chain_arr[i][j]); } - } - return w; + } + return w; } static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a, int tid) @@ -1649,13 +1670,20 @@ void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 else mem_pestat(opt, w->bns->l_pac, n, w->isize_arr, pes); // otherwise, infer the insert size distribution from data - //else mem_pestat_old(opt, w->bns->l_pac, n, w->regs, pes); } PROF_END(gprof[G_MEM_PESTAT], pestat); PROF_START(mem_sam); +#if 0 kt_for(opt->n_threads, worker_sam, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment - PROF_END(gprof[G_MEM_SAM], mem_sam); +#else + if ((opt->flag & MEM_F_PE)) { // pair-end + gen_paired_sam(w); + } else { + kt_for(opt->n_threads, worker_sam_single_end, w, n); // generate alignment + } +#endif + PROF_END(gprof[G_MEM_SAM], mem_sam); //free(w.regs); if (bwa_verbose >= 3) diff --git a/bwamem.h b/bwamem.h index e7fe380..fa9c0bd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -93,7 +93,7 @@ typedef struct { int score; // best local SW score int truesc; // actual score corresponding to the aligned region; possibly smaller than $score int sub; // 2nd best SW score - int alt_sc; + int alt_sc; // 应该是与truesc对应的,2nd true sc int csub; // SW score of a tandem hit int sub_n; // approximate number of suboptimal hits int w; // actual band width used in extension @@ -102,9 +102,9 @@ typedef struct { int secondary_all; int seedlen0; // length of the starting seed int n_comp:30, is_alt:2; // number of sub-alignments chained together - float frac_rep; + float frac_rep; // 好像只有seed的occ超过一定次数才进行计算,有啥用? 应该跟重复区有关 uint64_t hash; -} mem_alnreg_t; +} mem_alnreg_t; // 这个reg到底是啥? region? 局部比对结果? typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; @@ -152,12 +152,25 @@ typedef struct { uint64_v pos_arr; } smem_v; +typedef struct { + int start_tid; + int start_idx; + int end_tid; + int end_idx; +} matesw_range_t; + +typedef kvec_t(matesw_range_t) matesw_range_v; + +typedef struct { + int max_seq_len; + int max_msw_ref_len; // mate sw里的最长的ref长度 +} mem_stats_t; // 记录一些用到的数据统计,比如最长ref长度,最长seq长度等 + typedef struct { int calc_isize; const mem_opt_t *opt; const bwt_t *bwt; const FMTIndex *fmt; - const ERT *ert; const bntseq_t *bns; const uint8_t *pac; const mem_pestat_t *pes; @@ -168,16 +181,23 @@ typedef struct { mem_chain_v **chain_arr; mem_alnreg_v *regs; uint64_v **isize_arr; - int64_t n_processed; - int64_t n; + matesw_data_v** matesw_arr8; // 收集需要做mate sw的任务的数据 + matesw_data_v** matesw_arr16; + //matesw_range_v msw_range_arr8; + //matesw_range_v msw_range_arr16; + matesw_ptr_v msw_tasks8; // 需要做mates sw的任务的指针 + matesw_ptr_v msw_tasks16; + matesw_buf_t* msw_buf; // mate sw过程用到的缓存空间 + mem_stats_t* mem_stats; // 记录一些开辟空间等用到的统计数据,如seq长度,ref长度等 + int64_t n_processed; + int64_t n; // regs元素个数,可动态扩展 int64_t n_reads; - int useERT; } mem_worker_t; #ifdef __cplusplus extern "C" { #endif - mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const ERT *ert, const bntseq_t *bns, const uint8_t *pac, int useERT); + mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac); smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); diff --git a/bwamem_pair.c b/bwamem_pair.c index 7974447..cebfdf5 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -233,7 +233,7 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co for (r = 0; r < 4; ++r) { int is_rev, is_larger; uint8_t *seq, *rev = 0, *ref = 0; - int64_t rb, re; + int64_t rb, re; // 左闭右开 if (skip[r]) continue; is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate is_larger = !(r>>1); // whether the mate has larger coordinate @@ -256,7 +256,6 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co kswr_t aln; mem_alnreg_t b; int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); - //fprintf(stderr, "%d\n", xtra); aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 diff --git a/bwashm.c b/bwashm.c index 4aa1a3b..d280373 100644 --- a/bwashm.c +++ b/bwashm.c @@ -9,7 +9,7 @@ #include #include "bwa.h" -int bwa_shm_stage(bwaidx_t *idx, const char *hint, int useERT) +int bwa_shm_stage(bwaidx_t *idx, const char *hint) { const char *name; diff --git a/bwtindex.c b/bwtindex.c index 2670243..25e7897 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -343,7 +343,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command fprintf(stderr, "Usage: fastbwa index [options] \n\n"); fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); - fprintf(stderr, " -t INT number of threads for ERT index building [%d]\n", num_threads); + fprintf(stderr, " -t INT number of threads for index building [%d]\n", num_threads); fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size); fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); fprintf(stderr, "\n"); diff --git a/fastmap.c b/fastmap.c index 2185e2e..8f08852 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,26 +24,28 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include -#include -#include -#include #include +#include +#include +#include #include #include -#include +#include +#include +#include +#include +#include + +#include "bntseq.h" #include "bwa.h" #include "bwamem.h" -#include "kvec.h" -#include "utils.h" -#include "bntseq.h" -#include "kseq.h" -#include "fmt_idx.h" -#include "yarn.h" -#include "profiling.h" #include "debug.h" +#include "fmt_idx.h" +#include "kseq.h" +#include "kvec.h" +#include "profiling.h" +#include "utils.h" +#include "yarn.h" KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; @@ -382,7 +384,6 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; mem_pestat_t pes[4]; ktp_aux_t aux; - int useERT = 0; memset(&aux, 0, sizeof(ktp_aux_t)); memset(pes, 0, 4 * sizeof(mem_pestat_t)); @@ -639,7 +640,7 @@ int main_mem(int argc, char *argv[]) } //fprintf(stderr, "%ld %ld %ld %ld %ld\n", aux.idx->fmt->L2[0], aux.idx->fmt->L2[1], aux.idx->fmt->L2[2], aux.idx->fmt->L2[3], aux.idx->fmt->L2[4]); - aux.w = init_mem_worker(opt, aux.idx->fmt, aux.idx->ert, aux.idx->bns, aux.idx->pac, useERT); + aux.w = init_mem_worker(opt, aux.idx->fmt, aux.idx->bns, aux.idx->pac); aux.data = calloc(2, sizeof(ktp_data_t)); bwa_print_sam_hdr(aux.idx->bns, hdr_line); diff --git a/ksw_align_avx.h b/ksw_align_avx.h new file mode 100644 index 0000000..0ff01c2 --- /dev/null +++ b/ksw_align_avx.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include + +#include "ksw.h" + +#define AMBIG_ 4 // ambiguous base +// for 16 bit +#define DUMMY1_ 4 +#define DUMMY2_ 5 +#define DUMMY3 26 +#define AMBR16 15 +#define AMBQ16 16 +// for 8-bit +#define DUMMY8 8 +#define DUMMY5 5 +#define AMBRQ 0xFF +#define AMBR 4 +#define AMBQ 8 + +#define min_(x, y) ((x) > (y) ? (y) : (x)) +#define max_(x, y) ((x) > (y) ? (x) : (y)) + +typedef struct { + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_avx_t; + +typedef struct { + int seq_len; + int ref_len; + uint8_t* H0; + uint8_t* H1; + uint8_t* Hmax; + uint8_t* F; + uint8_t* rowMax; + uint8_t* seqArr; + uint8_t* refArr; +} matesw_buf_t; // inter-query算法的mate sw计算过程需要用到的缓存空间 + +#ifdef __cplusplus +extern "C" { +#endif + + +void ksw_align_avx2_u8(int qlen, uint8_t* query, int tlen, uint8_t* target, int m, const int8_t* mat, int gapo, int gape, int xtra, kswq_t** qry); +void ksw_align_avx2_i16(int qlen, uint8_t* query, int tlen, uint8_t* target, int m, const int8_t* mat, int gapo, int gape, int xtra, kswq_t** qry); + +void ksw_align_avx512_u8(int8_t w_match, // match分数,正数 + int8_t w_mismatch, // 错配罚分,负数 + int8_t o_ins, // 开始一个insert罚分,正数 + int8_t e_ins, // 延续一个insert罚分,正数 + int8_t o_del, // 开始一个delete罚分,正数 + int8_t e_del, // 延续一个delete罚分,正数 + matesw_buf_t* cache, // 计算用到的一些数据 + uint8_t* seq1SoA, // ref序列,已经pack好了 + uint8_t* seq2SoA, // seq序列 + int16_t nrow, // 最长的行数,对应ref长度 + int16_t ncol, // 最长的列数,对应seq长度 + int* xtras, // 每个seq对应一个xtra + int* rlenA, // ref真实长度 + kswr_avx_t* alns, // 存放结果 + int phase); // 正向阶段0,反向阶段1 + +void ksw_align_avx512_i16(int qlen, uint8_t* query, int tlen, uint8_t* target, int m, const int8_t* mat, int gapo, int gape, int xtra, kswq_t** qry); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/ksw_align_avx2.c b/ksw_align_avx2.c index 7ccdcaa..82a9fa5 100644 --- a/ksw_align_avx2.c +++ b/ksw_align_avx2.c @@ -7,69 +7,3 @@ #include #include "utils.h" #include "ksw.h" - -// static const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; - -typedef struct _kswq_t { - int qlen, slen; - uint8_t shift, mdiff, max, size; - __m256i *qp, *H0, *H1, *E, *Hmax; -} kswq_t; - -/** - * Initialize the query data structure - * - * @param size Number of bytes used to store a score; valid valures are 1 or 2 - * @param qlen Length of the query sequence - * @param query Query sequence - * @param m Size of the alphabet (ACGTN) - * @param mat Scoring matrix in a one-dimension array - * - * @return Query data structure - */ -kswq_t *ksw_qinit_avx2(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) -{ - kswq_t *q; - int slen, a, tmp, p; - size = size > 1 ? 2 : 1; - p = 16 * (3 - size); // # values per __m256i - slen = (qlen + p - 1) / p; // segmented length - q = (kswq_t *)malloc(sizeof(kswq_t) + 512 + 32 * slen * (m + 4)); // a single block of memory - q->qp = (__m256i *)(((size_t)q + sizeof(kswq_t) + 31) >> 5 << 5); // align memory - q->H0 = q->qp + slen * m; - q->H1 = q->H0 + slen; - q->E = q->H1 + slen; - q->Hmax = q->E + slen; - q->slen = slen; q->qlen = qlen; q->size = size; - // compute shift - tmp = m * m; - for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score - if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; - if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; - } - q->max = q->mdiff; - q->shift = 256 - q->shift; // NB: q->shift is uint8_t - q->mdiff += q->shift; // this is the difference between the min and max scores - // An example: p=8, qlen=19, slen=3 and segmentation: - // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} - if (size == 1) { - int8_t *t = (int8_t*)q->qp; - for (a = 0; a < m; ++a) { - int i, k, nlen = slen * p; - const int8_t *ma = mat + a * m; - for (i = 0; i < slen; ++i) - for (k = i; k < nlen; k += slen) // p iterations - *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; - } - } else { - int16_t *t = (int16_t*)q->qp; - for (a = 0; a < m; ++a) { - int i, k, nlen = slen * p; - const int8_t *ma = mat + a * m; - for (i = 0; i < slen; ++i) - for (k = i; k < nlen; k += slen) // p iterations - *t++ = (k >= qlen? 0 : ma[query[k]]); - } - } - return q; -} \ No newline at end of file diff --git a/ksw_align_avx512.c b/ksw_align_avx512.c new file mode 100644 index 0000000..320e6da --- /dev/null +++ b/ksw_align_avx512.c @@ -0,0 +1,362 @@ +#include "ksw_align_avx.h" + +#undef SIMD_WIDTH8 +#undef SIMD_WIDTH16 +#define SIMD_WIDTH8 64 +#define SIMD_WIDTH16 32 + +// 默认的非ACGT对应的罚分 +static const int8_t w_ambig = -1; + +#define MAIN_SAM_CODE8_OPT(s1, s2, h00, h11, e11, f11, f21, max512, sft512) \ + { \ + __m512i sbt11, xor11, or11; \ + xor11 = _mm512_xor_si512(s1, s2); \ + sbt11 = _mm512_shuffle_epi8(permSft512, xor11); \ + __mmask64 cmpq = _mm512_cmpeq_epu8_mask(s2, five512); \ + sbt11 = _mm512_mask_blend_epi8(cmpq, sbt11, sft512); \ + or11 = _mm512_or_si512(s1, s2); \ + __mmask64 cmp = _mm512_movepi8_mask(or11); \ + __m512i m11 = _mm512_adds_epu8(h00, sbt11); \ + m11 = _mm512_mask_blend_epi8(cmp, m11, zero512); \ + m11 = _mm512_subs_epu8(m11, sft512); \ + h11 = _mm512_max_epu8(m11, e11); \ + h11 = _mm512_max_epu8(h11, f11); \ + __mmask64 cmp0 = _mm512_cmpgt_epu8_mask(h11, imax512); \ + imax512 = _mm512_max_epu8(imax512, h11); \ + iqe512 = _mm512_mask_blend_epi8(cmp0, iqe512, l512); \ + __m512i gapE512 = _mm512_subs_epu8(h11, oe_ins512); \ + e11 = _mm512_subs_epu8(e11, e_ins512); \ + e11 = _mm512_max_epu8(gapE512, e11); \ + __m512i gapD512 = _mm512_subs_epu8(h11, oe_del512); \ + f21 = _mm512_subs_epu8(f11, e_del512); \ + f21 = _mm512_max_epu8(gapD512, f21); \ + } + +#define MAIN_SAM_CODE16_OPT(s1, s2, h00, h11, e11, f11, f21, max512) \ + { \ + __m512i sbt11, xor11, or11; \ + xor11 = _mm512_xor_si512(s1, s2); \ + sbt11 = _mm512_permutexvar_epi16(xor11, perm512); \ + __m512i m11 = _mm512_add_epi16(h00, sbt11); \ + or11 = _mm512_or_si512(s1, s2); \ + __mmask64 cmp = _mm512_movepi8_mask(or11); \ + m11 = _mm512_mask_blend_epi8(cmp, m11, zero512); \ + h11 = _mm512_max_epi16(m11, e11); \ + h11 = _mm512_max_epi16(h11, f11); \ + h11 = _mm512_max_epi16(h11, zero512); \ + __mmask32 cmp0 = _mm512_cmpgt_epi16_mask(h11, imax512); \ + imax512 = _mm512_max_epi16(imax512, h11); \ + iqe512 = _mm512_mask_blend_epi16(cmp0, iqe512, l512); \ + __m512i gapE512 = _mm512_sub_epi16(h11, oe_ins512); \ + e11 = _mm512_sub_epi16(e11, e_ins512); \ + e11 = _mm512_max_epi16(gapE512, e11); \ + __m512i gapD512 = _mm512_sub_epi16(h11, oe_del512); \ + f21 = _mm512_sub_epi16(f11, e_del512); \ + f21 = _mm512_max_epi16(gapD512, f21); \ + } + +void ksw_align_avx512_u8(int8_t w_match, // match分数,正数 + int8_t w_mismatch, // 错配罚分,负数 + int8_t o_ins, // 开始一个insert罚分,正数 + int8_t e_ins, // 延续一个insert罚分,正数 + int8_t o_del, // 开始一个delete罚分,正数 + int8_t e_del, // 延续一个delete罚分,正数 + matesw_buf_t* cache, // 计算用到的一些数据 + uint8_t* seq1SoA, // ref序列,已经pack好了 + uint8_t* seq2SoA, // seq序列 + int16_t nrow, // 最长的行数,对应ref长度 + int16_t ncol, // 最长的列数,对应seq长度 + int* xtras, // 每个seq对应一个xtra + int* rlenA, // ref真实长度 + kswr_avx_t* alns, // 存放结果 + int phase) { // 正向阶段0,反向阶段1 + + int g_qmax = max_(w_match, w_mismatch); + g_qmax = max_(g_qmax, w_ambig); + + uint8_t minsc[SIMD_WIDTH8] __attribute__((aligned(64))) = {0}; // min score ? + uint8_t endsc[SIMD_WIDTH8] __attribute__((aligned(64))) = {0}; // ending position score ? + + __m512i zero512 = _mm512_setzero_si512(); + __m512i one512 = _mm512_set1_epi8(1); + + int8_t temp[SIMD_WIDTH8] __attribute((aligned(64))) = {0}; // 应该是用来根据比较结果赋分值的 + + uint8_t shift = 127, mdiff = 0; + mdiff = max_(w_match, (int8_t)w_mismatch); + mdiff = max_(mdiff, (int8_t)w_ambig); + shift = min_(w_match, (int8_t)w_mismatch); + shift = min_((int8_t)shift, w_ambig); + + shift = 256 - (uint8_t)shift; + mdiff += shift; + + temp[0] = w_match; // states: 1. matches + temp[1] = temp[2] = temp[3] = w_mismatch; // 2. mis-matches + temp[4] = temp[5] = temp[6] = temp[7] = w_ambig; // 3. beyond boundary + temp[8] = temp[9] = temp[10] = temp[11] = w_ambig; // 4. 0 - sse2 region + temp[12] = w_ambig; // 5. ambig + + for (int i = 0; i < 16; i++) // for shuffle_epi8 + temp[i] += shift; + + int pos = 0; + for (int i = 16; i < SIMD_WIDTH8; i++) { + temp[i] = temp[pos++]; + if (pos % 16 == 0) + pos = 0; + } + + __m512i permSft512 = _mm512_load_si512(temp); + __m512i sft512 = _mm512_set1_epi8(shift); + __m512i cmax512 = _mm512_set1_epi8(255); + + // __m512i minsc512, endsc512; + __mmask64 minsc_msk_a = 0x0000, endsc_msk_a = 0x0000; + int val = 0; + for (int i = 0; i < SIMD_WIDTH8; i++) { + int xtra = xtras[i]; + val = (xtra & KSW_XSUBO) ? xtra & 0xffff : 0x10000; + if (val <= 255) { + minsc[i] = val; + minsc_msk_a |= (0x1L << i); + } + // msc_mask; + val = (xtra & KSW_XSTOP) ? xtra & 0xffff : 0x10000; + if (val <= 255) { + endsc[i] = val; + endsc_msk_a |= (0x1L << i); + } + } + + __m512i minsc512 = _mm512_load_si512((__m512i*)minsc); + __m512i endsc512 = _mm512_load_si512((__m512i*)endsc); + + __m512i e_del512 = _mm512_set1_epi8(e_del); + __m512i oe_del512 = _mm512_set1_epi8(o_del + e_del); + __m512i e_ins512 = _mm512_set1_epi8(e_ins); + __m512i oe_ins512 = _mm512_set1_epi8(o_ins + e_ins); + __m512i five512 = _mm512_set1_epi8(DUMMY5); // ambig mapping element + __m512i gmax512 = zero512; // exit1 = zero512; + __m512i te512 = _mm512_set1_epi16(-1); // changed to -1 + __m512i te512_ = _mm512_set1_epi16(-1); // changed to -1 + + __mmask64 exit0 = 0xFFFFFFFFFFFFFFFF; + + // 计算过程用到的一些数据,用cache预先开辟的空间 + uint8_t* H0 = cache->H0; + uint8_t* H1 = cache->H1; + uint8_t* Hmax = cache->Hmax; + uint8_t* F = cache->F; + uint8_t* rowMax = cache->rowMax; + + for (int i = 0; i <= ncol; i++) { + _mm512_store_si512((__m512*)(H0 + i * SIMD_WIDTH8), zero512); + _mm512_store_si512((__m512*)(Hmax + i * SIMD_WIDTH8), zero512); + _mm512_store_si512((__m512*)(F + i * SIMD_WIDTH8), zero512); + } +#if 1 + __m512i max512 = zero512, imax512, pimax512 = zero512; + __mmask64 mask512 = 0x0000; + __mmask64 minsc_msk = 0x0000; + + __m512i qe512 = _mm512_set1_epi8(0); + _mm512_store_si512((__m512i*)(H0), zero512); + _mm512_store_si512((__m512i*)(H1), zero512); +#endif +#if 1 + int i, limit = nrow; + for (i = 0; i < nrow; i++) { + __m512i e11 = zero512; + __m512i h00, h11, s1; + __m512i i512 = _mm512_set1_epi16(i); + int j; + + s1 = _mm512_load_si512((__m512i*)(seq1SoA + (i + 0) * SIMD_WIDTH8)); + imax512 = zero512; + __m512i iqe512 = _mm512_set1_epi8(-1); + + __m512i l512 = zero512; + for (j = 0; j < ncol; j++) { + __m512i f11, s2, f21; + h00 = _mm512_load_si512((__m512i*)(H0 + j * SIMD_WIDTH8)); // check for col "0" + s2 = _mm512_load_si512((__m512i*)(seq2SoA + (j)*SIMD_WIDTH8)); + f11 = _mm512_load_si512((__m512i*)(F + (j + 1) * SIMD_WIDTH8)); + + MAIN_SAM_CODE8_OPT(s1, s2, h00, h11, e11, f11, f21, max512, sft512); + + _mm512_store_si512((__m512i*)(H1 + (j + 1) * SIMD_WIDTH8), h11); // check for col "0" + _mm512_store_si512((__m512i*)(F + (j + 1) * SIMD_WIDTH8), f21); + l512 = _mm512_add_epi8(l512, one512); + } + + // Block I,从第二行开始,需要和前一行比较,来计算max score + if (i > 0) { + __mmask64 msk64 = _mm512_cmpgt_epu8_mask(imax512, pimax512); + msk64 |= mask512; + pimax512 = _mm512_mask_blend_epi8(msk64, pimax512, zero512); + pimax512 = _mm512_mask_blend_epi8(minsc_msk, zero512, pimax512); + pimax512 = _mm512_mask_blend_epi8(exit0, zero512, pimax512); + + _mm512_store_si512((__m512i*)(rowMax + (i - 1) * SIMD_WIDTH8), pimax512); + mask512 = ~msk64; + } + pimax512 = imax512; + minsc_msk = _mm512_cmpge_epu8_mask(imax512, minsc512); + minsc_msk &= minsc_msk_a; + + // Block II: gmax, te + __mmask64 cmp0 = _mm512_cmpgt_epu8_mask(imax512, gmax512); + cmp0 &= exit0; + gmax512 = _mm512_mask_blend_epi8(cmp0, gmax512, imax512); + te512 = _mm512_mask_blend_epi16((__mmask32)cmp0, te512, i512); + te512_ = _mm512_mask_blend_epi16((__mmask32)(cmp0 >> SIMD_WIDTH16), te512_, i512); + qe512 = _mm512_mask_blend_epi8(cmp0, qe512, iqe512); + + cmp0 = _mm512_cmpge_epu8_mask(gmax512, endsc512); + cmp0 &= endsc_msk_a; + + __m512i left512 = _mm512_adds_epu8(gmax512, sft512); + __mmask64 cmp2 = _mm512_cmpge_epu8_mask(left512, cmax512); + + exit0 = (~(cmp0 | cmp2)) & exit0; + if (exit0 == 0) { + limit = i++; + break; + } + + uint8_t* S = H1; + H1 = H0; + H0 = S; + i512 = _mm512_add_epi16(i512, one512); + } // for nrow + + pimax512 = _mm512_mask_blend_epi8(mask512, pimax512, zero512); + pimax512 = _mm512_mask_blend_epi8(minsc_msk, zero512, pimax512); + pimax512 = _mm512_mask_blend_epi8(exit0, zero512, pimax512); + _mm512_store_si512((__m512i*)(rowMax + (i - 1) * SIMD_WIDTH8), pimax512); + + /******************* DP loop over *****************************/ + /**************** Partial output setting **********************/ + uint8_t score[SIMD_WIDTH8] __attribute((aligned(64))); + int16_t te1[SIMD_WIDTH8] __attribute((aligned(64))); + uint8_t qe[SIMD_WIDTH8] __attribute((aligned(64))); + int16_t low[SIMD_WIDTH8] __attribute((aligned(64))); + int16_t high[SIMD_WIDTH8] __attribute((aligned(64))); + + _mm512_store_si512((__m512i*)score, gmax512); + _mm512_store_si512((__m512i*)te1, te512); + _mm512_store_si512((__m512i*)(te1 + SIMD_WIDTH16), te512_); + _mm512_store_si512((__m512i*)qe, qe512); + + int live = 0; + + for (int l = 0; l < SIMD_WIDTH8; l++) { + int16_t* te; + if (i < SIMD_WIDTH16) + te = te1; + else + te = te1; + if (phase) { // 第二阶段,反向比对 + if (alns[l].score == score[l]) { + alns[l].tb = alns[l].te - te[l]; + alns[l].qb = alns[l].qe - qe[l]; + } + } else { // 第一阶段,正向比对 + alns[l].score = score[l] + shift < 255 ? score[l] : 255; + alns[l].te = te[l]; + alns[l].qe = qe[l]; + if (alns[l].score != 255) { + qe[l] = 1; + live++; + } else + qe[l] = 0; + } + } + + if (phase) + return; + + if (live == 0) + return; + + /*************** Score2 and te2 *******************/ + int qmax = g_qmax; + int maxl = 0, minh = nrow; + for (int i = 0; i < SIMD_WIDTH8; i++) { + int val = (score[i] + qmax - 1) / qmax; + int16_t* te = te1; + low[i] = te[i] - val; + high[i] = te[i] + val; + if (qe[i]) { + maxl = maxl < low[i] ? low[i] : maxl; + minh = minh > high[i] ? high[i] : minh; + } + } + + max512 = zero512; + te512 = _mm512_set1_epi16(-1); + te512_ = _mm512_set1_epi16(-1); + __m512i low512 = _mm512_load_si512((__m512i*)low); // make it int16 + __m512i high512 = _mm512_load_si512((__m512i*)high); // int16 + __m512i low512_ = _mm512_load_si512((__m512i*)(low + SIMD_WIDTH16)); // make it int16 + __m512i high512_ = _mm512_load_si512((__m512i*)(high + SIMD_WIDTH16)); // int16 + + __m512i rmax512; + for (int i = 0; i < maxl; i++) { + __m512i i512 = _mm512_set1_epi16(i); + rmax512 = _mm512_load_si512((__m512i*)(rowMax + i * SIMD_WIDTH8)); + + __mmask64 mask11 = _mm512_cmpgt_epi16_mask(low512, i512); + __mmask64 mask12 = _mm512_cmpgt_epi16_mask(low512_, i512); + __mmask64 mask2 = _mm512_cmpgt_epu8_mask(rmax512, max512); + __mmask64 mask1 = mask11 | (mask12 << SIMD_WIDTH16); + mask2 &= mask1; + max512 = _mm512_mask_blend_epi8(mask2, max512, rmax512); + te512 = _mm512_mask_blend_epi16(mask2, te512, i512); + te512_ = _mm512_mask_blend_epi16(mask2 >> SIMD_WIDTH16, te512_, i512); + } + + // Added new block -- due to bug + int16_t rlen[SIMD_WIDTH8] __attribute((aligned(64))); + for (int i = 0; i < SIMD_WIDTH8; i++) rlen[i] = rlenA[i]; + __m512i rlen512 = _mm512_load_si512(rlen); + __m512i rlen512_ = _mm512_load_si512(rlen + SIMD_WIDTH16); + + for (int i = minh + 1; i < limit; i++) { + __m512i i512 = _mm512_set1_epi16(i); + rmax512 = _mm512_load_si512((__m512i*)(rowMax + i * SIMD_WIDTH8)); + __mmask64 mask11 = _mm512_cmpgt_epi16_mask(i512, high512); + __mmask64 mask12 = _mm512_cmpgt_epi16_mask(i512, high512_); + __mmask64 mask2 = _mm512_cmpgt_epu8_mask(rmax512, max512); + __mmask64 mask1 = mask11 | (mask12 << SIMD_WIDTH16); + mask2 &= mask1; + // new, bug + __mmask64 mask11_ = _mm512_cmpgt_epi16_mask(rlen512, i512); + __mmask64 mask12_ = _mm512_cmpgt_epi16_mask(rlen512_, i512); + __mmask64 mask1_ = mask11_ | (mask12_ << SIMD_WIDTH16); + mask2 &= mask1_; + max512 = _mm512_mask_blend_epi8(mask2, max512, rmax512); + te512 = _mm512_mask_blend_epi16(mask2, te512, i512); + te512_ = _mm512_mask_blend_epi16(mask2 >> SIMD_WIDTH16, te512_, i512); + } + + int16_t temp4[SIMD_WIDTH8] __attribute((aligned(64))); + _mm512_store_si512((__m512i*)temp, max512); + _mm512_store_si512((__m512i*)temp4, te512); + _mm512_store_si512((__m512i*)(temp4 + SIMD_WIDTH16), te512_); + + for (int i = 0; i < SIMD_WIDTH8; i++) { + int16_t* te2; + te2 = temp4; + if (qe[i]) { + alns[i].score2 = (temp[i] == 0 ? (int)-1 : (uint8_t)temp[i]); + alns[i].te2 = te2[i]; + } else { + alns[i].score2 = -1; + alns[i].te2 = -1; + } + } +#endif +} \ No newline at end of file diff --git a/paired_sam.c b/paired_sam.c new file mode 100644 index 0000000..4999230 --- /dev/null +++ b/paired_sam.c @@ -0,0 +1,315 @@ +#include "paired_sam.h" + +#include +#include + +#include "ksw_align_avx.h" +#include "kvec.h" + +// 原版生成sam函数 +static void worker_sam(void* data, int i, int tid) { + mem_worker_t* w = (mem_worker_t*)data; + if (bwa_verbose >= 4) + printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i << 1 | 0].name); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed >> 1) + i, &w->seqs[i << 1], &w->regs[i << 1], &w->sams[i << 1], tid); + free(w->regs[i << 1 | 0].a); + free(w->regs[i << 1 | 1].a); +} + +// 应该是推测read的方向,正链还是反向互补等等 +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t* dist) { + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2 ? b2 : (l_pac << 1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1 ? p2 - b1 : b1 - p2; + return (r1 == r2 ? 0 : 1) ^ (p2 > b1 ? 0 : 3); +} + +// 根据ref的begin和end计算对应的rid和修正之后的end +static int get_rid_range(const bntseq_t* bns, const uint8_t* pac, int64_t* beg, int64_t mid, int64_t* end) { + int64_t far_beg, far_end; + int is_rev; + int rid; + + if (*end < *beg) + *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap + assert(*beg <= mid && mid < *end); + rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[rid].offset; + far_end = far_beg + bns->anns[rid].len; + if (is_rev) { // flip to the reverse strand + int64_t tmp = far_beg; + far_beg = (bns->l_pac << 1) - far_end; + far_end = (bns->l_pac << 1) - tmp; + } + *beg = *beg > far_beg ? *beg : far_beg; + *end = *end < far_end ? *end : far_end; + return rid; +} + +// 获取ref序列 +static inline uint8_t* get_ref_seq(const bntseq_t* bns, const uint8_t* pac, int64_t beg, int64_t end) { + int64_t len; + return bns_get_seq(bns->l_pac, pac, beg, end, &len); +} + +// 计算当前seq是否需要做matesw,需要的话保存必要的数据 +static void get_matesw_data(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], const mem_alnreg_t* a, + bseq1_t* seq, mem_alnreg_v* ma, int64_t sid, mem_stats_t *stats, matesw_data_v* msw8, matesw_data_v* msw16) { + int64_t l_pac = bns->l_pac; + int l_ms = seq->l_seq; + int i, r, skip[4], rid; + stats->max_seq_len = max_(stats->max_seq_len, l_ms); + for (r = 0; r < 4; ++r) skip[r] = pes[r].failed ? 1 : 0; + for (i = 0; i < ma->n; ++i) { // check which orinentation has been found + int64_t dist; + r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip[r] = 1; + } + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) + return; // consistent pair exist; no need to perform SW + for (r = 0; r < 4; ++r) { + int is_rev, is_larger; + int64_t rb, re; // 左闭右开 + if (skip[r]) + continue; + is_rev = (r >> 1 != (r & 1)); // whether to reverse complement the mate + is_larger = !(r >> 1); // whether the mate has larger coordinate + if (!is_rev) { + rb = is_larger ? a->rb + pes[r].low : a->rb - pes[r].high; + // if on the same strand, end position should be larger to make room for the seq length + re = (is_larger ? a->rb + pes[r].high : a->rb - pes[r].low) + l_ms; + } else { + rb = (is_larger ? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands + re = is_larger ? a->rb + pes[r].high : a->rb - pes[r].low; + } + if (rb < 0) + rb = 0; + if (re > l_pac << 1) + re = l_pac << 1; + rid = get_rid_range(bns, pac, &rb, (rb + re) >> 1, &re); // 计算ref对应的染色体id和区间起始终止位置 + if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening + stats->max_msw_ref_len = max_(stats->max_msw_ref_len, re - rb); + // fprintf(stderr, "zzh here\n"); + int xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250 ? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); + matesw_data_t* p; + if ((xtra & KSW_XBYTE)) + p = kv_pushp(matesw_data_t, *msw8); + else + p = kv_pushp(matesw_data_t, *msw16); + p->is_rev = is_rev; + p->xtra = xtra; + p->rb = rb; + p->re = re; + p->seq_id = sid; + kv_push(matesw_data_t*, seq->msw, p); // 将matesw任务和对应的seq关联起来 + } + } +} +// 先计算哪些read需要做matesw +static void worker_matesw_data(void* data, int idx, int tid) { + mem_worker_t* w = (mem_worker_t*)data; + const mem_opt_t* opt = w->opt; + int64_t i_s = idx << 1; + bseq1_t* s = &w->seqs[i_s]; + mem_alnreg_v* a = &w->regs[i_s]; + s->msw.n = 0; // 清空之前的matesw数据 + int i, j; + mem_alnreg_v b[2]; + kv_init(b[0]); + kv_init(b[1]); + // fprintf(stderr, "zzh test\n"); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + get_matesw_data(opt, w->bns, w->pac, w->pes, &b[i].a[j], &s[!i], &a[!i], i_s + !i, &w->mem_stats[tid], w->matesw_arr8[tid], w->matesw_arr16[tid]); + free(b[0].a); + free(b[1].a); +} + +// 再多线程计算matesw,利用inter-query的simd并行 +static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) { + mem_worker_t* w = (mem_worker_t*)data; + int startIdx = idx * SIMD512_WIDTH8; + int endIdx = (idx + 1) * SIMD512_WIDTH8; + if (endIdx > w->msw_tasks8.n) + endIdx = w->msw_tasks8.n; + int i = 0, j = 0, k = 0; + int maxSeqLen = 0, maxRefLen = 0; + uint8_t* refArr[SIMD512_WIDTH8] = {0}; + uint8_t* seqArr[SIMD512_WIDTH8] = {0}; + int refLen[SIMD512_WIDTH8] = {0}; + int seqLen[SIMD512_WIDTH8] = {0}; + int seqQuantaLen[SIMD512_WIDTH8] = {0}; + int xtraA[SIMD512_WIDTH8] = {0}; + kswr_avx_t alns[SIMD512_WIDTH8] = {0}; + + matesw_buf_t* msw_buf = &w->msw_buf[tid]; // 缓冲区 + + for (i = startIdx, j = 0; i < endIdx; ++i, ++j) { + matesw_data_t* task = kv_A(w->msw_tasks8, i); + // 1. 获取对应的ref + refArr[j] = get_ref_seq(w->bns, w->pac, task->rb, task->re); + refLen[j] = task->re - task->rb; + maxRefLen = max_(maxRefLen, refLen[j]); + // 2. 获取对应的seq + seqArr[j] = (uint8_t*)w->seqs[task->seq_id].seq; + seqLen[j] = w->seqs[task->seq_id].l_seq; + int quanta = (seqLen[j] + 16 - 1) / 16; // based on SSE-8 bit lane + quanta *= 16; + seqQuantaLen[j] = quanta; + maxSeqLen = max_(maxSeqLen, quanta); + // 3. 其他数据 + xtraA[j] = task->xtra; + } + // 如果不到64个,补全剩下的 + for (; j < SIMD512_WIDTH8; ++j) { + refArr[j] = refArr[0]; + refLen[j] = refLen[0]; + seqArr[j] = seqArr[0]; + seqLen[j] = seqLen[0]; + xtraA[j] = xtraA[0]; + } + // 将ref和seq赋值给对应的用来计算的缓存 + uint8_t* mySeq1SoA = msw_buf->refArr; + uint8_t* mySeq2SoA = msw_buf->seqArr; + + for (j = 0; j < SIMD512_WIDTH8; ++j) { + uint8_t* seq1 = refArr[j]; + uint8_t* seq2 = seqArr[j]; + // 处理ref + for (k = 0; k < refLen[j]; ++k) { + mySeq1SoA[k * SIMD512_WIDTH8 + j] = (seq1[k] == AMBIG_ ? AMBR : seq1[k]); + } + for (; k < maxRefLen; ++k) { + mySeq1SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + } + // 处理seq + for (k = 0; k < seqLen[j]; ++k) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = (seq2[k] == AMBIG_ ? AMBR : seq2[k]); + } + for (; k < seqQuantaLen[j]; ++k) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = DUMMY5; + } + for (; k < maxSeqLen; ++k) { + mySeq2SoA[k * SIMD512_WIDTH8 + j] = 0xFF; + } + } + + const mem_opt_t* opt = w->opt; + // 利用smid指令计算 + ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen, + xtraA, refLen, alns, 0); + // 保存结果 + for (i = startIdx, j = 0; i < endIdx; ++i, ++j) { + matesw_data_t* task = kv_A(w->msw_tasks8, i); + task->aln = alns[j]; + free(refArr[j]); + } +} + +static void worker_calc_matesw16(void* data, int i, int tid) {} + +// 最后再计算并生成sam数据 +static void workder_gen_sam(void* data, int i, int tid) { + mem_worker_t* w = (mem_worker_t*)data; + free(w->regs[i << 1 | 0].a); + free(w->regs[i << 1 | 1].a); +} + +// 划分matesw任务 +static void gather_matesw_task(mem_worker_t* w, matesw_data_v** msw_arr, matesw_ptr_v* tasks) { + tasks->n = 0; + int i = 0, j = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + for (j = 0; j < msw_arr[i]->n; ++j) + kv_push(matesw_data_t*, *tasks, &kv_A(*msw_arr[i], j)); + } +} + +// 更新stats +static void update_mem_stats(mem_worker_t* w) { + int i = 0, max_seq_len = 0, max_ref_len = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + max_seq_len = max_(max_seq_len, w->mem_stats[i].max_seq_len); + max_ref_len = max_(max_ref_len, w->mem_stats[i].max_msw_ref_len); + } + int quanta = (max_seq_len + 16 - 1) / 16; // based on SSE-8 bit lane + quanta *= 16; + max_seq_len = quanta + 1; + for (i = 0; i < w->opt->n_threads; ++i) { + w->mem_stats[i].max_seq_len = max_seq_len; + w->mem_stats[i].max_msw_ref_len = max_ref_len; + } +} + +// 开辟缓冲区 +static void alloc_update_cache_avx512(mem_worker_t* w) { + int i = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + matesw_buf_t* b = &w->msw_buf[i]; + // 更新跟ref len有关的缓冲区 + if (b->ref_len < w->mem_stats[i].max_msw_ref_len) { + b->ref_len = w->mem_stats[i].max_msw_ref_len; + if (b->refArr) _mm_free(b->refArr); + if (b->rowMax) _mm_free(b->rowMax); + b->refArr = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->rowMax = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + } + // 更新跟seq len有关的缓冲区 + if (b->seq_len < w->mem_stats[i].max_seq_len) { + b->seq_len = w->mem_stats[i].max_seq_len; + if (b->seqArr) _mm_free(b->seqArr); + if (b->H0) _mm_free(b->H0); + if (b->H1) _mm_free(b->H1); + if (b->Hmax) _mm_free(b->Hmax); + if (b->F) _mm_free(b->F); + b->seqArr = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->H0 = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->H1 = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->Hmax = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + b->F = (uint8_t*)_mm_malloc(b->seq_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64); + } + } +} + +// 针对pair end数据,生成sam的过程 +void gen_paired_sam(mem_worker_t* w) { + int i = 0; + for (i = 0; i < w->opt->n_threads; ++i) { + w->matesw_arr8[i]->n = 0; // 清空之前的数据 + w->matesw_arr16[i]->n = 0; + } + if (w->opt->flag & MEM_F_NO_RESCUE) { + kt_for(w->opt->n_threads, worker_sam, w, w->n_reads >> 1); // generate alignment + } else { + // 1. 计算哪些read需要matesw + PROF_START(get_matesw_data); + kt_for(w->opt->n_threads, worker_matesw_data, w, w->n_reads >> 1); + PROF_END(gprof[G_get_matesw_data], get_matesw_data); + // 更新stats + PROF_START(update_stats_cache); + update_mem_stats(w); + // 开辟缓冲区 + alloc_update_cache_avx512(w); + PROF_END(gprof[G_update_stats_cache], update_stats_cache); + // 2. matesw计算过程 + PROF_START(gather_matesw_task); + gather_matesw_task(w, w->matesw_arr8, &w->msw_tasks8); + gather_matesw_task(w, w->matesw_arr16, &w->msw_tasks16); + PROF_END(gprof[G_gather_matesw_task], gather_matesw_task); + PROF_START(calc_matesw); + if (w->msw_tasks8.n > 0) + kt_for(w->opt->n_threads, worker_calc_matesw_avx512_u8, w, (w->msw_tasks8.n + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8); + if (w->msw_tasks16.n > 0) + kt_for(w->opt->n_threads, worker_calc_matesw16, w, (w->msw_tasks16.n + SIMD512_WIDTH16 - 1) / SIMD512_WIDTH16); + PROF_END(gprof[G_calc_matesw], calc_matesw); + // 3. + kt_for(w->opt->n_threads, workder_gen_sam, w, w->n_reads >> 1); + } + fprintf(stderr, "zzh %d : 8: %ld 16: %ld\n", i, w->msw_tasks8.n, w->msw_tasks16.n); +} \ No newline at end of file diff --git a/paired_sam.h b/paired_sam.h new file mode 100644 index 0000000..337f314 --- /dev/null +++ b/paired_sam.h @@ -0,0 +1,24 @@ +/* + Description: 针对paired数据,计算生成sam步骤的优化 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2026/01/08 +*/ +#pragma once + +#include "bwamem.h" + +// for avx512 +#define SIMD512_WIDTH8 64 +#define SIMD512_WIDTH16 32 + +void gen_paired_sam(mem_worker_t* w); + +extern int mem_sam_pe(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], + mem_alnreg_v a[2], seq_sam_t ss[2], int tid); + +extern void mem_reg2ovlp(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, bseq1_t* s, mem_alnreg_v* a); + +extern void kt_for(int n_threads, void (*func)(void*, int, int), void* data, int n); \ No newline at end of file diff --git a/profiling.c b/profiling.c index 4c0d7f3..833d254 100644 --- a/profiling.c +++ b/profiling.c @@ -120,6 +120,12 @@ int display_stats(int nthreads) // fprintf(stderr, "real_cal num: %ld\n", calc_num); + fprintf(stderr, "get_matesw_data: %0.2lf s\n", gprof[G_get_matesw_data] * 1.0 / proc_freq); + fprintf(stderr, "update_stats_cache: %0.2lf s\n", gprof[G_update_stats_cache] * 1.0 / proc_freq); + fprintf(stderr, "gather_matesw_task: %0.2lf s\n", gprof[G_gather_matesw_task] * 1.0 / proc_freq); + fprintf(stderr, "calc_matesw: %0.2lf s\n", gprof[G_calc_matesw] * 1.0 / proc_freq); + fprintf(stderr, "gen_sam: %0.2lf s\n", gprof[G_gen_sam] * 1.0 / proc_freq); + fprintf(stderr, "more num: %ld\n", more_num); fprintf(stderr, "no more num: %ld\n", not_more_num); diff --git a/profiling.h b/profiling.h index dd4f826..ffad5e2 100644 --- a/profiling.h +++ b/profiling.h @@ -69,7 +69,12 @@ enum { G_comp_wait_1, G_comp_wait_2, G_write_wait_1, - G_write_wait_2 + G_write_wait_2, + G_gather_matesw_task, + G_calc_matesw, + G_get_matesw_data, + G_gen_sam, + G_update_stats_cache, }; // THREAD diff --git a/run.sh b/run.sh index 4ae04ee..756933b 100755 --- a/run.sh +++ b/run.sh @@ -1,13 +1,18 @@ -thread=128 +thread=32 -n1=~/data/dataset/real/D1/n1.fq.gz -n2=~/data/dataset/real/D1/n2.fq.gz +make -j 16 -reference=~/data/fmt_ref/human_g1k_v37_decoy.fasta +n1=~/data/dataset/D1/n1.fq.gz +n2=~/data/dataset/D1/n2.fq.gz -out=./out.sam +reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta -time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ +out=/dev/null +#out=./out.sam +prog=./fastalign +#prog=/home/zzh/fastbwa + +time $prog mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ $reference \ $n1 \ $n2 \ diff --git a/utils.h b/utils.h index ddf81b9..826c5c2 100644 --- a/utils.h +++ b/utils.h @@ -32,6 +32,7 @@ #include #include #include "profiling.h" +#include "kvec.h" #undef MAX #undef MIN @@ -91,6 +92,9 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; uint32_t *a; } uint32_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; +// typedef struct { size_t n, m; int *a; } int_v; +typedef kvec_t(int) int_v; +typedef kvec_t(uint8_t) byte_v; typedef struct { size_t m; uint8_t *addr; } buf_t;