粗糙的完成了mate sw计算,应该还有bug
This commit is contained in:
parent
48fa3703ba
commit
64b6f1fab2
5
bntseq.c
5
bntseq.c
|
|
@ -424,7 +424,7 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end
|
||||||
return seq;
|
return seq;
|
||||||
}
|
}
|
||||||
|
|
||||||
void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len, size_t *m_seq, uint8_t **seqp)
|
void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, size_t *len, size_t *m_seq, uint8_t **seqp)
|
||||||
{
|
{
|
||||||
if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
|
if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
|
||||||
if (end > l_pac<<1) end = l_pac<<1;
|
if (end > l_pac<<1) end = l_pac<<1;
|
||||||
|
|
@ -477,7 +477,8 @@ uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, in
|
||||||
|
|
||||||
void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp)
|
void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp)
|
||||||
{
|
{
|
||||||
int64_t far_beg, far_end, len;
|
int64_t far_beg, far_end;
|
||||||
|
size_t len;
|
||||||
int is_rev;
|
int is_rev;
|
||||||
|
|
||||||
if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap
|
if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap
|
||||||
|
|
|
||||||
4
bntseq.h
4
bntseq.h
|
|
@ -79,7 +79,9 @@ extern "C" {
|
||||||
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
|
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
|
||||||
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid);
|
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid);
|
||||||
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re);
|
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re);
|
||||||
void bns_fetch_seq_no_alloc(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid, size_t *m_seq, uint8_t **seqp);
|
void bns_get_seq_no_alloc(int64_t l_pac, const uint8_t* pac, int64_t beg, int64_t end, size_t* len, size_t* m_seq, uint8_t** seqp);
|
||||||
|
void bns_fetch_seq_no_alloc(const bntseq_t* bns, const uint8_t* pac, int64_t* beg, int64_t mid, int64_t* end, int* rid, size_t* m_seq,
|
||||||
|
uint8_t** seqp);
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
12
bwamem.c
12
bwamem.c
|
|
@ -111,6 +111,7 @@ mem_opt_t *mem_opt_init()
|
||||||
o->max_chain_extend = 1<<30;
|
o->max_chain_extend = 1<<30;
|
||||||
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
|
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
|
||||||
bwa_fill_scmat(o->a, o->b, o->mat);
|
bwa_fill_scmat(o->a, o->b, o->mat);
|
||||||
|
o->msw_batch_size = 1024;
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1293,6 +1294,16 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b
|
||||||
w->matesw_arr16 = malloc(i * sizeof(matesw_data_v*));
|
w->matesw_arr16 = malloc(i * sizeof(matesw_data_v*));
|
||||||
w->msw_buf = calloc(i, sizeof(matesw_buf_t));
|
w->msw_buf = calloc(i, sizeof(matesw_buf_t));
|
||||||
w->mem_stats = calloc(i, sizeof(mem_stats_t));
|
w->mem_stats = calloc(i, sizeof(mem_stats_t));
|
||||||
|
w->msw_2_arr8 = malloc(i * sizeof(matesw_ptr_v*));
|
||||||
|
w->msw_ref = calloc(i, sizeof(byte_vv));
|
||||||
|
w->msw_seq = calloc(i, sizeof(msw_seq_v));
|
||||||
|
w->msw_kswr = calloc(i, sizeof(msw_kswr_v));
|
||||||
|
w->msw_xtra = calloc(i, sizeof(int_v));
|
||||||
|
|
||||||
|
w->msw_2_ref = calloc(i, sizeof(msw_ref_v));
|
||||||
|
w->msw_2_seq = calloc(i, sizeof(msw_seq_v));
|
||||||
|
w->msw_2_kswr = calloc(i, sizeof(msw_kswr_v));
|
||||||
|
w->msw_2_xtra = calloc(i, sizeof(int_v));
|
||||||
|
|
||||||
for (i = 0; i < opt->n_threads; ++i) {
|
for (i = 0; i < opt->n_threads; ++i) {
|
||||||
w->aux[i] = smem_aux_init();
|
w->aux[i] = smem_aux_init();
|
||||||
|
|
@ -1301,6 +1312,7 @@ mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const b
|
||||||
w->isize_arr[i] = calloc(4, sizeof(uint64_v));
|
w->isize_arr[i] = calloc(4, sizeof(uint64_v));
|
||||||
w->matesw_arr8[i] = calloc(1, sizeof(matesw_data_v));
|
w->matesw_arr8[i] = calloc(1, sizeof(matesw_data_v));
|
||||||
w->matesw_arr16[i] = calloc(1, sizeof(matesw_data_v));
|
w->matesw_arr16[i] = calloc(1, sizeof(matesw_data_v));
|
||||||
|
w->msw_2_arr8[i] = calloc(1, sizeof(matesw_data_t*));
|
||||||
for (j = 0; j < opt->batch_size; ++j) {
|
for (j = 0; j < opt->batch_size; ++j) {
|
||||||
kv_init(w->smem_arr[i][j].mem);
|
kv_init(w->smem_arr[i][j].mem);
|
||||||
kv_init(w->smem_arr[i][j].pos_arr);
|
kv_init(w->smem_arr[i][j].pos_arr);
|
||||||
|
|
|
||||||
34
bwamem.h
34
bwamem.h
|
|
@ -32,6 +32,7 @@
|
||||||
#include "bwa.h"
|
#include "bwa.h"
|
||||||
#include "fmt_idx.h"
|
#include "fmt_idx.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
#include "ksw_align_avx.h"
|
||||||
|
|
||||||
#define MEM_MAPQ_COEF 30.0
|
#define MEM_MAPQ_COEF 30.0
|
||||||
#define MEM_MAPQ_MAX 60
|
#define MEM_MAPQ_MAX 60
|
||||||
|
|
@ -84,6 +85,7 @@ typedef struct {
|
||||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||||
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
|
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
|
||||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||||
|
int msw_batch_size; // 每次处理的msw的个数,64的倍数
|
||||||
} mem_opt_t;
|
} mem_opt_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
@ -166,6 +168,24 @@ typedef struct {
|
||||||
int max_msw_ref_len; // mate sw里的最长的ref长度
|
int max_msw_ref_len; // mate sw里的最长的ref长度
|
||||||
} mem_stats_t; // 记录一些用到的数据统计,比如最长ref长度,最长seq长度等
|
} mem_stats_t; // 记录一些用到的数据统计,比如最长ref长度,最长seq长度等
|
||||||
|
|
||||||
|
typedef kvec_t(byte_v) byte_vv;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint8_t* seq; // 只是指向seq的指针
|
||||||
|
int l_seq; // seq长度
|
||||||
|
} msw_seq_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint8_t* ref;
|
||||||
|
int l_ref;
|
||||||
|
} msw_ref_t;
|
||||||
|
|
||||||
|
typedef kvec_t(msw_seq_t) msw_seq_v;
|
||||||
|
|
||||||
|
typedef kvec_t(msw_ref_t) msw_ref_v;
|
||||||
|
|
||||||
|
typedef kvec_t(kswr_avx_t) msw_kswr_v;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int calc_isize;
|
int calc_isize;
|
||||||
const mem_opt_t *opt;
|
const mem_opt_t *opt;
|
||||||
|
|
@ -182,13 +202,27 @@ typedef struct {
|
||||||
mem_alnreg_v *regs;
|
mem_alnreg_v *regs;
|
||||||
uint64_v **isize_arr;
|
uint64_v **isize_arr;
|
||||||
matesw_data_v** matesw_arr8; // 收集需要做mate sw的任务的数据
|
matesw_data_v** matesw_arr8; // 收集需要做mate sw的任务的数据
|
||||||
|
matesw_ptr_v** msw_2_arr8; // 第二阶段任务,每个线程单独收集
|
||||||
matesw_data_v** matesw_arr16;
|
matesw_data_v** matesw_arr16;
|
||||||
//matesw_range_v msw_range_arr8;
|
//matesw_range_v msw_range_arr8;
|
||||||
//matesw_range_v msw_range_arr16;
|
//matesw_range_v msw_range_arr16;
|
||||||
matesw_ptr_v msw_tasks8; // 需要做mates sw的任务的指针
|
matesw_ptr_v msw_tasks8; // 需要做mates sw的任务的指针
|
||||||
matesw_ptr_v msw_tasks16;
|
matesw_ptr_v msw_tasks16;
|
||||||
|
matesw_ptr_v msw_2_tasks8; // msw第二阶段任务
|
||||||
matesw_buf_t* msw_buf; // mate sw过程用到的缓存空间
|
matesw_buf_t* msw_buf; // mate sw过程用到的缓存空间
|
||||||
mem_stats_t* mem_stats; // 记录一些开辟空间等用到的统计数据,如seq长度,ref长度等
|
mem_stats_t* mem_stats; // 记录一些开辟空间等用到的统计数据,如seq长度,ref长度等
|
||||||
|
|
||||||
|
byte_vv* msw_ref; // 获取ref时的缓存空间,避免频繁的内存开辟和释放
|
||||||
|
msw_seq_v* msw_seq; // seq的缓存空间,个数由msw_batch_size决定,上面的ref一样
|
||||||
|
msw_kswr_v* msw_kswr; // 计算结果
|
||||||
|
int_v* msw_xtra; // msw额外参数
|
||||||
|
|
||||||
|
// msw第二阶段
|
||||||
|
msw_ref_v* msw_2_ref;
|
||||||
|
msw_seq_v* msw_2_seq;
|
||||||
|
msw_kswr_v* msw_2_kswr;
|
||||||
|
int_v* msw_2_xtra;
|
||||||
|
|
||||||
int64_t n_processed;
|
int64_t n_processed;
|
||||||
int64_t n; // regs元素个数,可动态扩展
|
int64_t n; // regs元素个数,可动态扩展
|
||||||
int64_t n_reads;
|
int64_t n_reads;
|
||||||
|
|
|
||||||
303
paired_sam.c
303
paired_sam.c
|
|
@ -48,9 +48,9 @@ static int get_rid_range(const bntseq_t* bns, const uint8_t* pac, int64_t* beg,
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取ref序列
|
// 获取ref序列
|
||||||
static inline uint8_t* get_ref_seq(const bntseq_t* bns, const uint8_t* pac, int64_t beg, int64_t end) {
|
static inline uint8_t* get_ref_seq(const bntseq_t* bns, const uint8_t* pac, int64_t beg, int64_t end, byte_v *buf) {
|
||||||
int64_t len;
|
bns_get_seq_no_alloc(bns->l_pac, pac, beg, end, &buf->n, &buf->m, &buf->a);
|
||||||
return bns_get_seq(bns->l_pac, pac, beg, end, &len);
|
return buf->a;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算当前seq是否需要做matesw,需要的话保存必要的数据
|
// 计算当前seq是否需要做matesw,需要的话保存必要的数据
|
||||||
|
|
@ -111,7 +111,14 @@ static void get_matesw_data(const mem_opt_t* opt, const bntseq_t* bns, const uin
|
||||||
static void worker_matesw_data(void* data, int idx, int tid) {
|
static void worker_matesw_data(void* data, int idx, int tid) {
|
||||||
mem_worker_t* w = (mem_worker_t*)data;
|
mem_worker_t* w = (mem_worker_t*)data;
|
||||||
const mem_opt_t* opt = w->opt;
|
const mem_opt_t* opt = w->opt;
|
||||||
int64_t i_s = idx << 1;
|
int startIdx = idx * opt->batch_size;
|
||||||
|
int endIdx = (idx + 1) * opt->batch_size;
|
||||||
|
if (endIdx > w->n_reads)
|
||||||
|
endIdx = w->n_reads;
|
||||||
|
|
||||||
|
int id = 0;
|
||||||
|
for (id = startIdx; id < endIdx; id += 2) {
|
||||||
|
int64_t i_s = id;
|
||||||
bseq1_t* s = &w->seqs[i_s];
|
bseq1_t* s = &w->seqs[i_s];
|
||||||
mem_alnreg_v* a = &w->regs[i_s];
|
mem_alnreg_v* a = &w->regs[i_s];
|
||||||
s->msw.n = 0; // 清空之前的matesw数据
|
s->msw.n = 0; // 清空之前的matesw数据
|
||||||
|
|
@ -126,90 +133,210 @@ static void worker_matesw_data(void* data, int idx, int tid) {
|
||||||
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
||||||
for (i = 0; i < 2; ++i)
|
for (i = 0; i < 2; ++i)
|
||||||
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
||||||
get_matesw_data(opt, w->bns, w->pac, w->pes, &b[i].a[j], &s[!i], &a[!i], i_s + !i, &w->mem_stats[tid], w->matesw_arr8[tid], w->matesw_arr16[tid]);
|
get_matesw_data(opt, w->bns, w->pac, w->pes, &b[i].a[j], &s[!i], &a[!i], i_s + !i, &w->mem_stats[tid], w->matesw_arr8[tid],
|
||||||
|
w->matesw_arr16[tid]);
|
||||||
free(b[0].a);
|
free(b[0].a);
|
||||||
free(b[1].a);
|
free(b[1].a);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 再多线程计算matesw,利用inter-query的simd并行
|
// 再多线程计算matesw,利用inter-query的simd并行
|
||||||
static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) {
|
static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) {
|
||||||
mem_worker_t* w = (mem_worker_t*)data;
|
mem_worker_t* w = (mem_worker_t*)data;
|
||||||
int startIdx = idx * SIMD512_WIDTH8;
|
const mem_opt_t* opt = w->opt;
|
||||||
int endIdx = (idx + 1) * SIMD512_WIDTH8;
|
int startIdx = idx * w->opt->msw_batch_size;
|
||||||
|
int endIdx = (idx + 1) * w->opt->msw_batch_size;
|
||||||
if (endIdx > w->msw_tasks8.n)
|
if (endIdx > w->msw_tasks8.n)
|
||||||
endIdx = w->msw_tasks8.n;
|
endIdx = w->msw_tasks8.n;
|
||||||
|
int roundEndIdx = ((endIdx + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8) * SIMD512_WIDTH8;
|
||||||
|
|
||||||
int i = 0, j = 0, k = 0;
|
int i = 0, j = 0, k = 0;
|
||||||
int maxSeqLen = 0, maxRefLen = 0;
|
|
||||||
uint8_t* refArr[SIMD512_WIDTH8] = {0};
|
|
||||||
uint8_t* seqArr[SIMD512_WIDTH8] = {0};
|
|
||||||
int refLen[SIMD512_WIDTH8] = {0};
|
int refLen[SIMD512_WIDTH8] = {0};
|
||||||
int seqLen[SIMD512_WIDTH8] = {0};
|
|
||||||
int seqQuantaLen[SIMD512_WIDTH8] = {0};
|
|
||||||
int xtraA[SIMD512_WIDTH8] = {0};
|
|
||||||
kswr_avx_t alns[SIMD512_WIDTH8] = {0};
|
|
||||||
|
|
||||||
matesw_buf_t* msw_buf = &w->msw_buf[tid]; // 缓冲区
|
matesw_buf_t* msw_buf = &w->msw_buf[tid]; // 缓冲区
|
||||||
|
byte_vv* refs = &w->msw_ref[tid];
|
||||||
|
msw_seq_v* seqs = &w->msw_seq[tid];
|
||||||
|
int_v* xtras = &w->msw_xtra[tid];
|
||||||
|
msw_kswr_v* kswrs = &w->msw_kswr[tid];
|
||||||
|
|
||||||
|
// 获取ref和seq数据
|
||||||
|
PROF_START(msw_get_ref);
|
||||||
for (i = startIdx, j = 0; i < endIdx; ++i, ++j) {
|
for (i = startIdx, j = 0; i < endIdx; ++i, ++j) {
|
||||||
matesw_data_t* task = kv_A(w->msw_tasks8, i);
|
matesw_data_t* task = kv_A(w->msw_tasks8, i);
|
||||||
// 1. 获取对应的ref
|
// 1. 获取对应的ref
|
||||||
refArr[j] = get_ref_seq(w->bns, w->pac, task->rb, task->re);
|
byte_v* ref = &kv_A(*refs, j);
|
||||||
refLen[j] = task->re - task->rb;
|
get_ref_seq(w->bns, w->pac, task->rb, task->re, ref);
|
||||||
maxRefLen = max_(maxRefLen, refLen[j]);
|
|
||||||
// 2. 获取对应的seq
|
// 2. 获取对应的seq
|
||||||
seqArr[j] = (uint8_t*)w->seqs[task->seq_id].seq;
|
msw_seq_t* seq = &kv_A(*seqs, j);
|
||||||
seqLen[j] = w->seqs[task->seq_id].l_seq;
|
seq->seq = (uint8_t*)w->seqs[task->seq_id].seq;
|
||||||
int quanta = (seqLen[j] + 16 - 1) / 16; // based on SSE-8 bit lane
|
seq->l_seq = w->seqs[task->seq_id].l_seq;
|
||||||
quanta *= 16;
|
|
||||||
seqQuantaLen[j] = quanta;
|
|
||||||
maxSeqLen = max_(maxSeqLen, quanta);
|
|
||||||
// 3. 其他数据
|
// 3. 其他数据
|
||||||
xtraA[j] = task->xtra;
|
xtras->a[j] = task->xtra;
|
||||||
}
|
}
|
||||||
// 如果不到64个,补全剩下的
|
for (; i < roundEndIdx; ++i, ++j) {
|
||||||
for (; j < SIMD512_WIDTH8; ++j) {
|
byte_v* ref = &kv_A(*refs, j);
|
||||||
refArr[j] = refArr[0];
|
ref->n = 0;
|
||||||
refLen[j] = refLen[0];
|
msw_seq_t* seq = &kv_A(*seqs, j);
|
||||||
seqArr[j] = seqArr[0];
|
seq->l_seq = 0;
|
||||||
seqLen[j] = seqLen[0];
|
xtras->a[j] = 0;
|
||||||
xtraA[j] = xtraA[0];
|
|
||||||
}
|
}
|
||||||
|
PROF_END(tprof[T_MSW_GET_REF][tid], msw_get_ref);
|
||||||
|
|
||||||
|
// 准备数据并进行计算
|
||||||
|
for (i = 0; i < endIdx - startIdx; i += SIMD512_WIDTH8) {
|
||||||
// 将ref和seq赋值给对应的用来计算的缓存
|
// 将ref和seq赋值给对应的用来计算的缓存
|
||||||
uint8_t* mySeq1SoA = msw_buf->refArr;
|
uint8_t* mySeq1SoA = msw_buf->refArr;
|
||||||
uint8_t* mySeq2SoA = msw_buf->seqArr;
|
uint8_t* mySeq2SoA = msw_buf->seqArr;
|
||||||
|
int maxSeqLen = 0, maxRefLen = 0;
|
||||||
|
|
||||||
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
PROF_START(msw_pack_seq);
|
||||||
uint8_t* seq1 = refArr[j];
|
|
||||||
uint8_t* seq2 = seqArr[j];
|
|
||||||
// 处理ref
|
// 处理ref
|
||||||
for (k = 0; k < refLen[j]; ++k) {
|
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
||||||
|
byte_v* ref = &refs->a[i + j];
|
||||||
|
uint8_t* seq1 = ref->a;
|
||||||
|
refLen[j] = ref->n;
|
||||||
|
// 处理ref
|
||||||
|
for (k = 0; k < ref->n; ++k) {
|
||||||
mySeq1SoA[k * SIMD512_WIDTH8 + j] = (seq1[k] == AMBIG_ ? AMBR : seq1[k]);
|
mySeq1SoA[k * SIMD512_WIDTH8 + j] = (seq1[k] == AMBIG_ ? AMBR : seq1[k]);
|
||||||
}
|
}
|
||||||
for (; k < maxRefLen; ++k) {
|
maxRefLen = max_(maxRefLen, ref->n);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
||||||
|
byte_v* ref = &refs->a[i + j];
|
||||||
|
for (k = ref->n; k < maxRefLen; ++k) {
|
||||||
mySeq1SoA[k * SIMD512_WIDTH8 + j] = 0xFF;
|
mySeq1SoA[k * SIMD512_WIDTH8 + j] = 0xFF;
|
||||||
}
|
}
|
||||||
// 处理seq
|
|
||||||
for (k = 0; k < seqLen[j]; ++k) {
|
|
||||||
mySeq2SoA[k * SIMD512_WIDTH8 + j] = (seq2[k] == AMBIG_ ? AMBR : seq2[k]);
|
|
||||||
}
|
}
|
||||||
for (; k < seqQuantaLen[j]; ++k) {
|
|
||||||
mySeq2SoA[k * SIMD512_WIDTH8 + j] = DUMMY5;
|
// 处理query
|
||||||
|
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
||||||
|
msw_seq_t* seq = &kv_A(*seqs, i + j);
|
||||||
|
uint8_t* seq2 = seq->seq;
|
||||||
|
int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane
|
||||||
|
for (k = 0; k < seq->l_seq; k++) {
|
||||||
|
mySeq2SoA[k * SIMD512_WIDTH8 + j] = (seq2[k] == AMBIG_ ? AMBQ : seq2[k]);
|
||||||
}
|
}
|
||||||
for (; k < maxSeqLen; ++k) {
|
for (; k < quanta; ++k) {
|
||||||
|
mySeq2SoA[k * SIMD512_WIDTH8 + j] = DUMMY5; // SSE quanta
|
||||||
|
}
|
||||||
|
maxSeqLen = max_(maxSeqLen, quanta);
|
||||||
|
}
|
||||||
|
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
||||||
|
msw_seq_t* seq = &kv_A(*seqs, i + j);
|
||||||
|
int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane
|
||||||
|
for (k = quanta; k < maxSeqLen; k++) {
|
||||||
mySeq2SoA[k * SIMD512_WIDTH8 + j] = 0xFF;
|
mySeq2SoA[k * SIMD512_WIDTH8 + j] = 0xFF;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
PROF_END(tprof[T_MSW_PACK_SEQ][tid], msw_pack_seq);
|
||||||
|
|
||||||
const mem_opt_t* opt = w->opt;
|
|
||||||
// 利用smid指令计算
|
// 利用smid指令计算
|
||||||
|
PROF_START(msw_1);
|
||||||
ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen,
|
ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen,
|
||||||
xtraA, refLen, alns, 0);
|
&xtras->a[i], refLen, &kswrs->a[i], 0);
|
||||||
// 保存结果
|
PROF_END(tprof[T_MSW_1][tid], msw_1);
|
||||||
for (i = startIdx, j = 0; i < endIdx; ++i, ++j) {
|
|
||||||
matesw_data_t* task = kv_A(w->msw_tasks8, i);
|
|
||||||
task->aln = alns[j];
|
|
||||||
free(refArr[j]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
msw_ref_v* refs2 = &w->msw_2_ref[tid];
|
||||||
|
msw_seq_v* seqs2 = &w->msw_2_seq[tid];
|
||||||
|
int_v* xtras2 = &w->msw_2_xtra[tid];
|
||||||
|
msw_kswr_v* kswrs2 = &w->msw_2_kswr[tid];
|
||||||
|
|
||||||
|
int_v msw_task_ids; // 用于第二阶段任务统计
|
||||||
|
kv_init(msw_task_ids);
|
||||||
|
// 保存结果,并筛选第二阶段msw计算的任务
|
||||||
|
for (i = startIdx, j = 0, k = 0; i < endIdx; ++i, ++j) {
|
||||||
|
matesw_data_t* task = kv_A(w->msw_tasks8, i);
|
||||||
|
task->aln = kswrs->a[j];
|
||||||
|
if ((task->xtra & KSW_XSTART) == 0 || ((task->xtra & KSW_XSUBO) && task->aln.score < (task->xtra & 0xffff)))
|
||||||
|
continue;
|
||||||
|
task->xtra = KSW_XSTOP | task->aln.score;
|
||||||
|
kv_push(int, msw_task_ids, j);
|
||||||
|
|
||||||
|
// 1. 获取对应的ref
|
||||||
|
byte_v* ref = &kv_A(*refs, j);
|
||||||
|
msw_ref_t* ref2 = &kv_A(*refs2, k);
|
||||||
|
ref2->l_ref = ref->n;
|
||||||
|
ref2->ref = ref->a;
|
||||||
|
// 2. 获取对应的seq
|
||||||
|
msw_seq_t* seq = &kv_A(*seqs, j);
|
||||||
|
msw_seq_t* seq2 = &kv_A(*seqs2, k);
|
||||||
|
seq2->seq = seq->seq;
|
||||||
|
seq2->l_seq = seq->l_seq;
|
||||||
|
|
||||||
|
// 3. 其他数据
|
||||||
|
xtras2->a[k] = task->xtra;
|
||||||
|
++k;
|
||||||
|
}
|
||||||
|
roundEndIdx = ((msw_task_ids.n + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8) * SIMD512_WIDTH8;
|
||||||
|
for (; k < roundEndIdx; ++k) {
|
||||||
|
msw_ref_t* ref2 = &kv_A(*refs2, k);
|
||||||
|
ref2->l_ref = 0;
|
||||||
|
msw_seq_t* seq2 = &kv_A(*seqs2, k);
|
||||||
|
seq2->l_seq = 0;
|
||||||
|
xtras2->a[k] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 准备数据并进行计算
|
||||||
|
for (i = 0; i < msw_task_ids.n; i += SIMD512_WIDTH8) {
|
||||||
|
// 将ref和seq赋值给对应的用来计算的缓存
|
||||||
|
uint8_t* mySeq1SoA = msw_buf->refArr;
|
||||||
|
uint8_t* mySeq2SoA = msw_buf->seqArr;
|
||||||
|
int maxSeqLen = 0, maxRefLen = 0;
|
||||||
|
|
||||||
|
PROF_START(msw_pack_seq);
|
||||||
|
// 处理ref
|
||||||
|
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
||||||
|
msw_ref_t* ref = &refs2->a[i + j];
|
||||||
|
uint8_t* seq1 = ref->ref;
|
||||||
|
refLen[j] = ref->l_ref;
|
||||||
|
// 处理ref
|
||||||
|
for (k = 0; k < ref->l_ref; ++k) {
|
||||||
|
mySeq1SoA[k * SIMD512_WIDTH8 + j] = (seq1[k] == AMBIG_ ? AMBR : seq1[k]);
|
||||||
|
}
|
||||||
|
maxRefLen = max_(maxRefLen, ref->l_ref);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
||||||
|
msw_ref_t* ref = &refs2->a[i + j];
|
||||||
|
for (k = ref->l_ref; k < maxRefLen; ++k) {
|
||||||
|
mySeq1SoA[k * SIMD512_WIDTH8 + j] = 0xFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 处理query
|
||||||
|
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
||||||
|
msw_seq_t* seq = &kv_A(*seqs2, i + j);
|
||||||
|
uint8_t* seq2 = seq->seq;
|
||||||
|
int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane
|
||||||
|
for (k = 0; k < seq->l_seq; k++) {
|
||||||
|
mySeq2SoA[k * SIMD512_WIDTH8 + j] = (seq2[k] == AMBIG_ ? AMBQ : seq2[k]);
|
||||||
|
}
|
||||||
|
for (; k < quanta; ++k) {
|
||||||
|
mySeq2SoA[k * SIMD512_WIDTH8 + j] = DUMMY5; // SSE quanta
|
||||||
|
}
|
||||||
|
maxSeqLen = max_(maxSeqLen, quanta);
|
||||||
|
}
|
||||||
|
for (j = 0; j < SIMD512_WIDTH8; ++j) {
|
||||||
|
msw_seq_t* seq = &kv_A(*seqs2, i + j);
|
||||||
|
int quanta = ((seq->l_seq + 16 - 1) / 16) * 16; // based on SSE-8 bit lane
|
||||||
|
for (k = quanta; k < maxSeqLen; k++) {
|
||||||
|
mySeq2SoA[k * SIMD512_WIDTH8 + j] = 0xFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
PROF_END(tprof[T_MSW_PACK_SEQ][tid], msw_pack_seq);
|
||||||
|
|
||||||
|
// 利用smid指令计算
|
||||||
|
PROF_START(msw_2);
|
||||||
|
ksw_align_avx512_u8(opt->a, -1 * opt->b, opt->o_ins, opt->e_ins, opt->o_del, opt->e_del, msw_buf, mySeq1SoA, mySeq2SoA, maxRefLen, maxSeqLen,
|
||||||
|
&xtras2->a[i], refLen, &kswrs2->a[i], 1);
|
||||||
|
PROF_END(tprof[T_MSW_2][tid], msw_2);
|
||||||
|
}
|
||||||
|
|
||||||
|
kv_destroy(msw_task_ids);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void worker_calc_matesw16(void* data, int i, int tid) {}
|
static void worker_calc_matesw16(void* data, int i, int tid) {}
|
||||||
|
|
@ -249,8 +376,71 @@ static void update_mem_stats(mem_worker_t* w) {
|
||||||
|
|
||||||
// 开辟缓冲区
|
// 开辟缓冲区
|
||||||
static void alloc_update_cache_avx512(mem_worker_t* w) {
|
static void alloc_update_cache_avx512(mem_worker_t* w) {
|
||||||
int i = 0;
|
int i = 0, j = 0;
|
||||||
for (i = 0; i < w->opt->n_threads; ++i) {
|
for (i = 0; i < w->opt->n_threads; ++i) {
|
||||||
|
// 检查ref缓存空间
|
||||||
|
byte_vv* refs = &w->msw_ref[i];
|
||||||
|
if (refs->n < w->opt->msw_batch_size) { // 初始化ref缓存
|
||||||
|
for (j = refs->n; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
byte_v *p = kv_pushp(byte_v, *refs);
|
||||||
|
kv_init(*p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
msw_ref_v* refs2 = &w->msw_2_ref[i];
|
||||||
|
if (refs2->n < w->opt->msw_batch_size) { // 初始化seq缓存
|
||||||
|
for (j = refs2->n; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
msw_ref_t* p = kv_pushp(msw_ref_t, *refs2);
|
||||||
|
p->l_ref = 0;
|
||||||
|
p->ref = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 检查seq缓存空间
|
||||||
|
msw_seq_v* seqs = &w->msw_seq[i];
|
||||||
|
if (seqs->n < w->opt->msw_batch_size) { // 初始化seq缓存
|
||||||
|
for (j = seqs->n; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
msw_seq_t* p = kv_pushp(msw_seq_t, *seqs);
|
||||||
|
p->l_seq = 0;
|
||||||
|
p->seq = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
msw_seq_v* seqs2 = &w->msw_2_seq[i];
|
||||||
|
if (seqs2->n < w->opt->msw_batch_size) { // 初始化seq缓存
|
||||||
|
for (j = seqs2->n; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
msw_seq_t* p = kv_pushp(msw_seq_t, *seqs2);
|
||||||
|
p->l_seq = 0;
|
||||||
|
p->seq = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
msw_kswr_v* kswrs = &w->msw_kswr[i]; // 初始化msw结果
|
||||||
|
if (kswrs->n < w->opt->msw_batch_size) {
|
||||||
|
for (j = kswrs->n; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
kv_pushp(kswr_avx_t, *kswrs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
msw_kswr_v* kswrs2 = &w->msw_2_kswr[i]; // 初始化msw结果
|
||||||
|
if (kswrs2->n < w->opt->msw_batch_size) {
|
||||||
|
for (j = kswrs2->n; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
kv_pushp(kswr_avx_t, *kswrs2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int_v* xtras = &w->msw_xtra[i]; // 初始化xtra
|
||||||
|
if (xtras->n < w->opt->msw_batch_size) {
|
||||||
|
for (j = xtras->n; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
kv_pushp(int, *xtras);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int_v* xtras2 = &w->msw_2_xtra[i]; // 初始化xtra
|
||||||
|
if (xtras2->n < w->opt->msw_batch_size) {
|
||||||
|
for (j = xtras2->n; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
kv_pushp(int, *xtras2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
matesw_buf_t* b = &w->msw_buf[i];
|
matesw_buf_t* b = &w->msw_buf[i];
|
||||||
// 更新跟ref len有关的缓冲区
|
// 更新跟ref len有关的缓冲区
|
||||||
if (b->ref_len < w->mem_stats[i].max_msw_ref_len) {
|
if (b->ref_len < w->mem_stats[i].max_msw_ref_len) {
|
||||||
|
|
@ -259,6 +449,11 @@ static void alloc_update_cache_avx512(mem_worker_t* w) {
|
||||||
if (b->rowMax) _mm_free(b->rowMax);
|
if (b->rowMax) _mm_free(b->rowMax);
|
||||||
b->refArr = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64);
|
b->refArr = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64);
|
||||||
b->rowMax = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64);
|
b->rowMax = (uint8_t*)_mm_malloc(b->ref_len * SIMD512_WIDTH8 * sizeof(uint8_t), 64);
|
||||||
|
for (j = 0; j < w->opt->msw_batch_size; ++j) {
|
||||||
|
byte_v* ref = &kv_A(*refs, j);
|
||||||
|
kv_resize(uint8_t, *ref, b->ref_len);
|
||||||
|
// ref->n = ref->m;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// 更新跟seq len有关的缓冲区
|
// 更新跟seq len有关的缓冲区
|
||||||
if (b->seq_len < w->mem_stats[i].max_seq_len) {
|
if (b->seq_len < w->mem_stats[i].max_seq_len) {
|
||||||
|
|
@ -287,9 +482,10 @@ void gen_paired_sam(mem_worker_t* w) {
|
||||||
if (w->opt->flag & MEM_F_NO_RESCUE) {
|
if (w->opt->flag & MEM_F_NO_RESCUE) {
|
||||||
kt_for(w->opt->n_threads, worker_sam, w, w->n_reads >> 1); // generate alignment
|
kt_for(w->opt->n_threads, worker_sam, w, w->n_reads >> 1); // generate alignment
|
||||||
} else {
|
} else {
|
||||||
|
int batch_n = (w->n_reads + w->opt->batch_size - 1) / w->opt->batch_size;
|
||||||
// 1. 计算哪些read需要matesw
|
// 1. 计算哪些read需要matesw
|
||||||
PROF_START(get_matesw_data);
|
PROF_START(get_matesw_data);
|
||||||
kt_for(w->opt->n_threads, worker_matesw_data, w, w->n_reads >> 1);
|
kt_for(w->opt->n_threads, worker_matesw_data, w, batch_n);
|
||||||
PROF_END(gprof[G_get_matesw_data], get_matesw_data);
|
PROF_END(gprof[G_get_matesw_data], get_matesw_data);
|
||||||
// 更新stats
|
// 更新stats
|
||||||
PROF_START(update_stats_cache);
|
PROF_START(update_stats_cache);
|
||||||
|
|
@ -303,10 +499,11 @@ void gen_paired_sam(mem_worker_t* w) {
|
||||||
gather_matesw_task(w, w->matesw_arr16, &w->msw_tasks16);
|
gather_matesw_task(w, w->matesw_arr16, &w->msw_tasks16);
|
||||||
PROF_END(gprof[G_gather_matesw_task], gather_matesw_task);
|
PROF_END(gprof[G_gather_matesw_task], gather_matesw_task);
|
||||||
PROF_START(calc_matesw);
|
PROF_START(calc_matesw);
|
||||||
|
int msw_batch_n = (w->msw_tasks8.n + w->opt->msw_batch_size - 1) / w->opt->msw_batch_size;
|
||||||
if (w->msw_tasks8.n > 0)
|
if (w->msw_tasks8.n > 0)
|
||||||
kt_for(w->opt->n_threads, worker_calc_matesw_avx512_u8, w, (w->msw_tasks8.n + SIMD512_WIDTH8 - 1) / SIMD512_WIDTH8);
|
kt_for(w->opt->n_threads, worker_calc_matesw_avx512_u8, w, msw_batch_n);
|
||||||
if (w->msw_tasks16.n > 0)
|
if (w->msw_tasks16.n > 0)
|
||||||
kt_for(w->opt->n_threads, worker_calc_matesw16, w, (w->msw_tasks16.n + SIMD512_WIDTH16 - 1) / SIMD512_WIDTH16);
|
kt_for(w->opt->n_threads, worker_calc_matesw16, w, msw_batch_n);
|
||||||
PROF_END(gprof[G_calc_matesw], calc_matesw);
|
PROF_END(gprof[G_calc_matesw], calc_matesw);
|
||||||
// 3.
|
// 3.
|
||||||
kt_for(w->opt->n_threads, workder_gen_sam, w, w->n_reads >> 1);
|
kt_for(w->opt->n_threads, workder_gen_sam, w, w->n_reads >> 1);
|
||||||
|
|
|
||||||
14
profiling.c
14
profiling.c
|
|
@ -126,6 +126,20 @@ int display_stats(int nthreads)
|
||||||
fprintf(stderr, "calc_matesw: %0.2lf s\n", gprof[G_calc_matesw] * 1.0 / proc_freq);
|
fprintf(stderr, "calc_matesw: %0.2lf s\n", gprof[G_calc_matesw] * 1.0 / proc_freq);
|
||||||
fprintf(stderr, "gen_sam: %0.2lf s\n", gprof[G_gen_sam] * 1.0 / proc_freq);
|
fprintf(stderr, "gen_sam: %0.2lf s\n", gprof[G_gen_sam] * 1.0 / proc_freq);
|
||||||
|
|
||||||
|
find_opt(tprof[T_MSW_GET_REF], nthreads, &max, &min, &avg);
|
||||||
|
fprintf(stderr, "msw_get_ref: %0.2lf s\n", avg);
|
||||||
|
|
||||||
|
find_opt(tprof[T_MSW_PACK_SEQ], nthreads, &max, &min, &avg);
|
||||||
|
fprintf(stderr, "msw_pack_seq: %0.2lf s\n", avg);
|
||||||
|
|
||||||
|
find_opt(tprof[T_MSW_1], nthreads, &max, &min, &avg);
|
||||||
|
fprintf(stderr, "msw_1: %0.2lf s\n", avg);
|
||||||
|
|
||||||
|
find_opt(tprof[T_MSW_2], nthreads, &max, &min, &avg);
|
||||||
|
fprintf(stderr, "msw_2: %0.2lf s\n", avg);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
fprintf(stderr, "more num: %ld\n", more_num);
|
fprintf(stderr, "more num: %ld\n", more_num);
|
||||||
fprintf(stderr, "no more num: %ld\n", not_more_num);
|
fprintf(stderr, "no more num: %ld\n", not_more_num);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -102,6 +102,10 @@ enum
|
||||||
T_SEEDING,
|
T_SEEDING,
|
||||||
T_EXTENSION,
|
T_EXTENSION,
|
||||||
T_SAM,
|
T_SAM,
|
||||||
|
T_MSW_1,
|
||||||
|
T_MSW_2,
|
||||||
|
T_MSW_GET_REF,
|
||||||
|
T_MSW_PACK_SEQ,
|
||||||
};
|
};
|
||||||
|
|
||||||
int display_stats(int);
|
int display_stats(int);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue