sw_perf/align.c

278 lines
13 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*********************************************************************************************
Description: stripped sw align functions in bwa-mem
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/11
***********************************************************************************************/
// u8和i16 score2 结果有不同,不是算法问题,而是边界未清零
#include <stdio.h>
#include <stdlib.h>
#include "byte_alloc.h"
#include "utils.h"
#include "profiling.h"
#include "align.h"
#include "debug.h"
const kswr_t g_defr = {0, -1, -1, -1, -1, -1, -1};
#define ALIGN_PERFORMANCE_TEST(kernel_id, nbyte, func, sp, ep) \
do \
{ \
PROF_START(align); \
int i; \
kswr_t score; \
for (i = sp; i < ep; ++i) \
{ if (kv_A(kv_A(i_arr, i), 0) < 144) continue; \
kswq_sse_t *q = aln_sse_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \
score = func(&bmem[1], q, \
kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \
6, 1, 6, 1, xtra); \
score_total[kernel_id] += score.score; \
byte_mem_clear(&bmem[0]); /* free(q); */ \
} \
PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \
} while (0)
#define ALIGN_PERFORMANCE_TEST_AVX2(kernel_id, nbyte, func, sp, ep) \
do { \
PROF_START(align); \
int i; \
kswr_t score; \
for (i = sp; i < ep; ++i) { \
if (kv_A(kv_A(i_arr, i), 0) < 144) \
continue; \
kswq_avx2_t *q = aln_avx2_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \
score = func(&bmem[1], q, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); \
score_total[kernel_id] += score.score; \
byte_mem_clear(&bmem[0]); /* free(q); */ \
} \
PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \
} while (0)
// sse ksw init
/**
* Initialize the query data structure
*
* @param size Number of bytes used to store a score; valid valures are 1 or 2
* @param qlen Length of the query sequence
* @param query Query sequence
* @param m Size of the alphabet
* @param mat Scoring matrix in a one-dimension array
*
* @return Query data structure
*/
kswq_sse_t *aln_sse_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
{
kswq_sse_t *q;
int slen, a, tmp, p, memsize;
size = size > 1? 2 : 1;
p = 8 * (3 - size); // # values per __m128i
qlen = 144;
slen = (qlen + p - 1) / p; // segmented length
memsize = sizeof(kswq_sse_t) + 256 + 16 * slen * (m + 4);
q = (kswq_sse_t *)malloc(memsize); // a single block of memory
//q = (kswq_sse_t *)byte_mem_request_and_clean(bmem, memsize);
//q = (kswq_sse_t *)byte_mem_request(bmem, memsize);
q->qp = (__m128i *)(((size_t)q + sizeof(kswq_sse_t) + 15) >> 4 << 4); // align memory
q->H0 = q->qp + slen * m;
q->H1 = q->H0 + slen;
q->E = q->H1 + slen;
q->Hmax = q->E + slen;
q->slen = slen; q->qlen = qlen; q->size = size;
//fprintf(stderr, "%d %d\n", slen, qlen);
// compute shift
tmp = m * m;
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
}
q->max = q->mdiff;
q->shift = 256 - q->shift; // NB: q->shift is uint8_t
q->mdiff += q->shift; // this is the difference between the min and max scores
// An example: p=8, qlen=19, slen=3 and segmentation:
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
if (size == 1) {
int8_t *t = (int8_t*)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
}
} else {
int16_t *t = (int16_t*)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen? 0 : ma[query[k]]);
}
}
return q;
}
/**
* Initialize the query data structure
*
* @param size Number of bytes used to store a score; valid valures are 1 or 2
* @param qlen Length of the query sequence
* @param query Query sequence
* @param m Size of the alphabet (ACGTN)
* @param mat Scoring matrix in a one-dimension array
*
* @return Query data structure
*/
kswq_avx2_t *aln_avx2_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) {
kswq_avx2_t *q;
int slen, a, tmp, p;
size = size > 1 ? 2 : 1;
p = 16 * (3 - size); // # values per __m256ii16是16个u8是32个
slen = (qlen + p - 1) / p; // segmented length
q = (kswq_avx2_t *)malloc(sizeof(kswq_avx2_t) + 512 + 32 * slen * (m + 4)); // a single block of memory
q->qp = (__m256i *)(((size_t)q + sizeof(kswq_avx2_t) + 31) >> 5 << 5); // align memory32字节对齐
q->H0 = q->qp + slen * m;
q->H1 = q->H0 + slen;
q->E = q->H1 + slen;
q->Hmax = q->E + slen;
q->slen = slen;
q->qlen = qlen;
q->size = size;
// compute shift
tmp = m * m;
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
if (mat[a] < (int8_t)q->shift)
q->shift = mat[a];
if (mat[a] > (int8_t)q->mdiff)
q->mdiff = mat[a];
}
q->max = q->mdiff; // (=1)
q->shift = 256 - q->shift; // NB: q->shift is uint8_t // 主要用来处理uint8_t类型数据,保证score不小于0i16不用 =4
q->mdiff += q->shift; // this is the difference between the min and max scores =1
// An example: p=8, qlen=19, slen=3 and segmentation:
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
if (size == 1) {
int8_t *t = (int8_t *)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen ? 0 : ma[query[k]]) + q->shift;
}
} else {
int16_t *t = (int16_t *)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen ? 0 : ma[query[k]]);
}
}
return q;
}
/******
* 输入说明三个文件query.fa, target.fa, info.txt
* query.fa: 每一行代表一个query序列由ACGTN组成
* target.fa: 每一行代表一个referencetarget序列由ACGTN组成
* info.txt: 每一行由两个数字组成分别代表query序列长度target序列长度align不需要初始分数
*/
int main_align(int argc, char *argv[])
{
if (argc < 3)
{
fprintf(stderr, "Need 3 files: query, target, info.\n");
return -1;
}
const char *qf_path = argv[0];
const char *tf_path = argv[1];
const char *if_path = argv[2];
FILE *qfp = fopen(qf_path, "r");
FILE *tfp = fopen(tf_path, "r");
FILE *ifp = fopen(if_path, "r");
buf_t read_buf = {0};
seq_v q_arr = {0};
seq_v t_arr = {0};
qti_v i_arr = {0};
uint64_t score_total[ALIGN_FUNC_NUM] = {0};
const int kmax_row = 3000000;
/* read input files */
int query_read_row = read_seq(&q_arr, &read_buf, kmax_row, qfp);
int target_read_row = read_seq(&t_arr, &read_buf, kmax_row, tfp);
int info_read_row = read_qt_info(&i_arr, &read_buf, kmax_row, 2, ifp);
// fprintf(stderr, "read row: %d\t%d\t%d\n", query_read_row, target_read_row, info_read_row);
/* init parameters */
int8_t mat[25] = {1, -4, -4, -4, -1,
-4, 1, -4, -4, -1,
-4, -4, 1, -4, -1,
-4, -4, -4, 1, -1,
-1, -1, -1, -1, -1};
int xtra = 851987;
int kernel_prof_idx[] = {G_ALN_I16, G_ALN_U8, G_ALN_AVX2_I16, G_ALN_AVX2_U8};
byte_mem_t bmem[2] = {{0}, {0}};
byte_mem_init_alloc(&bmem[0], 1024*4);
byte_mem_init_alloc(&bmem[1], 1024*4);
int align_lines = MIN(MIN(query_read_row, target_read_row), info_read_row);
// open_qti_files();
// open_debug_files();
/* convert query sequence for sse or avx2 */
fprintf(stderr, "excute nums: %d\n", align_lines);
ALIGN_PERFORMANCE_TEST(0, 2, align_sse_i16, 0, align_lines);
ALIGN_PERFORMANCE_TEST(1, 1, align_sse_u8, 0, align_lines);
ALIGN_PERFORMANCE_TEST_AVX2(2, 2, align_avx2_i16, 0, align_lines);
ALIGN_PERFORMANCE_TEST_AVX2(3, 1, align_avx2_u8, 0, align_lines);
#if 0
// compare the score2 of i16 and u8
{
int i;
kswr_t si16, su8;
for (i = 0; i < align_lines; ++i)
{
kswq_sse_t *qi16 = aln_sse_qinit(&bmem[0], 2, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat);
si16 = align_sse_i16(&bmem[1], qi16, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra);
byte_mem_clear(&bmem[0]);
fprintf(stderr, "split\n\n");
kswq_sse_t *qu8 = aln_sse_qinit(&bmem[0], 1, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat);
su8 = align_sse_u8(&bmem[1], qu8, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra);
byte_mem_clear(&bmem[0]);
if (si16.score2 != su8.score2) {
fprintf(stderr, "%d %d\n", si16.score2, su8.score2);
// write_query_target_sequence(kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 0, 0);
}
}
}
#endif
#if 1
int i = 0;
for (; i < ARRAY_SIZE(kernel_prof_idx); ++i)
{
gdata[kernel_prof_idx[i]] = score_total[i];
}
#ifdef SHOW_PERF
fprintf(stderr, "[align avx i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_I16] / TIME_DIVIDE_BY, gdata[G_ALN_I16]);
fprintf(stderr, "[align avx u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_U8] / TIME_DIVIDE_BY, gdata[G_ALN_U8]);
fprintf(stderr, "[align avx2 i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_I16]);
fprintf(stderr, "[align avx2 u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_U8]);
#endif
#endif
// close_files();
fclose(qfp);
fclose(tfp);
fclose(ifp);
return 0;
}