2025-09-18 15:09:48 +08:00
|
|
|
|
/*********************************************************************************************
|
|
|
|
|
|
Description: stripped sw align functions in bwa-mem
|
|
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by NCIC.ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2024/04/11
|
|
|
|
|
|
***********************************************************************************************/
|
|
|
|
|
|
// u8和i16 score2 结果有不同,不是算法问题,而是边界未清零
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
|
|
|
|
#include "byte_alloc.h"
|
|
|
|
|
|
#include "utils.h"
|
|
|
|
|
|
#include "profiling.h"
|
|
|
|
|
|
#include "align.h"
|
|
|
|
|
|
#include "debug.h"
|
|
|
|
|
|
|
|
|
|
|
|
const kswr_t g_defr = {0, -1, -1, -1, -1, -1, -1};
|
|
|
|
|
|
|
|
|
|
|
|
#define ALIGN_PERFORMANCE_TEST(kernel_id, nbyte, func, sp, ep) \
|
|
|
|
|
|
do \
|
|
|
|
|
|
{ \
|
|
|
|
|
|
PROF_START(align); \
|
|
|
|
|
|
int i; \
|
|
|
|
|
|
kswr_t score; \
|
|
|
|
|
|
for (i = sp; i < ep; ++i) \
|
|
|
|
|
|
{ if (kv_A(kv_A(i_arr, i), 0) < 144) continue; \
|
|
|
|
|
|
kswq_sse_t *q = aln_sse_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \
|
|
|
|
|
|
score = func(&bmem[1], q, \
|
|
|
|
|
|
kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \
|
|
|
|
|
|
6, 1, 6, 1, xtra); \
|
|
|
|
|
|
score_total[kernel_id] += score.score; \
|
|
|
|
|
|
byte_mem_clear(&bmem[0]); /* free(q); */ \
|
|
|
|
|
|
} \
|
|
|
|
|
|
PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \
|
|
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
|
|
|
|
#define ALIGN_PERFORMANCE_TEST_AVX2(kernel_id, nbyte, func, sp, ep) \
|
|
|
|
|
|
do { \
|
|
|
|
|
|
PROF_START(align); \
|
|
|
|
|
|
int i; \
|
|
|
|
|
|
kswr_t score; \
|
|
|
|
|
|
for (i = sp; i < ep; ++i) { \
|
|
|
|
|
|
/*if (kv_A(kv_A(i_arr, i), 0) < 144) \
|
|
|
|
|
|
continue; */ \
|
|
|
|
|
|
kswq_avx2_t *q = aln_avx2_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \
|
|
|
|
|
|
score = func(&bmem[1], q, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); \
|
|
|
|
|
|
score_total[kernel_id] += score.score; \
|
|
|
|
|
|
byte_mem_clear(&bmem[0]); /* free(q); */ \
|
|
|
|
|
|
} \
|
|
|
|
|
|
PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \
|
|
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
|
|
|
|
// sse ksw init
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Initialize the query data structure
|
|
|
|
|
|
*
|
|
|
|
|
|
* @param size Number of bytes used to store a score; valid valures are 1 or 2
|
|
|
|
|
|
* @param qlen Length of the query sequence
|
|
|
|
|
|
* @param query Query sequence
|
|
|
|
|
|
* @param m Size of the alphabet
|
|
|
|
|
|
* @param mat Scoring matrix in a one-dimension array
|
|
|
|
|
|
*
|
|
|
|
|
|
* @return Query data structure
|
|
|
|
|
|
*/
|
|
|
|
|
|
kswq_sse_t *aln_sse_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
|
|
|
|
|
|
{
|
|
|
|
|
|
kswq_sse_t *q;
|
|
|
|
|
|
int slen, a, tmp, p, memsize;
|
|
|
|
|
|
|
|
|
|
|
|
size = size > 1? 2 : 1;
|
|
|
|
|
|
p = 8 * (3 - size); // # values per __m128i
|
|
|
|
|
|
qlen = 144;
|
|
|
|
|
|
slen = (qlen + p - 1) / p; // segmented length
|
|
|
|
|
|
memsize = sizeof(kswq_sse_t) + 256 + 16 * slen * (m + 4);
|
|
|
|
|
|
q = (kswq_sse_t *)malloc(memsize); // a single block of memory
|
|
|
|
|
|
//q = (kswq_sse_t *)byte_mem_request_and_clean(bmem, memsize);
|
|
|
|
|
|
//q = (kswq_sse_t *)byte_mem_request(bmem, memsize);
|
|
|
|
|
|
q->qp = (__m128i *)(((size_t)q + sizeof(kswq_sse_t) + 15) >> 4 << 4); // align memory
|
|
|
|
|
|
q->H0 = q->qp + slen * m;
|
|
|
|
|
|
q->H1 = q->H0 + slen;
|
|
|
|
|
|
q->E = q->H1 + slen;
|
|
|
|
|
|
q->Hmax = q->E + slen;
|
|
|
|
|
|
q->slen = slen; q->qlen = qlen; q->size = size;
|
|
|
|
|
|
//fprintf(stderr, "%d %d\n", slen, qlen);
|
|
|
|
|
|
// compute shift
|
|
|
|
|
|
tmp = m * m;
|
|
|
|
|
|
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
|
|
|
|
|
|
if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
|
|
|
|
|
|
if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
|
|
|
|
|
|
}
|
|
|
|
|
|
q->max = q->mdiff;
|
|
|
|
|
|
q->shift = 256 - q->shift; // NB: q->shift is uint8_t
|
|
|
|
|
|
q->mdiff += q->shift; // this is the difference between the min and max scores
|
|
|
|
|
|
// An example: p=8, qlen=19, slen=3 and segmentation:
|
|
|
|
|
|
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
|
|
|
|
|
|
if (size == 1) {
|
|
|
|
|
|
int8_t *t = (int8_t*)q->qp;
|
|
|
|
|
|
for (a = 0; a < m; ++a) {
|
|
|
|
|
|
int i, k, nlen = slen * p;
|
|
|
|
|
|
const int8_t *ma = mat + a * m;
|
|
|
|
|
|
for (i = 0; i < slen; ++i)
|
|
|
|
|
|
for (k = i; k < nlen; k += slen) // p iterations
|
|
|
|
|
|
*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
int16_t *t = (int16_t*)q->qp;
|
|
|
|
|
|
for (a = 0; a < m; ++a) {
|
|
|
|
|
|
int i, k, nlen = slen * p;
|
|
|
|
|
|
const int8_t *ma = mat + a * m;
|
|
|
|
|
|
for (i = 0; i < slen; ++i)
|
|
|
|
|
|
for (k = i; k < nlen; k += slen) // p iterations
|
|
|
|
|
|
*t++ = (k >= qlen? 0 : ma[query[k]]);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return q;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Initialize the query data structure
|
|
|
|
|
|
*
|
|
|
|
|
|
* @param size Number of bytes used to store a score; valid valures are 1 or 2
|
|
|
|
|
|
* @param qlen Length of the query sequence
|
|
|
|
|
|
* @param query Query sequence
|
|
|
|
|
|
* @param m Size of the alphabet (ACGTN)
|
|
|
|
|
|
* @param mat Scoring matrix in a one-dimension array
|
|
|
|
|
|
*
|
|
|
|
|
|
* @return Query data structure
|
|
|
|
|
|
*/
|
|
|
|
|
|
kswq_avx2_t *aln_avx2_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) {
|
|
|
|
|
|
kswq_avx2_t *q;
|
|
|
|
|
|
int slen, a, tmp, p;
|
|
|
|
|
|
size = size > 1 ? 2 : 1;
|
|
|
|
|
|
p = 16 * (3 - size); // # values per __m256i,i16是16个,u8是32个
|
|
|
|
|
|
slen = (qlen + p - 1) / p; // segmented length
|
|
|
|
|
|
q = (kswq_avx2_t *)malloc(sizeof(kswq_avx2_t) + 512 + 32 * slen * (m + 4)); // a single block of memory
|
|
|
|
|
|
q->qp = (__m256i *)(((size_t)q + sizeof(kswq_avx2_t) + 31) >> 5 << 5); // align memory,32字节对齐
|
|
|
|
|
|
q->H0 = q->qp + slen * m;
|
|
|
|
|
|
q->H1 = q->H0 + slen;
|
|
|
|
|
|
q->E = q->H1 + slen;
|
|
|
|
|
|
q->Hmax = q->E + slen;
|
|
|
|
|
|
q->slen = slen;
|
|
|
|
|
|
q->qlen = qlen;
|
|
|
|
|
|
q->size = size;
|
|
|
|
|
|
// compute shift
|
|
|
|
|
|
tmp = m * m;
|
|
|
|
|
|
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
|
|
|
|
|
|
if (mat[a] < (int8_t)q->shift)
|
|
|
|
|
|
q->shift = mat[a];
|
|
|
|
|
|
if (mat[a] > (int8_t)q->mdiff)
|
|
|
|
|
|
q->mdiff = mat[a];
|
|
|
|
|
|
}
|
|
|
|
|
|
q->max = q->mdiff; // (=1)
|
|
|
|
|
|
q->shift = 256 - q->shift; // NB: q->shift is uint8_t // 主要用来处理uint8_t类型数据,保证score不小于0,i16不用 (=4)
|
|
|
|
|
|
q->mdiff += q->shift; // this is the difference between the min and max scores (=1)
|
|
|
|
|
|
// An example: p=8, qlen=19, slen=3 and segmentation:
|
|
|
|
|
|
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
|
|
|
|
|
|
if (size == 1) {
|
|
|
|
|
|
int8_t *t = (int8_t *)q->qp;
|
|
|
|
|
|
for (a = 0; a < m; ++a) {
|
|
|
|
|
|
int i, k, nlen = slen * p;
|
|
|
|
|
|
const int8_t *ma = mat + a * m;
|
|
|
|
|
|
for (i = 0; i < slen; ++i)
|
|
|
|
|
|
for (k = i; k < nlen; k += slen) // p iterations
|
|
|
|
|
|
*t++ = (k >= qlen ? 0 : ma[query[k]]) + q->shift;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
int16_t *t = (int16_t *)q->qp;
|
|
|
|
|
|
for (a = 0; a < m; ++a) {
|
|
|
|
|
|
int i, k, nlen = slen * p;
|
|
|
|
|
|
const int8_t *ma = mat + a * m;
|
|
|
|
|
|
for (i = 0; i < slen; ++i)
|
|
|
|
|
|
for (k = i; k < nlen; k += slen) // p iterations
|
|
|
|
|
|
*t++ = (k >= qlen ? 0 : ma[query[k]]);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return q;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/******
|
|
|
|
|
|
* 输入说明:三个文件,query.fa, target.fa, info.txt
|
|
|
|
|
|
* query.fa: 每一行代表一个query序列,由ACGTN组成
|
|
|
|
|
|
* target.fa: 每一行代表一个reference(target)序列,由ACGTN组成
|
|
|
|
|
|
* info.txt: 每一行由两个数字组成,分别代表query序列长度,target序列长度(align不需要初始分数)
|
|
|
|
|
|
*/
|
|
|
|
|
|
int main_align(int argc, char *argv[])
|
|
|
|
|
|
{
|
|
|
|
|
|
if (argc < 3)
|
|
|
|
|
|
{
|
|
|
|
|
|
fprintf(stderr, "Need 3 files: query, target, info.\n");
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
const char *qf_path = argv[0];
|
|
|
|
|
|
const char *tf_path = argv[1];
|
|
|
|
|
|
const char *if_path = argv[2];
|
|
|
|
|
|
|
|
|
|
|
|
FILE *qfp = fopen(qf_path, "r");
|
|
|
|
|
|
FILE *tfp = fopen(tf_path, "r");
|
|
|
|
|
|
FILE *ifp = fopen(if_path, "r");
|
|
|
|
|
|
|
|
|
|
|
|
buf_t read_buf = {0};
|
|
|
|
|
|
seq_v q_arr = {0};
|
|
|
|
|
|
seq_v t_arr = {0};
|
|
|
|
|
|
qti_v i_arr = {0};
|
|
|
|
|
|
uint64_t score_total[ALIGN_FUNC_NUM] = {0};
|
|
|
|
|
|
const int kmax_row = 3000000;
|
|
|
|
|
|
|
|
|
|
|
|
/* read input files */
|
|
|
|
|
|
int query_read_row = read_seq(&q_arr, &read_buf, kmax_row, qfp);
|
|
|
|
|
|
int target_read_row = read_seq(&t_arr, &read_buf, kmax_row, tfp);
|
|
|
|
|
|
int info_read_row = read_qt_info(&i_arr, &read_buf, kmax_row, 2, ifp);
|
|
|
|
|
|
// fprintf(stderr, "read row: %d\t%d\t%d\n", query_read_row, target_read_row, info_read_row);
|
|
|
|
|
|
/* init parameters */
|
|
|
|
|
|
int8_t mat[25] = {1, -4, -4, -4, -1,
|
|
|
|
|
|
-4, 1, -4, -4, -1,
|
|
|
|
|
|
-4, -4, 1, -4, -1,
|
|
|
|
|
|
-4, -4, -4, 1, -1,
|
|
|
|
|
|
-1, -1, -1, -1, -1};
|
|
|
|
|
|
int xtra = 851987;
|
|
|
|
|
|
int kernel_prof_idx[] = {G_ALN_I16, G_ALN_U8, G_ALN_AVX2_I16, G_ALN_AVX2_U8};
|
|
|
|
|
|
byte_mem_t bmem[2] = {{0}, {0}};
|
|
|
|
|
|
byte_mem_init_alloc(&bmem[0], 1024*4);
|
|
|
|
|
|
byte_mem_init_alloc(&bmem[1], 1024*4);
|
|
|
|
|
|
int align_lines = MIN(MIN(query_read_row, target_read_row), info_read_row);
|
|
|
|
|
|
// open_qti_files();
|
|
|
|
|
|
// open_debug_files();
|
|
|
|
|
|
/* convert query sequence for sse or avx2 */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "excute nums: %d\n", align_lines);
|
|
|
|
|
|
ALIGN_PERFORMANCE_TEST(0, 2, align_sse_i16, 0, align_lines);
|
|
|
|
|
|
ALIGN_PERFORMANCE_TEST(1, 1, align_sse_u8, 0, align_lines);
|
|
|
|
|
|
ALIGN_PERFORMANCE_TEST_AVX2(2, 2, align_avx2_i16, 0, align_lines);
|
|
|
|
|
|
ALIGN_PERFORMANCE_TEST_AVX2(3, 1, align_avx2_u8, 0, align_lines);
|
|
|
|
|
|
#if 0
|
|
|
|
|
|
// compare the score2 of i16 and u8
|
|
|
|
|
|
{
|
|
|
|
|
|
int i;
|
|
|
|
|
|
kswr_t si16, su8;
|
|
|
|
|
|
for (i = 0; i < align_lines; ++i)
|
|
|
|
|
|
{
|
|
|
|
|
|
kswq_sse_t *qi16 = aln_sse_qinit(&bmem[0], 2, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat);
|
|
|
|
|
|
si16 = align_sse_i16(&bmem[1], qi16, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra);
|
|
|
|
|
|
byte_mem_clear(&bmem[0]);
|
|
|
|
|
|
fprintf(stderr, "split\n\n");
|
|
|
|
|
|
kswq_sse_t *qu8 = aln_sse_qinit(&bmem[0], 1, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat);
|
|
|
|
|
|
su8 = align_sse_u8(&bmem[1], qu8, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra);
|
|
|
|
|
|
byte_mem_clear(&bmem[0]);
|
|
|
|
|
|
if (si16.score2 != su8.score2) {
|
|
|
|
|
|
fprintf(stderr, "%d %d\n", si16.score2, su8.score2);
|
|
|
|
|
|
// write_query_target_sequence(kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 0, 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#if 1
|
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
for (; i < ARRAY_SIZE(kernel_prof_idx); ++i)
|
|
|
|
|
|
{
|
|
|
|
|
|
gdata[kernel_prof_idx[i]] = score_total[i];
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef SHOW_PERF
|
|
|
|
|
|
fprintf(stderr, "[align sse i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_I16] / TIME_DIVIDE_BY, gdata[G_ALN_I16]);
|
|
|
|
|
|
fprintf(stderr, "[align sse u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_U8] / TIME_DIVIDE_BY, gdata[G_ALN_U8]);
|
|
|
|
|
|
fprintf(stderr, "[align avx2 i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_I16]);
|
|
|
|
|
|
fprintf(stderr, "[align avx2 u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_U8]);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#endif
|
|
|
|
|
|
// close_files();
|
|
|
|
|
|
fclose(qfp);
|
|
|
|
|
|
fclose(tfp);
|
|
|
|
|
|
fclose(ifp);
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|