完成avx2的align代码

This commit is contained in:
zzh 2025-09-18 15:09:48 +08:00
parent 845a1156b8
commit 338c56e6ed
19 changed files with 994 additions and 119 deletions

View File

@ -23,6 +23,8 @@
"utility": "c",
"byte_alloc.h": "c",
"debug.h": "c",
"limits": "c"
"limits": "c",
"align.h": "c",
"extend.h": "c"
}
}

276
align.c
View File

@ -0,0 +1,276 @@
/*********************************************************************************************
Description: stripped sw align functions in bwa-mem
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/11
***********************************************************************************************/
// u8和i16 score2 结果有不同,不是算法问题,而是边界未清零
#include <stdio.h>
#include <stdlib.h>
#include "byte_alloc.h"
#include "utils.h"
#include "profiling.h"
#include "align.h"
#include "debug.h"
const kswr_t g_defr = {0, -1, -1, -1, -1, -1, -1};
#define ALIGN_PERFORMANCE_TEST(kernel_id, nbyte, func, sp, ep) \
do \
{ \
PROF_START(align); \
int i; \
kswr_t score; \
for (i = sp; i < ep; ++i) \
{ if (kv_A(kv_A(i_arr, i), 0) < 144) continue; \
kswq_sse_t *q = aln_sse_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \
score = func(&bmem[1], q, \
kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \
6, 1, 6, 1, xtra); \
score_total[kernel_id] += score.score; \
byte_mem_clear(&bmem[0]); /* free(q); */ \
} \
PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \
} while (0)
#define ALIGN_PERFORMANCE_TEST_AVX2(kernel_id, nbyte, func, sp, ep) \
do { \
PROF_START(align); \
int i; \
kswr_t score; \
for (i = sp; i < ep; ++i) { \
/*if (kv_A(kv_A(i_arr, i), 0) < 144) \
continue; */ \
kswq_avx2_t *q = aln_avx2_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \
score = func(&bmem[1], q, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); \
score_total[kernel_id] += score.score; \
byte_mem_clear(&bmem[0]); /* free(q); */ \
} \
PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \
} while (0)
// sse ksw init
/**
* Initialize the query data structure
*
* @param size Number of bytes used to store a score; valid valures are 1 or 2
* @param qlen Length of the query sequence
* @param query Query sequence
* @param m Size of the alphabet
* @param mat Scoring matrix in a one-dimension array
*
* @return Query data structure
*/
kswq_sse_t *aln_sse_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
{
kswq_sse_t *q;
int slen, a, tmp, p, memsize;
size = size > 1? 2 : 1;
p = 8 * (3 - size); // # values per __m128i
qlen = 144;
slen = (qlen + p - 1) / p; // segmented length
memsize = sizeof(kswq_sse_t) + 256 + 16 * slen * (m + 4);
q = (kswq_sse_t *)malloc(memsize); // a single block of memory
//q = (kswq_sse_t *)byte_mem_request_and_clean(bmem, memsize);
//q = (kswq_sse_t *)byte_mem_request(bmem, memsize);
q->qp = (__m128i *)(((size_t)q + sizeof(kswq_sse_t) + 15) >> 4 << 4); // align memory
q->H0 = q->qp + slen * m;
q->H1 = q->H0 + slen;
q->E = q->H1 + slen;
q->Hmax = q->E + slen;
q->slen = slen; q->qlen = qlen; q->size = size;
//fprintf(stderr, "%d %d\n", slen, qlen);
// compute shift
tmp = m * m;
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
}
q->max = q->mdiff;
q->shift = 256 - q->shift; // NB: q->shift is uint8_t
q->mdiff += q->shift; // this is the difference between the min and max scores
// An example: p=8, qlen=19, slen=3 and segmentation:
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
if (size == 1) {
int8_t *t = (int8_t*)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
}
} else {
int16_t *t = (int16_t*)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen? 0 : ma[query[k]]);
}
}
return q;
}
/**
* Initialize the query data structure
*
* @param size Number of bytes used to store a score; valid valures are 1 or 2
* @param qlen Length of the query sequence
* @param query Query sequence
* @param m Size of the alphabet (ACGTN)
* @param mat Scoring matrix in a one-dimension array
*
* @return Query data structure
*/
kswq_avx2_t *aln_avx2_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) {
kswq_avx2_t *q;
int slen, a, tmp, p;
size = size > 1 ? 2 : 1;
p = 16 * (3 - size); // # values per __m256ii16是16个u8是32个
slen = (qlen + p - 1) / p; // segmented length
q = (kswq_avx2_t *)malloc(sizeof(kswq_avx2_t) + 512 + 32 * slen * (m + 4)); // a single block of memory
q->qp = (__m256i *)(((size_t)q + sizeof(kswq_avx2_t) + 31) >> 5 << 5); // align memory32字节对齐
q->H0 = q->qp + slen * m;
q->H1 = q->H0 + slen;
q->E = q->H1 + slen;
q->Hmax = q->E + slen;
q->slen = slen;
q->qlen = qlen;
q->size = size;
// compute shift
tmp = m * m;
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
if (mat[a] < (int8_t)q->shift)
q->shift = mat[a];
if (mat[a] > (int8_t)q->mdiff)
q->mdiff = mat[a];
}
q->max = q->mdiff; // (=1)
q->shift = 256 - q->shift; // NB: q->shift is uint8_t // 主要用来处理uint8_t类型数据,保证score不小于0i16不用 =4
q->mdiff += q->shift; // this is the difference between the min and max scores =1
// An example: p=8, qlen=19, slen=3 and segmentation:
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
if (size == 1) {
int8_t *t = (int8_t *)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen ? 0 : ma[query[k]]) + q->shift;
}
} else {
int16_t *t = (int16_t *)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen ? 0 : ma[query[k]]);
}
}
return q;
}
/******
* query.fa, target.fa, info.txt
* query.fa: queryACGTN
* target.fa: referencetargetACGTN
* info.txt: querytargetalign
*/
int main_align(int argc, char *argv[])
{
if (argc < 3)
{
fprintf(stderr, "Need 3 files: query, target, info.\n");
return -1;
}
const char *qf_path = argv[0];
const char *tf_path = argv[1];
const char *if_path = argv[2];
FILE *qfp = fopen(qf_path, "r");
FILE *tfp = fopen(tf_path, "r");
FILE *ifp = fopen(if_path, "r");
buf_t read_buf = {0};
seq_v q_arr = {0};
seq_v t_arr = {0};
qti_v i_arr = {0};
uint64_t score_total[ALIGN_FUNC_NUM] = {0};
const int kmax_row = 3000000;
/* read input files */
int query_read_row = read_seq(&q_arr, &read_buf, kmax_row, qfp);
int target_read_row = read_seq(&t_arr, &read_buf, kmax_row, tfp);
int info_read_row = read_qt_info(&i_arr, &read_buf, kmax_row, 2, ifp);
// fprintf(stderr, "read row: %d\t%d\t%d\n", query_read_row, target_read_row, info_read_row);
/* init parameters */
int8_t mat[25] = {1, -4, -4, -4, -1,
-4, 1, -4, -4, -1,
-4, -4, 1, -4, -1,
-4, -4, -4, 1, -1,
-1, -1, -1, -1, -1};
int xtra = 851987;
int kernel_prof_idx[] = {G_ALN_I16, G_ALN_U8, G_ALN_AVX2_I16, G_ALN_AVX2_U8};
byte_mem_t bmem[2] = {{0}, {0}};
byte_mem_init_alloc(&bmem[0], 1024*4);
byte_mem_init_alloc(&bmem[1], 1024*4);
int align_lines = MIN(MIN(query_read_row, target_read_row), info_read_row);
// open_qti_files();
// open_debug_files();
/* convert query sequence for sse or avx2 */
fprintf(stderr, "excute nums: %d\n", align_lines);
ALIGN_PERFORMANCE_TEST(0, 2, align_sse_i16, 0, align_lines);
ALIGN_PERFORMANCE_TEST(1, 1, align_sse_u8, 0, align_lines);
ALIGN_PERFORMANCE_TEST_AVX2(2, 2, align_avx2_i16, 0, align_lines);
ALIGN_PERFORMANCE_TEST_AVX2(3, 1, align_avx2_u8, 0, align_lines);
#if 0
// compare the score2 of i16 and u8
{
int i;
kswr_t si16, su8;
for (i = 0; i < align_lines; ++i)
{
kswq_sse_t *qi16 = aln_sse_qinit(&bmem[0], 2, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat);
si16 = align_sse_i16(&bmem[1], qi16, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra);
byte_mem_clear(&bmem[0]);
fprintf(stderr, "split\n\n");
kswq_sse_t *qu8 = aln_sse_qinit(&bmem[0], 1, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat);
su8 = align_sse_u8(&bmem[1], qu8, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra);
byte_mem_clear(&bmem[0]);
if (si16.score2 != su8.score2) {
fprintf(stderr, "%d %d\n", si16.score2, su8.score2);
// write_query_target_sequence(kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 0, 0);
}
}
}
#endif
#if 1
int i = 0;
for (; i < ARRAY_SIZE(kernel_prof_idx); ++i)
{
gdata[kernel_prof_idx[i]] = score_total[i];
}
#ifdef SHOW_PERF
fprintf(stderr, "[align sse i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_I16] / TIME_DIVIDE_BY, gdata[G_ALN_I16]);
fprintf(stderr, "[align sse u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_U8] / TIME_DIVIDE_BY, gdata[G_ALN_U8]);
fprintf(stderr, "[align avx2 i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_I16]);
fprintf(stderr, "[align avx2 u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_U8]);
#endif
#endif
// close_files();
fclose(qfp);
fclose(tfp);
fclose(ifp);
return 0;
}

49
align.h
View File

@ -1,15 +1,56 @@
/*********************************************************************************************
Description: common defines and declarations
Description: stripped sw align functions in bwa-mem
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2023/08/26
Date : 2024/04/11
***********************************************************************************************/
#ifndef __COMMON_H
#define __COMMON_H
#ifndef __ALIGN_H
#define __ALIGN_H
#include <stdio.h>
#include <emmintrin.h>
#include <immintrin.h>
#include "byte_alloc.h"
#define KSW_XBYTE 0x10000
#define KSW_XSTOP 0x20000
#define KSW_XSUBO 0x40000
#define KSW_XSTART 0x80000
typedef struct
{
int score; // best score
int te, qe; // target end and query end
int score2, te2; // second best score and ending position on the target
int tb, qb; // target start and query start
} kswr_t;
extern const kswr_t g_defr;
typedef struct _kswq_t
{
int qlen, slen;
uint8_t shift, mdiff, max, size;
__m128i *qp, *H0, *H1, *E, *Hmax;
} kswq_sse_t;
typedef struct _kswq_avx2_t {
int qlen, slen;
uint8_t shift, mdiff, max, size;
__m256i *qp, *H0, *H1, *E, *Hmax;
} kswq_avx2_t;
#define ALIGN_FUNC_NUM 4
kswq_sse_t *aln_sse_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat);
kswr_t align_sse_i16(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra); // the first gap costs -(_o+_e)
kswr_t align_sse_u8(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra);
kswq_avx2_t *aln_avx2_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat);
kswr_t align_avx2_u8(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins,
int _e_ins, int xtra);
kswr_t align_avx2_i16(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins,
int _e_ins, int xtra);
int main_align(int argc, char *argv[]);
#endif

View File

@ -0,0 +1,137 @@
#include <stdint.h>
#include <stdlib.h>
#include "align.h"
#include "utils.h"
kswr_t align_avx2_i16(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins,
int _e_ins, int xtra) {
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m256i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
kswr_t r;
#define __max_16(ret, xx) \
do { \
int16_t *maxVal = (int16_t *)&(xx); \
(xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 8)); \
(xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 4)); \
(xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 2)); \
(ret) = MAX(maxVal[0], maxVal[8]); \
} while (0)
// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000
#define allzero_16(xx) (!_mm256_movemask_epi8((xx)))
#define avx2_slli_i16(xx, imm) \
do { \
int16_t *arr = (int16_t *)&(xx); \
int16_t val = arr[7]; \
(xx) = _mm256_slli_si256((xx), (imm)); \
arr[8] = val; \
} while (0)
// initialization
r = g_defr;
minsc = (xtra & KSW_XSUBO) ? xtra & 0xffff : 0x10000;
endsc = (xtra & KSW_XSTOP) ? xtra & 0xffff : 0x10000;
m_b = n_b = 0;
b = 0;
zero = _mm256_set1_epi32(0);
oe_del = _mm256_set1_epi16(_o_del + _e_del);
e_del = _mm256_set1_epi16(_e_del);
oe_ins = _mm256_set1_epi16(_o_ins + _e_ins);
e_ins = _mm256_set1_epi16(_e_ins);
H0 = q->H0;
H1 = q->H1;
E = q->E;
Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
_mm256_store_si256(E + i, zero);
_mm256_store_si256(H0 + i, zero);
_mm256_store_si256(Hmax + i, zero);
}
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, imax;
__m256i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm256_load_si256(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
avx2_slli_i16(h, 2);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm256_adds_epi16(h, _mm256_load_si256(S++));
e = _mm256_load_si256(E + j);
h = _mm256_max_epi16(h, e);
h = _mm256_max_epi16(h, f);
max = _mm256_max_epi16(max, h);
_mm256_store_si256(H1 + j, h);
e = _mm256_subs_epu16(e, e_del);
t = _mm256_subs_epu16(h, oe_del);
e = _mm256_max_epi16(e, t);
_mm256_store_si256(E + j, e);
f = _mm256_subs_epu16(f, e_ins);
t = _mm256_subs_epu16(h, oe_ins);
f = _mm256_max_epi16(f, t);
h = _mm256_load_si256(H0 + j);
}
for (k = 0; LIKELY(k < 16); ++k) {
avx2_slli_i16(f, 2);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm256_load_si256(H1 + j);
h = _mm256_max_epi16(h, f);
_mm256_store_si256(H1 + j, h);
h = _mm256_subs_epu16(h, oe_ins);
f = _mm256_subs_epu16(f, e_ins);
if (UNLIKELY(allzero_16(_mm256_cmpgt_epi16(f, h))))
goto end_loop16;
}
}
end_loop16:
__max_16(imax, max);
if (imax >= minsc) {
if (n_b == 0 || (int32_t)b[n_b - 1] + 1 != i) {
if (n_b == m_b) {
m_b = m_b ? m_b << 1 : 8;
b = (uint64_t *)realloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax << 32 | i;
} else if ((int)(b[n_b - 1] >> 32) < imax)
b[n_b - 1] = (uint64_t)imax << 32 | i; // modify the last
}
if (imax > gmax) {
gmax = imax;
te = i;
for (j = 0; LIKELY(j < slen); ++j) _mm256_store_si256(Hmax + j, _mm256_load_si256(H1 + j));
if (gmax >= endsc)
break;
}
S = H1;
H1 = H0;
H0 = S;
}
r.score = gmax;
r.te = te;
{
int max = -1, tmp, low, high, qlen = slen * 16;
uint16_t *t = (uint16_t *)Hmax;
for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
if ((int)*t > max)
max = *t, r.qe = i / 16 + i % 16 * slen;
else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe)
r.qe = tmp;
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i;
high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i] >> 32) > r.score2)
r.score2 = b[i] >> 32, r.te2 = e;
}
}
}
free(b);
return r;
}

View File

@ -0,0 +1,154 @@
#include <stdint.h>
#include <stdlib.h>
#include "align.h"
#include "utils.h"
kswr_t align_avx2_u8(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins,
int _e_ins, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m256i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
kswr_t r;
#define __max_32(ret, xx) \
do { \
uint8_t *maxVal = (uint8_t *)&(xx); \
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 8)); \
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 4)); \
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 2)); \
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 1)); \
(ret) = MAX(maxVal[0], maxVal[16]); \
} while (0)
// Given entries with arbitrary values, return whether they are all 0x00
#define allzero_32(xx) (_mm256_movemask_epi8(_mm256_cmpeq_epi8((xx), zero)) == 0xffffffff)
#define avx2_slli_wrap(xx, imm) \
do { \
uint8_t *arr = (uint8_t *)&(xx); \
uint8_t val = arr[15]; \
(xx) = _mm256_slli_si256((xx), (imm)); \
arr[16] = val; \
} while (0)
// initialization
r = g_defr;
minsc = (xtra & KSW_XSUBO) ? xtra & 0xffff : 0x10000;
endsc = (xtra & KSW_XSTOP) ? xtra & 0xffff : 0x10000;
m_b = n_b = 0;
b = 0;
zero = _mm256_set1_epi32(0);
oe_del = _mm256_set1_epi8(_o_del + _e_del);
e_del = _mm256_set1_epi8(_e_del);
oe_ins = _mm256_set1_epi8(_o_ins + _e_ins);
e_ins = _mm256_set1_epi8(_e_ins);
shift = _mm256_set1_epi8(q->shift);
H0 = q->H0;
H1 = q->H1;
E = q->E;
Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
_mm256_store_si256(E + i, zero);
_mm256_store_si256(H0 + i, zero);
_mm256_store_si256(Hmax + i, zero);
}
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, imax;
__m256i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm256_load_si256(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
avx2_slli_wrap(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
for (j = 0; LIKELY(j < slen); ++j) {
/* SW cells are computed in the following order:
* H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
* E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
* F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
*/
// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
h = _mm256_adds_epu8(h, _mm256_load_si256(S + j));
h = _mm256_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
e = _mm256_load_si256(E + j); // e=E'(i,j)
h = _mm256_max_epu8(h, e);
h = _mm256_max_epu8(h, f); // h=H'(i,j)
max = _mm256_max_epu8(max, h); // set max
_mm256_store_si256(H1 + j, h); // save to H'(i,j)
// now compute E'(i+1,j)
e = _mm256_subs_epu8(e, e_del); // e=E'(i,j) - e_del
t = _mm256_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
e = _mm256_max_epu8(e, t); // e=E'(i+1,j)
_mm256_store_si256(E + j, e); // save to E'(i+1,j)
// now compute F'(i,j+1)
f = _mm256_subs_epu8(f, e_ins);
t = _mm256_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
f = _mm256_max_epu8(f, t);
// get H'(i-1,j) and prepare for the next j
h = _mm256_load_si256(H0 + j); // h=H'(i-1,j)
}
// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
for (k = 0; LIKELY(k < 32); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
avx2_slli_wrap(f, 1);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm256_load_si256(H1 + j);
h = _mm256_max_epu8(h, f); // h=H'(i,j)
_mm256_store_si256(H1 + j, h);
h = _mm256_subs_epu8(h, oe_ins);
f = _mm256_subs_epu8(f, e_ins);
if (UNLIKELY(allzero_32(_mm256_subs_epu8(f, h))))
goto end_loop32;
}
}
end_loop32:
__max_32(imax, max); // imax is the maximum number in max
if (imax >= minsc) { // write the b array; this condition adds branching unfornately
if (n_b == 0 ||
(int32_t)b[n_b - 1] + 1 !=
i) { // then append 新的max用新的数值记录用来记录次优分值不能与最优分值相邻否则就是连续匹配的
if (n_b == m_b) {
m_b = m_b ? m_b << 1 : 8;
b = (uint64_t *)realloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax << 32 | i;
} else if ((int)(b[n_b - 1] >> 32) < imax)
b[n_b - 1] = (uint64_t)imax << 32 | i; // modify the last
}
if (imax > gmax) {
gmax = imax;
te = i; // te is the end position on the target
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
_mm256_store_si256(Hmax + j, _mm256_load_si256(H1 + j));
if (gmax + q->shift >= 255 || gmax >= endsc)
break; // 最大分值溢出了,结束运算
}
S = H1;
H1 = H0;
H0 = S; // swap H0 and H1
}
// fprintf(gf[0], "%d\n", i);
r.score = gmax + q->shift < 255 ? gmax : 255;
r.te = te;
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
int max = -1, tmp, low, high, qlen = slen * 32;
uint8_t *t = (uint8_t *)Hmax;
for (i = 0; i < qlen; ++i, ++t)
if ((int)*t > max)
max = *t, r.qe = i / 32 + i % 32 * slen;
else if ((int)*t == max && (tmp = i / 32 + i % 32 * slen) < r.qe)
r.qe = tmp;
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i;
high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i] >> 32) > r.score2)
r.score2 = b[i] >> 32, r.te2 = e;
}
}
}
free(b);
return r;
}

View File

@ -0,0 +1,134 @@
/*********************************************************************************************
Description: sw align functions in bwa-mem
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/11
***********************************************************************************************/
#include <stdlib.h>
#include <stdint.h>
#include "utils.h"
#include "align.h"
kswr_t align_sse_i16(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
kswr_t r;
#if defined __SSE2__
#define __max_8(ret, xx) do { \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
(ret) = _mm_extract_epi16((xx), 0); \
} while (0)
// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000
#define allzero_0f_8(xx) (!_mm_movemask_epi8((xx)))
#elif defined __ARM_NEON
#define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx)))
#define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0)
#else
#define __max_8(ret, xx) (ret) = m128i_max_s16((xx))
#define allzero_0f_8(xx) (m128i_allzero((xx)))
#endif
// initialization
r = g_defr;
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
oe_del = _mm_set1_epi16(_o_del + _e_del);
e_del = _mm_set1_epi16(_e_del);
oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
e_ins = _mm_set1_epi16(_e_ins);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
_mm_store_si128(E + i, zero);
_mm_store_si128(H0 + i, zero);
_mm_store_si128(Hmax + i, zero);
}
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, imax;
__m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128(h, 2);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_adds_epi16(h, _mm_load_si128(S++));
e = _mm_load_si128(E + j);
h = _mm_max_epi16(h, e);
h = _mm_max_epi16(h, f);
max = _mm_max_epi16(max, h);
_mm_store_si128(H1 + j, h);
e = _mm_subs_epu16(e, e_del);
t = _mm_subs_epu16(h, oe_del);
e = _mm_max_epi16(e, t);
_mm_store_si128(E + j, e);
f = _mm_subs_epu16(f, e_ins);
t = _mm_subs_epu16(h, oe_ins);
f = _mm_max_epi16(f, t);
h = _mm_load_si128(H0 + j);
}
for (k = 0; LIKELY(k < 16); ++k) {
f = _mm_slli_si128(f, 2);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_load_si128(H1 + j);
h = _mm_max_epi16(h, f);
_mm_store_si128(H1 + j, h);
h = _mm_subs_epu16(h, oe_ins);
f = _mm_subs_epu16(f, e_ins);
if(UNLIKELY(allzero_0f_8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
}
}
end_loop8:
__max_8(imax, max);
//fprintf(stderr, "%d\n", imax);
if (imax >= minsc) {
//fprintf(stderr, "%d\n", imax);
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
b = (uint64_t*)realloc(b, 8 * m_b);
// b = (uint64_t *)byte_mem_request(bmem, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
}
if (imax > gmax) {
gmax = imax; te = i;
for (j = 0; LIKELY(j < slen); ++j)
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
if (gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S;
}
r.score = gmax; r.te = te;
{
int max = -1, tmp, low, high, qlen = slen * 8;
uint16_t *t = (uint16_t*)Hmax;
for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp;
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
r.score2 = b[i]>>32, r.te2 = e;
}
}
}
free(b);
// byte_mem_clear(bmem);
return r;
}

View File

@ -0,0 +1,149 @@
/*********************************************************************************************
Description: sw align functions in bwa-mem
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/11
***********************************************************************************************/
#include <stdlib.h>
#include <stdint.h>
#include "utils.h"
#include "align.h"
kswr_t align_sse_u8(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
kswr_t r;
#if defined __SSE2__
#define __max_16(ret, xx) do { \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
} while (0)
// Given entries with arbitrary values, return whether they are all 0x00
#define allzero_16(xx) (_mm_movemask_epi8(_mm_cmpeq_epi8((xx), zero)) == 0xffff)
#elif defined __ARM_NEON
#define __max_16(ret, xx) (ret) = vmaxvq_u8((xx))
#define allzero_16(xx) (vmaxvq_u8((xx)) == 0)
#else
#define __max_16(ret, xx) (ret) = m128i_max_u8((xx))
#define allzero_16(xx) (m128i_allzero((xx)))
#endif
// initialization
r = g_defr;
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
oe_del = _mm_set1_epi8(_o_del + _e_del);
e_del = _mm_set1_epi8(_e_del);
oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
e_ins = _mm_set1_epi8(_e_ins);
shift = _mm_set1_epi8(q->shift);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
_mm_store_si128(E + i, zero);
_mm_store_si128(H0 + i, zero);
_mm_store_si128(Hmax + i, zero);
}
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, imax;
__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
for (j = 0; LIKELY(j < slen); ++j) {
/* SW cells are computed in the following order:
* H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
* E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
* F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
*/
// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
h = _mm_adds_epu8(h, _mm_load_si128(S + j));
h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
e = _mm_load_si128(E + j); // e=E'(i,j)
h = _mm_max_epu8(h, e);
h = _mm_max_epu8(h, f); // h=H'(i,j)
max = _mm_max_epu8(max, h); // set max
_mm_store_si128(H1 + j, h); // save to H'(i,j)
// now compute E'(i+1,j)
e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
e = _mm_max_epu8(e, t); // e=E'(i+1,j)
_mm_store_si128(E + j, e); // save to E'(i+1,j)
// now compute F'(i,j+1)
f = _mm_subs_epu8(f, e_ins);
t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
f = _mm_max_epu8(f, t);
// get H'(i-1,j) and prepare for the next j
h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
}
// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
f = _mm_slli_si128(f, 1);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_load_si128(H1 + j);
h = _mm_max_epu8(h, f); // h=H'(i,j)
_mm_store_si128(H1 + j, h);
h = _mm_subs_epu8(h, oe_ins);
f = _mm_subs_epu8(f, e_ins);
if (UNLIKELY(allzero_16(_mm_subs_epu8(f, h)))) goto end_loop16;
}
}
end_loop16:
//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
__max_16(imax, max); // imax is the maximum number in max
//fprintf(stderr, "%d\n", imax);
if (imax >= minsc) { // write the b array; this condition adds branching unfornately
//fprintf(stderr, "%d\n", imax);
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
b = (uint64_t*)realloc(b, 8 * m_b);
//b = (uint64_t *)byte_mem_request(bmem, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
}
if (imax > gmax) {
gmax = imax; te = i; // te is the end position on the target
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
if (gmax + q->shift >= 255 || gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S; // swap H0 and H1
}
r.score = gmax + q->shift < 255? gmax : 255;
r.te = te;
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
int max = -1, tmp, low, high, qlen = slen * 16;
uint8_t *t = (uint8_t*)Hmax;
for (i = 0; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;
//printf("%d,%d\n", max, gmax);
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
r.score2 = b[i]>>32, r.te2 = e;
}
}
}
free(b);
//byte_mem_clear(bmem);
return r;
}

View File

@ -39,31 +39,33 @@ void byte_mem_init_alloc(byte_mem_t *bmem, size_t byte_cnt)
void *byte_mem_request(byte_mem_t *bmem, size_t byte_cnt)
{
// fprintf(stderr, "capacity:%ld, occupied: %ld, byte_cnt: %ld\n", bmem->capacity, bmem->occupied, byte_cnt);
void *ret_mem = 0;
//void *ret_mem = 0;
if (bmem == 0)
{
ret_mem = 0;
// ret_mem = 0;
return 0;
}
else if (bmem->capacity == 0)
{
bmem->capacity = byte_cnt;
bmem->mem = malloc(bmem->capacity);
bmem->occupied = byte_cnt;
ret_mem = bmem->mem;
//ret_mem = bmem->mem;
}
else if (bmem->capacity - bmem->occupied >= byte_cnt)
{
ret_mem = bmem->mem + bmem->occupied;
//ret_mem = bmem->mem + bmem->occupied;
bmem->occupied += byte_cnt;
}
else
{
bmem->capacity = bmem->occupied + byte_cnt;
bmem->mem = realloc(bmem->mem, bmem->capacity);
ret_mem = bmem->mem + bmem->occupied;
//ret_mem = bmem->mem + bmem->occupied;
bmem->occupied += byte_cnt;
}
return ret_mem;
// return ret_mem;
return bmem->mem;
}
void *byte_mem_request_and_clean(byte_mem_t *bmem, size_t byte_cnt)

View File

@ -28,7 +28,7 @@
kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, \
kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \
5, mat, 6, 1, 6, 1, 100, 5, 100, \
kv_A(kv_A(i_arr, i), 2), \
/*kv_A(kv_A(i_arr, i), 2)*/10, \
&qle, &tle, &gtle, &gscore, &max_off[0]); \
score_total[kernel_id] += score; \
} \
@ -60,10 +60,11 @@ int main_extend(int argc, char *argv[])
seq_v t_arr = {0};
qti_v i_arr = {0};
uint64_t score_total[EXTEND_FUNC_NUM] = {0};
const int kmax_row = 3000000;
int query_read_row = read_seq(&q_arr, &read_buf, 3000000, qfp);
int target_read_row = read_seq(&t_arr, &read_buf, 3000000, tfp);
int info_read_row = read_qt_info(&i_arr, &read_buf, 3000000, 3, ifp);
int query_read_row = read_seq(&q_arr, &read_buf, kmax_row, qfp);
int target_read_row = read_seq(&t_arr, &read_buf, kmax_row, tfp);
int info_read_row = read_qt_info(&i_arr, &read_buf, kmax_row, 3, ifp);
// fprintf(stderr, "read row: %d\t%d\t%d\n", query_read_row, target_read_row, info_read_row);
int8_t mat[25] = {1, -4, -4, -4, -1,
@ -84,9 +85,14 @@ int main_extend(int argc, char *argv[])
EXTEND_PERFORMANCE_TEST(2, extend_avx2_u8, 0, excute_lines);
EXTEND_PERFORMANCE_TEST(3, extend_avx2_i16_sp, 0, excute_lines);
int i = 0; for(; i<ARRAY_SIZE(kernel_prof_idx); ++i) { gdata[kernel_prof_idx[i]] = score_total[i]; }
display_extend_stats();
close_files();
#ifdef SHOW_PERF
fprintf(stderr, "[extend scalar ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_SCALAR] / TIME_DIVIDE_BY, gdata[G_EXT_SCALAR]);
fprintf(stderr, "[extend avx i16] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_I16]);
fprintf(stderr, "[extend avx u8 ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_U8]);
fprintf(stderr, "[extend avx sp ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_I16_SP] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_I16_SP]);
#endif
// close_files();
fclose(qfp);
fclose(tfp);
fclose(ifp);

View File

@ -23,19 +23,11 @@
#define ARR_FOR_EQUAL_CHECK 1
#define ELIMINATE_DIFF_3 0
#define PRUNING 1
#define PRUNING 0
#define REVERSE_TARGET 0
#define NO_VAL -1
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x),1)
#define UNLIKELY(x) __builtin_expect((x),0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#define SIMD_WIDTH 16
static int extend_scalar_inner(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del,
@ -186,27 +178,6 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
tmp = fA1; fA1 = fA2; fA2 = tmp; \
tmp = mA1; mA1 = mA2; mA2 = tmp;
static void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_FILE_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
#endif
}
int extend_avx2_i16(byte_mem_t *bmem,
int qlen, // query length 待匹配段碱基的query长度

View File

@ -27,14 +27,6 @@
#define NO_VAL -1
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x),1)
#define UNLIKELY(x) __builtin_expect((x),0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#define SIMD_WIDTH 16
static int extend_scalar_inner(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del,
@ -175,28 +167,6 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
tmp = fA1; fA1 = fA2; fA2 = tmp; \
tmp = mA1; mA1 = mA2; mA2 = tmp;
static void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_FILE_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
#endif
}
int extend_avx2_i16_sp(byte_mem_t *bmem,
int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列

View File

@ -17,18 +17,10 @@
#include "byte_alloc.h"
#include "utils.h"
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#define KSW_EQUAL
#define ARR_FOR_EQUAL_CHECK 1
#define PRUNING 1
#define PRUNING 0
#define REVERSE_TARGET 0
#define SIMD_WIDTH 32
@ -429,23 +421,23 @@ int extend_avx2_u8(byte_mem_t *bmem,
#if ARR_FOR_EQUAL_CHECK
if (hA1[0] < b && checkspecial)
{
int mi;
int mii;
if (hA1[0] == b - 1)
{
icheck = iend + 1;
}
for (mi = 0; mi < b - 1; ++mi)
for (mii = 0; mii < b - 1; ++mii)
{
if (midx - mi > 0)
marr[mi] = MAX(marr[mi], hA2[midx - mi]);
if (midx - mii > 0)
marr[mii] = MAX(marr[mii], hA2[midx - mii]);
}
midx += 1;
if (ibeg > icheck)
{
int stopCalc = 0;
for (mi = 0; mi < b - 1; ++mi)
for (mii = 0; mii < b - 1; ++mii)
{
stopCalc |= !marr[mi];
stopCalc |= !marr[mii];
}
if (stopCalc)
break;

View File

@ -21,7 +21,7 @@
#define UNLIKELY(x) (x)
#endif
#define PRUNING 1
#define PRUNING 0
typedef struct
{
@ -37,8 +37,11 @@ int extend_scalar(byte_mem_t *bmem, int qlen, const uint8_t *query, int tlen, co
assert(h0 > 0);
// qp = malloc(qlen * m);
// eh = calloc(qlen + 1, 8);
qp = byte_mem_request(bmem, qlen * m);
eh = byte_mem_request_and_clean(bmem, (qlen + 1) * 8);
int qp_size = qlen * m;
int eh_size = (qlen + 1) * 8;
qp = byte_mem_request(bmem, qp_size + eh_size);
eh = (eh_t*)((uint8_t*)qp + qp_size);
memset(eh, 0, eh_size);
// generate the query profile
for (k = i = 0; k < m; ++k)
{

13
main.c
View File

@ -15,19 +15,24 @@
#include <sys/time.h>
#include "byte_alloc.h"
#include "utils.h"
#include "extend.h"
#include "profiling.h"
#include "extend.h"
#include "align.h"
#define TEST_EXTEND
#define TEST_EXTEND 0
#define TEST_ALIGN 1
// 程序执行入口
int main(int argc, char *argv[])
{
#ifdef TEST_EXTEND
#if TEST_EXTEND
main_extend(argc - 1, argv + 1);
#endif
#if TEST_ALIGN
main_align(argc - 1, argv + 1);
#endif
return 0;
}

View File

@ -28,17 +28,4 @@ uint64_t get_msec()
// gettimeofday(&tv, NULL);
// return (uint64_t)1000 * (tv.tv_sec + ((1e-6) * tv.tv_usec));
return clock();
}
/* show excution time */
int display_extend_stats()
{
#ifdef SHOW_PERF
fprintf(stderr, "[extend scalar ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_SCALAR] / TIME_DIVIDE_BY, gdata[G_EXT_SCALAR]);
fprintf(stderr, "[extend avx i16] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_I16]);
fprintf(stderr, "[extend avx u8 ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_U8]);
fprintf(stderr, "[extend avx sp ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_I16_SP] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_I16_SP]);
#endif
return 0;
}

View File

@ -8,6 +8,7 @@
#ifndef __PROFILING_H
#define __PROFILING_H
#include <stdio.h>
#include <time.h>
#define TIME_DIVIDE_BY (CLOCKS_PER_SEC * 1.0)
@ -19,7 +20,7 @@ extern FILE *ins_f_arr[LIM_TYPE],
*score_f_arr[LIM_TYPE],
*retval_f_arr[LIM_TYPE];
// GLOBAL
// GLOBAL performance info
enum
{
G_ALL = 0,
@ -27,6 +28,10 @@ enum
G_EXT_AVX2_I16,
G_EXT_AVX2_U8,
G_EXT_AVX2_I16_SP,
G_ALN_I16,
G_ALN_U8,
G_ALN_AVX2_I16,
G_ALN_AVX2_U8
};
#ifdef SHOW_PERF
@ -39,6 +44,4 @@ enum
// get current milli seconds
uint64_t get_msec();
// show excution time
int display_extend_stats();
#endif

14
run.sh
View File

@ -1,6 +1,12 @@
#!/bin/bash
#./sw_perf ./input/diff_ext/q0.txt ./input/diff_ext/t0.txt ./input/diff_ext/i0.txt
./sw_perf ./input/ext/q_0.fa ./input/ext/t_0.fa ./input/ext/i_0.txt
./sw_perf ./input/ext/q_1.fa ./input/ext/t_1.fa ./input/ext/i_1.txt
./sw_perf ./input/ext/q_2.fa ./input/ext/t_2.fa ./input/ext/i_2.txt
./sw_perf ./input/ext/q_3.fa ./input/ext/t_3.fa ./input/ext/i_3.txt
#./sw_perf ./input/ext/q_0.fa ./input/ext/t_0.fa ./input/ext/i_0.txt
#./sw_perf ./input/ext/q_1.fa ./input/ext/t_1.fa ./input/ext/i_1.txt
#./sw_perf ./input/ext/q_2.fa ./input/ext/t_2.fa ./input/ext/i_2.txt
#./sw_perf ./input/ext/q_3.fa ./input/ext/t_3.fa ./input/ext/i_3.txt
#./sw_perf ./input/align/q_0.fa ./input/align/t_0.fa ./input/align/i_0.txt
./sw_perf ./input/align/q_0.fa ./input/align/t_0.fa ./input/align/i_0_ext.txt
# debug
#./sw_perf ./input/diff_align/q0.txt ./input/diff_align/t0.txt ./input/diff_align/i0.txt

26
utils.c
View File

@ -12,6 +12,7 @@
#include <string.h>
#include "utils.h"
#include "debug.h"
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@ -112,4 +113,29 @@ int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp)
++read_row_num;
}
return read_row_num;
}
void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_FILE_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
if (h0 > 0)
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
else
fprintf(info_f, "%-8d%-8d\n", qlen, tlen);
#endif
}

11
utils.h
View File

@ -10,6 +10,7 @@
#define __UTILS_H
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include "kvec.h"
@ -18,6 +19,14 @@
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#define BUF_SIZE 1048576
@ -42,4 +51,6 @@ int read_qt_info(qti_v *qti_arr, buf_t *buf, int row_num_to_read, int info_num,
int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp);
void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum);
#endif