2025-09-18 15:09:48 +08:00
/*********************************************************************************************
Description : sw align functions in bwa - mem
Copyright : All right reserved by NCIC . ICT
Author : Zhang Zhonghai
Date : 2024 / 04 / 11
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# include <stdlib.h>
# include <stdint.h>
# include "utils.h"
# include "align.h"
kswr_t align_sse_u8 ( byte_mem_t * bmem , kswq_sse_t * q , int tlen , const uint8_t * target , int _o_del , int _e_del , int _o_ins , int _e_ins , int xtra ) // the first gap costs -(_o+_e)
{
int slen , i , m_b , n_b , te = - 1 , gmax = 0 , minsc , endsc ;
uint64_t * b ;
__m128i zero , oe_del , e_del , oe_ins , e_ins , shift , * H0 , * H1 , * E , * Hmax ;
kswr_t r ;
# if defined __SSE2__
# define __max_16(ret, xx) do { \
( xx ) = _mm_max_epu8 ( ( xx ) , _mm_srli_si128 ( ( xx ) , 8 ) ) ; \
( xx ) = _mm_max_epu8 ( ( xx ) , _mm_srli_si128 ( ( xx ) , 4 ) ) ; \
( xx ) = _mm_max_epu8 ( ( xx ) , _mm_srli_si128 ( ( xx ) , 2 ) ) ; \
( xx ) = _mm_max_epu8 ( ( xx ) , _mm_srli_si128 ( ( xx ) , 1 ) ) ; \
( ret ) = _mm_extract_epi16 ( ( xx ) , 0 ) & 0x00ff ; \
} while ( 0 )
// Given entries with arbitrary values, return whether they are all 0x00
# define allzero_16(xx) (_mm_movemask_epi8(_mm_cmpeq_epi8((xx), zero)) == 0xffff)
# elif defined __ARM_NEON
# define __max_16(ret, xx) (ret) = vmaxvq_u8((xx))
# define allzero_16(xx) (vmaxvq_u8((xx)) == 0)
# else
# define __max_16(ret, xx) (ret) = m128i_max_u8((xx))
# define allzero_16(xx) (m128i_allzero((xx)))
# endif
// initialization
r = g_defr ;
minsc = ( xtra & KSW_XSUBO ) ? xtra & 0xffff : 0x10000 ;
endsc = ( xtra & KSW_XSTOP ) ? xtra & 0xffff : 0x10000 ;
m_b = n_b = 0 ; b = 0 ;
zero = _mm_set1_epi32 ( 0 ) ;
oe_del = _mm_set1_epi8 ( _o_del + _e_del ) ;
e_del = _mm_set1_epi8 ( _e_del ) ;
oe_ins = _mm_set1_epi8 ( _o_ins + _e_ins ) ;
e_ins = _mm_set1_epi8 ( _e_ins ) ;
shift = _mm_set1_epi8 ( q - > shift ) ;
H0 = q - > H0 ; H1 = q - > H1 ; E = q - > E ; Hmax = q - > Hmax ;
slen = q - > slen ;
for ( i = 0 ; i < slen ; + + i ) {
_mm_store_si128 ( E + i , zero ) ;
_mm_store_si128 ( H0 + i , zero ) ;
_mm_store_si128 ( Hmax + i , zero ) ;
}
// the core loop
for ( i = 0 ; i < tlen ; + + i ) {
int j , k , imax ;
__m128i e , h , t , f = zero , max = zero , * S = q - > qp + target [ i ] * slen ; // s is the 1st score vector
h = _mm_load_si128 ( H0 + slen - 1 ) ; // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128 ( h , 1 ) ; // h=H(i-1,-1); << instead of >> because x64 is little-endian
for ( j = 0 ; LIKELY ( j < slen ) ; + + j ) {
/* SW cells are computed in the following order:
* H ( i , j ) = max { H ( i - 1 , j - 1 ) + S ( i , j ) , E ( i , j ) , F ( i , j ) }
* E ( i + 1 , j ) = max { H ( i , j ) - q , E ( i , j ) - r }
* F ( i , j + 1 ) = max { H ( i , j ) - q , F ( i , j ) - r }
*/
// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
h = _mm_adds_epu8 ( h , _mm_load_si128 ( S + j ) ) ;
h = _mm_subs_epu8 ( h , shift ) ; // h=H'(i-1,j-1)+S(i,j)
e = _mm_load_si128 ( E + j ) ; // e=E'(i,j)
h = _mm_max_epu8 ( h , e ) ;
h = _mm_max_epu8 ( h , f ) ; // h=H'(i,j)
max = _mm_max_epu8 ( max , h ) ; // set max
_mm_store_si128 ( H1 + j , h ) ; // save to H'(i,j)
// now compute E'(i+1,j)
e = _mm_subs_epu8 ( e , e_del ) ; // e=E'(i,j) - e_del
t = _mm_subs_epu8 ( h , oe_del ) ; // h=H'(i,j) - o_del - e_del
e = _mm_max_epu8 ( e , t ) ; // e=E'(i+1,j)
_mm_store_si128 ( E + j , e ) ; // save to E'(i+1,j)
// now compute F'(i,j+1)
f = _mm_subs_epu8 ( f , e_ins ) ;
t = _mm_subs_epu8 ( h , oe_ins ) ; // h=H'(i,j) - o_ins - e_ins
f = _mm_max_epu8 ( f , t ) ;
// get H'(i-1,j) and prepare for the next j
h = _mm_load_si128 ( H0 + j ) ; // h=H'(i-1,j)
}
// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
for ( k = 0 ; LIKELY ( k < 16 ) ; + + k ) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
f = _mm_slli_si128 ( f , 1 ) ;
for ( j = 0 ; LIKELY ( j < slen ) ; + + j ) {
h = _mm_load_si128 ( H1 + j ) ;
h = _mm_max_epu8 ( h , f ) ; // h=H'(i,j)
_mm_store_si128 ( H1 + j , h ) ;
h = _mm_subs_epu8 ( h , oe_ins ) ;
f = _mm_subs_epu8 ( f , e_ins ) ;
if ( UNLIKELY ( allzero_16 ( _mm_subs_epu8 ( f , h ) ) ) ) goto end_loop16 ;
}
}
end_loop16 :
//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
__max_16 ( imax , max ) ; // imax is the maximum number in max
//fprintf(stderr, "%d\n", imax);
if ( imax > = minsc ) { // write the b array; this condition adds branching unfornately
//fprintf(stderr, "%d\n", imax);
if ( n_b = = 0 | | ( int32_t ) b [ n_b - 1 ] + 1 ! = i ) { // then append
if ( n_b = = m_b ) {
m_b = m_b ? m_b < < 1 : 8 ;
b = ( uint64_t * ) realloc ( b , 8 * m_b ) ;
//b = (uint64_t *)byte_mem_request(bmem, 8 * m_b);
}
b [ n_b + + ] = ( uint64_t ) imax < < 32 | i ;
} else if ( ( int ) ( b [ n_b - 1 ] > > 32 ) < imax ) b [ n_b - 1 ] = ( uint64_t ) imax < < 32 | i ; // modify the last
}
if ( imax > gmax ) {
gmax = imax ; te = i ; // te is the end position on the target
for ( j = 0 ; LIKELY ( j < slen ) ; + + j ) // keep the H1 vector
_mm_store_si128 ( Hmax + j , _mm_load_si128 ( H1 + j ) ) ;
if ( gmax + q - > shift > = 255 | | gmax > = endsc ) break ;
}
S = H1 ; H1 = H0 ; H0 = S ; // swap H0 and H1
}
r . score = gmax + q - > shift < 255 ? gmax : 255 ;
r . te = te ;
if ( r . score ! = 255 ) { // get a->qe, the end of query match; find the 2nd best score
int max = - 1 , tmp , low , high , qlen = slen * 16 ;
uint8_t * t = ( uint8_t * ) Hmax ;
for ( i = 0 ; i < qlen ; + + i , + + t )
if ( ( int ) * t > max ) max = * t , r . qe = i / 16 + i % 16 * slen ;
else if ( ( int ) * t = = max & & ( tmp = i / 16 + i % 16 * slen ) < r . qe ) r . qe = tmp ;
//printf("%d,%d\n", max, gmax);
if ( b ) {
i = ( r . score + q - > max - 1 ) / q - > max ;
low = te - i ; high = te + i ;
for ( i = 0 ; i < n_b ; + + i ) {
int e = ( int32_t ) b [ i ] ;
if ( ( e < low | | e > high ) & & ( int ) ( b [ i ] > > 32 ) > r . score2 )
r . score2 = b [ i ] > > 32 , r . te2 = e ;
}
}
}
free ( b ) ;
//byte_mem_clear(bmem);
return r ;
}