2025-09-18 15:09:48 +08:00
/*********************************************************************************************
Description : sw align functions in bwa - mem
Copyright : All right reserved by NCIC . ICT
Author : Zhang Zhonghai
Date : 2024 / 04 / 11
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# include <stdlib.h>
# include <stdint.h>
# include "utils.h"
# include "align.h"
kswr_t align_sse_i16 ( byte_mem_t * bmem , kswq_sse_t * q , int tlen , const uint8_t * target , int _o_del , int _e_del , int _o_ins , int _e_ins , int xtra ) // the first gap costs -(_o+_e)
{
int slen , i , m_b , n_b , te = - 1 , gmax = 0 , minsc , endsc ;
uint64_t * b ;
__m128i zero , oe_del , e_del , oe_ins , e_ins , * H0 , * H1 , * E , * Hmax ;
kswr_t r ;
# if defined __SSE2__
# define __max_8(ret, xx) do { \
( xx ) = _mm_max_epi16 ( ( xx ) , _mm_srli_si128 ( ( xx ) , 8 ) ) ; \
( xx ) = _mm_max_epi16 ( ( xx ) , _mm_srli_si128 ( ( xx ) , 4 ) ) ; \
( xx ) = _mm_max_epi16 ( ( xx ) , _mm_srli_si128 ( ( xx ) , 2 ) ) ; \
( ret ) = _mm_extract_epi16 ( ( xx ) , 0 ) ; \
} while ( 0 )
// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000
# define allzero_0f_8(xx) (!_mm_movemask_epi8((xx)))
# elif defined __ARM_NEON
# define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx)))
# define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0)
# else
# define __max_8(ret, xx) (ret) = m128i_max_s16((xx))
# define allzero_0f_8(xx) (m128i_allzero((xx)))
# endif
// initialization
r = g_defr ;
minsc = ( xtra & KSW_XSUBO ) ? xtra & 0xffff : 0x10000 ;
endsc = ( xtra & KSW_XSTOP ) ? xtra & 0xffff : 0x10000 ;
m_b = n_b = 0 ; b = 0 ;
zero = _mm_set1_epi32 ( 0 ) ;
oe_del = _mm_set1_epi16 ( _o_del + _e_del ) ;
e_del = _mm_set1_epi16 ( _e_del ) ;
oe_ins = _mm_set1_epi16 ( _o_ins + _e_ins ) ;
e_ins = _mm_set1_epi16 ( _e_ins ) ;
H0 = q - > H0 ; H1 = q - > H1 ; E = q - > E ; Hmax = q - > Hmax ;
slen = q - > slen ;
for ( i = 0 ; i < slen ; + + i ) {
_mm_store_si128 ( E + i , zero ) ;
_mm_store_si128 ( H0 + i , zero ) ;
_mm_store_si128 ( Hmax + i , zero ) ;
}
// the core loop
for ( i = 0 ; i < tlen ; + + i ) {
int j , k , imax ;
__m128i e , t , h , f = zero , max = zero , * S = q - > qp + target [ i ] * slen ; // s is the 1st score vector
h = _mm_load_si128 ( H0 + slen - 1 ) ; // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128 ( h , 2 ) ;
for ( j = 0 ; LIKELY ( j < slen ) ; + + j ) {
h = _mm_adds_epi16 ( h , _mm_load_si128 ( S + + ) ) ;
e = _mm_load_si128 ( E + j ) ;
h = _mm_max_epi16 ( h , e ) ;
h = _mm_max_epi16 ( h , f ) ;
max = _mm_max_epi16 ( max , h ) ;
_mm_store_si128 ( H1 + j , h ) ;
e = _mm_subs_epu16 ( e , e_del ) ;
t = _mm_subs_epu16 ( h , oe_del ) ;
e = _mm_max_epi16 ( e , t ) ;
_mm_store_si128 ( E + j , e ) ;
f = _mm_subs_epu16 ( f , e_ins ) ;
t = _mm_subs_epu16 ( h , oe_ins ) ;
f = _mm_max_epi16 ( f , t ) ;
h = _mm_load_si128 ( H0 + j ) ;
}
for ( k = 0 ; LIKELY ( k < 16 ) ; + + k ) {
f = _mm_slli_si128 ( f , 2 ) ;
for ( j = 0 ; LIKELY ( j < slen ) ; + + j ) {
h = _mm_load_si128 ( H1 + j ) ;
h = _mm_max_epi16 ( h , f ) ;
_mm_store_si128 ( H1 + j , h ) ;
h = _mm_subs_epu16 ( h , oe_ins ) ;
f = _mm_subs_epu16 ( f , e_ins ) ;
if ( UNLIKELY ( allzero_0f_8 ( _mm_cmpgt_epi16 ( f , h ) ) ) ) goto end_loop8 ;
}
}
end_loop8 :
__max_8 ( imax , max ) ;
//fprintf(stderr, "%d\n", imax);
if ( imax > = minsc ) {
//fprintf(stderr, "%d\n", imax);
if ( n_b = = 0 | | ( int32_t ) b [ n_b - 1 ] + 1 ! = i ) {
if ( n_b = = m_b ) {
m_b = m_b ? m_b < < 1 : 8 ;
b = ( uint64_t * ) realloc ( b , 8 * m_b ) ;
// b = (uint64_t *)byte_mem_request(bmem, 8 * m_b);
}
b [ n_b + + ] = ( uint64_t ) imax < < 32 | i ;
} else if ( ( int ) ( b [ n_b - 1 ] > > 32 ) < imax ) b [ n_b - 1 ] = ( uint64_t ) imax < < 32 | i ; // modify the last
}
if ( imax > gmax ) {
gmax = imax ; te = i ;
for ( j = 0 ; LIKELY ( j < slen ) ; + + j )
_mm_store_si128 ( Hmax + j , _mm_load_si128 ( H1 + j ) ) ;
if ( gmax > = endsc ) break ;
}
S = H1 ; H1 = H0 ; H0 = S ;
}
r . score = gmax ; r . te = te ;
{
int max = - 1 , tmp , low , high , qlen = slen * 8 ;
uint16_t * t = ( uint16_t * ) Hmax ;
for ( i = 0 , r . qe = - 1 ; i < qlen ; + + i , + + t )
if ( ( int ) * t > max ) max = * t , r . qe = i / 8 + i % 8 * slen ;
else if ( ( int ) * t = = max & & ( tmp = i / 8 + i % 8 * slen ) < r . qe ) r . qe = tmp ;
if ( b ) {
i = ( r . score + q - > max - 1 ) / q - > max ;
low = te - i ; high = te + i ;
for ( i = 0 ; i < n_b ; + + i ) {
int e = ( int32_t ) b [ i ] ;
if ( ( e < low | | e > high ) & & ( int ) ( b [ i ] > > 32 ) > r . score2 )
r . score2 = b [ i ] > > 32 , r . te2 = e ;
}
}
}
free ( b ) ;
// byte_mem_clear(bmem);
return r ;
}