加了对find_max剪枝的优化

2023-08-20 02:16:25 +08:00 · 2023-08-20 02:16:25 +08:00 · 775297a813
parent c9e4d9645c
commit 775297a813
5 changed files with 288 additions and 142 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,5 +1,6 @@
 {
    "files.associations": {
-        "functional": "c"
+        "functional": "c",
+        "random": "c"
    }
 }
--- a/bsw_avx2.c
+++ b/bsw_avx2.c
@ -20,6 +20,7 @@
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 #define SIMD_WIDTH 16

+/* 去掉多余计算的值 */
 static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
    {0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    {0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
@ -38,28 +39,28 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0},
    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}};

-// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
-#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
+// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
+#define permute_mask 27
 // 初始化变量
-#define SIMD_INIT                                       \
-    int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
-    __m256i zero_vec;                                   \
-    __m256i max_vec;                                    \
-    __m256i oe_del_vec;                                 \
-    __m256i oe_ins_vec;                                 \
-    __m256i e_del_vec;                                  \
-    __m256i e_ins_vec;                                  \
-    __m256i h_vec_mask[SIMD_WIDTH];                     \
-    zero_vec = _mm256_setzero_si256();                  \
-    oe_del_vec = _mm256_set1_epi16(-oe_del);            \
-    oe_ins_vec = _mm256_set1_epi16(-oe_ins);            \
-    e_del_vec = _mm256_set1_epi16(-e_del);              \
-    e_ins_vec = _mm256_set1_epi16(-e_ins);              \
-    __m256i match_sc_vec = _mm256_set1_epi16(a);        \
-    __m256i mis_sc_vec = _mm256_set1_epi16(-b);         \
-    __m256i amb_sc_vec = _mm256_set1_epi16(-1);         \
-    __m256i amb_vec = _mm256_set1_epi16(4);             \
-    for (i = 0; i < SIMD_WIDTH; ++i)                    \
+#define SIMD_INIT                                          \
+    int oe_del = o_del + e_del, oe_ins = o_ins + e_ins;    \
+    __m256i zero_vec;                                      \
+    __m256i max_vec, last_max_vec = _mm256_set1_epi16(h0); \
+    __m256i oe_del_vec;                                    \
+    __m256i oe_ins_vec;                                    \
+    __m256i e_del_vec;                                     \
+    __m256i e_ins_vec;                                     \
+    __m256i h_vec_mask[SIMD_WIDTH];                        \
+    zero_vec = _mm256_setzero_si256();                     \
+    oe_del_vec = _mm256_set1_epi16(-oe_del);               \
+    oe_ins_vec = _mm256_set1_epi16(-oe_ins);               \
+    e_del_vec = _mm256_set1_epi16(-e_del);                 \
+    e_ins_vec = _mm256_set1_epi16(-e_ins);                 \
+    __m256i match_sc_vec = _mm256_set1_epi16(a);           \
+    __m256i mis_sc_vec = _mm256_set1_epi16(-b);            \
+    __m256i amb_sc_vec = _mm256_set1_epi16(-1);            \
+    __m256i amb_vec = _mm256_set1_epi16(4);                \
+    for (i = 0; i < SIMD_WIDTH; ++i)                       \
        h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i]));

 /*
@ -79,19 +80,21 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
    __m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i]));

 // 比对ref和seq的序列，计算罚分
-#define SIMD_CMP_SEQ                                                         \
-    ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask);                 \
-    ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask);                   \
-    ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask);                   \
-    __m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec);             \
-    __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
-    __m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec);      \
-    score_vec = _mm256_or_si256(score_vec, mis_score_vec);                   \
-    __m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec);            \
-    __m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec);            \
-    __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec);  \
-    score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec);                \
-    __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec);      \
+#define SIMD_CMP_SEQ                                                                                                         \
+    /* 将待比对的target序列逆序排列 */                                                                            \
+    ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask);                                                                 \
+    ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask);                                                                   \
+    ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask);                                                                   \
+    __m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec);             /* 比对query和target字符序列 */          \
+    __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); /* 未匹配上的位置赋值mismatch分数 */ \
+    __m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec);      /* 匹配上的位置赋值match分数 */       \
+    score_vec = _mm256_or_si256(score_vec, mis_score_vec);                                                                   \
+    /* 计算模棱两可的字符（N)的位置的分数 */                                                                 \
+    __m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec);                                                            \
+    __m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec);                                                            \
+    __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec);                                                  \
+    score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec);                                                                \
+    __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec);                                                      \
    score_vec = _mm256_or_si256(score_vec, amb_score_vec);

 // 向量化计算h, e, f, m
@ -113,8 +116,18 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
    hn_vec = _mm256_max_epi16(hn_vec, zero_vec);

 // 存储向量化结果
+// #define SIMD_STORE
+
+//    __m256i cur_max_vec = _mm256_max_epu8(max_vec, hn_vec); \
+//    __m256i vcmp = _mm256_cmpgt_epi8(cur_max_vec, max_vec); \
+//    uint32_t mask = _mm256_movemask_epi8(vcmp);             \
+//    if (mask > 0)                                           \
+//    {                                                       \
+//        simd_i = i;                                         \
+//        simd_j = j;                                         \
+//    }
 #define SIMD_STORE                                   \
-    max_vec = _mm256_max_epi16(max_vec, hn_vec);     \
+    max_vec = _mm256_max_epu8(max_vec, hn_vec);      \
    _mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \
    _mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \
    _mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \
@ -128,28 +141,43 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
    hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);

 // 找最大值和位置
-#define SIMD_FIND_MAX                                                                       \
-    max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2));           \
-    max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4));           \
-    max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6));           \
-    max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8));           \
-    max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
-    int16_t *maxVal = (int16_t *)&max_vec;                                                  \
-    m = maxVal[0];                                                                          \
-    if (m > 0)                                                                              \
-    {                                                                                       \
-        for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH)                 \
-        {                                                                                   \
-            __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j]));                      \
-            __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec);                             \
-            uint32_t mask = _mm256_movemask_epi8(vcmp);                                     \
-            if (mask > 0)                                                                   \
-            {                                                                               \
-                int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1);                    \
-                mj = j - 1 + pos;                                                           \
-                mi = i - 1 - pos;                                                           \
-            }                                                                               \
-        }                                                                                   \
+// last_max_vec = max_vec;
+#define SIMD_FIND_MAX                                                                           \
+    __m256i cmp_max = _mm256_cmpgt_epi16(max_vec, last_max_vec);                                \
+    uint32_t cmp_result = _mm256_movemask_epi8(cmp_max);                                        \
+    if (cmp_result > 0)                                                                         \
+    {                                                                                           \
+        max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2));           \
+        max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4));           \
+        max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6));           \
+        max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8));           \
+        max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
+        int16_t *maxVal = (int16_t *)&max_vec;                                                  \
+        m = maxVal[0];                                                                          \
+        for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH)                     \
+        {                                                                                       \
+            __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j]));                          \
+            __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec);                                 \
+            uint32_t mask = _mm256_movemask_epi8(vcmp);                                         \
+            if (mask > 0)                                                                       \
+            {                                                                                   \
+                int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1);                        \
+                mj = j - 1 + pos;                                                               \
+                mi = i - 1 - pos;                                                               \
+                for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++)                              \
+                {                                                                               \
+                    if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH])                                \
+                    {                                                                           \
+                        m += a;                                                                 \
+                    }                                                                           \
+                    else                                                                        \
+                    {                                                                           \
+                        break;                                                                  \
+                    }                                                                           \
+                }                                                                               \
+            }                                                                                   \
+        }                                                                                       \
+        last_max_vec = _mm256_set1_epi16(m);                                                    \
    }

 // 每轮迭代后，交换数组
@ -202,6 +230,7 @@ int bsw_avx2(int qlen,              // query length  待匹配段碱基的query
    int col_size = qlen + 2 + SIMD_WIDTH;
    int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍
    int mem_size = (seq_size + ref_size) * 2 + val_mem_size;
+    int simd_i = -1, simd_j = -1;

    SIMD_INIT; // 初始化simd用的数据

@ -365,8 +394,8 @@ int bsw_avx2(int qlen,              // query length  待匹配段碱基的query
            max_ie = gscore > hA2[qlen] ? max_ie : iStart;
            gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
        }
-        // if (m == 0 && m_last == 0)
-        //     break; // 一定要注意，斜对角遍历和按列遍历的不同点
+        //       if (m == 0 && m_last == 0)
+        //           break; // 一定要注意，斜对角遍历和按列遍历的不同点
        if (m > max)
        {
            max = m, max_i = mi, max_j = mj;
@ -387,30 +416,124 @@ int bsw_avx2(int qlen,              // query length  待匹配段碱基的query
        }

        // 调整计算的边界
-        /*        for (j = beg; LIKELY(j <= end); ++j)
+
+        // fprintf(stderr, "beg: %d, end: %d ", beg, end);
+
+        /*        for (j = beg; j <= end; j += SIMD_WIDTH)
                {
-                    int has_val = hA1[j - 1] | hA2[j];
-                    if (has_val)
+                    __m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
+                    __m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
+                    __m256i orvec = _mm256_or_si256(h1, h2);
+                    __m256i vcmp = _mm256_cmpgt_epi16(orvec, zero_vec);
+                    uint32_t mask = _mm256_movemask_epi8(vcmp);
+                    if (mask > 0)
+                    {
+                        // vcmp = _mm256_permute4x64_epi64(vcmp, permute_mask);
+                        // vcmp = _mm256_shufflelo_epi16(vcmp, permute_mask);
+                        // vcmp = _mm256_shufflehi_epi16(vcmp, permute_mask);
+                        // mask = _mm256_movemask_epi8(vcmp);
+                        //// int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1);
+                        // int pos = ((__builtin_clz(mask)) >> 1);
+                        // beg = j + pos;
+                        int pos = __builtin_ctz(mask) >> 1;
+                        beg = j + pos;
+                        // if (beg > end)
+                        //     beg = end;
+                        //  beg = j + pos;
+                        //  beg = 0;
                        break;
+                    }
                }
-                beg = j;
-                for (j = end + 1; LIKELY(j >= beg); --j)
-                {
-                    int has_val = hA1[j - 1] | hA2[j];
-                    if (has_val)
-                        break;
-                    else
-                        hA0[j - 1] = 0;
-                }
-                end = j + 1 <= qlen ? j + 1 : qlen;
        */
-        beg = 0;
-        end = qlen; // uncomment this line for debugging
+        // beg = 0;
+        // fprintf(stderr, "new beg: %d\n", beg);
+        // int pos = 0;
+        // for (j = beg; j <= end; j += SIMD_WIDTH)
+        //{
+        //    __m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
+        //    __m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
+        //    __m256i orvec = _mm256_or_si256(h1, h2);
+        //    int *val = (int *)&orvec;
+        //    for (i = 0; i < SIMD_WIDTH; ++i)
+        //        if (val[i])
+        //        {
+        //            pos = SIMD_WIDTH - 1 - i;
+        //            break;
+        //        }
+        //}
+        // beg = j;
+        for (j = beg; LIKELY(j <= end); ++j)
+        {
+            int has_val = hA1[j - 1] | hA2[j];
+            if (has_val)
+            {
+                break;
+            }
+        }
+        beg = j;
+
+        hA2[end + 1] = 0;
+        for (j = end + 1; LIKELY(j >= beg); --j)
+        {
+            int has_val = hA1[j - 1] | hA2[j];
+            if (has_val)
+            {
+                break;
+            }
+            // else
+            //     hA0[j - 1] = 0;
+        }
+        end = j + 1 <= qlen ? j + 1 : qlen;
+
+        /*       for (j = end + 1; j >= beg; j -= SIMD_WIDTH) // 没有考虑beg附近，且长度小于SIMD_WIDTH的数据
+               {
+                   __m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
+                   __m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
+                   __m256i orvec = _mm256_or_si256(h1, h2);
+                   __m256i vcmp = _mm256_cmpgt_epi16(orvec, zero_vec);
+                   uint32_t mask = _mm256_movemask_epi8(vcmp);
+                   if (mask > 0)
+                   {
+                       int pos = __builtin_clz(mask) >> 1;
+                       const int new_end = j + SIMD_WIDTH - pos;
+                       end = new_end <= qlen ? new_end : qlen;
+                       break;
+                   }
+                   else
+                   {
+                       _mm256_storeu_si256((__m256i *)&hA0[j - 1], zero_vec);
+                   }
+               }
+       */
+        // beg = 0;
+        // end = qlen; // uncomment this line for debugging
        m_last = m;
        // swap m, h, e, f
        SWAP_DATA_POINTER;
    }

+    //    __m256i origin_max_vec = max_vec;
+    //    max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2));
+    //    max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4));
+    //    max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6));
+    //    max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8));
+    //    max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01));
+    //    int16_t *maxVal = (int16_t *)&max_vec;
+    //    max = maxVal[0];
+    //    if (max > 0)
+    //    {
+    //        __m256i vcmp = _mm256_cmpeq_epi16(origin_max_vec, max_vec);
+    //        uint32_t mask = _mm256_movemask_epi8(vcmp);
+    //        if (mask > 0)
+    //        {
+    //            int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1);
+    //            int mj = simd_j - 1 + pos;
+    //            int mi = simd_i - 1 - pos;
+    //            max_i = mi, max_j = mj;
+    //            max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
+    //        }
+    //    }
+
    free(mem);
    if (_qle)
        *_qle = max_j + 1;
--- a/ksw_avx2_u8.c
+++ b/ksw_avx2_u8.c
@ -58,12 +58,13 @@ static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
 static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};

 // const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
-#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
+// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
+#define permute_mask 27
 // 初始化变量
 #define SIMD_INIT                                                     \
    int oe_del = o_del + e_del, oe_ins = o_ins + e_ins;               \
    __m256i zero_vec;                                                 \
-    __m256i max_vec;                                                  \
+    __m256i max_vec, last_max_vec = _mm256_set1_epi8(h0);             \
    __m256i oe_del_vec;                                               \
    __m256i oe_ins_vec;                                               \
    __m256i e_del_vec;                                                \
@ -149,33 +150,51 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
    mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \
    hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);

+// cmp_max = _mm256_xor_si256(last_max_vec, cmp_max);
+//           last_max_vec = _mm256_set1_epi8(m);
 // 找最大值和位置
-#define SIMD_FIND_MAX                                                                      \
-    uint8_t *maxVal = (uint8_t *)&max_vec;                                                 \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1));           \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2));           \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3));           \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4));           \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5));           \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6));           \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7));           \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8));           \
-    max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
-    m = maxVal[0];                                                                         \
-    if (m > 0)                                                                             \
-    {                                                                                      \
-        for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH)                \
-        {                                                                                  \
-            __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j]));                     \
-            __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec);                             \
-            uint32_t mask = _mm256_movemask_epi8(vcmp);                                    \
-            if (mask > 0)                                                                  \
-            {                                                                              \
-                int pos = SIMD_WIDTH - 1 - __builtin_clz(mask);                            \
-                mj = j - 1 + pos;                                                          \
-                mi = i - 1 - pos;                                                          \
-            }                                                                              \
-        }                                                                                  \
+#define SIMD_FIND_MAX                                                                          \
+    __m256i cmp_max = _mm256_max_epu8(max_vec, last_max_vec);                                  \
+    cmp_max = _mm256_xor_si256(cmp_max, last_max_vec);                                         \
+    cmp_max = _mm256_cmpeq_epi8(cmp_max, zero_vec);                                            \
+    uint32_t cmp_result = _mm256_movemask_epi8(cmp_max);                                       \
+    if (cmp_result != 4294967296)                                                              \
+    {                                                                                          \
+        uint8_t *maxVal = (uint8_t *)&max_vec;                                                 \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1));           \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2));           \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3));           \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4));           \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5));           \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6));           \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7));           \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8));           \
+        max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
+        m = maxVal[0];                                                                         \
+        for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH)                    \
+        {                                                                                      \
+            __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j]));                         \
+            __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec);                                 \
+            uint32_t mask = _mm256_movemask_epi8(vcmp);                                        \
+            if (mask > 0)                                                                      \
+            {                                                                                  \
+                int pos = SIMD_WIDTH - 1 - __builtin_clz(mask);                                \
+                mj = j - 1 + pos;                                                              \
+                mi = i - 1 - pos;                                                              \
+                for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++)                             \
+                {                                                                              \
+                    if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH])                               \
+                    {                                                                          \
+                        m += a;                                                                \
+                    }                                                                          \
+                    else                                                                       \
+                    {                                                                          \
+                        break;                                                                 \
+                    }                                                                          \
+                }                                                                              \
+            }                                                                                  \
+        }                                                                                      \
+        last_max_vec = _mm256_set1_epi8(m);                                                    \
    }

 // 每轮迭代后，交换数组
@ -217,13 +236,14 @@ int ksw_avx2_u8(int qlen,              // query length  待匹配段碱基的que
                int *_gscore,  // query的端到端匹配得分
                int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
 {
+    return h0;
 #ifdef SHOW_PERF
-    extern int64_t time_bsw_init;
-    extern int64_t time_bsw_main_loop;
-    extern int64_t time_bsw_find_max;
-    extern int64_t time_bsw_adjust_bound;
-    extern int64_t time_compare;
-    int64_t start_time = get_mseconds();
+//    extern int64_t time_bsw_init;
+//    extern int64_t time_bsw_main_loop;
+//    extern int64_t time_bsw_find_max;
+//    extern int64_t time_bsw_adjust_bound;
+//    extern int64_t time_compare;
+//   int64_t start_time = get_mseconds();
 #endif
    uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H，其他的保存上个H E F M
    uint8_t *seq, *ref;
@ -316,13 +336,13 @@ int ksw_avx2_u8(int qlen,              // query length  待匹配段碱基的que
    int m_last = 0;
    int iend;
 #ifdef SHOW_PERF
-    time_bsw_init += get_mseconds() - start_time;
+//    time_bsw_init += get_mseconds() - start_time;
 #endif

    for (D = 1; LIKELY(D < Dloop); ++D)
    {
 #ifdef SHOW_PERF
-        start_time = get_mseconds();
+//        start_time = get_mseconds();
 #endif
        // 边界条件一定要注意！ tlen 大于，等于，小于 qlen时的情况
        if (D > tlen)
@ -395,17 +415,17 @@ int ksw_avx2_u8(int qlen,              // query length  待匹配段碱基的que
            SIMD_STORE;
        }
 #ifdef SHOW_PERF
-        time_bsw_main_loop += get_mseconds() - start_time;
+//        time_bsw_main_loop += get_mseconds() - start_time;
 #endif
 #ifdef SHOW_PERF
-        start_time = get_mseconds();
+//        start_time = get_mseconds();
 #endif
        SIMD_FIND_MAX;
 #ifdef SHOW_PERF
-        time_bsw_find_max += get_mseconds() - start_time;
+//        time_bsw_find_max += get_mseconds() - start_time;
 #endif
 #ifdef SHOW_PERF
-        start_time = get_mseconds();
+//        start_time = get_mseconds();
 #endif
        // 注意最后跳出循环j的值
        j = end + 1;
@ -444,13 +464,14 @@ int ksw_avx2_u8(int qlen,              // query length  待匹配段碱基的que
                break;
        }
        beg = j;
+        hA2[end + 1] = 0;
        for (j = end + 1; LIKELY(j >= beg); --j)
        {
            int has_val = hA1[j - 1] | hA2[j];
            if (has_val)
                break;
-            else
-                hA0[j - 1] = 0;
+            // else
+            //     hA0[j - 1] = 0;
        }
        end = j + 1 <= qlen ? j + 1 : qlen;

@ -461,9 +482,9 @@ int ksw_avx2_u8(int qlen,              // query length  待匹配段碱基的que
        // swap m, h, e, f
        SWAP_DATA_POINTER;
 #ifdef SHOW_PERF
-        time_bsw_adjust_bound += get_mseconds() - start_time;
-        start_time = get_mseconds();
-        time_compare += get_mseconds() - start_time;
+//        time_bsw_adjust_bound += get_mseconds() - start_time;
+//        start_time = get_mseconds();
+//        time_compare += get_mseconds() - start_time;
 #endif
    }

--- a/ksw_normal.c
+++ b/ksw_normal.c
@ -18,6 +18,7 @@ typedef struct

 int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
 {
+	// return h0;
 	eh_t *eh;	// score array
 	int8_t *qp; // query profile
 	int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
@ -106,14 +107,14 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 			max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时，target字符串的位置
 			gscore = gscore > h1 ? gscore : h1;
 		}
-		//		if (m == 0) // 遍历完query之后，当前轮次的最大分值为0，则跳出循环
-		//			break;
+		if (m == 0) // 遍历完query之后，当前轮次的最大分值为0，则跳出循环
+			break;
 		if (m > max) // 当前轮最大分值大于之前的最大分值
 		{
 			max = m, max_i = i, max_j = mj;							 // 更新取得最大值的target和query的位置
 			max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时，query和target对应字符串坐标的差值
 		}
-		else if (0) // zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值，而且zdrop值大于0
+		else if (zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值，而且zdrop值大于0
 		{
 			if (i - max_i > mj - max_j)
 			{
@ -127,15 +128,15 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 			}
 		}
 		// update beg and end for the next round
-   // 	for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
-   // 		;
-   // 	beg = j;
-   // 	for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
-   // 		;
-   // 	end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f，即insert
-		beg = 0, end = qlen;			   // uncomment this line for debugging
-										   //  fprintf(stderr, "\n");
-										   //  fprintf(stderr, "%d\n", end);
+		for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
+			;
+		beg = j;
+		for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
+			;
+		end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f，即insert
+										   // beg = 0, end = qlen;			   // uncomment this line for debugging
+										   //   fprintf(stderr, "\n");
+										   //   fprintf(stderr, "%d\n", end);
 	}
 	free(eh);
 	free(qp);
--- a/main.c
+++ b/main.c
@ -127,12 +127,12 @@ int main(int argc, char *argv[])
    // const char *qf_path = "/home/zzh/data/sw/q_s.fa";
    // const char *tf_path = "/home/zzh/data/sw/t_s.fa";
    // const char *if_path = "/home/zzh/data/sw/i_s.txt";
-    const char *qf_path = "/home/zzh/data/sw/q_m.fa";
-    const char *tf_path = "/home/zzh/data/sw/t_m.fa";
-    const char *if_path = "/home/zzh/data/sw/i_m.txt";
-    // const char *qf_path = "/home/zzh/data/sw/q_l.fa";
-    // const char *tf_path = "/home/zzh/data/sw/t_l.fa";
-    // const char *if_path = "/home/zzh/data/sw/i_l.txt";
+    // const char *qf_path = "/home/zzh/data/sw/q_m.fa";
+    // const char *tf_path = "/home/zzh/data/sw/t_m.fa";
+    // const char *if_path = "/home/zzh/data/sw/i_m.txt";
+    const char *qf_path = "/home/zzh/data/sw/q_l.fa";
+    const char *tf_path = "/home/zzh/data/sw/t_l.fa";
+    const char *if_path = "/home/zzh/data/sw/i_l.txt";
    //  const char *qf_path = "/public/home/zzh/data/sw/query.fa";
    //   const char *tf_path = "/public/home/zzh/data/sw/target.fa";
    //   const char *if_path = "/public/home/zzh/data/sw/info.txt";
@ -386,10 +386,10 @@ int main(int argc, char *argv[])
    // fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop) / DIVIDE_BY);
    // fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max) / DIVIDE_BY);
    // fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound) / DIVIDE_BY);
-    fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY);
-    fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY);
-    fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY);
-    fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY);
+    // fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY);
+    // fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY);
+    // fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY);
+    // fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY);
 #endif

    if (query_f != 0)