sw_perf/ksw_normal.c

154 lines
6.6 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
typedef struct
{
int32_t h, e;
} eh_t;
int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
{
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
assert(h0 > 0);
qp = malloc(qlen * m);
eh = calloc(qlen + 1, 8);
// generate the query profile
for (k = i = 0; k < m; ++k)
{
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j)
qp[i++] = p[query[j]]; // 对于qp数组第0到qlen个元素表示和A比对的分值第qlen到2*qlen表示和C比对的分值以此类推
}
// fill the first row
// 初始化第一行分值
eh[0].h = h0;
eh[1].h = h0 > oe_ins ? h0 - oe_ins : 0;
for (j = 2; j <= qlen && eh[j - 1].h > e_ins; ++j)
eh[j].h = eh[j - 1].h - e_ins;
// adjust $w if it is too large
k = m * m; // 字符矩阵
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i] ? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); // 最大可插入的长度?
max_ins = max_ins > 1 ? max_ins : 1;
w = w < max_ins ? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1 ? max_del : 1;
w = w < max_del ? w : max_del; // TODO: is this necessary? 上述几行代码都是为了看看能否缩小窗口,减小计算
// DP loop
max = h0, max_i = max_j = -1;
max_ie = -1, gscore = -1;
max_off = 0;
beg = 0, end = qlen;
for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历
{
int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[target[i] * qlen]; // 对于target第i个字符query中每个字符的分值只有匹配和不匹配
// apply the band and the constraint (if provided)
if (beg < i - w) // 检查开始点是否可以缩小一些
beg = i - w;
if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小
end = i + w + 1;
if (end > qlen) // 终点不超过query长度
end = qlen;
// compute the first column
if (beg == 0)
{
h1 = h0 - (o_del + e_del * (i + 1)); // 只消耗了target序列query从第一个字符开始匹配第i个target字符
if (h1 < 0)
h1 = 0;
}
else
h1 = 0;
for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列
{
// fprintf(stderr, "%-3d", h1);
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete只消耗target
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert只消耗querytarget的row id固定query的col index一直增加
eh_t *p = &eh[j];
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值
p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果
M = M ? M + q[j] : 0; // separating H and M to disallow a cigar like "100M3I3D20M" // M大于0则当前两个字符进行匹配无论是否相等将分值加到M上此时M可能变为负数
h = M > e ? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 // e和f保证是非负的所以h肯定非负即使M可能是负数因为h取e,f和M的最大值
h = h > f ? h : f;
h1 = h; // save H(i,j) to h1 for the next column // 用h1来保存当前表格i,j)对应的分值,用来下次计算
mj = m > h ? mj : j; // record the position where max score is achieved // 记录取得最大值时query的字符位置
m = m > h ? m : h; // m is stored at eh[mj+1] 因为eh[mj+1]->h表示的是H(i, mj)及上一轮记录的h
t = M - oe_del; // 用来计算delete假设当前字符(i,j)匹配无论match还是mismatchtarget下一个字符串被空消耗delete)的分值F(i+1, j)
t = t > 0 ? t : 0;
e -= e_del; // 假设当前query字符
e = e > t ? e : t; // computed E(i+1,j) // t表示(i,j)强行匹配,(i+1, j)是delete的分数此前e表示(i+1,j)继续delete的分数
p->e = e; // save E(i+1,j) for the next row
t = M - oe_ins;
t = t > 0 ? t : 0;
f -= e_ins;
f = f > t ? f : t; // computed F(i,j+1)
}
eh[end].h = h1; // end是query序列之外的位置
eh[end].e = 0;
if (j == qlen) // 此轮遍历到了query的最后一个字符
{
max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时target字符串的位置
gscore = gscore > h1 ? gscore : h1;
}
// if (m == 0) // 遍历完query之后当前轮次的最大分值为0则跳出循环
// break;
if (m > max) // 当前轮最大分值大于之前的最大分值
{
max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置
max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时query和target对应字符串坐标的差值
}
else if (0) // zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0
{
if (i - max_i > mj - max_j)
{
if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) // 之前最大分值 -从取得最大值的点出发当前的delete总长度对应的分值 + 当前轮取得的最大值) > zdrop
break;
}
else
{
if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) // 同上不过这次是insert可能是说明有很多mismatch
break;
}
}
// update beg and end for the next round
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
;
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
;
end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f即insert
beg = 0, end = qlen; // uncomment this line for debugging
// fprintf(stderr, "\n");
// fprintf(stderr, "%d\n", end);
}
free(eh);
free(qp);
if (_qle)
*_qle = max_j + 1;
if (_tle)
*_tle = max_i + 1;
if (_gtle)
*_gtle = max_ie + 1;
if (_gscore)
*_gscore = gscore;
if (_max_off)
*_max_off = max_off;
return max;
}