fixed a band boundary bug in ksw2

This commit is contained in:
Heng Li 2017-07-16 17:52:57 -04:00
parent b4280d186f
commit f42790398d
3 changed files with 29 additions and 12 deletions

View File

@ -19,7 +19,7 @@ Minimap2 is a fast sequence mapping and alignment program that can find
overlaps between long noisy reads, or map long reads or their assemblies to a
reference genome optionally with detailed alignment (i.e. CIGAR). At present,
it works efficiently with query sequences from a few kilobases to ~100
megabases in length at a error rate ~15%. Minimap2 outputs in the [PAF][paf] or
megabases in length at an error rate ~15%. Minimap2 outputs in the [PAF][paf] or
the [SAM format][sam]. On limited test data sets, minimap2 is over 20 times
faster than most other long-read aligners.

View File

@ -42,14 +42,17 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
a2= _mm_sub_epi8(a2, tmp); \
b2= _mm_sub_epi8(b2, tmp);
int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, long_thres, long_diff;
int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc, long_thres, long_diff;
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
int32_t *H = 0, H0 = 0, last_H0_t = 0;
uint8_t *qr, *sf, *mem, *mem2 = 0;
__m128i q_, q2_, qe_, qe2_, zero_, sc_mch_, sc_mis_, m1_;
__m128i *u, *v, *x, *y, *x2, *y2, *s, *p = 0;
if (m <= 0 || qlen <= 0 || tlen <= 0 || w < 0) return;
ksw_reset_extz(ez);
if (m <= 1 || qlen <= 0 || tlen <= 0 || w < 0) return;
if (q2 + e2 < q + e) t = q, q = q2, q2 = t, t = e, e = e2, e2 = t; // make sure q+e no larger than q2+e2
zero_ = _mm_set1_epi8(0);
q_ = _mm_set1_epi8(q);
@ -60,16 +63,17 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
sc_mis_ = _mm_set1_epi8(mat[1]);
m1_ = _mm_set1_epi8(m - 1); // wildcard
ksw_reset_extz(ez);
wl = wr = w;
tlen_ = (tlen + 15) / 16;
n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
qlen_ = (qlen + 15) / 16;
for (t = 1, max_sc = mat[0]; t < m * m; ++t)
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
max_sc = max_sc > mat[t]? max_sc : mat[t];
min_sc = min_sc < mat[t]? min_sc : mat[t];
}
if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
long_thres = (q2 - q) / (e - e2) - 1;
long_thres = e != e2? (q2 - q) / (e - e2) - 1 : 0;
if (q2 + e2 + long_thres * e2 > q + e + long_thres * e)
++long_thres;
long_diff = long_thres * (e - e2) - (q2 - q) - e2;
@ -164,6 +168,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
z = _mm_max_epi8(z, b);
z = _mm_max_epi8(z, a2);
z = _mm_max_epi8(z, b2);
z = _mm_min_epi8(z, sc_mch_);
__dp_code_block2; // save u[] and v[]; update a, b, a2 and b2
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_));
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_));
@ -178,6 +183,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
tmp = _mm_cmpgt_epi8(b2, z);
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
tmp = _mm_cmplt_epi8(sc_mch_, z);
z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
__dp_code_block2;
tmp = _mm_cmpgt_epi8(a, zero_);
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
@ -215,6 +222,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
z = _mm_max_epi8(z, a2);
d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d
z = _mm_max_epi8(z, b2);
z = _mm_min_epi8(z, sc_mch_);
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
tmp = _mm_cmpgt_epi8(a, z);
d = _mm_and_si128(tmp, _mm_set1_epi8(1));
@ -228,6 +236,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
tmp = _mm_cmpgt_epi8(b2, z);
d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4)));
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
tmp = _mm_cmplt_epi8(sc_mch_, z);
z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
#endif
__dp_code_block2;
tmp = _mm_cmpgt_epi8(a, zero_);
@ -270,6 +280,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
z = _mm_max_epi8(z, a2);
d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4
z = _mm_max_epi8(z, b2);
z = _mm_min_epi8(z, sc_mch_);
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
tmp = _mm_cmpgt_epi8(z, a);
d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
@ -283,6 +294,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
tmp = _mm_cmpgt_epi8(z, b2);
d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4)));
z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2));
tmp = _mm_cmplt_epi8(sc_mch_, z);
z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
#endif
__dp_code_block2;
tmp = _mm_cmpgt_epi8(zero_, a);

View File

@ -26,19 +26,21 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
#define __dp_code_block2 \
z = _mm_max_epu8(z, b); /* z = max(z, b); this works because both are non-negative */ \
z = _mm_min_epu8(z, max_sc_); \
_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
_mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
z = _mm_sub_epi8(z, q_); \
a = _mm_sub_epi8(a, z); \
b = _mm_sub_epi8(b, z);
int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc;
int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
int32_t *H = 0, H0 = 0, last_H0_t = 0;
uint8_t *qr, *sf, *mem, *mem2 = 0;
__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, m1_;
__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, m1_, max_sc_;
__m128i *u, *v, *x, *y, *s, *p = 0;
ksw_reset_extz(ez);
if (m <= 0 || qlen <= 0 || tlen <= 0 || w < 0) return;
zero_ = _mm_set1_epi8(0);
@ -51,15 +53,17 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
sc_mch_ = _mm_set1_epi8(mat[0]);
sc_mis_ = _mm_set1_epi8(mat[1]);
m1_ = _mm_set1_epi8(m - 1); // wildcard
ksw_reset_extz(ez);
max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2);
wl = wr = w;
tlen_ = (tlen + 15) / 16;
n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
qlen_ = (qlen + 15) / 16;
for (t = 1, max_sc = mat[0]; t < m * m; ++t)
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
max_sc = max_sc > mat[t]? max_sc : mat[t];
min_sc = min_sc < mat[t]? min_sc : mat[t];
}
if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16);
u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned