diff --git a/README.md b/README.md index ffdd328..da6fcc5 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Minimap2 is a fast sequence mapping and alignment program that can find overlaps between long noisy reads, or map long reads or their assemblies to a reference genome optionally with detailed alignment (i.e. CIGAR). At present, it works efficiently with query sequences from a few kilobases to ~100 -megabases in length at a error rate ~15%. Minimap2 outputs in the [PAF][paf] or +megabases in length at an error rate ~15%. Minimap2 outputs in the [PAF][paf] or the [SAM format][sam]. On limited test data sets, minimap2 is over 20 times faster than most other long-read aligners. diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index af2b85c..c638738 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -42,14 +42,17 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin a2= _mm_sub_epi8(a2, tmp); \ b2= _mm_sub_epi8(b2, tmp); - int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, long_thres, long_diff; + int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc, long_thres, long_diff; int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); int32_t *H = 0, H0 = 0, last_H0_t = 0; uint8_t *qr, *sf, *mem, *mem2 = 0; __m128i q_, q2_, qe_, qe2_, zero_, sc_mch_, sc_mis_, m1_; __m128i *u, *v, *x, *y, *x2, *y2, *s, *p = 0; - if (m <= 0 || qlen <= 0 || tlen <= 0 || w < 0) return; + ksw_reset_extz(ez); + if (m <= 1 || qlen <= 0 || tlen <= 0 || w < 0) return; + + if (q2 + e2 < q + e) t = q, q = q2, q2 = t, t = e, e = e2, e2 = t; // make sure q+e no larger than q2+e2 zero_ = _mm_set1_epi8(0); q_ = _mm_set1_epi8(q); @@ -60,16 +63,17 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin sc_mis_ = _mm_set1_epi8(mat[1]); m1_ = _mm_set1_epi8(m - 1); // wildcard - ksw_reset_extz(ez); - wl = wr = w; tlen_ = (tlen + 15) / 16; n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1; qlen_ = (qlen + 15) / 16; - for (t = 1, max_sc = mat[0]; t < m * m; ++t) + for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { max_sc = max_sc > mat[t]? max_sc : mat[t]; + min_sc = min_sc < mat[t]? min_sc : mat[t]; + } + if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches - long_thres = (q2 - q) / (e - e2) - 1; + long_thres = e != e2? (q2 - q) / (e - e2) - 1 : 0; if (q2 + e2 + long_thres * e2 > q + e + long_thres * e) ++long_thres; long_diff = long_thres * (e - e2) - (q2 - q) - e2; @@ -164,6 +168,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin z = _mm_max_epi8(z, b); z = _mm_max_epi8(z, a2); z = _mm_max_epi8(z, b2); + z = _mm_min_epi8(z, sc_mch_); __dp_code_block2; // save u[] and v[]; update a, b, a2 and b2 _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_)); _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_)); @@ -178,6 +183,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); tmp = _mm_cmpgt_epi8(b2, z); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); __dp_code_block2; tmp = _mm_cmpgt_epi8(a, zero_); _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); @@ -215,6 +222,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin z = _mm_max_epi8(z, a2); d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d z = _mm_max_epi8(z, b2); + z = _mm_min_epi8(z, sc_mch_); #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() tmp = _mm_cmpgt_epi8(a, z); d = _mm_and_si128(tmp, _mm_set1_epi8(1)); @@ -228,6 +236,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(b2, z); d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4))); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); #endif __dp_code_block2; tmp = _mm_cmpgt_epi8(a, zero_); @@ -270,6 +280,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin z = _mm_max_epi8(z, a2); d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4 z = _mm_max_epi8(z, b2); + z = _mm_min_epi8(z, sc_mch_); #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() tmp = _mm_cmpgt_epi8(z, a); d = _mm_andnot_si128(tmp, _mm_set1_epi8(1)); @@ -283,6 +294,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(z, b2); d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4))); z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); #endif __dp_code_block2; tmp = _mm_cmpgt_epi8(zero_, a); diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index ab26747..16f4fcd 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -26,19 +26,21 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin #define __dp_code_block2 \ z = _mm_max_epu8(z, b); /* z = max(z, b); this works because both are non-negative */ \ + z = _mm_min_epu8(z, max_sc_); \ _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ z = _mm_sub_epi8(z, q_); \ a = _mm_sub_epi8(a, z); \ b = _mm_sub_epi8(b, z); - int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc; + int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc; int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); int32_t *H = 0, H0 = 0, last_H0_t = 0; uint8_t *qr, *sf, *mem, *mem2 = 0; - __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, m1_; + __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, m1_, max_sc_; __m128i *u, *v, *x, *y, *s, *p = 0; + ksw_reset_extz(ez); if (m <= 0 || qlen <= 0 || tlen <= 0 || w < 0) return; zero_ = _mm_set1_epi8(0); @@ -51,15 +53,17 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin sc_mch_ = _mm_set1_epi8(mat[0]); sc_mis_ = _mm_set1_epi8(mat[1]); m1_ = _mm_set1_epi8(m - 1); // wildcard - - ksw_reset_extz(ez); + max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2); wl = wr = w; tlen_ = (tlen + 15) / 16; n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1; qlen_ = (qlen + 15) / 16; - for (t = 1, max_sc = mat[0]; t < m * m; ++t) + for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { max_sc = max_sc > mat[t]? max_sc : mat[t]; + min_sc = min_sc < mat[t]? min_sc : mat[t]; + } + if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16); u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned