diff --git a/README.md b/README.md
index ffdd328..da6fcc5 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Minimap2 is a fast sequence mapping and alignment program that can find
 overlaps between long noisy reads, or map long reads or their assemblies to a
 reference genome optionally with detailed alignment (i.e. CIGAR). At present,
 it works efficiently with query sequences from a few kilobases to ~100
-megabases in length at a error rate ~15%. Minimap2 outputs in the [PAF][paf] or
+megabases in length at an error rate ~15%. Minimap2 outputs in the [PAF][paf] or
 the [SAM format][sam]. On limited test data sets, minimap2 is over 20 times
 faster than most other long-read aligners.
 
diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c
index af2b85c..c638738 100644
--- a/ksw2_extd2_sse.c
+++ b/ksw2_extd2_sse.c
@@ -42,14 +42,17 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 	a2= _mm_sub_epi8(a2, tmp); \
 	b2= _mm_sub_epi8(b2, tmp);
 
-	int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, long_thres, long_diff;
+	int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc, long_thres, long_diff;
 	int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
 	int32_t *H = 0, H0 = 0, last_H0_t = 0;
 	uint8_t *qr, *sf, *mem, *mem2 = 0;
 	__m128i q_, q2_, qe_, qe2_, zero_, sc_mch_, sc_mis_, m1_;
 	__m128i *u, *v, *x, *y, *x2, *y2, *s, *p = 0;
 
-	if (m <= 0 || qlen <= 0 || tlen <= 0 || w < 0) return;
+	ksw_reset_extz(ez);
+	if (m <= 1 || qlen <= 0 || tlen <= 0 || w < 0) return;
+
+	if (q2 + e2 < q + e) t = q, q = q2, q2 = t, t = e, e = e2, e2 = t; // make sure q+e no larger than q2+e2
 
 	zero_   = _mm_set1_epi8(0);
 	q_      = _mm_set1_epi8(q);
@@ -60,16 +63,17 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 	sc_mis_ = _mm_set1_epi8(mat[1]);
 	m1_     = _mm_set1_epi8(m - 1); // wildcard
 
-	ksw_reset_extz(ez);
-
 	wl = wr = w;
 	tlen_ = (tlen + 15) / 16;
 	n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
 	qlen_ = (qlen + 15) / 16;
-	for (t = 1, max_sc = mat[0]; t < m * m; ++t)
+	for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
 		max_sc = max_sc > mat[t]? max_sc : mat[t];
+		min_sc = min_sc < mat[t]? min_sc : mat[t];
+	}
+	if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
 
-	long_thres = (q2 - q) / (e - e2) - 1;
+	long_thres = e != e2? (q2 - q) / (e - e2) - 1 : 0;
 	if (q2 + e2 + long_thres * e2 > q + e + long_thres * e)
 		++long_thres;
 	long_diff = long_thres * (e - e2) - (q2 - q) - e2;
@@ -164,6 +168,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 				z = _mm_max_epi8(z, b);
 				z = _mm_max_epi8(z, a2);
 				z = _mm_max_epi8(z, b2);
+				z = _mm_min_epi8(z, sc_mch_);
 				__dp_code_block2; // save u[] and v[]; update a, b, a2 and b2
 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_max_epi8(a,  zero_), qe_));
 				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_max_epi8(b,  zero_), qe_));
@@ -178,6 +183,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
 				tmp = _mm_cmpgt_epi8(b2, z);
 				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
+				tmp = _mm_cmplt_epi8(sc_mch_, z);
+				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(a, zero_);
 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
@@ -215,6 +222,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 				z = _mm_max_epi8(z, a2);
 				d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d
 				z = _mm_max_epi8(z, b2);
+				z = _mm_min_epi8(z, sc_mch_);
 #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
 				tmp = _mm_cmpgt_epi8(a,  z);
 				d = _mm_and_si128(tmp, _mm_set1_epi8(1));
@@ -228,6 +236,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 				tmp = _mm_cmpgt_epi8(b2, z);
 				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4)));
 				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
+				tmp = _mm_cmplt_epi8(sc_mch_, z);
+				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
 #endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(a, zero_);
@@ -270,6 +280,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 				z = _mm_max_epi8(z, a2);
 				d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4
 				z = _mm_max_epi8(z, b2);
+				z = _mm_min_epi8(z, sc_mch_);
 #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
 				tmp = _mm_cmpgt_epi8(z, a);
 				d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
@@ -283,6 +294,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 				tmp = _mm_cmpgt_epi8(z, b2);
 				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4)));
 				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2));
+				tmp = _mm_cmplt_epi8(sc_mch_, z);
+				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
 #endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(zero_, a);
diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c
index ab26747..16f4fcd 100644
--- a/ksw2_extz2_sse.c
+++ b/ksw2_extz2_sse.c
@@ -26,19 +26,21 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 
 #define __dp_code_block2 \
 	z = _mm_max_epu8(z, b);                          /* z = max(z, b); this works because both are non-negative */ \
+	z = _mm_min_epu8(z, max_sc_); \
 	_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
 	_mm_store_si128(&v[t], _mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
 	z = _mm_sub_epi8(z, q_); \
 	a = _mm_sub_epi8(a, z); \
 	b = _mm_sub_epi8(b, z);
 
-	int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc;
+	int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
 	int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
 	int32_t *H = 0, H0 = 0, last_H0_t = 0;
 	uint8_t *qr, *sf, *mem, *mem2 = 0;
-	__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, m1_;
+	__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, m1_, max_sc_;
 	__m128i *u, *v, *x, *y, *s, *p = 0;
 
+	ksw_reset_extz(ez);
 	if (m <= 0 || qlen <= 0 || tlen <= 0 || w < 0) return;
 
 	zero_   = _mm_set1_epi8(0);
@@ -51,15 +53,17 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 	sc_mch_ = _mm_set1_epi8(mat[0]);
 	sc_mis_ = _mm_set1_epi8(mat[1]);
 	m1_     = _mm_set1_epi8(m - 1); // wildcard
-
-	ksw_reset_extz(ez);
+	max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2);
 
 	wl = wr = w;
 	tlen_ = (tlen + 15) / 16;
 	n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
 	qlen_ = (qlen + 15) / 16;
-	for (t = 1, max_sc = mat[0]; t < m * m; ++t)
+	for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
 		max_sc = max_sc > mat[t]? max_sc : mat[t];
+		min_sc = min_sc < mat[t]? min_sc : mat[t];
+	}
+	if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
 
 	mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16);
 	u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned