diff --git a/Makefile b/Makefile
index 60c2104..78ee00b 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ CFLAGS=		-g -Wall -Wno-unused-function -O2
 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
 AR=			ar
 DFLAGS=		-DHAVE_PTHREAD $(WRAP_MALLOC)
-LOBJS=		utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
+LOBJS=		utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o intel_ext.o ed_intrav.o
 AOBJS=		QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
 			is.o bwtindex.o bwape.o kopen.o pemerge.o \
 			bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
@@ -14,18 +14,21 @@ INCLUDES=
 LIBS=		-lm -lz -lpthread
 SUBDIRS=	.
 
-.SUFFIXES:.c .o .cc
+.SUFFIXES:.c .o .cc .cpp
 
 .c.o:
 		$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
 
+.cpp.o:
+		$(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
 all:$(PROG)
 
 bwa:libbwa.a $(AOBJS) main.o
-		$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
+		$(CXX) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
 
 bwamem-lite:libbwa.a example.o
-		$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
+		$(CXX) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
 
 libbwa.a:$(LOBJS)
 		$(AR) -csru $@ $(LOBJS)
diff --git a/bwamem.c b/bwamem.c
index e9d9304..ae5d98e 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -15,6 +15,8 @@
 #include "ksort.h"
 #include "utils.h"
 
+#include "intel_ext.h"
+
 #ifdef USE_MALLOC_WRAPPERS
 #  include "malloc_wrap.h"
 #endif
@@ -664,6 +666,21 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 			tmp = s->rbeg - rmax[0];
 			rs = malloc(tmp);
 			for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
+			if (opt->flag & MEM_F_FASTFLT) {
+				int alignedQLen, alignedRlen, score;
+				float confidence;
+				intel_filter(rs, tmp, qs, s->qbeg, s->len, 0, &alignedQLen, &alignedRlen, &score, &confidence);
+				if (confidence == 1.0) {
+					if (alignedQLen == tmp) {
+						a->score = a->truesc = score * opt->a;
+						a->qb = 0; a->rb = s->rbeg - alignedRlen;
+					} else if (alignedQLen == 0) {
+						a->score = a->truesc = s->len * opt->a;
+						a->qb = s->qbeg; a->rb = s->rbeg;
+					}
+					goto end_left_extend;
+				}
+			}
 			for (i = 0; i < MAX_BAND_TRY; ++i) {
 				int prev = a->score;
 				aw[0] = opt->w << i;
@@ -684,6 +701,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 				a->qb = 0, a->rb = s->rbeg - gtle;
 				a->truesc = gscore;
 			}
+
+end_left_extend:
 			free(qs); free(rs);
 		} else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
 
@@ -692,6 +711,20 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 			qe = s->qbeg + s->len;
 			re = s->rbeg + s->len - rmax[0];
 			assert(re >= 0);
+			if (opt->flag & MEM_F_FASTFLT) {
+				int alignedQLen, alignedRlen, score;
+				float confidence;
+				intel_filter(rseq + re, rmax[1] - rmax[0] - re, query + qe, l_query - qe, a->score / opt->a, 0, &alignedQLen, &alignedRlen, &score, &confidence);
+				if (confidence == 1.0) {
+					if (alignedQLen == tmp) {
+						a->score = a->truesc = score * opt->a;
+						a->qe = l_query; a->re = rmax[0] + re + alignedRlen;
+					} else if (alignedQLen == 0) {
+						a->qe = qe; a->rb = rmax[0] + re;
+					}
+					goto end_right_extend;
+				}
+			}
 			for (i = 0; i < MAX_BAND_TRY; ++i) {
 				int prev = a->score;
 				aw[1] = opt->w << i;
@@ -715,6 +748,7 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 		} else a->qe = l_query, a->re = s->rbeg + s->len;
 		if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]);
 
+end_right_extend:
 		// compute seedcov
 		for (i = 0, a->seedcov = 0; i < c->n; ++i) {
 			const mem_seed_t *t = &c->seeds[i];
diff --git a/bwamem.h b/bwamem.h
index 7b14ca8..80d4617 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -19,6 +19,7 @@ typedef struct __smem_i smem_i;
 #define MEM_F_SELF_OVLP 0x40
 #define MEM_F_ALN_REG   0x80
 #define MEM_F_SOFTCLIP  0x200
+#define MEM_F_FASTFLT   0x400
 
 typedef struct {
 	int a, b;               // match score and mismatch penalty
diff --git a/ed_fine.h b/ed_fine.h
new file mode 100644
index 0000000..07b38b5
--- /dev/null
+++ b/ed_fine.h
@@ -0,0 +1,115 @@
+/* 
+The MIT License (MIT)
+Copyright (c) 2014 Intel Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+#include <stdint.h>
+#include <iostream>
+
+
+class BitCount8 {
+  static uint8_t lut_[256] ;
+
+ public:
+
+  static void init_lut() ;
+  
+  static int getCnt(uint8_t bitVec) {
+    return lut_[bitVec] ;
+  }
+
+} ;
+
+
+class BitCount16 {
+  static uint8_t lut_[256] ;
+
+ public:
+
+  static void init_lut() ;
+  
+  static int getCnt(uint16_t bitVec) {
+    return BitCount8::getCnt((uint8_t) (bitVec >> 8)) + 
+      BitCount8::getCnt((uint8_t) bitVec) ;
+  }
+  
+} ;
+
+
+
+// Fine-grain alignment for up to the first 16 bases.
+// Considers only alignments where qlen=rlen.
+class FineAlignment16 {
+
+  int queryLen_, endBonus_ ;
+  
+  int bestAlignedLenLow_, bestAlignedLenHigh_ ;
+  int maxScoreLow_, maxScoreHigh_ ;
+
+  int mismatchWtP1Low_, mismatchWtP1High_ ;
+  int ambigWtP1_ ;
+  
+
+ public:
+
+ FineAlignment16(int queryLen, int endBonus, int mismatchWtLow, int mismatchWtHigh, int ambigWt)
+   : queryLen_(queryLen), endBonus_(endBonus), 
+    mismatchWtP1Low_(mismatchWtLow+1), mismatchWtP1High_(mismatchWtHigh+1), 
+    ambigWtP1_(ambigWt+1),
+    bestAlignedLenLow_(0), bestAlignedLenHigh_(0), maxScoreLow_(0), maxScoreHigh_(0) {}
+  
+  void update(uint16_t UP0, uint16_t UP1, int partialLen, int ambigCnt) {
+
+    uint16_t mask = ((uint32_t)(1) << partialLen) - 1 ; // clear higher bits beyond partialLen
+    UP0 &= mask ;
+    UP1 &= mask ;
+
+    int editDist = (partialLen <= 8) ? 
+      (BitCount8::getCnt((uint8_t)UP0) - BitCount8::getCnt((uint8_t)UP1)) :
+      (BitCount16::getCnt(UP0) - BitCount16::getCnt(UP1)) ;
+
+    int bonus = (partialLen == queryLen_) ? endBonus_ : 0 ;
+
+    int scoreLow = partialLen - editDist * mismatchWtP1High_ - ambigCnt * ambigWtP1_ + bonus ;
+    int scoreHigh = partialLen - editDist * mismatchWtP1Low_ - ambigCnt * ambigWtP1_ + bonus ;
+
+    //cout << std::hex << mask << " " << UP0 << " " << UP1 << std::dec << endl ;
+    //cout << BitCount16::getCnt(UP0) << " " << BitCount16::getCnt(UP1) << endl ;
+    //cout << std::hex << (int) ((uint8_t) (UP0 >> 8)) << " " << (int) ((uint8_t) UP0) << endl ;
+    //cout << "###################len = " << partialLen << ": " << ambigCnt << " " << editDist
+    //	 << " " << scoreLow << " " << scoreHigh << endl ;
+
+    if (scoreLow > maxScoreLow_) {
+      maxScoreLow_ = scoreLow ;
+      bestAlignedLenLow_ = partialLen ;
+    }
+
+    if (scoreHigh > maxScoreHigh_) {
+      maxScoreHigh_ = scoreHigh ;
+      bestAlignedLenHigh_ = partialLen ;
+    }
+
+  }
+
+  void getBestLow(int& alignedLen, int& score) {
+    alignedLen = bestAlignedLenLow_ ;
+    score = maxScoreLow_ ;
+  }
+
+  void getBestHigh(int& alignedLen, int& score) {
+    alignedLen = bestAlignedLenHigh_ ;
+    score = maxScoreHigh_ ;
+  }
+
+  friend ostream& operator<< (ostream& os, const FineAlignment16& a) {
+    os << "{maxScores = (" << a.maxScoreLow_ << ", " << a.maxScoreHigh_ << "); best cols: "
+       << a.bestAlignedLenLow_ << ", " << a.bestAlignedLenHigh_ << "}" ;
+
+    return os ;
+  }
+
+} ;
diff --git a/ed_intrav.cpp b/ed_intrav.cpp
new file mode 100644
index 0000000..f686db2
--- /dev/null
+++ b/ed_intrav.cpp
@@ -0,0 +1,1280 @@
+/* 
+The MIT License (MIT)
+Copyright (c) 2014 Intel Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+#include <iostream>
+#include <stdint.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "ed_intrav64.h"
+#include "ed_intrav64x2.h"
+
+#include "ed_intrav.h"
+#include "ed_fine.h"
+#include "filter.h"
+
+
+//using namespace std ;
+
+#define DIST_TYPE uint16_t
+
+//#define MAIN
+//#define DEBUG0
+//#define DEBUG1
+//#define DEBUG2
+
+#define FINE_ALIGN
+
+#define FILTER_USING_HAMMING_DISTANCE
+
+
+#ifdef SW_FILTER_AND_EXTEND
+#define SW_FEEDBACK 
+#endif
+
+
+int run_ksw_extend(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+		   int bandW, int initScore, int endBonus, int zdrop, int costMatrixRowCnt,
+		   const int8_t* costMatrix, int gapo, int gape,
+		   int& alignedQLen, int& alignedRLen) {
+
+  int qle, tle, gtle, gscore, max_off ;
+
+  int pscore = ksw_extend(queryLen, querySeq, refLen, refSeq, costMatrixRowCnt, costMatrix,
+			  gapo, gape, bandW, endBonus, zdrop, initScore,
+			  &qle, &tle, &gtle, &gscore, &max_off) ;
+
+  if (gscore <= 0 || gscore <= pscore - endBonus) {
+    alignedQLen = qle ;
+    alignedRLen = tle ;
+    return pscore ;
+  }
+  else {
+    alignedQLen = queryLen ;
+    alignedRLen = gtle ;
+    return gscore ;
+  }
+}
+
+
+#ifdef SW_FEEDBACK
+
+template<class EDVec>
+void compute_sw_feedback(const EDVec& bestLowScoreVec, const EDVec& bestColVecLow,
+			 const EDVec& bestHighScoreVec, const EDVec& bestColVecHigh,
+			 const EDVec& queryLenVec, const EDVec& gapWtVec, 
+			 const EDVec& gapOpenWtVec, const EDVec& ambigBaseCntsVec,
+			 const EDVec& ambigWtP1Vec, 
+			 int mismatchWtLow, int mismatchWtHigh, int validDistCnt,
+			 SWFeedback& feedback) {
+
+  EDVec deleteCntLow = bestColVecLow.subSat(queryLenVec) ;
+  EDVec insertCntLow = queryLenVec.subSat(bestColVecLow) ;
+  EDVec deleteCntHigh = bestColVecHigh.subSat(queryLenVec) ;
+  EDVec insertCntHigh = queryLenVec.subSat(bestColVecHigh) ;
+
+  // Recompute the real edit distance.
+  // score =  - insertCnt * (gapWt+1)     
+  //          - deleteCnt * gapWt         
+  //          - (gapCnt > 0) ? gapOpenWt  
+  //          - editDist * (mismatchWt+1) 
+  //          + qlen - ambigCnt * (ambigWt+1) 
+  //
+  // edistDist * (mismatchWt+1) = qlen - ambigCnt * (ambigWt+1)
+  //                              - deleteCnt * (gapWt+1)     
+  //                              - insertCnt * gapWt    
+  //                              - (gapCnt > 0) ? gapOpenWt 
+  //                              - score 
+
+  EDVec gapCntLow = insertCntLow + deleteCntLow ;
+  EDVec gapCntHigh = insertCntHigh + deleteCntHigh ;
+  EDVec zeroVec(0) ;
+
+  EDVec gapPenaltyVecLow = gapCntLow * gapWtVec + insertCntLow ;
+  gapPenaltyVecLow.addThirdIfFirstGTSecond(gapCntLow, zeroVec, gapOpenWtVec, zeroVec) ;
+
+  EDVec gapPenaltyVecHigh = gapCntHigh * gapWtVec + insertCntHigh ;
+  gapPenaltyVecHigh.addThirdIfFirstGTSecond(gapCntHigh, zeroVec, gapOpenWtVec, zeroVec) ;
+
+  // Best scores were computed using the following formula:
+  // score = deltaScore - gapPenalty - accumDist * mismatchWtP1 ;
+  // Then:
+  // accumDist * mismatchWtP1 = deltaScore - gapPenalty - score 
+  //
+
+  EDVec ambigPenaltyVec = ambigBaseCntsVec * ambigWtP1Vec ;
+
+  EDVec weightedEDLow = queryLenVec - ambigPenaltyVec - gapPenaltyVecHigh - bestLowScoreVec ;
+  EDVec weightedEDHigh = queryLenVec - ambigPenaltyVec - gapPenaltyVecLow - bestHighScoreVec ;
+
+  //cout << queryLenVec << endl << ambigPenaltyVec << endl << gapPenaltyVecHigh << endl
+  //     << bestLowScoreVec << endl << weightedEDLow << endl ;
+
+  feedback.maxQLen = 0 ;
+  feedback.maxRLen = 0 ;
+  feedback.maxBand = 0 ;
+
+  for (int iter=0; iter < 2; ++iter) {
+    const EDVec& bestScoreVec = (iter == 0) ? bestLowScoreVec : bestHighScoreVec ;
+    const EDVec& weightedEDVec = (iter == 0) ? weightedEDLow : weightedEDHigh ;
+    const EDVec& bestColVec = (iter == 0) ? bestColVecLow : bestColVecHigh ;
+    int mismatchWt = ((iter == 0) ? mismatchWtHigh : mismatchWtLow) + 1 ;
+
+    for (int di=0; di < validDistCnt; ++di) {
+
+      // We will consider only the entries with a non-negative score
+      if (bestScoreVec.getWord(di) > 0) {
+
+#ifdef DEBUG1
+	cout << "Edit distance of " << ((iter==0) ? "low" : "high") << " score for qlen = "
+	     << queryLenVec.getWord(di) << " is " << weightedEDVec.getWord(di) / mismatchWt
+	     << ", where mismatchWt = " << mismatchWt << endl ;
+#endif
+	
+	feedback.maxBand = std::max(feedback.maxBand, 
+				    weightedEDVec.getWord(di) / mismatchWt) ;
+
+	feedback.maxQLen = std::max(feedback.maxQLen, queryLenVec.getWord(di)) ;
+	feedback.maxRLen = std::max(feedback.maxRLen, bestColVec.getWord(di)) ;
+	
+      }
+    }
+  }
+
+  feedback.maxBand += 1 ; // band should be at least 1 more than the edit distance
+}
+
+#endif // #ifdef SW_FEEDBACK
+
+template<class BitVec>
+int getValueForDebug(const BitVec& B0, const BitVec& B1, int index) {
+  int bit0 = (int) B0.getBit(index) ;
+  int bit1 = (int) B1.getBit(index) ;
+  //assert (bit0 == 0 || bit1 == 0) ;
+  //assert (bit0 <= 1 && bit1 <= 1) ;
+
+  return (bit0 == 1) ? 1 : ((bit1 == 1) ? -1 : 0) ;
+  //return (bit1 << 1) | bit0 ;
+}
+
+template<class BitVec>
+int getValueForDebug(const BitVec& B0, const BitVec& B1, const BitVec& B2, int index) {
+  int bit0 = (int) B0.getBit(index) ;
+  int bit1 = (int) B1.getBit(index) ;
+  int bit2 = (int) B2.getBit(index) ;
+  //assert (bit0 == 0 || bit1 == 0) ;
+  //assert (bit0 <= 1 && bit1 <= 1) ;
+
+  //return (bit0 == 1) ? 1 : ((bit1 == 1) ? -1 : 0) ;
+  return (bit2 << 2) | (bit1 << 1) | bit0 ;
+}
+
+
+template<class BitVec>
+void printVecForDebug(const BitVec& B0, const BitVec& B1, int cnt) {
+  for (int i=0; i < cnt; ++i) {
+    cout << getValueForDebug(B0, B1, i) ;
+    if (i % 8 == 7)
+      cout << "|" ;
+    if (i % 16 == 15)
+      cout << "|" ;
+    if (i % 32 == 31)
+      cout << "|" ;
+    if (i % 64 == 63)
+      cout << "|" ;
+    cout << " " ;
+  }
+}
+
+template<class BitVec>
+void printVecForDebug(const char* msg, const BitVec& B0, int cnt) {
+
+  cout << msg ;
+  for (int i=0; i < cnt; ++i) {
+    cout << B0.getBit(i) << " " ;
+  }
+  cout << endl ;
+}
+
+
+
+
+template<class BitVec>
+void printVecForDebug(const BitVec& B0, const BitVec& B1, const BitVec& B2, int cnt) {
+  for (int i=0; i < cnt; ++i) {
+    cout << getValueForDebug(B0, B1, i) << " " ;
+  }
+}
+
+
+template<class BitVec>
+void printVecsForDebug(const BitVec& L0, const BitVec& L1,
+		       const BitVec& U0, const BitVec& U1, int cnt) {
+
+  cout << "L: " ;
+  printVecForDebug(L0, L1, cnt) ;
+  cout << endl ;
+
+  cout << "U: " ;
+  printVecForDebug(U0, U1, cnt) ;
+  cout << endl ;
+}
+
+inline void orBitAtPosition(uint64_t& w, bool newBit, int index) {
+  w |= (((uint64_t)newBit) << index) ;
+}
+
+template<class BitVec>
+inline BitVec computeMatchVec(const BitVec& Q0, const BitVec& Q1, const BitVec& Q2,
+			      const BitVec& R0, const BitVec& R1, const BitVec& R2) {
+
+#ifdef MAIN
+  return Q2 | R2 | (Q1 ^ R1) | (Q0 ^ R0) ;
+#else
+  return ~Q2 & ~R2 & ((Q1 ^ R1) | (Q0 ^ R0)) ;
+#endif
+}
+
+/*
+template<class BitVec, class EDVec>
+class ScoreProbe {
+  DistVec<BitVec,EDVec> accumDist ;
+  EDVec ambigBaseCnts ;
+  
+  EDVec bestLowScoreVec, bestHighScoreVec, bestColVecLow, bestColVecHigh, currNegLowScoreVec,
+    currNegHighScoreVec, currColVec ;
+  EDVec queryLenVec ;
+} ;
+*/
+
+int computeHammingDistance(const uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen, int upperBound, int& lastConsecutiveMatching) {
+  
+  int cnt = min(refLen, queryLen) ;
+  int dist = 0 ;
+  lastConsecutiveMatching = 0 ;
+  for (int ci=0; ci < cnt; ++ci) {
+
+    ++lastConsecutiveMatching ;
+    if (refSeq[ci] != querySeq[ci]) {
+      ++dist ;
+      lastConsecutiveMatching = 0 ;
+      if (dist >= upperBound)
+	return dist ;
+    }
+  }
+
+  return dist ;
+}
+
+
+template<class BitVec, class EDVec>
+DIST_TYPE edit_dist(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+		    int initScore, int endBonus, 
+		    int& alignedQLenLow, int& alignedQLenHigh,
+		    int& alignedRLenLow, int& alignedRLenHigh,
+		    int& maxScoreLow, int& maxScoreHigh, SWFeedback& swfb,
+		    bool printDebug = false) {
+
+  assert (BitVec::isQueryLengthOk(queryLen)) ;
+
+#ifdef DEBUG0
+  cout << "Query and ref lens for SW problem: " << queryLen << " " << refLen << endl ;
+#endif
+
+#ifdef FILTER_USING_HAMMING_DISTANCE
+  
+  if (queryLen >= 20) {
+    int lastConsecutiveMatching=0 ;
+    int upperBound = queryLen / 6 ;
+
+    int hdist = computeHammingDistance(refSeq, refLen, querySeq, queryLen, upperBound, lastConsecutiveMatching) ;
+    
+    if (hdist < upperBound && lastConsecutiveMatching > 6) {
+#ifdef DEBUG0
+      cout << "The Hamming distance is " << hdist  << " with "
+	   << lastConsecutiveMatching
+	   << " bases matches at the end of query. " << endl ;
+      cout << "Returning full alignment directly." << endl ;
+#endif
+      alignedQLenLow = alignedQLenHigh = queryLen ;
+      alignedRLenLow = alignedRLenHigh = queryLen ;
+      maxScoreLow = queryLen - hdist - hdist * 6 ;
+      maxScoreHigh = queryLen - hdist - hdist * 4 ;
+
+      return hdist ;
+    }
+  }
+  
+  
+
+
+#endif // #ifdef FILTER_USING_HAMMING_DISTANCE
+
+
+  BitVec L0, L1, U0, U1, UP0, UP1 ;
+  BitVec Q0, Q1, Q2 ;
+  BitVec D[5] ; // profile vecs
+
+#ifdef MAIN
+  bool initL0 = 1 ;
+  bool initL1 = 0 ;
+
+  bool initU0 = 1 ;
+  bool initU1 = 0 ;
+#else
+  bool initL0 = 0 ;
+  bool initL1 = 0 ;
+
+  bool initU0 = 0 ;
+  bool initU1 = 0 ;
+#endif
+
+
+#ifdef FINE_ALIGN
+  const int FINE_ALIGN_CNT = 16 ; // align the first 16 bases in a fine-grain way
+  assert (FINE_ALIGN_CNT <= 16) ; // the rest of the code assumes this.
+  int ambigBaseCntsFineArr[FINE_ALIGN_CNT] ;
+  for (int i=0; i < FINE_ALIGN_CNT; ++i)
+    ambigBaseCntsFineArr[i] = 0 ;
+#endif
+
+  EDVec ambigBaseCnts(0) ;
+  int ambigBaseCntsArr[EDVec::WORD_CNT()] ;
+  for (int i=0; i < EDVec::WORD_CNT(); ++i)
+    ambigBaseCntsArr[i] = 0 ;
+  
+  int ambigWordIndex = 0 ;
+  //int totalAmbigBaseCnt = 0 ;
+  //int lastWordUpdated = 0 ;
+  // Store the query in bit vectors
+  for (int wIndex=0; wIndex * 63 < queryLen; ++wIndex) {
+    uint64_t bit0Partial = 0x0 ;
+    uint64_t bit1Partial = 0x0 ;
+    uint64_t bit2Partial = 0x0 ;
+
+    for (int bIndex=0; bIndex < 63; ++bIndex) {
+      int qi = wIndex * 63 + bIndex ;
+      if (qi >= queryLen)
+	break ;
+      uint8_t qbase = querySeq[qi] ;
+      if (qbase == 4) {
+	//lastWordUpdated = EDVec::getWordIndexFor(qi) ;
+	//ambigBaseCntsArr[lastWordUpdated] = ++totalAmbigBaseCnt ;
+	//++ambigBaseCntsArr[EDVec::getWordIndexFor(qi)] ;
+
+	while (qi >= EDVec::getDistAt(ambigWordIndex, queryLen))
+	  ++ambigWordIndex ;
+	++ambigBaseCntsArr[ambigWordIndex] ;
+
+	//cout << "Ambig base at " << wIndex << " " << bIndex << " " << qi << endl ;
+	//cout << "Word index for ambig base: " << ambigWordIndex << endl ;
+
+#ifdef FINE_ALIGN
+	if (qi < FINE_ALIGN_CNT)
+	  ++ambigBaseCntsFineArr[qi] ;
+#endif
+      }
+
+      orBitAtPosition(bit0Partial, qbase & 0x1, bIndex) ;
+      orBitAtPosition(bit1Partial, (qbase & 0x2) >> 1, bIndex) ;
+      orBitAtPosition(bit2Partial, (qbase & 0x4) >> 2, bIndex) ;
+    }
+    Q0.setWord(wIndex, bit0Partial) ;
+    Q1.setWord(wIndex, bit1Partial) ;
+    Q2.setWord(wIndex, bit2Partial) ;
+  }
+  
+  
+  // Compute running sum of ambig base cnts
+  for (int wi=1; wi < EDVec::WORD_CNT(); ++wi) {
+    ambigBaseCntsArr[wi] += ambigBaseCntsArr[wi-1] ;
+  }
+  ambigBaseCnts.setWords(ambigBaseCntsArr) ;
+  
+
+#ifdef FINE_ALIGN
+  for (int wi=1; wi < FINE_ALIGN_CNT; ++wi) {
+    ambigBaseCntsFineArr[wi] += ambigBaseCntsFineArr[wi-1] ;
+  }
+#endif
+
+  // pre-compute the profile vectors
+  BitVec B0, B1 ;
+  B0.setAllZeroes() ;
+  B1.setAllOnes() ;
+  
+  D[0] = computeMatchVec(Q0, Q1, Q2, B0, B0, B0) ;
+  D[1] = computeMatchVec(Q0, Q1, Q2, B1, B0, B0) ;
+  D[2] = computeMatchVec(Q0, Q1, Q2, B0, B1, B0) ;
+  D[3] = computeMatchVec(Q0, Q1, Q2, B1, B1, B0) ;
+  D[4] = computeMatchVec(Q0, Q1, Q2, B0, B0, B1) ;
+  
+
+  // Initialize boundary conditions
+  BitVec initL0Vec, initL1Vec ;
+  initL0Vec.setLSBClearRest(initL0) ;
+  initL1Vec.setLSBClearRest(initL1) ;
+
+  U0.setAllBits(initU0) ;
+  U1.setAllBits(initU1) ;
+
+  // Init the edit distance vec.
+  DistVec<BitVec,EDVec> accumDist(queryLen) ;
+  accumDist.initDist(initU0, initU1) ;
+  EDVec bestAccumDistLow = accumDist.getVec() ;
+  EDVec bestAccumDistHigh = accumDist.getVec() ;
+
+  int INF_SCORE = EDVec::getMaxWordVal() ;
+
+  EDVec currColVec(0) ;
+  EDVec bestLowScore(0), bestHighScore(0), bestColVecLow(0), bestColVecHigh(0) ;
+
+  EDVec queryLenVec ;
+  queryLenVec.setWordsAsDist(queryLen, 1) ;
+
+  int mismatchWtLow = 3; // 3 ;
+  int mismatchWtHigh = 6 ; // 6 ;
+  int gapWt = 1 ;
+  int gapOpenWt = 6 ; // 2
+  int ambigWt = 1 ;
+
+#ifdef FINE_ALIGN
+  FineAlignment16 fineAlign(queryLen, endBonus, mismatchWtLow, mismatchWtHigh, ambigWt) ;
+#endif
+
+  EDVec mismatchWtLowVec(mismatchWtLow), mismatchWtHighVec(mismatchWtHigh),
+    mismatchWtLowP1Vec(mismatchWtLow+1), mismatchWtHighP1Vec(mismatchWtHigh+1), // P1: plus 1
+    gapWtVec(gapWt), gapWtP1Vec(gapWt+1), gapOpenWtVec(gapOpenWt), ambigWtP1Vec(ambigWt+1), 
+    zeroVec(0) ;
+  
+  initScore -= (6-gapOpenWt) ;
+  initScore += endBonus ;
+
+  EDVec endBonusVec(0), deltaScoreVec(initScore), badScoreVec(0) ;
+  endBonusVec.setWordsAsEndBonus(queryLen, endBonus) ;
+  deltaScoreVec = deltaScoreVec + queryLenVec - ambigBaseCnts * ambigWtP1Vec + endBonusVec ;
+  badScoreVec.setWordsAsBadScore(queryLen, 0, INF_SCORE) ;
+
+  EDVec deltaCostInsert((gapWt+1)) ; // insert costs decrease as ci increases
+  EDVec deltaCostDelete(0) ; // delete costs increase as ci increases
+  EDVec deltaCostDistWtLow(-(mismatchWtLow+1)), deltaCostDistWtHigh(-(mismatchWtHigh+1)) ;
+  EDVec deltaColVec(1) ;
+  
+  EDVec insertVecModifier(0), deleteVecModifier(0), gapOpenDelta(0) ;
+  insertVecModifier.setFirstWord(-(gapWt+1)) ; // will cancel one entry when added
+  deleteVecModifier.setFirstWord(-gapWt) ; // will add deleteWt to one entry
+  gapOpenDelta.setFirstWord(gapOpenWt) ; // will remove/add gapOpenWt from/to one diag entry
+
+  EDVec firstColCost = deltaScoreVec - queryLenVec * gapWtP1Vec - gapOpenWtVec ;
+
+  EDVec currLowScore = firstColCost ;
+  EDVec currHighScore = firstColCost ;
+  EDVec deltaMismatch(0) ;
+
+#ifdef DEBUG1
+  cout << " The init score is: " << initScore << endl ;
+  cout << " The cost of the 0th column: " << firstColCost << endl ;
+#endif
+
+
+
+  // Score is computed as follows:
+  // (Note that ambigCnt and gapCnt are NOT included in editDist.)
+  // (If the gap type is deletion and there are ambig bases in query, the score estimated
+  //  is lower than the actual score. Some of the ambig bases may be skipped in deletions, but
+  //  we don't capture this. This can be fixed later by subtracting delete-cnt from ambigCnt
+  //  at every column less than qlen.)
+  // score =  (qlen - editDist - ambigCnt - insertCnt) // score corresponding to matches
+  //          - ambigCnt * ambigWt // score corresponding to ambig bases
+  //          - (deleteCnt+insertCnt) * gapWt // score corresponding to gap extends
+  //          - editDist * mismatchWt // score corresponding to mismatches
+  // Reordering the constant terms, we have:
+  // score =  - insertCnt * (gapWt+1)     // variable term
+  //          - deleteCnt * gapWt         // variable term
+  //          - editDist * (mismatchWt+1) // variable term
+  //          + qlen - ambigCnt * (ambigWt+1) // constant terms
+
+
+#define PRINT_UPDATE_COST_DEBUG(__ci) {					\
+    cout << "In iteration " << __ci << ", the cost values are updated to:" << endl ; \
+    cout << "L0: " << std::hex << L0 << std::dec << endl ;		\
+    cout << "L1: " << std::hex << L1 << std::dec << endl ;		\
+    cout << "currColVec: " << currColVec << endl ;			\
+    cout << "queryLenVec: " << queryLenVec << endl ;			\
+    cout << "deltaCostInsert: " << deltaCostInsert << endl ;	        \
+    cout << "deltaCostDelete: " << deltaCostDelete << endl ;      	\
+    cout << "deltaMismatch: " << deltaMismatch << endl ;		\
+    cout << "deltaCostDistWtHigh: " << deltaCostDistWtHigh << endl ;	\
+    cout << "deltaCostDistWtLow: " << deltaCostDistWtLow << endl ;	\
+    cout << "currLowScore: " << currLowScore << endl ;			\
+    cout << "currHighScore: " << currHighScore << endl ;		\
+    cout << "bestColLow: " << bestColVecLow << endl ;			\
+    cout << "bestColHigh: " << bestColVecHigh << endl ;			\
+    cout << "bestLowScore: " << bestLowScore << endl ;			\
+    cout << "bestHighScore: " << bestHighScore << endl ;		\
+  }
+
+#define PRINT_DELTA_MODIFICATIONS_DEBUG(__ci) {			       \
+    cout << "The delta costs have been updated after processing column " << __ci << endl ; \
+    cout << "New delta cost insert: " << deltaCostInsert << endl ;	\
+    cout << "New delta cost delete: " << deltaCostDelete << endl ;	\
+    cout << "using the modifiers: " << endl ;				\
+    cout << "Insert vec modifier: " << insertVecModifier << endl ;	\
+    cout << "Delete vec modifier: " << deleteVecModifier << endl ;	\
+  }
+
+
+#define COMPUTE_LU(__ci) {			\
+    uint8_t refChar = refSeq[ci] ;		\
+    const BitVec& currD = D[refChar] ;		\
+    BitVec SL1 = U0.andnot(currD) ;		\
+    SL1.shiftLeftAndInsert(initL1) ;		\
+						\
+    BitVec INJ = U0 & SL1 ;			\
+    BitVec SUM = INJ + U0 ;			\
+    BitVec CIN = SUM ^ INJ ^ U0 ;		\
+						\
+    L1 = SL1 | CIN ;				\
+    L0 = U1 | currD.andnot(L1).andnot(U0) ;	\
+    L0.shiftLeftAndInsert(initL0) ;		\
+						\
+    BitVec TU = currD.andnot(U1) ;		\
+    U1 = L0.andnot(TU) ;			\
+    U0 = L1 | TU.andnot(L0) ;			\
+						\
+    /* Add delta costs corresponding to the current mismatch delta (-1, 0, or 1) */ \
+    deltaMismatch = accumDist.getDeltaDistVec(L0, L1) ;			\
+    accumDist.addDist(deltaMismatch) ;					\
+    									\
+  }
+
+#define UPDATE_COSTS(__ci) {						\
+    currColVec += deltaColVec ;						\
+    EDVec gapDelta = deltaCostInsert + deltaCostDelete ;		\
+    /*gapDelta.addThirdIfFirstGTSecond(gapDelta, zeroVec, gapOpenWtVec, zeroVec) ; */ \
+    currLowScore += gapDelta ;						\
+    currHighScore += gapDelta ;						\
+									\
+    currLowScore += (deltaMismatch * deltaCostDistWtHigh) ;		\
+    currHighScore += (deltaMismatch * deltaCostDistWtLow) ;		\
+    bestLowScore.setMax(currLowScore, bestColVecLow, currColVec) ;	\
+    bestHighScore.setMax(currHighScore, bestColVecHigh, currColVec) ;	\
+  }
+
+#define UPDATE_COSTS_AND_CHECK(__ci) {					\
+    currColVec += deltaColVec ;						\
+    EDVec gapDelta = deltaCostInsert + deltaCostDelete ;		\
+    /*gapDelta.addThirdIfFirstGTSecond(gapDelta, zeroVec, gapOpenWtVec, zeroVec) ; */ \
+    currLowScore += gapDelta ;						\
+    currHighScore += gapDelta ;						\
+									\
+    /* Add delta costs corresponding to the current mismatch delta (-1, 0, or 1) */  \
+    deltaMismatch = accumDist.getDeltaDistVec(L0, L1) ;		\
+									\
+    currLowScore += (deltaMismatch * deltaCostDistWtHigh) ;		\
+    currHighScore += (deltaMismatch * deltaCostDistWtLow) ;		\
+    updateFlag |= bestLowScore.setMaxAndReturnFlag(currLowScore, bestColVecLow, currColVec) ; \
+    updateFlag |= bestHighScore.setMaxAndReturnFlag(currHighScore, bestColVecHigh, currColVec) ; \
+  }
+
+#define UPDATE_COSTS_AND_STORE_BEST_ACCUM_DIST(__ci) {			\
+    currColVec += deltaColVec ;						\
+    EDVec gapDelta = deltaCostInsert + deltaCostDelete ;		\
+    /*gapDelta.addThirdIfFirstGTSecond(gapDelta, zeroVec, gapOpenWtVec, zeroVec) ; */ \
+    currLowScore += gapDelta ;						\
+    currHighScore += gapDelta ;						\
+									\
+    /* Add delta costs corresponding to the current mismatch delta (-1, 0, or 1) */  \
+    deltaMismatch = accumDist.getDeltaDistVec(L0, L1) ;		\
+									\
+    currLowScore += (deltaMismatch * deltaCostDistWtHigh) ;		\
+    currHighScore += (deltaMismatch * deltaCostDistWtLow) ;		\
+    bestLowScore.setMax(currLowScore, bestColVecLow, currColVec, bestAccumDistLow, accumDist.getVec()) ; \
+    bestHighScore.setMax(currHighScore, bestColVecHigh, currColVec, bestAccumDistHigh, accumDist.getVec()) ; \
+  }
+
+#define UPDATE_COSTS_AND_CHECK_AND_STORE_BEST_ACCUM_DIST(__ci) {	\
+    currColVec += deltaColVec ;						\
+    EDVec gapDelta = deltaCostInsert + deltaCostDelete ;		\
+    /*gapDelta.addThirdIfFirstGTSecond(gapDelta, zeroVec, gapOpenWtVec, zeroVec) ; */ \
+    currLowScore += gapDelta ;						\
+    currHighScore += gapDelta ;						\
+									\
+    /* Add delta costs corresponding to the current mismatch delta (-1, 0, or 1) */  \
+    deltaMismatch = accumDist.getDeltaDistVec(L0, L1) ;		\
+									\
+    currLowScore += (deltaMismatch * deltaCostDistWtHigh) ;		\
+    currHighScore += (deltaMismatch * deltaCostDistWtLow) ;		\
+    updateFlag |= bestLowScore.setMaxAndReturnFlag(currLowScore, bestColVecLow, currColVec, bestAccumDistLow, accumDist.getVec()) ; \
+    updateFlag |= bestHighScore.setMaxAndReturnFlag(currHighScore, bestColVecHigh, currColVec, bestAccumDistHigh, accumDist.getVec()) ; \
+  }
+
+
+#define REMOVE_GAP_OPEN_PENALTY_FOR_DIAG(__ci) {			\
+    currLowScore += gapOpenDelta ;					\
+    currHighScore += gapOpenDelta ;					\
+  }
+
+#define ADD_GAP_OPEN_PENALTY_TO_DIAG(__ci) {				\
+    currLowScore -= gapOpenDelta ;					\
+    currHighScore -= gapOpenDelta ;					\
+    									\
+    /* shift the modifier so that they modify the next lane in the next update */ \
+    gapOpenDelta.shiftWordsLeftByOne() ;				\
+  }
+
+#define PERFORM_DELTA_MODIFICATIONS(__ci) {				\
+    deltaCostInsert += insertVecModifier ;				\
+    deltaCostDelete += deleteVecModifier ;				\
+    									\
+    /* shift the modifiers so that they modify the next lane in the next update */ \
+    insertVecModifier.shiftWordsLeftByOne() ;				\
+    deleteVecModifier.shiftWordsLeftByOne() ;				\
+    									\
+  }
+
+#define UPDATE_FINE_ALIGN(__ci) {		\
+    if (__ci < FINE_ALIGN_CNT)						\
+      fineAlign.update(U0.getLow16Bits(), U1.getLow16Bits(), __ci+1,	\
+		       ambigBaseCntsFineArr[__ci]) ;			\
+    									\
+  }
+
+#define CHECK_TERMINATION(__ci) {				\
+    if (currHighScore.allLessThanOrEqualTo(badScoreVec)) {	\
+      goto loopEnd ;						\
+    }								\
+  }
+
+#define CHECK_COSTS(__ci) {						\
+    EDVec deleteCntVec = currColVec.subSat(queryLenVec) ;		\
+    EDVec insertCntVec = queryLenVec.subSat(currColVec) ;		\
+    EDVec gapCntVec = insertCntVec + deleteCntVec ;			\
+									\
+    EDVec gapPenaltyVec = gapCntVec * gapWtVec + insertCntVec ;		\
+    gapPenaltyVec.addThirdIfFirstGTSecond(gapCntVec, zeroVec, gapOpenWtVec, zeroVec) ; \
+									\
+    EDVec currLowScoreVecTest =						\
+      deltaScoreVec - gapPenaltyVec - accumDist.getVec() * mismatchWtHighP1Vec ; \
+    EDVec currHighScoreVecTest =							\
+      deltaScoreVec - gapPenaltyVec - accumDist.getVec() * mismatchWtLowP1Vec ;	\
+									\
+    if (!(currLowScoreVecTest == currLowScore) || !(currHighScoreVecTest == currHighScore)) { \
+      cout << "Error: Mismatch in cost values computed using two methods at column " << ci << endl ; \
+      cout << "deleteCntVec: " << deleteCntVec << endl ;		\
+      cout << "insertCntVec: " << insertCntVec << endl ;		\
+      cout << "gapPenaltyVec: " << gapPenaltyVec << endl ;		\
+      cout << "low score (test): " << currLowScoreVecTest << endl ;	\
+      cout << "high score (test): " << currHighScoreVecTest << endl ;	\
+      exit(0) ;								\
+    }									\
+  }
+
+  uint16_t flagMask = (1 << (EDVec::getLastWordIndexFor(queryLen)+1)) - 1 ;
+  int firstProbeOffset = accumDist.getProbeColumn(0) ;
+  int numFullIters ;
+  
+  int ci = 0 ;
+  for (; ci < firstProbeOffset; ++ci) {
+    COMPUTE_LU(ci) ;
+
+#ifdef SW_FEEDBACK
+    UPDATE_COSTS_AND_STORE_BEST_ACCUM_DIST(ci) ;
+#else
+    UPDATE_COSTS(ci) ;
+#endif
+
+#ifdef FINE_ALIGN
+    UPDATE_FINE_ALIGN(ci) ;
+#endif
+#ifdef DEBUG1
+    PRINT_UPDATE_COST_DEBUG(ci) ;
+#endif
+    //CHECK_COSTS(ci) ;
+  }
+  
+  REMOVE_GAP_OPEN_PENALTY_FOR_DIAG(ci) ;
+  
+  if (firstProbeOffset >= 0) {
+    COMPUTE_LU(ci) ;
+
+#ifdef SW_FEEDBACK
+    UPDATE_COSTS_AND_STORE_BEST_ACCUM_DIST(ci) ;
+#else
+    UPDATE_COSTS(ci) ;
+#endif
+
+#ifdef FINE_ALIGN
+    UPDATE_FINE_ALIGN(ci) ;
+#endif
+#ifdef DEBUG1
+    PRINT_UPDATE_COST_DEBUG(ci) ;
+#endif
+    //CHECK_COSTS(ci) ;
+    ++ci ; 
+  }
+  ADD_GAP_OPEN_PENALTY_TO_DIAG(ci-1) ;
+
+  PERFORM_DELTA_MODIFICATIONS(ci-1) ;
+#ifdef DEBUG1
+  PRINT_DELTA_MODIFICATIONS_DEBUG(ci-1) ;
+#endif
+  CHECK_TERMINATION(ci-1) ;
+
+  numFullIters = (min(queryLen, refLen)-ci+1) /EDVec::PERIOD() ;
+  for (int iterIndex = 0; iterIndex < numFullIters; ++iterIndex) {
+
+    //cout << "OUTER ITERATION: " << iterIndex << " of " << numFullIters << endl ;
+
+    for (int subIterIndex=0; subIterIndex < EDVec::PERIOD()-1; ++subIterIndex, ++ci) {
+      COMPUTE_LU(ci) ;
+#ifdef SW_FEEDBACK
+    UPDATE_COSTS_AND_STORE_BEST_ACCUM_DIST(ci) ;
+#else
+    UPDATE_COSTS(ci) ;
+#endif
+
+#ifdef FINE_ALIGN
+      UPDATE_FINE_ALIGN(ci) ;
+#endif
+#ifdef DEBUG1
+      PRINT_UPDATE_COST_DEBUG(ci) ;
+#endif
+      //CHECK_COSTS(ci) ;
+    }
+    REMOVE_GAP_OPEN_PENALTY_FOR_DIAG(ci) ;
+    COMPUTE_LU(ci) ;
+
+#ifdef SW_FEEDBACK
+    UPDATE_COSTS_AND_STORE_BEST_ACCUM_DIST(ci) ;
+#else
+    UPDATE_COSTS(ci) ;
+#endif
+
+#ifdef FINE_ALIGN
+    UPDATE_FINE_ALIGN(ci) ;
+#endif
+#ifdef DEBUG1
+    PRINT_UPDATE_COST_DEBUG(ci) ;
+#endif
+    //CHECK_COSTS(ci) ;
+    ADD_GAP_OPEN_PENALTY_TO_DIAG(ci) ;
+
+    PERFORM_DELTA_MODIFICATIONS(ci) ;
+#ifdef DEBUG1
+    PRINT_DELTA_MODIFICATIONS_DEBUG(ci) ;
+#endif
+
+    CHECK_TERMINATION(ci) ;
+    ++ci ;
+  }
+
+
+  // If there are 3 words in EDVec, flagMask will be 0....0111 in binary
+  
+  // Next, process the columns with indices greater than or equal to query length
+  numFullIters = (refLen-ci) / EDVec::PERIOD() ;
+  for (int iterIndex = 0; iterIndex < numFullIters; ++iterIndex) {
+    uint16_t updateFlag = 0x0 ;
+
+    for (int subIterIndex=0; subIterIndex < EDVec::PERIOD(); ++subIterIndex, ++ci) {
+      COMPUTE_LU(ci) ;
+
+#ifdef SW_FEEDBACK
+      UPDATE_COSTS_AND_CHECK_AND_STORE_BEST_ACCUM_DIST(ci) ;
+#else
+      UPDATE_COSTS_AND_CHECK(ci) ;
+#endif
+
+
+#ifdef DEBUG1
+      PRINT_UPDATE_COST_DEBUG(ci) ;
+#endif
+      //CHECK_COSTS(ci) ; // will fail for the padded entries
+    }
+
+    updateFlag &= flagMask ;
+
+    //cout << ci << " " << queryLen << " " << refLen << " " 
+    //	 << std::hex << updateFlag << std::dec << endl ;
+    if (!updateFlag) // if the best score hasn't been updated in any iteration
+      goto loopEnd ;
+
+    CHECK_TERMINATION(ci) ;
+
+
+
+  }
+
+
+
+
+  // Next, process the remaining columns
+  for (; ci < refLen; ++ci) {
+    COMPUTE_LU(ci) ;
+
+#ifdef SW_FEEDBACK
+    UPDATE_COSTS_AND_STORE_BEST_ACCUM_DIST(ci) ;
+#else
+    UPDATE_COSTS(ci) ;
+#endif
+
+#ifdef DEBUG1
+    PRINT_UPDATE_COST_DEBUG(ci) ;
+#endif
+    // CHECK_COSTS(ci) ; // will fail for the padded entries
+  }
+
+ loopEnd:
+#ifdef DEBUG0
+  if (ci < refLen)
+    cout << "Loop terminated after iteration " << ci << endl ;
+#endif
+  
+  //maxScoreHigh = initScore ;
+  //maxScoreLow = initScore ;
+
+  //const int SCORE_INF = 10000 ;
+  //maxScoreHigh = -SCORE_INF ;
+  //maxScoreLow = -SCORE_INF ;
+
+  maxScoreHigh = initScore ;
+  maxScoreLow = initScore ;
+
+  alignedQLenLow = 0 ;
+  alignedQLenHigh = 0 ;
+  alignedRLenLow = 0 ;
+  alignedRLenHigh = 0 ;
+
+#ifdef FINE_ALIGN
+  int alignedLenLowFine, alignedLenHighFine, scoreLowFine, scoreHighFine ;
+  fineAlign.getBestLow(alignedLenLowFine, scoreLowFine) ;
+  fineAlign.getBestHigh(alignedLenHighFine, scoreHighFine) ;
+
+  scoreLowFine += initScore ;
+  scoreHighFine += initScore ;
+
+  if (scoreLowFine > maxScoreLow) {
+    maxScoreLow = scoreLowFine ;
+    alignedQLenLow = alignedLenLowFine ;
+    alignedRLenLow = alignedLenLowFine ;
+  }
+
+  if (scoreHighFine > maxScoreHigh) {
+    maxScoreHigh = scoreHighFine ;
+    alignedQLenHigh = alignedLenHighFine ;
+    alignedRLenHigh = alignedLenHighFine ;
+  }
+
+#ifdef DEBUG0 
+  cout << "The fine-grain alignment: " << fineAlign << endl ;
+#endif
+
+#endif // #ifdef FINE_ALIGN
+
+
+#ifdef SW_FEEDBACK
+  swfb = SWFeedback(alignedQLenHigh, alignedRLenHigh, 1) ;
+  const int scoreTolerance = EDVec::PERIOD() ;
+#endif
+
+  for (int di=0; di < accumDist.getValidDistCnt(); ++di) {
+  //for (int di=0; di < 1; ++di) {
+
+    int currQLen = queryLenVec.getWord(di) ;
+    int currRLenLow = bestColVecLow.getWord(di) ;
+    int currRLenHigh = bestColVecHigh.getWord(di) ;
+
+    // if (currQLen < 5) {
+    //  continue ; // skip the partial alignments that are too short
+    //}
+    
+    int currLowScore = bestLowScore.getWord(di) ; 
+    //currLowScore -= (currQLen > 16) ? 5 : 0 ;
+
+    if (currLowScore > maxScoreLow) {
+      maxScoreLow = currLowScore ;
+      alignedQLenLow = currQLen ;
+      alignedRLenLow = currRLenLow ;
+    }
+
+    int currHighScore = bestHighScore.getWord(di) ;//+ ((currQLen == queryLen) ? endBonus : 0) ;
+    //currHighScore += (currQLen > 16) ? 5 : 0 ;
+
+    if (currHighScore > maxScoreHigh) {
+      maxScoreHigh = currHighScore ;
+      alignedQLenHigh = currQLen ; 
+      alignedRLenHigh = currRLenHigh ; 
+    }
+
+#ifdef SW_FEEDBACK
+
+    int currDistLow = bestAccumDistLow.getWord(di) ;
+    int currDistHigh = bestAccumDistHigh.getWord(di) ;
+    int currGapLow = (currQLen > currRLenLow) ? (currQLen - currRLenLow) : (currRLenLow - currQLen) ;
+    int currGapHigh = (currQLen > currRLenHigh) ? (currQLen - currRLenHigh) : (currRLenHigh - currQLen) ;
+
+    int currQLenUpdated = min(queryLen, currQLen+EDVec::PERIOD()-1) ;
+
+    int currRLenLowUpdated = min(refLen, max(currRLenLow, currQLenUpdated)+currDistLow) ;
+    int currRLenHighUpdated = min(refLen, max(currRLenHigh, currQLenUpdated)+currDistHigh) ;
+
+
+
+    if (currLowScore > maxScoreLow - scoreTolerance) {
+      swfb.updateMax(currQLenUpdated, currRLenLowUpdated, currDistLow + currGapLow + 1) ;
+    }
+    if (currHighScore > maxScoreHigh - scoreTolerance) {
+      swfb.updateMax(currQLenUpdated, currRLenHighUpdated, currDistHigh + currGapHigh + 1) ;
+    }
+#endif
+
+#ifdef DEBUG0
+    cout << "For partialQLen " << queryLenVec.getWord(di) << ": "
+	 << "score = [" << currLowScore << ", " << currHighScore << "], "
+ 	 << "partial ref len = [" << bestColVecLow.getWord(di) << ", "
+	 << bestColVecHigh.getWord(di) << "], "
+	 << "qlen in vector = " << queryLenVec.getWord(di) << " "
+	 << "ambigBaseCnt = " << ambigBaseCnts.getWord(di) << ", "
+#ifdef SW_FEEDBACK
+	 << "editDist = [" << currDistLow << ", " << currDistHigh << "]"
+#endif
+	 << endl ;
+
+#ifdef SW_FEEDBACK
+    cout << "The feedback to be sent: " << swfb << endl ;
+#endif
+
+#endif
+  }
+
+  
+  // Adjust the alignedRLen value for full alignment for obvious cases
+  if (alignedQLenHigh == alignedQLenLow && alignedQLenHigh == queryLen && 
+      alignedRLenHigh == alignedRLenLow && alignedQLenHigh >= 4 && alignedRLenHigh >= 5) {
+
+    int origMatchCnt = 
+      (refSeq[alignedRLenHigh-1] == querySeq[alignedQLenHigh-1])
+      + (refSeq[alignedRLenHigh-2] == querySeq[alignedQLenHigh-2])
+      + (refSeq[alignedRLenHigh-3] == querySeq[alignedQLenHigh-3])
+      + (refSeq[alignedRLenHigh-4] == querySeq[alignedQLenHigh-4]) ;
+
+    int back1MatchCnt =
+      (refSeq[alignedRLenHigh-2] == querySeq[alignedQLenHigh-1])
+      + (refSeq[alignedRLenHigh-3] == querySeq[alignedQLenHigh-2])
+      + (refSeq[alignedRLenHigh-4] == querySeq[alignedQLenHigh-3]) 
+      + (refSeq[alignedRLenHigh-5] == querySeq[alignedQLenHigh-4]) ;
+    
+    int forw1MatchCnt = 0 ;
+    if (alignedRLenHigh < refLen-1) {
+      forw1MatchCnt = 
+	(refSeq[alignedRLenHigh] == querySeq[alignedQLenHigh-1])
+	+ (refSeq[alignedRLenHigh-1] == querySeq[alignedQLenHigh-2])
+	+ (refSeq[alignedRLenHigh-2] == querySeq[alignedQLenHigh-3])
+	+ (refSeq[alignedRLenHigh-3] == querySeq[alignedQLenHigh-4]) ;
+    }
+
+    if (back1MatchCnt > origMatchCnt && back1MatchCnt >= forw1MatchCnt) {
+      --alignedRLenHigh ;
+      --alignedRLenLow ;
+    }
+    else if (forw1MatchCnt > origMatchCnt) {
+      ++alignedRLenHigh ;
+      ++alignedRLenLow ;
+    }
+    
+#ifdef DEBUG0
+    cout << "Orig/Back-by-1/Forward-by-1 match counts: " << origMatchCnt << " "
+	 << back1MatchCnt << " " << forw1MatchCnt << endl ;
+    cout << "The aligned ref len is now: " << alignedRLenHigh << endl ;
+#endif
+
+  }
+
+
+#ifdef DEBUG0
+  cout << "Init score = " << initScore << endl ;
+  cout << "Max score-high = " << maxScoreHigh << " for alignedQLen = " 
+       << alignedQLenHigh << ", alignedRLen = " << alignedRLenHigh << endl ;
+  cout << "Max score-low = " << maxScoreLow << " for alignedQLen = " 
+       << alignedQLenLow << ", alignedRLen = " << alignedRLenLow << endl ;
+#endif
+
+#if 0
+#ifdef SW_FEEDBACK
+  if (!((alignedQLenHigh == queryLen && alignedQLenLow == queryLen) ||
+	(alignedQLenHigh == 0 && alignedQLenLow == 0))) {
+
+    // TODO: Also consider FINE_ALIGN
+    compute_sw_feedback<EDVec>(bestLowScore, bestColVecLow, bestHighScore, bestColVecHigh,
+			       queryLenVec, gapWtVec, gapOpenWtVec, ambigBaseCnts,
+			       ambigWtP1Vec, mismatchWtLow, mismatchWtHigh, 
+			       accumDist.getValidDistCnt(), swFeedback) ;
+
+#ifdef DEBUG0
+    cout << "The feedback sent to SW: " << swFeedback.maxQLen << " " << swFeedback.maxRLen
+	 << " " << swFeedback.maxBand << endl ;
+#endif
+
+  }
+#endif
+#endif
+
+  return accumDist.getTotalDist() ;
+}
+
+
+bool unit_test1() {
+
+  uint8_t ref[] = {4, 4, 1, 3, 2, 2} ;
+  uint8_t query[] = {4, 3, 0, 2, 2} ;
+
+  //uint8_t ref[] = {1, 3, 3, 2, 2} ;
+  //uint8_t query[] = {3, 3, 2, 2} ;
+
+  //uint8_t ref[]   = {3, 3, 2, 2} ;
+  //uint8_t query[] = {3, 3, 2, 2} ;
+
+  //uint8_t ref[] = {2} ;
+  //uint8_t query[] = {2} ;
+
+  int alignedQLenLow=0, alignedQLenHigh=0, alignedRLenLow=0, alignedRLenHigh=0 ;
+  int scoreLow, scoreHigh ;
+  SWFeedback swFeedback ;
+  int dist = edit_dist<BitVec64, EDVec32Every4>(ref, 6, query, 5, 0, 0, alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh, scoreLow, scoreHigh, swFeedback, false) ;
+
+  cout << "Unit test1 result: " << dist << endl ;
+
+  return dist == 4 ;
+}
+
+bool unit_test2() {
+
+  //uint8_t query[] = {3,1,3,0,2,0,2,2,0,2,2,0,0} ;
+  //uint8_t ref[]   = {0,1,3,0,2,0,2,2,0,2,2,0,0} ;
+
+  uint8_t query[] = {3,1,3} ; 
+  uint8_t ref[]   = {0,1,3} ; 
+
+  int alignedQLenLow=0, alignedQLenHigh=0, alignedRLenLow=0, alignedRLenHigh=0 ;
+  int scoreLow, scoreHigh ;
+  SWFeedback swFeedback ;
+  int dist = edit_dist<BitVec64, EDVec32Every4>(ref, 3, query, 3, 0, 0, alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh, scoreLow, scoreHigh, swFeedback, false) ;
+  cout << "Unit test2 result: " << dist << endl ;
+
+  return dist == 1 ;
+
+}
+
+bool unit_test3() {
+
+  uint8_t query[] = {2,0,3,3,3,1,3,3,1,0,2,3,0,2,2,0,1,1,2,0,2,3,1} ;
+  uint8_t ref[] =   {3,0,3,3,3,1,3,3,1,0,2,3,0,0,2,0,1,0,2,0,2,1,1} ;
+  // qlen = rlen = 23
+
+  //uint8_t query[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} ;
+  //uint8_t ref[] =   {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} ;
+  // qlen = rlen = 17
+
+  int alignedQLenLow=0, alignedQLenHigh=0, alignedRLenLow=0, alignedRLenHigh=0 ;
+  int scoreLow, scoreHigh ;
+  SWFeedback swFeedback ;
+  int dist = edit_dist<BitVec64x2, EDVec128Every16>(ref, 23, query, 23, 21, 5, alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh, scoreLow, scoreHigh, swFeedback, false) ; 
+
+  cout << "Unit test3 result: " << dist << endl ;
+
+  return true ;
+}
+
+
+DIST_TYPE edit_dist_u32(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen, 
+			int initScore, int endBonus, 
+			int& alignedQLenLow, int& alignedQLenHigh, 
+			int& alignedRLenLow, int& alignedRLenHigh, 
+			int& scoreLow, int& scoreHigh, SWFeedback& swFeedback,
+			bool printDebug=false) {
+
+  return edit_dist<BitVec64, EDVec32Every4>(refSeq, refLen, querySeq, queryLen, 
+  //return edit_dist<BitVec64, EDVec128Every16>(refSeq, refLen, querySeq, queryLen, 
+					    initScore, endBonus, 
+					    alignedQLenLow, alignedQLenHigh, 
+					    alignedRLenLow, alignedRLenHigh, 
+					    scoreLow, scoreHigh, 
+					    swFeedback, printDebug) ;
+}
+
+
+DIST_TYPE edit_dist_u64(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen, 
+			int initScore, int endBonus, 
+			int& alignedQLenLow, int& alignedQLenHigh, 
+			int& alignedRLenLow, int& alignedRLenHigh, 
+			int& scoreLow, int& scoreHigh, SWFeedback& swFeedback,
+			bool printDebug=false) {
+
+  return edit_dist<BitVec64, EDVec64Every8>(refSeq, refLen, querySeq, queryLen, 
+  //return edit_dist<BitVec64, EDVec128Every16>(refSeq, refLen, querySeq, queryLen, 
+					    initScore, endBonus, 
+					    alignedQLenLow, alignedQLenHigh, 
+					    alignedRLenLow, alignedRLenHigh, 
+					    scoreLow, scoreHigh, 
+					    swFeedback, printDebug) ;
+}
+
+
+DIST_TYPE edit_dist_u64x2(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+			  int initScore, int endBonus, 
+			  int& alignedQLenLow, int& alignedQLenHigh, 
+			  int& alignedRLenLow, int& alignedRLenHigh, 
+			  int& scoreLow, int& scoreHigh, SWFeedback& swFeedback,
+			  bool printDebug=false) {
+
+  return edit_dist<BitVec64x2, EDVec128Every16>(refSeq, refLen, querySeq, queryLen, 
+  //return edit_dist<BitVec64x2, EDVec64x2>(refSeq, refLen, querySeq, queryLen, 
+						initScore, endBonus, 
+						alignedQLenLow, alignedQLenHigh, 
+						alignedRLenLow, alignedRLenHigh, 
+						scoreLow, scoreHigh, 
+						swFeedback, printDebug) ;
+}
+
+void edit_dist(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+		  int initScore, int endBonus, 
+		  int& alignedQLenLow, int& alignedQLenHigh, 
+		  int& alignedRLenLow, int& alignedRLenHigh, 
+		  int& scoreLow, int& scoreHigh, SWFeedback& swFeedback,
+		  bool printDebug=false) {
+  
+  if (queryLen < 15) {
+    cout << "Warning: It is not recommended to use SW filtering API for queries shorter than 15 bases!" << endl ;
+  }
+
+  if (queryLen <= 32)
+    edit_dist_u32(refSeq, refLen, querySeq, queryLen, initScore, endBonus,
+		  alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh,
+		  scoreLow, scoreHigh, swFeedback, printDebug) ;
+  else if (queryLen <= 63)
+    edit_dist_u64(refSeq, refLen, querySeq, queryLen, initScore, endBonus,
+		  alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh,
+		  scoreLow, scoreHigh, swFeedback, printDebug) ;
+  else if (queryLen <= 126)
+    edit_dist_u64x2(refSeq, refLen, querySeq, queryLen, initScore, endBonus,
+		    alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh,
+		    scoreLow, scoreHigh, swFeedback, printDebug) ;
+  else {
+    alignedQLenLow = alignedRLenLow = 0 ;
+    alignedQLenHigh = queryLen ;
+    alignedRLenHigh = refLen ;
+    scoreLow = 0 ;
+    scoreHigh = queryLen ;
+    cout << "Warning: Skipping edit distance computation because reads longer than 126 bases not supported yet" << endl ;
+  }
+  
+}
+
+
+void extend_with_edit_dist(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+			   int initScore, int endBonus, 
+			   int& alignedQLen, int& alignedRLen, int& score,
+			   float& confidence) {
+  
+  int alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh, scoreLow, scoreHigh ;
+  SWFeedback swFeedback ;
+
+  edit_dist(refSeq, refLen, querySeq, queryLen, initScore, endBonus,
+	    alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh,
+	    scoreLow, scoreHigh, swFeedback) ;
+  
+  alignedQLen = (alignedQLenLow + alignedQLenHigh)/2 ;
+  alignedRLen = (alignedRLenLow + alignedRLenHigh)/2 ;
+  score = (scoreLow + scoreHigh)/2 ;
+  confidence = 
+    (alignedQLenLow == alignedQLenHigh && alignedRLenLow == alignedRLenHigh) ? 1.0 : 0.0 ;
+
+}
+
+void filter_and_extend(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+		       int initScore, int endBonus, int zdrop,
+		       int& alignedQLen, int& alignedRLen, int& score) {
+
+  int alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh, scoreLow, scoreHigh ;
+  SWFeedback swFeedback ;
+
+  edit_dist(refSeq, refLen, querySeq, queryLen, initScore, endBonus,
+	    alignedQLenLow, alignedQLenHigh, alignedRLenLow, alignedRLenHigh,
+	    scoreLow, scoreHigh, swFeedback) ;
+  
+  score = (scoreLow + scoreHigh) / 2 ;
+  alignedQLen = (alignedQLenLow + alignedQLenHigh)/2 ;
+  alignedRLen = (alignedRLenLow + alignedRLenHigh)/2 ;  
+  
+  bool partialQ = (alignedQLenLow != alignedQLenHigh) || 
+    (alignedQLenLow > 0 && alignedQLenLow < queryLen) ;
+  bool partialR = (alignedRLenLow != alignedRLenHigh) ;
+
+  if (partialQ || partialR) {
+    int endBonusUpdated = (swFeedback.maxQLen == queryLen) ? endBonus : 0 ;
+    score = run_ksw_extend(refSeq, swFeedback.maxRLen, querySeq, swFeedback.maxQLen,
+			   swFeedback.maxBand, initScore, endBonusUpdated, zdrop, 
+			   costMatrixRowCnt, costMatrix, gapo, gape,
+			   alignedQLen, alignedRLen) ;
+  }
+
+}
+
+
+
+void init_ed_dist() {
+
+#ifdef FINE_ALIGN
+  BitCount8::init_lut() ;
+#endif
+  
+}
+
+
+#ifdef MAIN
+int main() {
+  
+  init_ed_dist() ;
+
+
+
+  if (!unit_test1()) {
+    cout << "Failed unit test 1" << endl ;
+    exit(0) ;
+  }
+
+  if (!unit_test2()) {
+    cout << "Failed unit test 2" << endl ;
+    exit(0) ;
+  }
+  
+  unit_test3() ;
+
+  return 0 ;
+}
+#endif
+
+uint8_t BitCount8::lut_[] ;
+void BitCount8::init_lut() {
+
+  for (int bitV=0; bitV < 256; ++bitV) {
+
+    // The following is from Brian Kernighan
+    // See: "Bit Twiddling Hacks"
+    int cnt ;
+    int val = bitV ;
+    for (cnt=0; val ; ++cnt) {
+      val &= val - 1 ; // clear the least significant bit set
+    }
+    lut_[bitV] = cnt ;
+  }
+
+}
+
diff --git a/ed_intrav.h b/ed_intrav.h
new file mode 100644
index 0000000..e4328cd
--- /dev/null
+++ b/ed_intrav.h
@@ -0,0 +1,131 @@
+/* 
+The MIT License (MIT)
+Copyright (c) 2014 Intel Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef ED_INTRAV_H
+#define ED_INTRAV_H
+
+#include "ed_intravED.h"
+
+const int costMatrixRowCnt = 5 ;
+const int8_t costMatrix[] = {
+  1, -4, -4, -4, -1,
+  -4, 1, -4, -4, -1,
+  -4, -4, 1, -4, -1,
+  -4, -4, -4, 1, -1,
+  -1, -1, -1, -1, -1 
+} ;
+
+const int gapo = 6 ;
+const int gape = 1 ; 
+
+int run_ksw_extend(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+		   int bandW, int initScore, int endBonus, int zdrop, int costMatrixRowCnt,
+		   const int8_t* costMatrix, int gapo, int gape,
+		   int& alignedQLen, int& alignedRLen) ;
+
+struct SWFeedback {
+  int maxQLen ;
+  int maxRLen ;
+  int maxBand ;
+
+  SWFeedback():maxQLen(0), maxRLen(0), maxBand(0) {} ;
+
+  SWFeedback(int qlen, int rlen, int band): maxQLen(qlen), maxRLen(rlen), maxBand(band) {}
+  void updateMax(int qlen, int rlen, int band) {
+    maxQLen = std::max(maxQLen, qlen) ;
+    maxRLen = std::max(maxRLen, rlen) ;
+    maxBand = std::max(maxBand, band) ;
+  }
+} ;
+
+inline ostream& operator<<(ostream& os, const SWFeedback& swfb) {
+  os << "{" << swfb.maxQLen << " " << swfb.maxRLen << " " << swfb.maxBand << "}" ;
+  return os ;
+}
+
+template<class BitVec, class EDVec>
+class DistVec {
+
+  EDVec dist_, mask_ ;
+  int queryLen_, msWordIndex_, probeOffset_ ;
+  
+ public: 
+  
+ DistVec(int queryLen):
+  queryLen_(queryLen), msWordIndex_(EDVec::getLastWordIndexFor(queryLen)),
+    probeOffset_(EDVec::getProbeOffsetFor(queryLen)) {
+
+    mask_.setWordsAsMask() ;
+  }
+
+  const EDVec& getVec() const {
+    return dist_ ;
+  }
+
+  void initDist(int initU0, int initU1) {
+    dist_.setWordsAsDist(queryLen_, initU0 - initU1) ;
+  }
+
+  void addDist (const BitVec& LP0, const BitVec& LP1) {
+
+    /*
+    cout << std::hex << "+++++" << LP0 << " " << LP1 << " " << std::dec << probeOffset_ << endl
+	 << std::hex << (EDVec(LP0.shiftProbesRight(probeOffset_))) << " "
+	 << ((EDVec(LP0.shiftProbesRight(probeOffset_))) & mask_) << endl 
+	 << std::hex << (EDVec(LP1.shiftProbesRight(probeOffset_))) << " "
+	 << ((EDVec(LP1.shiftProbesRight(probeOffset_))) & mask_)
+	 << std::dec << endl ;
+    cout << "Dist before: " << dist_ << endl ;
+    */
+
+    dist_ += (EDVec(LP0.shiftProbesRight(probeOffset_)) & mask_) ;
+    dist_ -= (EDVec(LP1.shiftProbesRight(probeOffset_)) & mask_) ;
+
+    //cout << "Dist after: " << dist_ << endl ;
+
+  }
+
+  EDVec getDeltaDistVec(const BitVec& LP0, const BitVec& LP1) {
+    EDVec r = EDVec(LP0.shiftProbesRight(probeOffset_)) & mask_ ;
+    r -= EDVec(LP1.shiftProbesRight(probeOffset_)) & mask_ ;
+
+    return r ;
+  }
+
+  void addDist (const EDVec& deltaDist) {
+    dist_ += deltaDist ;
+  }
+
+  int getValidDistCnt() const {
+    return msWordIndex_ + 1 ;
+  }
+
+  int getProbeColumn(int index) const {
+    return EDVec::getDistAt(index, queryLen_) - 1 ;
+  }
+
+  int getDist(int index) const {
+    return (int) dist_.getWord(index) ;
+  }
+
+  int getTotalDist() const {
+    return (int) dist_.getWord(msWordIndex_) ;
+  }
+
+  void setMin (const DistVec& other) {
+    this->dist_.setMin(other.dist_) ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const DistVec& d) {
+    os << d.dist_ ;
+    return os ;
+  }
+
+} ;
+
+#endif // #ifndef ED_INTRAV_H
diff --git a/ed_intrav64.h b/ed_intrav64.h
new file mode 100644
index 0000000..e8a495f
--- /dev/null
+++ b/ed_intrav64.h
@@ -0,0 +1,308 @@
+/* 
+The MIT License (MIT)
+Copyright (c) 2014 Intel Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef _ED_INTRAV64_H
+#define _ED_INTRAV64_H
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <limits>
+
+class BitVec64 {
+
+  uint64_t bitV ;
+
+  explicit BitVec64(uint64_t val): bitV(val) {}
+  //  cout << "val = " << val << ", bitV = " << bitV << endl ;
+
+public:
+  
+  BitVec64(): bitV(0x0) {}
+
+  inline bool getLSB() const {
+    return bitV & 0x1 ;
+  }
+  
+  inline uint16_t getLow16Bits() const {
+    return (uint16_t) bitV ;
+  }
+
+  inline void setAllOnes () {
+    bitV = 0xFFFFFFFFFFFFFFFF ;
+  }
+
+  inline void setAllZeroes () {
+    bitV = 0x0000000000000000 ;
+  }
+
+  inline void setAllBits (bool bit) {
+    bitV = bit ? 0xFFFFFFFFFFFFFFFF : 0x0 ;
+  }
+
+  inline void setLSBClearRest (bool bit) {
+    bitV = bit ;
+  }
+
+  inline void setBit(int bitIndex, bool bit) {
+    uint64_t mask = ((uint64_t)0x1) << bitIndex ;
+    bitV = bit ? (bitV | mask) : (bitV & ~mask) ;
+  }
+
+  inline void shiftLeftAndInsert (bool newBit) {
+    bitV = (bitV << 1) | ((uint8_t) newBit) ;
+  }
+
+  inline void shiftLeft () {
+    bitV <<= 1 ;
+  }
+
+  /*
+  inline BitVec64 shiftBitsRight(int offset) const {
+    return BitVec64(bitV >> offset) ;
+  }
+  */
+
+  inline BitVec64 shiftProbesRight(int probeOffset) const {
+    // The carryout bit is at location probeOffset+1
+    return BitVec64(bitV >> (probeOffset+1)) ;
+  }
+
+
+  void setWord (int index, uint64_t w) {
+    bitV = w ;
+  }
+
+  inline void orBitAtPosition (bool newBit, int index) {
+    bitV |= (((uint64_t)newBit) << index) ;
+  }
+  
+  inline bool getMSB() const {
+    return bitV & 0x8000000000000000 ;
+  }
+
+  inline BitVec64 operator~ () const {
+    return BitVec64(~this->bitV) ;
+  }
+
+  inline BitVec64 operator& (const BitVec64& other) const {
+    return BitVec64(this->bitV & other.bitV) ;
+  }
+
+  inline BitVec64 andnot (const BitVec64& other) const {
+    return *this & (~other) ;
+  }
+
+  inline BitVec64 operator| (const BitVec64& other) const {
+    return BitVec64(this->bitV | other.bitV) ;
+  }
+
+  inline BitVec64 operator^ (const BitVec64& other) const {
+    return BitVec64(this->bitV ^ other.bitV) ;
+  }
+
+  inline BitVec64 operator+ (const BitVec64& other) const {
+    return BitVec64(this->bitV + other.bitV) ;
+  }
+
+  //inline BitVec64 operator>>(int shiftVal) const {
+  //  return BitVec64(this->bitV >> shiftVal) ;
+  //}
+
+  static bool isQueryLengthOk(int qlen) {
+    return qlen < 64 ;
+  }
+
+  bool getBit (int index) const {
+    return (bitV >> index) & 0x1 ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const BitVec64& b) {
+    os << b.bitV ;
+    return os ;
+  }
+
+  friend class EDVec64 ;
+  friend class EDVec128Every16 ;
+  friend class EDVec64Every8 ;
+  friend class EDVec32Every4 ;
+  friend class EDVec16Every1 ;
+
+} ;
+
+
+class EDVec64 {
+  int64_t val_ ;
+  
+ public:
+  EDVec64() {}
+  explicit EDVec64(int val): val_(val) {}
+  explicit EDVec64(const BitVec64& bv): val_(bv.bitV) {}
+
+  inline void setAll(int val) {
+    val_ = val ;
+  }
+
+  static int16_t getMaxWordVal() { return std::numeric_limits<int16_t>::max() ;}
+
+  inline int getWord(int index) const {
+    return (int) val_ ;
+  }
+
+  bool allLessThanOrEqualTo(const EDVec64& other) const {
+    return val_ <= other.val_ ;
+  }
+
+  inline void setWords (int* vals) {
+    val_ = vals[0] ;
+  }
+
+  inline void setFirstWord(int val) {
+    val_ = val ;
+  }
+
+  inline void setWordsAsMask () {
+    val_ = 0x1 ;
+  }
+
+  static int getLastWordIndexFor(int queryLen) {
+    return 0 ;
+  }
+  
+  static int getProbeOffsetFor (int queryLen) {
+    return queryLen-1 ;
+  }
+
+  static int getBitOffsetFor(int baseIndex) {
+    return baseIndex ;
+  }
+
+  void setWordsAsEndBonus(int queryLen, int endBonus) {
+    val_ = endBonus ;
+  }
+ 
+
+  void setWordsAsDist (int queryLen, int distWt) {
+    val_ = queryLen * distWt ;
+  }
+
+  void setWordsAsBadScore(int queryLen, int thr, int infScore) {
+    val_ = infScore ;
+  }
+
+  static int getDistAt(int wordIndex, int queryLen) {
+    return queryLen ;
+  }
+
+  void setMin(const EDVec64& other) {
+    this->val_ = std::min(this->val_, other.val_) ;
+  }
+
+  void setMin(const EDVec64& other, EDVec64& bestIndices, const EDVec64& otherIndices) {
+    if (other.val_ < this->val_) {
+      this->val_ = other.val_ ;
+      bestIndices.val_ = otherIndices.val_ ;
+    }
+  }
+
+  void setMax(const EDVec64& other, EDVec64& bestIndices, const EDVec64& otherIndices) {
+    if (other.val_ > this->val_) {
+      this->val_ = other.val_ ;
+      bestIndices.val_ = otherIndices.val_ ;
+    }
+  }
+
+  uint16_t setMaxAndReturnFlag(const EDVec64& other, EDVec64& bestIndices, const EDVec64& otherIndices) {
+    if (other.val_ > this->val_) {
+      this->val_ = other.val_ ;
+      bestIndices.val_ = otherIndices.val_ ;
+      return 0x1 ;
+    }
+    
+    return 0x0 ;
+  }
+  
+
+  void addThirdIfFirstGTSecond(const EDVec64& first, const EDVec64& second, 
+			       const EDVec64& third, const EDVec64& zero) {
+    if (first.val_ > second.val_)
+      this->val_ += third.val_ ;
+  }
+
+  /*
+  EDVec64 abs(const EDVec64& other) const {
+    return EDVec64(labs((uint64_t)this->val_ - other.val_)) ;
+  }
+
+  EDVec64 abs() const {
+    return EDVec64(labs(this->val_)) ;
+  }
+  */
+
+  EDVec64 subSat(const EDVec64& other) const {
+    int64_t diff = this->val_ - other.val_ ;
+
+    return EDVec64(diff < 0 ? 0 : diff) ;
+  }
+
+  inline EDVec64 shiftBitsRightWithinWords(int shiftVal) {
+    return EDVec64(this->val_ >> shiftVal) ;
+  }
+
+  inline void shiftWordsLeftByOne() {
+    val_ = 0 ;
+  }
+
+  inline bool operator == (const EDVec64& other) const {
+    return this->val_ == other.val_ ;
+  }
+
+  inline EDVec64& operator += (const EDVec64& other) {
+    this->val_ += other.val_ ;
+    return *this ;
+  }
+
+  inline EDVec64& operator -= (const EDVec64& other) {
+    this->val_ -= other.val_ ;
+    return *this ;
+  }
+
+  inline EDVec64 operator+ (const EDVec64& other) const {
+    return EDVec64(this->val_ + other.val_) ;
+  }
+
+  inline EDVec64 operator- (const EDVec64& other) const {
+    return EDVec64(this->val_ - other.val_) ;
+  }
+
+  inline EDVec64 operator* (const EDVec64& other) const {
+    return EDVec64(this->val_ * other.val_) ;
+  }
+
+  inline EDVec64 operator& (const EDVec64& other) const {
+    return EDVec64(this->val_ & other.val_) ;
+  }
+
+  static int PERIOD() {
+    return 64 ;
+  }
+
+  static int WORD_CNT() {
+    return 1 ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const EDVec64& v) {
+    os << v.val_ ;
+    return os ;
+  }
+
+} ;
+
+
+#endif // #ifndef _ED_INTRAV64_H
diff --git a/ed_intrav64x2.h b/ed_intrav64x2.h
new file mode 100644
index 0000000..05403b3
--- /dev/null
+++ b/ed_intrav64x2.h
@@ -0,0 +1,408 @@
+/* 
+The MIT License (MIT)
+Copyright (c) 2014 Intel Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef _ED_INTRAV64x2_H
+#define _ED_INTRAV64x2_H
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <iostream>
+
+using namespace std ;
+
+class BitVec64x2 {
+
+  uint64_t bitV[2] ;
+
+
+public:
+  
+  BitVec64x2() {
+    bitV[0] = 0x0 ;
+    bitV[1] = 0x0 ;
+  }
+
+  inline bool getLSB() const {
+    return bitV[0] & 0x1 ;
+  }
+
+  inline uint16_t getLow16Bits() const {
+    return (uint16_t) bitV[0] ;
+  }
+
+  inline void setAllOnes () {
+    bitV[0] = 0xFFFFFFFFFFFFFFFF ;
+    bitV[1] = 0xFFFFFFFFFFFFFFFF ;
+  }
+
+  inline void setAllZeroes () {
+    bitV[0] = 0x0000000000000000 ;
+    bitV[1] = 0x0000000000000000 ;
+  }
+
+  inline void setAllBits (bool bit) {
+    if (bit) setAllOnes() ;
+    else setAllZeroes() ;
+  }
+
+  inline void setLSBClearRest (bool bit) {
+    bitV[0] = bit ;
+    bitV[1] = 0x0 ;
+  }
+
+  inline void shiftLeftAndInsert (bool newBit) {
+    bitV[0] <<= 1 ;
+    bitV[1] <<= 1 ;
+    bitV[1] |= (bitV[0] >> 63) ;
+    bitV[0] |= ((uint8_t) newBit) ;
+  }
+
+  inline void shiftLeft () {
+    bitV[0] <<= 1 ;
+    bitV[1] <<= 1 ;
+    bitV[1] |= (bitV[0] >> 63) ;
+  }
+
+  /*
+  inline BitVec64x2 shiftBitsRight(int offset) const {
+    BitVec64x2 r ;
+    r.bitV[0] = this->bitV[0] >> offset ;
+    r.bitV[1] = this->bitV[1] >> offset ;
+    return r ;
+  }
+  */
+
+  inline BitVec64x2 shiftProbesRight(int probeOffset) const {
+    BitVec64x2 r ;
+    
+    // Copy LSB of bitV[1] to MSB of bitV[0] in case probeOffset is 15
+    // L' of base 62 is equal to L of base 63, which is stored at the LSB of bitV[1]
+    r.bitV[0] = (this->bitV[0] & 0x7FFFFFFFFFFFFFFF) | (this->bitV[1] << 63) ;
+
+    // The carryout bit is at location probeOffset+1
+    r.bitV[0] = r.bitV[0] >> probeOffset ; // probes in the lower word have offset-1
+    r.bitV[1] = this->bitV[1] >> (probeOffset+1) ; 
+    return r ;
+  }
+
+
+  void setWord (int index, uint64_t w) {
+    bitV[index] = w ;
+  }
+
+  inline BitVec64x2 operator~ () const {
+    BitVec64x2 r ;
+    r.bitV[0] = ~this->bitV[0] ;
+    r.bitV[1] = ~this->bitV[1] ;
+    return r ;
+  }
+
+  inline BitVec64x2 operator& (const BitVec64x2& other) const {
+    BitVec64x2 r ;
+    r.bitV[0] = this->bitV[0] & other.bitV[0] ;
+    r.bitV[1] = this->bitV[1] & other.bitV[1] ;
+    return r ;
+  }
+
+  inline BitVec64x2 andnot (const BitVec64x2& other) const {
+    return *this & (~other) ;
+  }
+
+  inline BitVec64x2 operator| (const BitVec64x2& other) const {
+    BitVec64x2 r ;
+    r.bitV[0] = this->bitV[0] | other.bitV[0] ;
+    r.bitV[1] = this->bitV[1] | other.bitV[1] ;
+    return r ;
+  }
+
+  inline BitVec64x2 operator^ (const BitVec64x2& other) const {
+    BitVec64x2 r ;
+    r.bitV[0] = this->bitV[0] ^ other.bitV[0] ;
+    r.bitV[1] = this->bitV[1] ^ other.bitV[1] ;
+    return r ;
+  }
+
+  inline BitVec64x2 operator+ (const BitVec64x2& other) const {
+    BitVec64x2 r ;
+    r.bitV[0] = (this->bitV[0] & 0x7FFFFFFFFFFFFFFF) 
+      +         (other.bitV[0] & 0x7FFFFFFFFFFFFFFF) ;
+
+    r.bitV[1] = this->bitV[1] + other.bitV[1] ;
+
+    r.bitV[1] += ((r.bitV[0] >> 63) & 0x1) ;
+
+    /*
+    cout << "Add operation: " << endl ;
+    cout << std::hex << this->bitV[0] << " + " << other.bitV[0] << endl ;
+    cout << this->bitV[1] << " + " << other.bitV[1] << endl ;
+    cout << "Output: " << endl ;
+    cout << r.bitV[0] << endl ;
+    cout << r.bitV[1] << std::dec << endl ;
+    */
+
+    return r ;
+  }
+
+
+  static bool isQueryLengthOk(int qlen) {
+    return qlen <= 126 ;
+  }
+
+  bool getBit (int bitIndex) const {
+    const uint64_t& bitVchosen = (bitIndex < 63) ? bitV[0] : bitV[1] ;
+    int bitIndexMod = (bitIndex < 63) ? bitIndex : (bitIndex - 63) ;
+
+    return (bitVchosen >> bitIndexMod) & 0x1 ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const BitVec64x2& b) {
+    os << "{" << b.bitV[0] << ", " << b.bitV[1] << "}" ;
+    return os ;
+  }
+
+  friend class EDVec64x2 ;
+  friend class EDVec128Every16 ;
+} ;
+
+
+
+class EDVec64x2 {
+  int64_t vals_[2] ;
+
+ public:
+  EDVec64x2() {} 
+
+  explicit EDVec64x2(int val) {
+    vals_[0] = val ;
+    vals_[1] = val ;
+  }
+  
+  explicit EDVec64x2(const BitVec64x2& bv) {
+    vals_[0] = bv.bitV[0] ;
+    vals_[1] = bv.bitV[1] ;
+  }
+
+  inline void set(int val, int index) {
+    vals_[index] = val ;
+  }
+
+  static int16_t getMaxWordVal() { return numeric_limits<int16_t>::max() ;}
+
+  inline int getWord(int index) const {
+    return (int) vals_[index] ;
+  }
+
+  bool allLessThanOrEqualTo(const EDVec64x2& other) const {
+    return vals_[0] <= other.vals_[0] && vals_[1] <= other.vals_[1] ;
+  }
+
+  inline void setWords (int* vals) {
+    vals_[0] = vals[0] ;
+    vals_[1] = vals[1] ;
+  }
+
+  inline void setFirstWord(int val) {
+    vals_[0] = val ;
+  }
+
+  inline void setWordsAsMask () {
+    vals_[0] = 0x1 ;
+    vals_[1] = 0x1 ;
+  }
+
+  static int getLastWordIndexFor(int queryLen) {
+    int baseIndex = queryLen - 1 ;
+    return (baseIndex < 63) ? 0 : 1 ;
+  }
+
+  static int getProbeOffsetFor (int queryLen) {
+    return (queryLen-1) % 63 ;
+  }
+  
+  static int getBitOffsetFor(int baseIndex) {
+    return (baseIndex < 63) ? baseIndex : (baseIndex - 63) ;
+  }
+
+  void setWordsAsEndBonus(int queryLen, int endBonus) {
+    vals_[0] = vals_[1] = 0 ;
+    if (queryLen <= 63)
+      vals_[0] = endBonus ;
+    else
+      vals_[1] = endBonus ;
+  }
+
+
+  inline void setWordsAsDist(int queryLen, int distWt) {
+    int bitOffset = getBitOffsetFor(queryLen-1) ;
+    vals_[0] = (bitOffset + 1) * distWt ;
+    vals_[1] = (63 + bitOffset + 1) * distWt ;
+  }
+
+  void setWordsAsBadScore(int queryLen, int thr, int infScore) {
+    vals_[0] = thr ;
+    vals_[1] = thr ;
+    vals_[getLastWordIndexFor(queryLen-1)] = infScore ;
+  }
+
+  static int getDistAt(int wordIndex, int queryLen) {
+    return wordIndex * PERIOD() + getProbeOffsetFor(queryLen) ;
+  }
+
+  void setMin(const EDVec64x2& other) {
+    this->vals_[0] = min(this->vals_[0], other.vals_[0]) ;
+    this->vals_[1] = min(this->vals_[1], other.vals_[1]) ;
+  }
+  
+  void setMin(const EDVec64x2& other, EDVec64x2& bestIndices, const EDVec64x2& otherIndices) {
+    for (int i=0; i < 2; ++i) {
+      if (other.vals_[i] < this->vals_[i]) {
+	this->vals_[i] = other.vals_[i] ;
+	bestIndices.vals_[i] = otherIndices.vals_[i] ;
+      }
+    }
+  }
+
+  void setMax(const EDVec64x2& other, EDVec64x2& bestIndices, const EDVec64x2& otherIndices) {
+    for (int i=0; i < 2; ++i) {
+      if (other.vals_[i] > this->vals_[i]) {
+	this->vals_[i] = other.vals_[i] ;
+	bestIndices.vals_[i] = otherIndices.vals_[i] ;
+      }
+    }
+  }
+
+  uint16_t setMaxAndReturnFlag(const EDVec64x2& other, EDVec64x2& bestIndices, const EDVec64x2& otherIndices) {
+
+    uint16_t flag = 0x0 ;
+    
+    for (int i=0; i < 2; ++i) {
+      if (other.vals_[i] > this->vals_[i]) {
+	this->vals_[i] = other.vals_[i] ;
+	bestIndices.vals_[i] = otherIndices.vals_[i] ;
+	flag |= (0x1 << i) ;
+      }
+    }
+
+    return flag ;
+  }
+
+
+  void addThirdIfFirstGTSecond(const EDVec64x2& first, const EDVec64x2& second, 
+			       const EDVec64x2& third, const EDVec64x2& zero) {
+    
+    for (int i=0; i < 2; ++i) {
+      if (first.vals_[i] > second.vals_[i])
+	this->vals_[i] += third.vals_[i] ;
+    }
+  }
+
+
+  /*
+  EDVec64x2 abs(const EDVec64x2& other) const {
+    EDVec64x2 r ;
+    r.vals_[0] = labs(this->vals_[0] - other.vals_[0]) ;
+    r.vals_[1] = labs(this->vals_[1] - other.vals_[1]) ;
+
+    return r ;
+  }
+
+  EDVec64x2 abs() const {
+    EDVec64x2 r ;
+    r.vals_[0] = labs(this->vals_[0]) ;
+    r.vals_[1] = labs(this->vals_[1]) ;
+    return r ;
+  }
+  */
+
+  EDVec64x2 subSat(const EDVec64x2& other) const {
+    EDVec64x2 r ;
+    int64_t diff0 = this->vals_[0] - other.vals_[0] ;
+    int64_t diff1 = this->vals_[1] - other.vals_[1] ;
+    
+    r.vals_[0] = (diff0 < 0 ? 0 : diff0) ;
+    r.vals_[1] = (diff1 < 0 ? 0 : diff1) ;
+    return r ;
+  }
+
+  inline EDVec64x2 shiftBitsRightWithinWords(int shiftVal) {
+    EDVec64x2 r ;
+    r.vals_[0] = this->vals_[0] >> shiftVal ;
+    r.vals_[1] = this->vals_[1] >> shiftVal ;
+    return r ;
+  }
+
+  inline void shiftWordsLeftByOne() {
+    vals_[1] = vals_[0] ;
+    vals_[0] = 0 ;
+  }
+
+  inline bool operator == (const EDVec64x2& other) const {
+    return this->vals_[0] == other.vals_[0] && this->vals_[1] == other.vals_[1] ;
+  }
+
+  inline EDVec64x2& operator += (const EDVec64x2& other) {
+    this->vals_[0] += other.vals_[0] ;
+    this->vals_[1] += other.vals_[1] ;
+    return *this ;
+  }
+
+  inline EDVec64x2& operator -= (const EDVec64x2& other) {
+    this->vals_[0] -= other.vals_[0] ;
+    this->vals_[1] -= other.vals_[1] ;
+    return *this ;
+  }
+
+
+  inline EDVec64x2 operator+ (const EDVec64x2& other) const {
+    EDVec64x2 r ;
+    r.vals_[0] = this->vals_[0] + other.vals_[0] ;
+    r.vals_[1] = this->vals_[1] + other.vals_[1] ;
+    return r ;
+  }
+
+  inline EDVec64x2 operator- (const EDVec64x2& other) const {
+    EDVec64x2 r ;
+    r.vals_[0] = this->vals_[0] - other.vals_[0] ;
+    r.vals_[1] = this->vals_[1] - other.vals_[1] ;
+    return r ;
+  }
+
+
+  inline EDVec64x2 operator* (const EDVec64x2& other) const {
+    EDVec64x2 r ;
+    r.vals_[0] = this->vals_[0] * other.vals_[0] ;
+    r.vals_[1] = this->vals_[1] * other.vals_[1] ;
+    return r ;
+  }
+
+  inline EDVec64x2 operator& (const EDVec64x2& other) const {
+    EDVec64x2 r ;
+    r.vals_[0] = this->vals_[0] & other.vals_[0] ;
+    r.vals_[1] = this->vals_[1] & other.vals_[1] ;
+    return r ;
+  }
+
+  static int PERIOD() {
+    return 64 ;
+  }
+
+  static int WORD_CNT() {
+    return 2 ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const EDVec64x2& v) {
+    os << "{" << v.vals_[0] << ", " << v.vals_[1] << "}" ;
+    return os ;
+  }
+
+} ;
+
+#endif // #ifndef _ED_INTRAV64x2_H
+
diff --git a/ed_intravED.h b/ed_intravED.h
new file mode 100644
index 0000000..fc0da41
--- /dev/null
+++ b/ed_intravED.h
@@ -0,0 +1,1079 @@
+/* 
+The MIT License (MIT)
+Copyright (c) 2014 Intel Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#ifndef _ED_INTRAVED_H
+#define _ED_INTRAVED_H
+
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#ifdef USE_SSE4
+#include <tmmintrin.h>
+#include <smmintrin.h>
+
+#define BLENDV(__v1, __v2, __pred)		\
+  _mm_blendv_epi8(__v1, __v2, __pred)
+
+#else // #ifdef USE_SSE4
+
+#define BLENDV(__v1, __v2, __pred)					\
+  _mm_or_si128(_mm_andnot_si128(__pred, __v1), _mm_and_si128(__v2, __pred))
+
+#endif // #ifdef USE_SSE4
+
+#include "ed_intrav64.h"
+#include "ed_intrav64x2.h"
+
+		    /*
+inline void printVec8(const __m128i& v) {
+  
+  cout << "{" ;
+  cout << " " << _mm_extract_epi8(v, 0) ;
+  cout << " " <<_mm_extract_epi8(v, 1) ;
+  cout << " " <<_mm_extract_epi8(v, 2) ;
+  cout << " " << _mm_extract_epi8(v, 3) ;
+  cout << " " <<_mm_extract_epi8(v, 4) ;
+  cout << " " <<_mm_extract_epi8(v, 5) ;
+  cout << " " <<_mm_extract_epi8(v, 6) ;
+  cout << " " <<_mm_extract_epi8(v, 7) ;
+  cout << " " <<_mm_extract_epi8(v, 8) ;
+  cout << " " <<_mm_extract_epi8(v, 9) ;
+  cout << " " <<_mm_extract_epi8(v, 10) ;
+  cout << " " <<_mm_extract_epi8(v, 11) ;
+  cout << " " <<_mm_extract_epi8(v, 12) ;
+  cout << " " <<_mm_extract_epi8(v, 13) ;
+  cout << " " <<_mm_extract_epi8(v, 14) ;
+  cout << " " <<_mm_extract_epi8(v, 15) ;
+
+  cout << "}" << endl ;
+}
+		    */
+inline void printVec16(const __m128i& v) {
+  
+  cout << "{" ;
+  cout << " " << _mm_extract_epi16(v, 0) ;
+  cout << " " << _mm_extract_epi16(v, 1) ;
+  cout << " " << _mm_extract_epi16(v, 2) ;
+  cout << " " << _mm_extract_epi16(v, 3) ;
+  cout << " " <<_mm_extract_epi16(v, 4) ;
+  cout << " " <<_mm_extract_epi16(v, 5) ;
+  cout << " " <<_mm_extract_epi16(v, 6) ;
+  cout << " " <<_mm_extract_epi16(v, 7) ;
+
+  cout << "}" << endl ;
+}
+
+
+class EDVec128Every16 {
+  __m128i vec_ ;
+  static const int WORD_CNT_ = 8 ;
+  static const int PERIOD_ = 16 ;
+
+  // Update equality operator if more members are added
+
+ public:
+  EDVec128Every16() {}
+
+  explicit EDVec128Every16(__m128i v):vec_(v) {}
+  explicit EDVec128Every16(int16_t val):vec_(_mm_set1_epi16(val)) {}
+  explicit EDVec128Every16(const BitVec64& bv): vec_(_mm_set_epi64x(0, bv.bitV)) {}
+  explicit EDVec128Every16(const BitVec64x2& bv): vec_(_mm_set_epi64x(bv.bitV[1], bv.bitV[0])) {}
+
+  inline void setAll(int16_t val) {
+    vec_ = _mm_set1_epi16(val) ;    
+  }
+
+  static int16_t getMaxWordVal() { return numeric_limits<int16_t>::max() ;}
+
+  inline int getWord(int index) const {
+    switch(index) {
+    case 0: return _mm_extract_epi16(vec_, 0) ;
+    case 1: return  _mm_extract_epi16(vec_, 1) ;
+    case 2: return  _mm_extract_epi16(vec_, 2) ;
+    case 3: return  _mm_extract_epi16(vec_, 3) ;
+    case 4: return  _mm_extract_epi16(vec_, 4) ;
+    case 5: return  _mm_extract_epi16(vec_, 5) ;
+    case 6: return  _mm_extract_epi16(vec_, 6) ;
+    case 7: return  _mm_extract_epi16(vec_, 7) ;
+    default: assert(false); return 0 ;
+    }
+  }
+
+  bool allLessThanOrEqualTo(const EDVec128Every16& other) const {
+    // each elt in pred will be set to 0xFFFF if the element in this vector 
+    // is greater than the corresponding element in other
+    __m128i pred = _mm_cmpgt_epi16(vec_, other.vec_) ; 
+    int flags = _mm_movemask_epi8(pred) ; // creates a mask from MSB of each 8-bit elt in pred
+
+    return flags == 0 ; // if there's at least one elt greater than "other", flags != 0
+  }
+
+  inline void setWords (int* v) {
+    vec_ = _mm_set_epi16(v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]) ;
+  }
+
+  inline void setFirstWord(int val) {
+    vec_ = _mm_insert_epi16(vec_, val, 0) ;
+  }
+
+  inline void setWordsAsMask () {
+    vec_ = _mm_set1_epi16(0x1) ;
+  }
+
+  static int getLastWordIndexFor (int queryLen) {
+    int baseIndex = queryLen - 1 ;
+    return (baseIndex >= 63) ? 
+      (((baseIndex - 63) / PERIOD_) + WORD_CNT_/2) :
+      (baseIndex/PERIOD_) ;
+  }
+
+  // If the query length is greater than 63, we need two 64-words.
+  // In that case, we will have:
+  // bitOffset(lowerWord probes) = bitOffset(upper word probes)-1
+  // Consider the following example:
+  // query len = 79, probeOffset = (79-64) % 16 = 15. The bit offset of the 
+  // probes in the upper 64-bit word will be 15, while the bit offsets of the probes 
+  // in the lower 64-bit word will be 14 in this case. So, the probes will be at
+  // indices: 78 (in upper word), 62, 46, 30, 14 (in lower word). Note that if the
+  // bit offset were also 15 in the lower word, one probe would correspond to position
+  // 63, which has no corresponding base (because bit 63 stores the carryout bit of the
+  // lower word). We can generalize this notion for all query lengths. 
+  // As a potential corner case, consider query length of 64. In this case, the 
+  // probes in the upper 64-bit word will have bitOffset = 0, and the lower
+  // ones will have bitOffset = -1 (conceptually). The probe locations will be at:
+  // 63 (upper word), 47, 31, 15, -1 (lower word). The corresponding lengths of
+  // the subsequences will be: 64, 48, 32, 16, 0. Hence the last probe will be ignored.
+  //
+  // In the following function, the probe offset corresponding to the upper word will be
+  // returned. Internally, we'll use probe offset one less for the lower word.
+  // If the query length is less than 63, we will have only the lower word, and the
+  // last probe location should match the MSB. So, the return value of the following 
+  // function should be one more than the real probe bit offset.
+  // Consider an example where query length is 60. The probe locations should be at 
+  // bit offsets: 59, 43, 27, 11. The return value of the following function in this 
+  // case will be 12, but internally, bit offset 11 will be used downstream.
+  static int getProbeOffsetFor (int queryLen) {
+    return queryLen % PERIOD_ ;
+    // It is not (queryLen-1) % PERIOD, because:
+    // For queries longer than 63, one bit in the bit vector is unused (the one at position 63).
+    // For queries shorter than or equal to 63, the value returned is one more than the real
+    // offset (see the comments above).
+  }
+
+  void setWordsAsEndBonus(int queryLen, int endBonus) {
+    int v[WORD_CNT_] = {0, 0, 0, 0, 0, 0, 0, 0} ;
+    
+    int lastWordIndex = getLastWordIndexFor(queryLen) ;    
+    v[lastWordIndex] = endBonus ;
+
+    setWords(v) ;
+  }
+
+  void setWordsAsBadScore(int queryLen, int thr, int infScore) {
+    int v[WORD_CNT_] = {thr, thr, thr, thr, thr, thr, thr, thr} ;
+    
+    int lastWordIndex = getLastWordIndexFor(queryLen) ;    
+    for (int i=lastWordIndex+1; i < WORD_CNT_; ++i) {
+      v[lastWordIndex] = infScore ;
+    }
+
+    setWords(v) ;
+  }
+
+  static int getDistAt(int wordIndex, int queryLen) {
+    return wordIndex * PERIOD_ + getProbeOffsetFor(queryLen) ;
+  }
+
+  void setWordsAsDist (int queryLen, int distWt) {
+    //int16_t v[WORD_CNT_] = {0, 0, 0, 0, 0, 0, 0, 0} ;
+    int v[WORD_CNT_] ;
+    int bitOffset = getProbeOffsetFor(queryLen) ;
+
+    for (int wi=0; wi < WORD_CNT_; ++wi) {
+      v[wi] = distWt * (wi * PERIOD_ + bitOffset) ;
+    }
+    
+    /*
+    for (int wi=0; wi < WORD_CNT_/2 - 1; ++wi) {
+      v[wi] = distWt * (wi * PERIOD_ + bitOffset + 1) ;
+    }
+
+    for (int wi=WORD_CNT_/2-1; wi < WORD_CNT_; ++wi) {
+      v[wi] = distWt * (wi * PERIOD_ + bitOffset) ; // ... + PERIOD_-1 + bitOffset+1
+    }
+    */
+
+    setWords(v) ;
+  }
+
+  void setMin(const EDVec128Every16& other) {
+    vec_ = _mm_min_epi16(vec_, other.vec_) ;
+  }
+
+  void setMin(const EDVec128Every16& other, EDVec128Every16& bestIndices, const EDVec128Every16& otherIndices) {
+    __m128i pred = _mm_cmplt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other < this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+  }
+
+  void setMax(const EDVec128Every16& other, EDVec128Every16& bestIndices, const EDVec128Every16& otherIndices) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+  }
+
+  void setMax(const EDVec128Every16& other, EDVec128Every16& bestIndices, const EDVec128Every16& otherIndices, EDVec128Every16& bestAccumDist, const EDVec128Every16& otherAccumDist) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    bestAccumDist.vec_ = BLENDV(bestAccumDist.vec_, otherAccumDist.vec_, pred) ;
+  }
+
+  // The return value will be nonzero iff at least one of the entries is updated
+  uint16_t setMaxAndReturnFlag(const EDVec128Every16& other, EDVec128Every16& bestIndices, const EDVec128Every16& otherIndices) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    
+    return (uint16_t) _mm_movemask_epi8(pred) ;
+  }
+
+  // The return value will be nonzero iff at least one of the entries is updated
+  uint16_t setMaxAndReturnFlag(const EDVec128Every16& other, EDVec128Every16& bestIndices, const EDVec128Every16& otherIndices, EDVec128Every16& bestAccumDist, const EDVec128Every16& otherAccumDist) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    bestAccumDist.vec_ = BLENDV(bestAccumDist.vec_, otherAccumDist.vec_, pred) ;
+    
+    return (uint16_t) _mm_movemask_epi8(pred) ;
+  }
+
+  void addThirdIfFirstGTSecond(const EDVec128Every16& first, const EDVec128Every16& second, 
+			       const EDVec128Every16& third, const EDVec128Every16& zero) {
+
+    __m128i pred = _mm_cmpgt_epi16(first.vec_, second.vec_) ; 
+    // pred is 0xFFFF if first > second
+
+    // If pred is 0, then blendv will add zero; o/w it will add third
+    vec_ = _mm_add_epi16(vec_, BLENDV(zero.vec_, third.vec_, pred)) ;
+  }
+
+  /*
+  EDVec128Every16 abs(const EDVec128Every16& other) const {
+    return EDVec128Every16(_mm_abs_epi16(_mm_sub_epi16(this->vec_, other.vec_))) ;
+  }
+
+  EDVec128Every16 abs() const {
+    return EDVec128Every16(_mm_abs_epi16(this->vec_)) ;
+  }
+  */
+
+  EDVec128Every16 subSat(const EDVec128Every16& other) const {
+    return EDVec128Every16(_mm_subs_epu16(this->vec_, other.vec_)) ;
+  }
+
+  inline EDVec128Every16 shiftBitsRightWithinWords(int shiftVal) {
+    return EDVec128Every16(_mm_srli_epi16(vec_, shiftVal)) ;
+  }
+
+  inline void shiftWordsLeftByOne() {
+    vec_ = _mm_slli_si128(vec_, 2) ; // 16-bit shift = 2 * 8-bit shift
+  }
+
+  inline bool operator == (const EDVec128Every16& other) {
+    return _mm_movemask_epi8(_mm_cmpeq_epi16(vec_, other.vec_)) == 0xFFFF ;
+  }
+
+  inline EDVec128Every16& operator += (const EDVec128Every16& other) {
+    vec_ = _mm_add_epi16(vec_, other.vec_) ;
+    return *this ;
+  }
+
+  inline EDVec128Every16& operator -= (const EDVec128Every16& other) {
+    vec_ = _mm_sub_epi16(vec_, other.vec_) ;
+    return *this ;
+  }
+
+  inline EDVec128Every16 operator+ (const EDVec128Every16& other) const {
+    return EDVec128Every16(_mm_add_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec128Every16 operator- (const EDVec128Every16& other) const {
+    return EDVec128Every16(_mm_sub_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec128Every16 operator* (const EDVec128Every16& other) const {
+    return EDVec128Every16(_mm_mullo_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec128Every16 operator& (const EDVec128Every16& other) const {
+    return EDVec128Every16(_mm_and_si128(vec_, other.vec_)) ;
+  }
+
+  static int WORD_CNT() {
+    return WORD_CNT_ ;
+  }
+
+  static int PERIOD() {
+    return PERIOD_ ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const EDVec128Every16& v) {
+    os << "{" ;
+    for (int i=0; i < WORD_CNT_; ++i) {
+      os << v.getWord(i) << " " ;
+    }
+    os << "}" ;
+    return os ;
+  }
+
+} ;
+
+
+//
+// Edit distance vector for a 64-bit query.
+// 16-bit words will be stored in an SSE vector in interleaved order as follows:
+//           w7 w5 w3 w1 w6 w4 w2 w0
+//
+class EDVec64Every8 {
+  __m128i vec_ ;
+  __m128i shiftLeftIndices_ ; // can be made static later
+  static const int WORD_CNT_ = 8 ;
+  static const int PERIOD_ = 8 ;
+
+  void init_() {
+    shiftLeftIndices_ = _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 13, 12, 11, 10, 9, 8, 0xFF, 0xFF) ;
+  }
+
+
+ public:
+  EDVec64Every8() {}
+
+  explicit EDVec64Every8(__m128i v):vec_(v) {init_() ;}
+  explicit EDVec64Every8(int16_t val):vec_(_mm_set1_epi16(val)) {init_() ;}
+  explicit EDVec64Every8(const BitVec64& bv): vec_(_mm_set_epi64x(bv.bitV>>8, bv.bitV)) {init_() ;}
+
+
+  inline void setAll(int16_t val) {
+    vec_ = _mm_set1_epi16(val) ;    
+  }
+
+  static int16_t getMaxWordVal() { return numeric_limits<int16_t>::max() ;}
+
+  inline int getWord(int index) const {
+    switch(index) {
+    case 0: return _mm_extract_epi16(vec_, 0) ;
+    case 1: return  _mm_extract_epi16(vec_, 4) ;
+    case 2: return  _mm_extract_epi16(vec_, 1) ;
+    case 3: return  _mm_extract_epi16(vec_, 5) ;
+    case 4: return  _mm_extract_epi16(vec_, 2) ;
+    case 5: return  _mm_extract_epi16(vec_, 6) ;
+    case 6: return  _mm_extract_epi16(vec_, 3) ;
+    case 7: return  _mm_extract_epi16(vec_, 7) ;
+    default: assert(false); return 0 ;
+    }
+  }
+
+  bool allLessThanOrEqualTo(const EDVec64Every8& other) const {
+    // each elt in pred will be set to 0xFFFF if the element in this vector 
+    // is greater than the corresponding element in other
+    __m128i pred = _mm_cmpgt_epi16(vec_, other.vec_) ; 
+    int flags = _mm_movemask_epi8(pred) ; // creates a mask from MSB of each 8-bit elt in pred
+
+    return flags == 0 ; // if there's at least one elt greater than "other", flags != 0
+  }
+
+
+  inline void setWords (int* v) {
+    vec_ = _mm_set_epi16(v[7], v[5], v[3], v[1], v[6], v[4], v[2], v[0]) ;
+  }
+
+  inline void setFirstWord(int val) {
+    vec_ = _mm_insert_epi16(vec_, val, 0) ;
+  }
+
+  inline void setWordsAsMask () {
+    vec_ = _mm_set1_epi16(0x1) ;
+  }
+
+  static int getLastWordIndexFor (int queryLen) {
+    return (queryLen-1) / PERIOD_ ; // logical word index - no interleaving at this level
+  }
+
+  static int getProbeOffsetFor (int queryLen) {
+    return (queryLen-1) % PERIOD_ ;
+  }
+
+  void setWordsAsEndBonus(int queryLen, int endBonus) {
+    int v[WORD_CNT_] = {0, 0, 0, 0, 0, 0, 0, 0} ;
+    
+    int lastWordIndex = getLastWordIndexFor(queryLen) ;    
+    v[lastWordIndex] = endBonus ;
+
+    setWords(v) ;
+  }
+
+  void setWordsAsBadScore(int queryLen, int thr, int infScore) {
+    int v[WORD_CNT_] = {thr, thr, thr, thr, thr, thr, thr, thr} ;
+    
+    int lastWordIndex = getLastWordIndexFor(queryLen) ;    
+    for (int i=lastWordIndex+1; i < WORD_CNT_; ++i) {
+      v[i] = infScore ;
+    }
+
+    setWords(v) ;
+  }
+
+  static int getDistAt(int wordIndex, int queryLen) {
+    return wordIndex * PERIOD_ + getProbeOffsetFor(queryLen) + 1 ;
+  }
+
+  void setWordsAsDist (int queryLen, int distWt) {
+    int v[WORD_CNT_] ;
+    int bitOffset = getProbeOffsetFor(queryLen) ;
+    
+    for (int wi=0; wi < WORD_CNT_; ++wi) {
+      v[wi] = distWt * (wi * PERIOD_ + bitOffset + 1) ;
+    }
+
+    setWords(v) ;
+  }
+
+  void setMin(const EDVec64Every8& other) {
+    vec_ = _mm_min_epi16(vec_, other.vec_) ;
+  }
+
+  void setMin(const EDVec64Every8& other, EDVec64Every8& bestIndices, const EDVec64Every8& otherIndices) {
+    __m128i pred = _mm_cmplt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other < this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+  }
+
+  void setMax(const EDVec64Every8& other, EDVec64Every8& bestIndices, const EDVec64Every8& otherIndices) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+  }
+
+  void setMax(const EDVec64Every8& other, EDVec64Every8& bestIndices, const EDVec64Every8& otherIndices, EDVec64Every8& bestAccumDist, const EDVec64Every8& otherAccumDist) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    bestAccumDist.vec_ = BLENDV(bestAccumDist.vec_, otherAccumDist.vec_, pred) ;
+  }
+
+
+  // The return value will be nonzero iff at least one of the entries is updated
+  uint16_t setMaxAndReturnFlag(const EDVec64Every8& other, EDVec64Every8& bestIndices, const EDVec64Every8& otherIndices) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    return (uint16_t) _mm_movemask_epi8(pred) ;
+  }
+
+  // The return value will be nonzero iff at least one of the entries is updated
+  uint16_t setMaxAndReturnFlag(const EDVec64Every8& other, EDVec64Every8& bestIndices, const EDVec64Every8& otherIndices, EDVec64Every8& bestAccumDist, const EDVec64Every8& otherAccumDist) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    bestAccumDist.vec_ = BLENDV(bestAccumDist.vec_, otherAccumDist.vec_, pred) ;
+ 
+   return (uint16_t) _mm_movemask_epi8(pred) ;
+  }
+
+
+
+  void addThirdIfFirstGTSecond(const EDVec64Every8& first, const EDVec64Every8& second, 
+			       const EDVec64Every8& third, const EDVec64Every8& zero) {
+
+    __m128i pred = _mm_cmpgt_epi16(first.vec_, second.vec_) ; 
+    // pred is 0xFFFF if first > second
+
+    // If pred is 0, then blendv will add zero; o/w it will add third
+    vec_ = _mm_add_epi16(vec_, BLENDV(zero.vec_, third.vec_, pred)) ;
+  }
+
+
+  /*
+  EDVec64Every8 abs(const EDVec64Every8& other) const {
+    return EDVec64Every8(_mm_abs_epi16(_mm_sub_epi16(this->vec_, other.vec_))) ;
+  }
+
+  EDVec64Every8 abs() const {
+    return EDVec64Every8(_mm_abs_epi16(this->vec_)) ;
+  }
+  */
+
+  EDVec64Every8 subSat(const EDVec64Every8& other) const {
+    return EDVec64Every8(_mm_subs_epu16(this->vec_, other.vec_)) ;
+  }
+
+  inline EDVec64Every8 shiftBitsRightWithinWords(int shiftVal) {
+    return EDVec64Every8(_mm_srli_epi16(vec_, shiftVal)) ;
+  }
+
+  inline void shiftWordsLeftByOne() {
+    //vec_ = _mm_slli_si128(vec_, 2) ; // 16-bit shift = 2 * 8-bit shift
+
+#ifdef USE_SSE4
+    vec_ = _mm_shuffle_epi8(vec_, shiftLeftIndices_) ;
+#else
+
+    // The layout of the logical 16-bit words before shift:
+    // 7 5 3 1 6 4 2 0
+    // Consider the physical 16-bit words before shift: 
+    // 7 6 5 4 3 2 1 0
+    // These words should be shuffled into the following for logical shift:
+    // 3 2 1 0 6 5 4 X
+    
+    // First, do the following: 7 6 5 4 3 2 1 0 --> 6 5 4 7 3 2 1 0
+    // The binary imm value: 10 01 00 11 (i.e. 2 1 0 3)
+    vec_ = _mm_shufflehi_epi16(vec_, 0x93) ;
+
+    // Then, do the following: 6 5 4 7 3 2 1 0 --> 3 2 1 0 6 5 4 7
+    // The binary imm value: 01 00 11 10 (i.e. 1 0 3 2)
+    vec_ = _mm_shuffle_epi32(vec_, 0x4E) ;
+
+#endif
+
+  }
+
+  inline bool operator == (const EDVec64Every8& other) {
+    return _mm_movemask_epi8(_mm_cmpeq_epi16(vec_, other.vec_)) == 0xFFFF ;
+  }
+
+  inline EDVec64Every8& operator += (const EDVec64Every8& other) {
+    vec_ = _mm_add_epi16(vec_, other.vec_) ;
+    return *this ;
+  }
+
+  inline EDVec64Every8& operator -= (const EDVec64Every8& other) {
+    vec_ = _mm_sub_epi16(vec_, other.vec_) ;
+    return *this ;
+  }
+
+  inline EDVec64Every8 operator+ (const EDVec64Every8& other) const {
+    return EDVec64Every8(_mm_add_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec64Every8 operator- (const EDVec64Every8& other) const {
+    return EDVec64Every8(_mm_sub_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec64Every8 operator* (const EDVec64Every8& other) const {
+    return EDVec64Every8(_mm_mullo_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec64Every8 operator& (const EDVec64Every8& other) const {
+    return EDVec64Every8(_mm_and_si128(vec_, other.vec_)) ;
+  }
+
+  static int WORD_CNT() {
+    return WORD_CNT_ ;
+  }
+
+  static int PERIOD() {
+    return PERIOD_ ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const EDVec64Every8& v) {
+    os << "{" ;
+    for (int i=0; i < WORD_CNT_; ++i) {
+      os << v.getWord(i) << " " ;
+    }
+    os << "}" ;
+    return os ;
+  }
+
+} ;
+
+
+//
+// Edit distance vector for a 32-bit query.
+// 16-bit words will be stored in an SSE vector in interleaved order as follows:
+//           w7 w3 w6 w2 w5 w1 w4 w0
+//
+class EDVec32Every4 {
+  __m128i vec_ ;
+  __m128i shiftLeftIndices_ ; // can be made static later
+  static const int WORD_CNT_ = 8 ;
+  static const int PERIOD_ = 4 ;
+
+  void init_() {
+    shiftLeftIndices_ = _mm_set_epi8(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 13, 12, 0xFF, 0xFF) ;
+  }
+
+ public:
+  EDVec32Every4() {}
+
+  explicit EDVec32Every4(__m128i v):vec_(v) {init_();}
+  explicit EDVec32Every4(int16_t val):vec_(_mm_set1_epi16(val)) {init_();}
+  explicit EDVec32Every4(const BitVec64& bv)
+    : vec_(_mm_set_epi32(bv.bitV>>12, bv.bitV>>8, bv.bitV>>4, bv.bitV)) {init_();}
+
+  inline void setAll(int16_t val) {
+    vec_ = _mm_set1_epi16(val) ;    
+  }
+
+  static int16_t getMaxWordVal() { return numeric_limits<int16_t>::max() ;}
+
+  inline int getWord(int index) const {
+    switch(index) {
+    case 0: return _mm_extract_epi16(vec_, 0) ;
+    case 1: return  _mm_extract_epi16(vec_, 2) ;
+    case 2: return  _mm_extract_epi16(vec_, 4) ;
+    case 3: return  _mm_extract_epi16(vec_, 6) ;
+    case 4: return  _mm_extract_epi16(vec_, 1) ;
+    case 5: return  _mm_extract_epi16(vec_, 3) ;
+    case 6: return  _mm_extract_epi16(vec_, 5) ;
+    case 7: return  _mm_extract_epi16(vec_, 7) ;
+    default: assert(false); return 0 ;
+    }
+  }
+
+  bool allLessThanOrEqualTo(const EDVec32Every4& other) const {
+    // each elt in pred will be set to 0xFFFF if the element in this vector 
+    // is greater than the corresponding element in other
+    __m128i pred = _mm_cmpgt_epi16(vec_, other.vec_) ; 
+    int flags = _mm_movemask_epi8(pred) ; // creates a mask from MSB of each 8-bit elt in pred
+
+    return flags == 0 ; // if there's at least one elt greater than "other", flags != 0
+  }
+
+
+  inline void setWords (int* v) {
+    vec_ = _mm_set_epi16(v[7], v[3], v[6], v[2], v[5], v[1], v[4], v[0]) ;
+  }
+
+  inline void setFirstWord(int val) {
+    vec_ = _mm_insert_epi16(vec_, val, 0) ;
+  }
+
+  inline void setWordsAsMask () {
+    vec_ = _mm_set1_epi16(0x1) ;
+  }
+
+  static int getLastWordIndexFor (int queryLen) {
+    return (queryLen-1) / PERIOD_ ; // logical word index - no interleaving at this level
+  }
+
+  static int getProbeOffsetFor (int queryLen) {
+    return (queryLen-1) % PERIOD_ ;
+  }
+
+  void setWordsAsEndBonus(int queryLen, int endBonus) {
+    int v[WORD_CNT_] = {0, 0, 0, 0, 0, 0, 0, 0} ;
+    
+    int lastWordIndex = getLastWordIndexFor(queryLen) ;    
+    v[lastWordIndex] = endBonus ;
+
+    setWords(v) ;
+  }
+
+  void setWordsAsBadScore(int queryLen, int thr, int infScore) {
+    int v[WORD_CNT_] = {thr, thr, thr, thr, thr, thr, thr, thr} ;
+    
+    int lastWordIndex = getLastWordIndexFor(queryLen) ;    
+    for (int i=lastWordIndex+1; i < WORD_CNT_; ++i)
+      v[lastWordIndex] = infScore ;
+
+    setWords(v) ;
+  }
+
+  static int getDistAt(int wordIndex, int queryLen) {
+    return wordIndex * PERIOD_ + getProbeOffsetFor(queryLen) + 1 ;
+  }
+
+  void setWordsAsDist (int queryLen, int distWt) {
+    int v[WORD_CNT_] ;
+    int bitOffset = getProbeOffsetFor(queryLen) ;
+    
+    for (int wi=0; wi < WORD_CNT_; ++wi) {
+      v[wi] = distWt * (wi * PERIOD_ + bitOffset + 1) ;
+    }
+
+    setWords(v) ;
+  }
+
+  void setMin(const EDVec32Every4& other) {
+    vec_ = _mm_min_epi16(vec_, other.vec_) ;
+  }
+
+  void setMin(const EDVec32Every4& other, EDVec32Every4& bestIndices, const EDVec32Every4& otherIndices) {
+    __m128i pred = _mm_cmplt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other < this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+  }
+
+  void setMax(const EDVec32Every4& other, EDVec32Every4& bestIndices, const EDVec32Every4& otherIndices) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+  }
+
+  void setMax(const EDVec32Every4& other, EDVec32Every4& bestIndices, const EDVec32Every4& otherIndices, EDVec32Every4& bestAccumDist, const EDVec32Every4& otherAccumDist) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    bestAccumDist.vec_ = BLENDV(bestAccumDist.vec_, otherAccumDist.vec_, pred) ;
+  }
+
+  // The return value will be nonzero iff at least one of the entries is updated
+  uint16_t setMaxAndReturnFlag(const EDVec32Every4& other, EDVec32Every4& bestIndices, const EDVec32Every4& otherIndices) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    return (uint16_t) _mm_movemask_epi8(pred) ;
+  }
+
+  // The return value will be nonzero iff at least one of the entries is updated
+  uint16_t setMaxAndReturnFlag(const EDVec32Every4& other, EDVec32Every4& bestIndices, const EDVec32Every4& otherIndices, EDVec32Every4& bestAccumDist, const EDVec32Every4& otherAccumDist) {
+    __m128i pred = _mm_cmpgt_epi16(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    bestAccumDist.vec_ = BLENDV(bestAccumDist.vec_, otherAccumDist.vec_, pred) ; 
+
+   return (uint16_t) _mm_movemask_epi8(pred) ;
+  }
+
+
+  void addThirdIfFirstGTSecond(const EDVec32Every4& first, const EDVec32Every4& second, 
+			       const EDVec32Every4& third, const EDVec32Every4& zero) {
+
+    __m128i pred = _mm_cmpgt_epi16(first.vec_, second.vec_) ; 
+    // pred is 0xFFFF if first > second
+
+    // If pred is 0, then blendv will add zero; o/w it will add third
+    vec_ = _mm_add_epi16(vec_, BLENDV(zero.vec_, third.vec_, pred)) ;
+  }
+
+
+  /*
+  EDVec32Every4 abs(const EDVec32Every4& other) const {
+    return EDVec32Every4(_mm_abs_epi16(_mm_sub_epi16(this->vec_, other.vec_))) ;
+  }
+
+  EDVec32Every4 abs() const {
+    return EDVec32Every4(_mm_abs_epi16(this->vec_)) ;
+  }
+  */
+
+  EDVec32Every4 subSat(const EDVec32Every4& other) const {
+    return EDVec32Every4(_mm_subs_epu16(this->vec_, other.vec_)) ;
+  }
+
+  inline EDVec32Every4 shiftBitsRightWithinWords(int shiftVal) {
+    return EDVec32Every4(_mm_srli_epi16(vec_, shiftVal)) ;
+  }
+
+  inline void shiftWordsLeftByOne() {
+    //vec_ = _mm_slli_si128(vec_, 2) ; // 16-bit shift = 2 * 8-bit shift
+
+#ifdef USE_SSE4
+    vec_ = _mm_shuffle_epi8(vec_, shiftLeftIndices_) ;
+
+#else
+
+    // The layout of the logical 16-bit words before shift:
+    // 7 3 6 2 5 1 4 0
+    // Consider the physical 16-bit words before shift: 
+    // 7 6 5 4 3 2 1 0
+    // These words should be shuffled into the following for logical shift:
+    // 5 4 3 2 1 0 6 X
+    
+    // First, do the following: 7 6 5 4 3 2 1 0 --> 5 4 3 2 1 0 7 6
+    // The binary imm value: 10 01 00 11 (i.e. 2 1 0 3)
+    vec_ = _mm_shuffle_epi32(vec_, 0x93) ;
+
+    // Then, do the following: 5 4 3 2 1 0 7 6 --> 5 4 3 2 1 0 6 7
+    // The binary imm value: 11 10 00 01 (i.e. 3 2 0 1)
+    vec_ = _mm_shufflelo_epi16(vec_, 0xE1) ;
+#endif
+
+  }
+
+  inline bool operator == (const EDVec32Every4& other) {
+    //cout << "Comparing: " << endl ;
+    //cout << *this << endl ;
+    //cout << other << endl ;
+    //cout << EDVec32Every4(_mm_cmpeq_epi16(vec_, other.vec_)) << endl ;
+    //cout << std::hex << _mm_movemask_epi8(_mm_cmpeq_epi16(vec_, other.vec_))
+    //     << std::dec << endl ;
+
+    return _mm_movemask_epi8(_mm_cmpeq_epi16(vec_, other.vec_)) == 0xFFFF ;
+  }
+
+  inline EDVec32Every4& operator += (const EDVec32Every4& other) {
+    vec_ = _mm_add_epi16(vec_, other.vec_) ;
+    return *this ;
+  }
+
+  inline EDVec32Every4& operator -= (const EDVec32Every4& other) {
+    vec_ = _mm_sub_epi16(vec_, other.vec_) ;
+    return *this ;
+  }
+
+  inline EDVec32Every4 operator+ (const EDVec32Every4& other) const {
+    return EDVec32Every4(_mm_add_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec32Every4 operator- (const EDVec32Every4& other) const {
+    return EDVec32Every4(_mm_sub_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec32Every4 operator* (const EDVec32Every4& other) const {
+    return EDVec32Every4(_mm_mullo_epi16(vec_, other.vec_)) ;
+  }
+
+  inline EDVec32Every4 operator& (const EDVec32Every4& other) const {
+    return EDVec32Every4(_mm_and_si128(vec_, other.vec_)) ;
+  }
+
+  static int WORD_CNT() {
+    return WORD_CNT_ ;
+  }
+
+  static int PERIOD() {
+    return PERIOD_ ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const EDVec32Every4& v) {
+    os << "{" ;
+    for (int i=0; i < WORD_CNT_; ++i) {
+      os << v.getWord(i) << " " ;
+    }
+    os << "}" ;
+    return os ;
+  }
+
+} ;
+
+#if 0
+//
+// Edit distance vector for a 16-bit query.
+// 8-bit words will be stored in an SSE vector in interleaved order as follows:
+//           w7 w3 w6 w2 w5 w1 w4 w0
+//
+class EDVec16Every1 {
+  __m128i vec_ ;
+  static const int WORD_CNT_ = 16 ;
+  static const int PERIOD_ = 1 ;
+
+  static uint64_t expand8To64LUT_[256] ; // to expand 8 bits to 64 bits
+
+ public:
+  EDVec16Every1() {}
+
+  explicit EDVec16Every1(__m128i v):vec_(v) {}
+  explicit EDVec16Every1(int16_t val):vec_(_mm_set1_epi8((int8_t) val)) {}
+
+  explicit EDVec16Every1(const BitVec64& bv)
+    :vec_(_mm_set_epi64(expand8To64LUT_[bv.bitV >> 8], expand8To64LUT_[bv.bitV])) {}
+
+  /*
+  explicit EDVec16Every1(const BitVec64& bv):vec_(_mm_set_epi64(bv.bitV >> 8, bv.bitV)) {
+    // The initialization above separates the 16-bit vector into upper and lower 64-bits.
+    // The 8-bit words of 128-bit vec look like this:
+    // _ _ _ _ _ _ _ b[15:8] _ _ _ _ _ _ _ b[7:0]
+
+    // Further separate them into 4-bit words:
+    vec_ = _mm_or_si128(vec_, _mm_slli_si128(_mm_srli_epi16(vec_, 4), 4)) ;
+    // Now, we have:
+    // _ _ _ b[15:12] _ _ _ b[11:8] _ _ _ b[7:4] _ _ _ b[3:0]
+    
+    // Further separate them into 2-bit words:
+    vec_ = _mm_or_si128(vec_, _mm_slli_si128(_mm_srli_epi16(vec_, 2), 2)) ;
+    // Now, we have:
+    // _ b[15:14] _ b[13:12] _ b[11:10] _ b[9:8] _ b[7:6] _ b[5:4] _ b[3:2] _ b[1:0]
+
+    // Further separate them into 1-bit words:
+    vec_ = _mm_or_si128(vec_, _mm_slli_si128(_mm_srli_epi16(vec_, 1), 1)) ;
+
+  }
+  */
+
+  static void initLUT() ;
+
+  inline void setAll(int16_t val) {
+    vec_ = _mm_set1_epi8((int8_t)val) ;    
+  }
+
+  static int16_t getMaxWordVal() { return numeric_limits<int8_t>::max() ;}
+
+  inline int getWord(int index) const {
+    switch(index) {
+    case 0: return _mm_extract_epi8(vec_, 0) ;
+    case 1: return  _mm_extract_epi8(vec_, 1) ;
+    case 2: return  _mm_extract_epi8(vec_, 2) ;
+    case 3: return  _mm_extract_epi8(vec_, 3) ;
+    case 4: return  _mm_extract_epi8(vec_, 4) ;
+    case 5: return  _mm_extract_epi8(vec_, 5) ;
+    case 6: return  _mm_extract_epi8(vec_, 6) ;
+    case 7: return  _mm_extract_epi8(vec_, 7) ;
+    case 8: return  _mm_extract_epi8(vec_, 8) ;
+    case 9: return  _mm_extract_epi8(vec_, 9) ;
+    case 10: return  _mm_extract_epi8(vec_, 10) ;
+    case 11: return  _mm_extract_epi8(vec_, 11) ;
+    case 12: return  _mm_extract_epi8(vec_, 12) ;
+    case 13: return  _mm_extract_epi8(vec_, 13) ;
+    case 14: return  _mm_extract_epi8(vec_, 14) ;
+    case 15: return  _mm_extract_epi8(vec_, 15) ;
+
+    default: assert(false); return 0 ;
+    }
+  }
+
+  inline void setWords (int* v) {
+    vec_ = _mm_set_epi8(v[15], v[14], v[13], v[12], v[11], v[10], v[9], v[8],
+			v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]) ;
+  }
+
+  inline void setFirstWord(int val) {
+    vec_ = _mm_insert_epi16(vec_, val, 0) ;
+  }
+
+  inline void setWordsAsMask () {
+    vec_ = _mm_set1_epi8(0x1) ;
+  }
+
+  static int getLastWordIndexFor (int queryLen) {
+    return (queryLen-1) / PERIOD_ ; // logical word index - no interleaving at this level
+  }
+
+  static int getProbeOffsetFor (int queryLen) {
+    return (queryLen-1) % PERIOD_ ;
+  }
+
+  void setWordsAsEndBonus(int queryLen, int endBonus) {
+    int v[WORD_CNT_] = {0, 0, 0, 0, 0, 0, 0, 0} ;
+    
+    int lastWordIndex = getLastWordIndexFor(queryLen) ;    
+    v[lastWordIndex] = endBonus ;
+
+    setWords(v) ;
+  }
+
+  void setWordsAsBadScore(int queryLen, int thr, int infScore) {
+    int v[WORD_CNT_] = {thr, thr, thr, thr, thr, thr, thr, thr} ;
+    
+    int lastWordIndex = getLastWordIndexFor(queryLen) ;    
+    for (int i=lastWordIndex+1; i < WORD_CNT_; ++i)
+      v[lastWordIndex] = infScore ;
+
+    setWords(v) ;
+  }
+
+  static int getDistAt(int wordIndex, int queryLen) {
+    return wordIndex * PERIOD_ + getProbeOffsetFor(queryLen) + 1 ;
+  }
+
+  void setWordsAsDist (int queryLen, int distWt) {
+    int v[WORD_CNT_] ;
+    int bitOffset = getProbeOffsetFor(queryLen) ;
+    
+    for (int wi=0; wi < WORD_CNT_; ++wi) {
+      v[wi] = distWt * (wi * PERIOD_ + bitOffset + 1) ;
+    }
+
+    setWords(v) ;
+  }
+
+  void setMin(const EDVec16Every1& other) {
+    vec_ = _mm_min_epi8(vec_, other.vec_) ;
+  }
+
+  void setMin(const EDVec16Every1& other, EDVec16Every1& bestIndices, const EDVec16Every1& otherIndices) {
+    __m128i pred = _mm_cmplt_epi8(other.vec_, vec_) ; // pred is 0xFFFF if other < this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+  }
+
+  void setMax(const EDVec16Every1& other, EDVec16Every1& bestIndices, const EDVec16Every1& otherIndices) {
+    __m128i pred = _mm_cmpgt_epi8(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+  }
+
+  void setMax(const EDVec16Every1& other, EDVec16Every1& bestIndices, const EDVec16Every1& otherIndices, EDVecEvery1& bestAccumDist, const EDVecEvery1& otherAccumDist) {
+    __m128i pred = _mm_cmpgt_epi8(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    bestAccumDist.vec_ = BLENDV(bestAccumDist.vec_, otherAccumDist.vec_, pred) ;    
+  }
+
+
+  // The return value will be nonzero iff at least one of the entries is updated
+  uint16_t setMaxAndReturnFlag(const EDVec16Every1& other, EDVec16Every1& bestIndices, const EDVec16Every1& otherIndices) {
+    __m128i pred = _mm_cmpgt_epi8(other.vec_, vec_) ; // pred is 0xFFFF if other > this
+    vec_ = BLENDV(vec_, other.vec_, pred) ; // if pred is 0, choose this; o/w choose other
+    bestIndices.vec_ = BLENDV(bestIndices.vec_, otherIndices.vec_, pred) ;
+    return (uint16_t) _mm_movemask_epi8(pred) ;
+  }
+
+  EDVec16Every1 subSat(const EDVec16Every1& other) const {
+    return EDVec16Every1(_mm_subs_epu8(this->vec_, other.vec_)) ;
+  }
+
+  inline bool operator == (const EDVec16Every1& other) {
+    return _mm_movemask_epi8(_mm_cmpeq_epi16(vec_, other.vec_)) == 0xFFFFFFFF ;
+  }
+
+  inline EDVec16Every1& operator += (const EDVec16Every1& other) {
+    vec_ = _mm_add_epi8(vec_, other.vec_) ;
+    return *this ;
+  }
+
+  inline EDVec16Every1& operator -= (const EDVec16Every1& other) {
+    vec_ = _mm_sub_epi8(vec_, other.vec_) ;
+    return *this ;
+  }
+
+  inline EDVec16Every1 operator+ (const EDVec16Every1& other) const {
+    return EDVec16Every1(_mm_add_epi8(vec_, other.vec_)) ;
+  }
+
+  inline EDVec16Every1 operator- (const EDVec16Every1& other) const {
+    return EDVec16Every1(_mm_sub_epi8(vec_, other.vec_)) ;
+  }
+
+  inline EDVec16Every1 operator* (const EDVec16Every1& other) const {
+    return EDVec16Every1(_mm_mullo_epi8(vec_, other.vec_)) ;
+  }
+
+  inline EDVec16Every1 operator& (const EDVec16Every1& other) const {
+    return EDVec16Every1(_mm_and_si128(vec_, other.vec_)) ;
+  }
+
+  static int WORD_CNT() {
+    return WORD_CNT_ ;
+  }
+
+  static int PERIOD() {
+    return PERIOD_ ;
+  }
+
+  friend std::ostream& operator<< (std::ostream& os, const EDVec16Every1& v) {
+    os << "{" ;
+    for (int i=0; i < WORD_CNT_; ++i) {
+      os << v.getWord(i) << " " ;
+    }
+    os << "}" ;
+    return os ;
+  }
+
+} ;
+
+#endif // #if 0
+
+
+
+#endif // #ifndef _ED_INTRAVED_H
diff --git a/fastmap.c b/fastmap.c
index 479e878..6ea50a0 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -11,6 +11,7 @@
 #include "utils.h"
 #include "kseq.h"
 #include "utils.h"
+#include "intel_ext.h"
 KSEQ_DECLARE(gzFile)
 
 extern unsigned char nst_nt4_table[256];
@@ -51,6 +52,7 @@ int main_mem(int argc, char *argv[])
 	memset(pes, 0, 4 * sizeof(mem_pestat_t));
 	for (i = 0; i < 4; ++i) pes[i].failed = 1;
 
+	intel_init();
 	opt = mem_opt_init();
 	memset(&opt0, 0, sizeof(mem_opt_t));
 	while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) {
diff --git a/filter.h b/filter.h
new file mode 100644
index 0000000..864bfaa
--- /dev/null
+++ b/filter.h
@@ -0,0 +1,56 @@
+
+// Needs to be called only once per program to set up static arrays
+void init_ed_dist() ;
+
+
+
+
+// The filtering function:
+// Inputs: Reference and query sequences,	
+//         Initial score (i.e. h0),
+//         endBonus (i.e. the extra score if the whole query is aligned)
+// Outputs:
+//         Alignment length in query
+//         Alignment length in reference
+//         Alignment score
+//         Confidence: For now, it's either 0.0 or 1.0, corresponding to no/full confidence in outputs
+// Usage:
+//         If confidence == 0.0: Partial alignment - need to rerun ksw_extend(...)
+//         If confidence == 1.0
+//               if alignedQLen == queryLen: Full alignment
+//               if alignedQLen == 0: No alignment
+// Notes: 
+//        For now, the costMatrix and gap penalties are hardcoded. 
+//
+void extend_with_edit_dist(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+     			   int initScore, int endBonus, 
+                           int& alignedQLen, int& alignedRLen, int& score, float& confidence) ;
+
+
+// Filter-and-extend function:
+// Inputs: Reference and query sequences,	
+//         Initial score (i.e. h0),
+//         endBonus (i.e. the extra score if the whole query is aligned)
+//         zdrop value passed to ksw_extend
+// Outputs:
+//         Alignment length in query
+//         Alignment length in reference
+//         Alignment score
+// Behavior:
+//         The filtering function will be called internally first.
+//         If there is an obvious result, it will be returned.
+//         If not, ksw_extend() will be called with feedback from filtering function,
+//         and its result will be returned.
+// Notes: 
+//        For now, the costMatrix and gap penalties are hardcoded. 
+//        It is assumed that ksw_extend(...) function is linked. In this version,
+//        it is defined in bwa_extend.cpp file.
+void filter_and_extend(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+		       int initScore, int endBonus, int zdrop,
+		       int& alignedQLen, int& alignedRLen, int& score) ;
+
+
+// Declaration needed for filter_and_extend(), defined in bwa_extend.cpp
+extern "C" {
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) ;
+}
diff --git a/intel_ext.cpp b/intel_ext.cpp
new file mode 100644
index 0000000..3c6d3b4
--- /dev/null
+++ b/intel_ext.cpp
@@ -0,0 +1,60 @@
+#include "intel_ext.h"
+
+// Needs to be called only once per program to set up static arrays
+void init_ed_dist() ;
+
+// The filtering function:
+// Inputs: Reference and query sequences,	
+//         Initial score (i.e. h0),
+//         endBonus (i.e. the extra score if the whole query is aligned)
+// Outputs:
+//         Alignment length in query
+//         Alignment length in reference
+//         Alignment score
+//         Confidence: For now, it's either 0.0 or 1.0, corresponding to no/full confidence in outputs
+// Usage:
+//         If confidence == 0.0: Partial alignment - need to rerun ksw_extend(...)
+//         If confidence == 1.0
+//               if alignedQLen == queryLen: Full alignment
+//               if alignedQLen == 0: No alignment
+// Notes: 
+//        For now, the costMatrix and gap penalties are hardcoded. 
+//
+void extend_with_edit_dist(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+     			   int initScore, int endBonus, 
+                           int& alignedQLen, int& alignedRLen, int& score, float& confidence) ;
+
+
+// Filter-and-extend function:
+// Inputs: Reference and query sequences,	
+//         Initial score (i.e. h0),
+//         endBonus (i.e. the extra score if the whole query is aligned)
+//         zdrop value passed to ksw_extend
+// Outputs:
+//         Alignment length in query
+//         Alignment length in reference
+//         Alignment score
+// Behavior:
+//         The filtering function will be called internally first.
+//         If there is an obvious result, it will be returned.
+//         If not, ksw_extend() will be called with feedback from filtering function,
+//         and its result will be returned.
+// Notes: 
+//        For now, the costMatrix and gap penalties are hardcoded. 
+//        It is assumed that ksw_extend(...) function is linked. In this version,
+//        it is defined in bwa_extend.cpp file.
+void filter_and_extend(uint8_t* refSeq, int refLen, uint8_t* querySeq, int queryLen,
+		       int initScore, int endBonus, int zdrop,
+		       int& alignedQLen, int& alignedRLen, int& score) ;
+
+
+void intel_init()
+{	
+	init_ed_dist();
+}
+
+void intel_filter(uint8_t *refSeq, int refLen, uint8_t *querySeq, int queryLen, int initScore, int endBonus,
+				  int *alignedQLen, int *alignedRLen, int *score, float *confidence)
+{
+	extend_with_edit_dist(refSeq, refLen, querySeq, queryLen, initScore, endBonus, *alignedQLen, *alignedRLen, *score, *confidence);
+}
diff --git a/intel_ext.h b/intel_ext.h
new file mode 100644
index 0000000..0dd93a5
--- /dev/null
+++ b/intel_ext.h
@@ -0,0 +1,18 @@
+#ifndef INTEL_EXT_H
+#define INTEL_EXT_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	void intel_init();
+	void intel_filter(uint8_t *refSeq, int refLen, uint8_t *querySeq, int queryLen, int initScore, int endBonus,
+					  int *alignedQLen, int *alignedRLen, int *score, float *confidence);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif