diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a957929
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,41 @@
+CC=			gcc
+CFLAGS=		-g -Wall -O2 -Wc++-compat -Wno-unused-function
+CPPFLAGS=
+INCLUDES=	-I.
+OBJS=		kalloc.o kthread.o bseq.o sketch.o sdust.o index.o
+PROG=		minimap2
+PROG_EXTRA=	sdust
+LIBS=		-lm -lz -lpthread
+
+.SUFFIXES:.c .o
+
+.c.o:
+		$(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+extra:all $(PROG_EXTRA)
+
+minimap2:libminimap2.a
+		$(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS)
+
+libminimap2.a:$(OBJS)
+		$(AR) -csru $@ $(OBJS)
+
+sdust:sdust.c kalloc.o kdq.h kvec.h kseq.h sdust.h
+		$(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz
+
+clean:
+		rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM session*
+
+depend:
+		(LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c)
+
+# DO NOT DELETE
+
+bseq.o: bseq.h kseq.h
+index.o: kthread.h bseq.h minimap.h kvec.h kalloc.h khash.h
+kalloc.o: kalloc.h
+misc.o: minimap.h ksort.h
+sdust.o: kalloc.h kdq.h kvec.h sdust.h
+sketch.o: kvec.h kalloc.h minimap.h
diff --git a/bseq.c b/bseq.c
index 3cde6c8..3179fa1 100644
--- a/bseq.c
+++ b/bseq.c
@@ -7,8 +7,6 @@
 #include "kseq.h"
 KSEQ_INIT(gzFile, gzread)
 
-extern unsigned char seq_nt4_table[256];
-
 struct bseq_file_s {
 	int is_eof;
 	gzFile fp;
diff --git a/bseq.h b/bseq.h
index daf1fc5..0595d98 100644
--- a/bseq.h
+++ b/bseq.h
@@ -16,4 +16,6 @@ void bseq_close(bseq_file_t *fp);
 bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_);
 int bseq_eof(bseq_file_t *fp);
 
+extern unsigned char seq_nt4_table[256];
+
 #endif
diff --git a/index.c b/index.c
new file mode 100644
index 0000000..52af8e4
--- /dev/null
+++ b/index.c
@@ -0,0 +1,355 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include "kthread.h"
+#include "bseq.h"
+#include "minimap.h"
+#include "kvec.h"
+#include "khash.h"
+
+#define idx_hash(a) ((a)>>1)
+#define idx_eq(a, b) ((a)>>1 == (b)>>1)
+KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
+typedef khash_t(idx) idxhash_t;
+
+#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
+
+mm_idx_t *mm_idx_init(int w, int k, int b)
+{
+	mm_idx_t *mi;
+	if (k*2 < b) b = k * 2;
+	if (w < 1) w = 1;
+	mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
+	mi->w = w, mi->k = k, mi->b = b;
+	mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
+	return mi;
+}
+
+void mm_idx_destroy(mm_idx_t *mi)
+{
+	int i;
+	if (mi == 0) return;
+	for (i = 0; i < 1<<mi->b; ++i) {
+		free(mi->B[i].p);
+		free(mi->B[i].a.a);
+		kh_destroy(idx, (idxhash_t*)mi->B[i].h);
+	}
+	for (i = 0; i < mi->n_seq; ++i)
+		free(mi->seq[i].name);
+	free(mi->seq); free(mi->B); free(mi);
+}
+
+const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
+{
+	int mask = (1<<mi->b) - 1;
+	khint_t k;
+	mm_idx_bucket_t *b = &mi->B[minier&mask];
+	idxhash_t *h = (idxhash_t*)b->h;
+	*n = 0;
+	if (h == 0) return 0;
+	k = kh_get(idx, h, minier>>mi->b<<1);
+	if (k == kh_end(h)) return 0;
+	if (kh_key(h, k)&1) {
+		*n = 1;
+		return &kh_val(h, k);
+	} else {
+		*n = (uint32_t)kh_val(h, k);
+		return &b->p[kh_val(h, k)>>32];
+	}
+}
+
+uint32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
+{
+	int i;
+	size_t n = 0;
+	uint32_t thres;
+	khint_t *a, k;
+	if (f <= 0.) return UINT32_MAX;
+	for (i = 0; i < 1<<mi->b; ++i)
+		if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
+	a = (uint32_t*)malloc(n * 4);
+	for (i = n = 0; i < 1<<mi->b; ++i) {
+		idxhash_t *h = (idxhash_t*)mi->B[i].h;
+		if (h == 0) continue;
+		for (k = 0; k < kh_end(h); ++k) {
+			if (!kh_exist(h, k)) continue;
+			a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
+		}
+	}
+	thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1;
+	free(a);
+	return thres;
+}
+
+/*********************************
+ * Sort and generate hash tables *
+ *********************************/
+
+static void worker_post(void *g, long i, int tid)
+{
+	int j, start_a, start_p, n, n_keys;
+	idxhash_t *h;
+	mm_idx_t *mi = (mm_idx_t*)g;
+	mm_idx_bucket_t *b = &mi->B[i];
+	if (b->a.n == 0) return;
+
+	// sort by minimizer
+	radix_sort_128x(b->a.a, b->a.a + b->a.n);
+
+	// count and preallocate
+	for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
+		if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
+			++n_keys;
+			if (n > 1) b->n += n;
+			n = 1;
+		} else ++n;
+	}
+	h = kh_init(idx);
+	kh_resize(idx, h, n_keys);
+	b->p = (uint64_t*)calloc(b->n, 8);
+
+	// create the hash table
+	for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
+		if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
+			khint_t itr;
+			int absent;
+			mm128_t *p = &b->a.a[j-1];
+			itr = kh_put(idx, h, p->x>>8>>mi->b<<1, &absent);
+			assert(absent && j - start_a == n);
+			if (n == 1) {
+				kh_key(h, itr) |= 1;
+				kh_val(h, itr) = p->y;
+			} else {
+				int k;
+				for (k = 0; k < n; ++k)
+					b->p[start_p + k] = b->a.a[start_a + k].y;
+				kh_val(h, itr) = (uint64_t)start_p<<32 | n;
+				start_p += n;
+			}
+			start_a = j, n = 1;
+		} else ++n;
+	}
+	b->h = h;
+	assert(b->n == start_p);
+
+	// deallocate and clear b->a
+	free(b->a.a);
+	b->a.n = b->a.m = 0, b->a.a = 0;
+}
+ 
+static void mm_idx_post(mm_idx_t *mi, int n_threads)
+{
+	kt_for(n_threads, worker_post, mi, 1<<mi->b);
+}
+
+/******************
+ * Generate index *
+ ******************/
+
+#include <string.h>
+#include <zlib.h>
+#include "bseq.h"
+
+typedef struct {
+	int mini_batch_size, keep_name, is_hpc;
+	uint64_t batch_size;
+	bseq_file_t *fp;
+	mm_idx_t *mi;
+} pipeline_t;
+
+typedef struct {
+    int n_seq;
+	bseq1_t *seq;
+	mm128_v a;
+} step_t;
+
+static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
+{
+	int i, mask = (1<<mi->b) - 1;
+	for (i = 0; i < n; ++i) {
+		mm128_v *p = &mi->B[a[i].x&mask].a;
+		kv_push(mm128_t, 0, *p, a[i]);
+	}
+}
+
+static void *worker_pipeline(void *shared, int step, void *in)
+{
+	int i;
+    pipeline_t *p = (pipeline_t*)shared;
+    if (step == 0) { // step 0: read sequences
+        step_t *s;
+		if (p->mi->sum_len > p->batch_size) return 0;
+        s = (step_t*)calloc(1, sizeof(step_t));
+		s->seq = bseq_read(p->fp, p->mini_batch_size, &s->n_seq); // read a mini-batch
+		if (s->seq) {
+			uint32_t old_m, m;
+			uint64_t sum_len, old_max_len, max_len;
+			assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
+			// make room for p->mi->seq
+			old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
+			kroundup32(m); kroundup32(old_m);
+			if (old_m != m)
+				p->mi->seq = (mm_idx_seq_t*)realloc(p->mi->seq, m * sizeof(mm_idx_seq_t));
+			// make room for p->mi->S
+			for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
+			old_max_len = (p->mi->sum_len + 7) / 8;
+			max_len = (p->mi->sum_len + sum_len + 7) / 8;
+			kroundup64(old_max_len); kroundup64(max_len);
+			if (old_max_len != max_len) {
+				p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4);
+				memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
+			}
+			// populate p->mi->seq
+			for (i = 0; i < s->n_seq; ++i) {
+				mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq++];
+				uint32_t j;
+				if (p->keep_name) {
+					assert(strlen(s->seq[i].name) <= 254); // a long query name breaks BAM
+					seq->name = strdup(s->seq[i].name);
+				} else seq->name = 0;
+				seq->len = s->seq[i].l_seq;
+				seq->offset = p->mi->sum_len;
+				// copy the sequence
+				for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
+					uint64_t o = p->mi->sum_len + j;
+					int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
+					mm_seq4_set(p->mi->S, o, c);
+				}
+				// update p->mi->{sum_len,n_seq}
+				p->mi->sum_len += seq->len;
+				s->seq[i].rid = p->mi->n_seq++;
+			}
+			return s;
+		} else free(s);
+    } else if (step == 1) { // step 1: compute sketch
+        step_t *s = (step_t*)in;
+		for (i = 0; i < s->n_seq; ++i) {
+			bseq1_t *t = &s->seq[i];
+			mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->is_hpc, &s->a);
+			free(t->seq); free(t->name);
+		}
+		free(s->seq); s->seq = 0;
+		return s;
+    } else if (step == 2) { // dispatch sketch to buckets
+        step_t *s = (step_t*)in;
+		mm_idx_add(p->mi, s->a.n, s->a.a);
+		free(s->a.a); free(s);
+	}
+    return 0;
+}
+
+mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
+{
+	pipeline_t pl;
+	memset(&pl, 0, sizeof(pipeline_t));
+	pl.mini_batch_size = mini_batch_size;
+	pl.keep_name = keep_name;
+	pl.is_hpc = is_hpc;
+	pl.batch_size = batch_size;
+	pl.fp = fp;
+	if (pl.fp == 0) return 0;
+	pl.mi = mm_idx_init(w, k, b);
+
+	kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
+	if (mm_verbose >= 3)
+		fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
+
+	mm_idx_post(pl.mi, n_threads);
+	if (mm_verbose >= 3)
+		fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
+
+	return pl.mi;
+}
+
+mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads) // a simpler interface
+{
+	bseq_file_t *fp;
+	mm_idx_t *mi;
+	fp = bseq_open(fn);
+	if (fp == 0) return 0;
+	mi = mm_idx_gen(fp, w, k, MM_IDX_DEF_B, is_hpc, 1<<18, n_threads, UINT64_MAX, 1);
+	bseq_close(fp);
+	return mi;
+}
+
+/*************
+ * index I/O *
+ *************/
+
+#define MM_IDX_MAGIC "MMI\2"
+
+void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
+{
+	uint32_t x[4];
+	int i;
+	x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq;
+	fwrite(MM_IDX_MAGIC, 1, 4, fp);
+	fwrite(x, 4, 4, fp);
+	for (i = 0; i < mi->n_seq; ++i) {
+		uint8_t l;
+		l = strlen(mi->seq[i].name);
+		fwrite(&l, 1, 1, fp);
+		fwrite(mi->seq[i].name, 1, l, fp);
+		fwrite(&mi->seq[i].len, 4, 1, fp);
+	}
+	for (i = 0; i < 1<<mi->b; ++i) {
+		mm_idx_bucket_t *b = &mi->B[i];
+		khint_t k;
+		idxhash_t *h = (idxhash_t*)b->h;
+		uint32_t size = h? h->size : 0;
+		fwrite(&b->n, 4, 1, fp);
+		fwrite(b->p, 8, b->n, fp);
+		fwrite(&size, 4, 1, fp);
+		if (size == 0) continue;
+		for (k = 0; k < kh_end(h); ++k) {
+			uint64_t x[2];
+			if (!kh_exist(h, k)) continue;
+			x[0] = kh_key(h, k), x[1] = kh_val(h, k);
+			fwrite(x, 8, 2, fp);
+		}
+	}
+}
+
+mm_idx_t *mm_idx_load(FILE *fp)
+{
+	int i;
+	char magic[4];
+	uint32_t x[4];
+	mm_idx_t *mi;
+	if (fread(magic, 1, 4, fp) != 4) return 0;
+	if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
+	if (fread(x, 4, 4, fp) != 6) return 0;
+	mi = mm_idx_init(x[0], x[1], x[2]);
+	mi->n_seq = x[3];
+	mi->seq = (mm_idx_seq_t*)calloc(mi->n_seq, sizeof(mm_idx_seq_t));
+	for (i = 0; i < mi->n_seq; ++i) {
+		uint8_t l;
+		fread(&l, 1, 1, fp);
+		mi->seq[i].name = (char*)malloc(l + 1);
+		fread(mi->seq[i].name, 1, l, fp);
+		mi->seq[i].name[l] = 0;
+		fread(&mi->seq[i].len, 4, 1, fp);
+	}
+	for (i = 0; i < 1<<mi->b; ++i) {
+		mm_idx_bucket_t *b = &mi->B[i];
+		uint32_t j, size;
+		khint_t k;
+		idxhash_t *h;
+		fread(&b->n, 4, 1, fp);
+		b->p = (uint64_t*)malloc(b->n * 8);
+		fread(b->p, 8, b->n, fp);
+		fread(&size, 4, 1, fp);
+		if (size == 0) continue;
+		b->h = h = kh_init(idx);
+		kh_resize(idx, h, size);
+		for (j = 0; j < size; ++j) {
+			uint64_t x[2];
+			int absent;
+			fread(x, 8, 2, fp);
+			k = kh_put(idx, h, x[0], &absent);
+			assert(absent);
+			kh_val(h, k) = x[1];
+		}
+	}
+	return mi;
+}
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..6373a93
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,615 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "kalloc.h"
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(0, 1, sizeof(kh_##name##_t));	\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree(0, (void *)h->keys); kfree(0, h->flags);				\
+			kfree(0, (void *)h->vals);									\
+			kfree(0, h);												\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(0, __ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc(0, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(0, new_flags); return -1; }	\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc(0, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(0, new_flags); return -1; } \
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc(0, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc(0, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(0, h->flags); /* free the working space */			\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/ksort.h b/ksort.h
new file mode 100644
index 0000000..b2ffc1a
--- /dev/null
+++ b/ksort.h
@@ -0,0 +1,159 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+// This is a simplified version of ksort.h
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+	void *left, *right;
+	int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt)								\
+	size_t ks_lis_##name(size_t n, const type_t *a, size_t *b, size_t *_p) \
+	{ /* translated from: http://www.algorithmist.com/index.php/Longest_Increasing_Subsequence.cpp */ \
+		size_t i, u, v, *top = b, *p; \
+		if (n == 0) return 0; \
+		p = _p? _p : (size_t*)calloc(n, sizeof(size_t)); \
+		*top++ = 0; \
+		for (i = 1; i < n; i++) { \
+			if (__sort_lt(a[*(top-1)], a[i])) { \
+				p[i] = *(top-1); \
+				*top++ = i; \
+				continue; \
+			} \
+			for (u = 0, v = top - b - 1; u < v;) { \
+				size_t c = (u + v) >> 1; \
+				if (__sort_lt(a[b[c]], a[i])) u = c + 1; \
+				else v = c; \
+			} \
+			if (__sort_lt(a[i], a[b[u]])) { \
+				if (u > 0) p[i] = b[u-1]; \
+				b[u] = i; \
+			} \
+		} \
+		for (u = top - b, v = *(top-1); u--; v = p[v]) b[u] = v; \
+		if (!_p) free(p); \
+		return top - b; \
+	} \
+	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)			\
+	{																	\
+		type_t *low, *high, *k, *ll, *hh, *mid;							\
+		low = arr; high = arr + n - 1; k = arr + kk;					\
+		for (;;) {														\
+			if (high <= low) return *k;									\
+			if (high == low + 1) {										\
+				if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+				return *k;												\
+			}															\
+			mid = low + (high - low) / 2;								\
+			if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+			if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+			if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);	\
+			KSORT_SWAP(type_t, *mid, *(low+1));							\
+			ll = low + 1; hh = high;									\
+			for (;;) {													\
+				do ++ll; while (__sort_lt(*ll, *low));					\
+				do --hh; while (__sort_lt(*low, *hh));					\
+				if (hh < ll) break;										\
+				KSORT_SWAP(type_t, *ll, *hh);							\
+			}															\
+			KSORT_SWAP(type_t, *low, *hh);								\
+			if (hh <= k) low = ll;										\
+			if (hh >= k) high = hh - 1;									\
+		}																\
+	}																	\
+
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#define RS_MIN_SIZE 64
+
+#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
+	typedef struct { \
+		rstype_t *b, *e; \
+	} rsbucket_##name##_t; \
+	void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
+	{ \
+		rstype_t *i; \
+		for (i = beg + 1; i < end; ++i) \
+			if (rskey(*i) < rskey(*(i - 1))) { \
+				rstype_t *j, tmp = *i; \
+				for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
+					*j = *(j - 1); \
+				*j = tmp; \
+			} \
+	} \
+	void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
+	{ \
+		rstype_t *i; \
+		int size = 1<<n_bits, m = size - 1; \
+		rsbucket_##name##_t *k, b[size], *be = b + size; \
+		for (k = b; k != be; ++k) k->b = k->e = beg; \
+		for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
+		for (k = b + 1; k != be; ++k) \
+			k->e += (k-1)->e - beg, k->b = (k-1)->e; \
+		for (k = b; k != be;) { \
+			if (k->b != k->e) { \
+				rsbucket_##name##_t *l; \
+				if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
+					rstype_t tmp = *k->b, swap; \
+					do { \
+						swap = tmp; tmp = *l->b; *l->b++ = swap; \
+						l = b + (rskey(tmp)>>s&m); \
+					} while (l != k); \
+					*k->b++ = tmp; \
+				} else ++k->b; \
+			} else ++k; \
+		} \
+		for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
+		if (s) { \
+			s = s > n_bits? s - n_bits : 0; \
+			for (k = b; k != be; ++k) \
+				if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
+				else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
+		} \
+	} \
+	void radix_sort_##name(rstype_t *beg, rstype_t *end) \
+	{ \
+		if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
+		else rs_sort_##name(beg, end, 8, sizeof_key * 8 - 8); \
+	}
+
+#endif
diff --git a/kthread.c b/kthread.c
new file mode 100644
index 0000000..f991714
--- /dev/null
+++ b/kthread.c
@@ -0,0 +1,151 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <limits.h>
+
+/************
+ * kt_for() *
+ ************/
+
+struct kt_for_t;
+
+typedef struct {
+	struct kt_for_t *t;
+	long i;
+} ktf_worker_t;
+
+typedef struct kt_for_t {
+	int n_threads;
+	long n;
+	ktf_worker_t *w;
+	void (*func)(void*,long,int);
+	void *data;
+} kt_for_t;
+
+static inline long steal_work(kt_for_t *t)
+{
+	int i, min_i = -1;
+	long k, min = LONG_MAX;
+	for (i = 0; i < t->n_threads; ++i)
+		if (min > t->w[i].i) min = t->w[i].i, min_i = i;
+	k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
+	return k >= t->n? -1 : k;
+}
+
+static void *ktf_worker(void *data)
+{
+	ktf_worker_t *w = (ktf_worker_t*)data;
+	long i;
+	for (;;) {
+		i = __sync_fetch_and_add(&w->i, w->t->n_threads);
+		if (i >= w->t->n) break;
+		w->t->func(w->t->data, i, w - w->t->w);
+	}
+	while ((i = steal_work(w->t)) >= 0)
+		w->t->func(w->t->data, i, w - w->t->w);
+	pthread_exit(0);
+}
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
+{
+	if (n_threads > 1) {
+		int i;
+		kt_for_t t;
+		pthread_t *tid;
+		t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
+		t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
+		tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+		for (i = 0; i < n_threads; ++i)
+			t.w[i].t = &t, t.w[i].i = i;
+		for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
+		for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+	} else {
+		long j;
+		for (j = 0; j < n; ++j) func(data, j, 0);
+	}
+}
+
+/*****************
+ * kt_pipeline() *
+ *****************/
+
+struct ktp_t;
+
+typedef struct {
+	struct ktp_t *pl;
+	int64_t index;
+	int step;
+	void *data;
+} ktp_worker_t;
+
+typedef struct ktp_t {
+	void *shared;
+	void *(*func)(void*, int, void*);
+	int64_t index;
+	int n_workers, n_steps;
+	ktp_worker_t *workers;
+	pthread_mutex_t mutex;
+	pthread_cond_t cv;
+} ktp_t;
+
+static void *ktp_worker(void *data)
+{
+	ktp_worker_t *w = (ktp_worker_t*)data;
+	ktp_t *p = w->pl;
+	while (w->step < p->n_steps) {
+		// test whether we can kick off the job with this worker
+		pthread_mutex_lock(&p->mutex);
+		for (;;) {
+			int i;
+			// test whether another worker is doing the same step
+			for (i = 0; i < p->n_workers; ++i) {
+				if (w == &p->workers[i]) continue; // ignore itself
+				if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
+					break;
+			}
+			if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
+			pthread_cond_wait(&p->cv, &p->mutex);
+		}
+		pthread_mutex_unlock(&p->mutex);
+
+		// working on w->step
+		w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
+
+		// update step and let other workers know
+		pthread_mutex_lock(&p->mutex);
+		w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
+		if (w->step == 0) w->index = p->index++;
+		pthread_cond_broadcast(&p->cv);
+		pthread_mutex_unlock(&p->mutex);
+	}
+	pthread_exit(0);
+}
+
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
+{
+	ktp_t aux;
+	pthread_t *tid;
+	int i;
+
+	if (n_threads < 1) n_threads = 1;
+	aux.n_workers = n_threads;
+	aux.n_steps = n_steps;
+	aux.func = func;
+	aux.shared = shared_data;
+	aux.index = 0;
+	pthread_mutex_init(&aux.mutex, 0);
+	pthread_cond_init(&aux.cv, 0);
+
+	aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
+	for (i = 0; i < n_threads; ++i) {
+		ktp_worker_t *w = &aux.workers[i];
+		w->step = 0; w->pl = &aux; w->data = 0;
+		w->index = aux.index++;
+	}
+
+	tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
+	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+
+	pthread_mutex_destroy(&aux.mutex);
+	pthread_cond_destroy(&aux.cv);
+}
diff --git a/kthread.h b/kthread.h
new file mode 100644
index 0000000..c3cd165
--- /dev/null
+++ b/kthread.h
@@ -0,0 +1,15 @@
+#ifndef KTHREAD_H
+#define KTHREAD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/minimap.h b/minimap.h
index 1376968..5728cf2 100644
--- a/minimap.h
+++ b/minimap.h
@@ -4,7 +4,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <sys/types.h>
-#include "bseq.h"
 
 #define MM_IDX_DEF_B    14
 #define MM_DEREP_Q50    5.0
@@ -30,13 +29,18 @@ typedef struct {
 } mm_idx_bucket_t;
 
 typedef struct {
-	int b, w, k;
-	uint32_t n;   // number of reference sequences
-	mm_idx_bucket_t *B;
-	uint32_t max_occ;
-	float freq_thres;
-	int32_t *len; // length of each reference sequence
-	char **name;  // TODO: if this uses too much RAM, switch to one concatenated string
+	char *name;      // name of the db sequence
+	uint64_t offset; // offset in mm_idx_t::seq16
+	uint32_t len;    // length
+} mm_idx_seq_t;
+
+typedef struct {
+	int32_t b, w, k;
+	uint64_t sum_len;   // sum of lengths
+	uint32_t n_seq;     // number of reference sequences
+	mm_idx_seq_t *seq;  // sequence name, length and offset
+	uint32_t *S;        // 4-bit packed sequence
+	mm_idx_bucket_t *B; // index
 } mm_idx_t;
 
 typedef struct {
@@ -62,6 +66,11 @@ extern double mm_realtime0;
 struct mm_tbuf_s;
 typedef struct mm_tbuf_s mm_tbuf_t;
 
+struct bseq_file_s;
+
+#define mm_seq4_set(s, i, c) ((s)[(i)>>8] |= (uint32_t)(c) << (((i)&7)<<2))
+#define mm_seq4_get(s, i)    ((s)[(i)>>8] >> (((i)&7)<<2) & 0xf)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -72,11 +81,11 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
 // minimizer indexing
 mm_idx_t *mm_idx_init(int w, int k, int b);
 void mm_idx_destroy(mm_idx_t *mi);
-mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name);
+mm_idx_t *mm_idx_gen(struct bseq_file_s *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name);
 void mm_idx_set_max_occ(mm_idx_t *mi, float f);
 const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
 
-mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads);
+mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads);
 
 // minimizer index I/O
 void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
diff --git a/misc.c b/misc.c
new file mode 100644
index 0000000..3f8bf52
--- /dev/null
+++ b/misc.c
@@ -0,0 +1,26 @@
+#include <sys/resource.h>
+#include <sys/time.h>
+#include "minimap.h"
+
+int mm_verbose = 3;
+double mm_realtime0;
+
+double cputime()
+{
+	struct rusage r;
+	getrusage(RUSAGE_SELF, &r);
+	return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
+}
+
+double realtime()
+{
+	struct timeval tp;
+	struct timezone tzp;
+	gettimeofday(&tp, &tzp);
+	return tp.tv_sec + tp.tv_usec * 1e-6;
+}
+
+#include "ksort.h"
+#define sort_key_128x(a) ((a).x)
+KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8) 
+KSORT_INIT_GENERIC(uint32_t)