添加log，argparse等第三方库，修改代码

2024-12-14 12:24:19 +08:00 · 2024-12-14 12:24:19 +08:00 · 27e0af955e
parent 1c35dafbf8
commit 27e0af955e
198 changed files with 49032 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,9 @@
 # ---> C++
 # Prerequisites
 *.d
+/.vscode
+/build
+build.sh

 # Compiled Object files
 *.slo
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,8 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 3.0)
+project(FastDup)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+# set(CMAKE_BUILD_TYPE Debug)
+# set(CMAKE_BUILD_TYPE Release)
+ADD_SUBDIRECTORY(src)
--- a/ext/argparse/argparse.hpp
+++ b/ext/argparse/argparse.hpp
--- a/ext/klib/.gitignore
+++ b/ext/klib/.gitignore
@ -0,0 +1,40 @@
+# General
+*.a
+*.dSYM/
+*.la
+*.lo
+*.o
+*.opensdf
+*.orig
+*.sdf
+*.suo
+*.swp
+*.tests
+*.vcxproj.filters
+*.vcxproj.user
+*~
+.git
+TAGS
+
+# Mac/Xcode-specfic
+xcuserdata
+contents.xcworkspacedata
+.DS_Store
+._*
+
+# Test byproducts
+test/kbtree_test
+test/khash_keith
+test/khash_keith2
+test/khash_test
+test/klist_test
+test/kmin_test
+test/kseq_bench
+test/kseq_bench2
+test/kseq_test
+test/ksort_test
+test/ksort_test-stl
+test/kstring_bench
+test/kstring_bench2
+test/kstring_test
+test/kvec_test
--- a/ext/klib/LICENSE.txt
+++ b/ext/klib/LICENSE.txt
@ -0,0 +1,23 @@
+The MIT License
+
+Copyright (c) 2008-     Attractive Chaos <attractor@live.co.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/ext/klib/README.md
+++ b/ext/klib/README.md
@ -0,0 +1,243 @@
+# Klib: a Generic Library in C
+
+## <a name="overview"></a>Overview
+
+Klib is a standalone and lightweight C library distributed under [MIT/X11
+license][1]. Most components are independent of external libraries, except the
+standard C library, and independent of each other. To use a component of this
+library, you only need to copy a couple of files to your source code tree
+without worrying about library dependencies.
+
+Klib strives for efficiency and a small memory footprint. Some components, such
+as khash.h, kbtree.h, ksort.h and kvec.h, are among the most efficient
+implementations of similar algorithms or data structures in all programming
+languages, in terms of both speed and memory use.
+
+A new documentation is available [here](http://attractivechaos.github.io/klib/)
+which includes most information in this README file.
+
+#### Common components
+
+* [khash.h][khash]: generic [hash table][2] with open addressing.
+* [kbtree.h][kbtree]: generic search tree based on [B-tree][3].
+* [kavl.h][kavl]: generic intrusive [AVL tree][wiki-avl].
+* [ksort.h][ksort]: generic sort, including [introsort][4], [merge sort][5], [heap sort][6], [comb sort][7], [Knuth shuffle][8] and the [k-small][9] algorithm.
+* [kseq.h][kseq]: generic stream buffer and a [FASTA][10]/[FASTQ][11] format parser.
+* kvec.h: generic dynamic array.
+* klist.h: generic single-linked list and [memory pool][12].
+* kstring.{h,c}: basic string library.
+* kmath.{h,c}: numerical routines including [MT19937-64][13] [pseudorandom generator][14], basic [nonlinear programming][15] and a few special math functions.
+* [ketopt.h][ketopt]: portable command-line argument parser with getopt\_long-like API.
+
+#### Components for more specific use cases
+
+* ksa.c: constructing [suffix arrays][16] for strings with multiple sentinels, based on a revised [SAIS algorithm][17].
+* knetfile.{h,c}: random access to remote files on HTTP or FTP.
+* kopen.c: smart stream opening.
+* khmm.{h,c}: basic [HMM][18] library.
+* ksw.(h,c}: Striped [Smith-Waterman algorithm][19].
+* knhx.{h,c}: [Newick tree format][20] parser.
+
+
+## <a name="methodology"></a>Methodology
+
+For the implementation of generic [containers][21], klib extensively uses C
+macros. To use these data structures, we usually need to instantiate methods by
+expanding a long macro. This makes the source code look unusual or even ugly
+and adds difficulty to debugging. Unfortunately, for efficient generic
+programming in C that lacks [template][22], using macros is the only
+solution. Only with macros, we can write a generic container which, once
+instantiated, compete with a type-specific container in efficiency. Some
+generic libraries in C, such as [Glib][23], use the `void*` type to implement
+containers. These implementations are usually slower and use more memory than
+klib (see [this benchmark][31]).
+
+To effectively use klib, it is important to understand how it achieves generic
+programming. We will use the hash table library as an example:
+
+    #include "khash.h"
+    KHASH_MAP_INIT_INT(m32, char)        // instantiate structs and methods
+    int main() {
+        int ret, is_missing;
+        khint_t k;
+        khash_t(m32) *h = kh_init(m32);  // allocate a hash table
+        k = kh_put(m32, h, 5, &ret);     // insert a key to the hash table
+        if (!ret) kh_del(m32, h, k);
+        kh_value(h, k) = 10;             // set the value
+        k = kh_get(m32, h, 10);          // query the hash table
+        is_missing = (k == kh_end(h));   // test if the key is present
+        k = kh_get(m32, h, 5);
+        kh_del(m32, h, k);               // remove a key-value pair
+        for (k = kh_begin(h); k != kh_end(h); ++k)  // traverse
+            if (kh_exist(h, k))          // test if a bucket contains data
+    			kh_value(h, k) = 1;
+        kh_destroy(m32, h);              // deallocate the hash table
+        return 0;
+    }
+
+In this example, the second line instantiates a hash table with `unsigned` as
+the key type and `char` as the value type. `m32` names such a type of hash table.
+All types and functions associated with this name are macros, which will be
+explained later. Macro `kh_init()` initiates a hash table and `kh_destroy()`
+frees it. `kh_put()` inserts a key and returns the iterator (or the position)
+in the hash table. `kh_get()` and `kh_del()` get a key and delete an element,
+respectively. Macro `kh_exist()` tests if an iterator (or a position) is filled
+with data.
+
+An immediate question is this piece of code does not look like a valid C
+program (e.g. lacking semicolon, assignment to an _apparent_ function call and
+_apparent_ undefined `m32` 'variable'). To understand why the code is correct,
+let's go a bit further into the source code of `khash.h`, whose skeleton looks
+like:
+
+    #define KHASH_INIT(name, SCOPE, key_t, val_t, is_map, _hashf, _hasheq) \
+      typedef struct { \
+        int n_buckets, size, n_occupied, upper_bound; \
+        unsigned *flags; \
+        key_t *keys; \
+        val_t *vals; \
+      } kh_##name##_t; \
+      SCOPE inline kh_##name##_t *init_##name() { \
+        return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
+      } \
+      SCOPE inline int get_##name(kh_##name##_t *h, key_t k) \
+      ... \
+      SCOPE inline void destroy_##name(kh_##name##_t *h) { \
+        if (h) { \
+          free(h->keys); free(h->flags); free(h->vals); free(h); \
+        } \
+      }
+    
+    #define _int_hf(key) (unsigned)(key)
+    #define _int_heq(a, b) (a == b)
+    #define khash_t(name) kh_##name##_t
+    #define kh_value(h, k) ((h)->vals[k])
+    #define kh_begin(h, k) 0
+    #define kh_end(h) ((h)->n_buckets)
+    #define kh_init(name) init_##name()
+    #define kh_get(name, h, k) get_##name(h, k)
+    #define kh_destroy(name, h) destroy_##name(h)
+    ...
+    #define KHASH_MAP_INIT_INT(name, val_t) \
+    	KHASH_INIT(name, static, unsigned, val_t, is_map, _int_hf, _int_heq)
+
+`KHASH_INIT()` is a huge macro defining all the structs and methods. When this
+macro is called, all the code inside it will be inserted by the [C
+preprocess][37] to the place where it is called. If the macro is called
+multiple times, multiple copies of the code will be inserted. To avoid naming
+conflict of hash tables with different key-value types, the library uses [token
+concatenation][36], which is a preprocessor feature whereby we can substitute
+part of a symbol based on the parameter of the macro. In the end, the C
+preprocessor will generate the following code and feed it to the compiler
+(macro `kh_exist(h,k)` is a little complex and not expanded for simplicity):
+
+    typedef struct {
+      int n_buckets, size, n_occupied, upper_bound;
+      unsigned *flags;
+      unsigned *keys;
+      char *vals;
+    } kh_m32_t;
+    static inline kh_m32_t *init_m32() {
+      return (kh_m32_t*)calloc(1, sizeof(kh_m32_t));
+    }
+    static inline int get_m32(kh_m32_t *h, unsigned k)
+    ...
+    static inline void destroy_m32(kh_m32_t *h) {
+      if (h) {
+        free(h->keys); free(h->flags); free(h->vals); free(h);
+      }
+    }
+
+	int main() {
+		int ret, is_missing;
+		khint_t k;
+		kh_m32_t *h = init_m32();
+		k = put_m32(h, 5, &ret);
+		if (!ret) del_m32(h, k);
+		h->vals[k] = 10;
+		k = get_m32(h, 10);
+		is_missing = (k == h->n_buckets);
+		k = get_m32(h, 5);
+		del_m32(h, k);
+		for (k = 0; k != h->n_buckets; ++k)
+			if (kh_exist(h, k)) h->vals[k] = 1;
+		destroy_m32(h);
+		return 0;
+	}
+
+This is the C program we know.
+
+From this example, we can see that macros and the C preprocessor plays a key
+role in klib. Klib is fast partly because the compiler knows the key-value
+type at the compile time and is able to optimize the code to the same level
+as type-specific code. A generic library written with `void*` will not get such
+performance boost.
+
+Massively inserting code upon instantiation may remind us of C++'s slow
+compiling speed and huge binary size when STL/boost is in use. Klib is much
+better in this respect due to its small code size and component independency.
+Inserting several hundreds lines of code won't make compiling obviously slower.
+
+## <a name="resources"></a>Resources
+
+* Library documentation, if present, is available in the header files. Examples
+can be found in the [test/][24] directory.
+* **Obsolete** documentation of the hash table library can be found at
+[SourceForge][25]. This README is partly adapted from the old documentation.
+* [Blog post][26] describing the hash table library.
+* [Blog post][27] on why using `void*` for generic programming may be inefficient.
+* [Blog post][28] on the generic stream buffer.
+* [Blog post][29] evaluating the performance of `kvec.h`.
+* [Blog post][30] arguing B-tree may be a better data structure than a binary search tree.
+* [Blog post][31] evaluating the performance of `khash.h` and `kbtree.h` among many other implementations.
+[An older version][33] of the benchmark is also available.
+* [Blog post][34] benchmarking internal sorting algorithms and implementations.
+* [Blog post][32] on the k-small algorithm.
+* [Blog post][35] on the Hooke-Jeeve's algorithm for nonlinear programming.
+
+[1]: http://en.wikipedia.org/wiki/MIT_License
+[2]: https://en.wikipedia.org/wiki/Hash_table
+[3]: http://en.wikipedia.org/wiki/B-tree
+[4]: http://en.wikipedia.org/wiki/Introsort
+[5]: http://en.wikipedia.org/wiki/Merge_sort
+[6]: http://en.wikipedia.org/wiki/Heapsort
+[7]: http://en.wikipedia.org/wiki/Comb_sort
+[8]: http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+[9]: http://en.wikipedia.org/wiki/Selection_algorithm
+[10]: http://en.wikipedia.org/wiki/FASTA_format
+[11]: http://en.wikipedia.org/wiki/FASTQ_format
+[12]: http://en.wikipedia.org/wiki/Memory_pool
+[13]: http://en.wikipedia.org/wiki/Mersenne_twister
+[14]: http://en.wikipedia.org/wiki/Pseudorandom_generator
+[15]: http://en.wikipedia.org/wiki/Nonlinear_programming
+[16]: http://en.wikipedia.org/wiki/Suffix_array
+[17]: https://sites.google.com/site/yuta256/sais
+[18]: http://en.wikipedia.org/wiki/Hidden_Markov_model
+[19]: http://en.wikipedia.org/wiki/Smith-Waterman_algorithm
+[20]: http://en.wikipedia.org/wiki/Newick_format
+[21]: http://en.wikipedia.org/wiki/Container_(abstract_data_type)
+[22]: http://en.wikipedia.org/wiki/Template_(C%2B%2B)
+[23]: http://en.wikipedia.org/wiki/GLib
+[24]: https://github.com/attractivechaos/klib/tree/master/test
+[25]: http://klib.sourceforge.net/
+[26]: http://attractivechaos.wordpress.com/2008/09/02/implementing-generic-hash-library-in-c/
+[27]: http://attractivechaos.wordpress.com/2008/10/02/using-void-in-generic-c-programming-may-be-inefficient/
+[28]: http://attractivechaos.wordpress.com/2008/10/11/a-generic-buffered-stream-wrapper/
+[29]: http://attractivechaos.wordpress.com/2008/09/19/c-array-vs-c-vector/
+[30]: http://attractivechaos.wordpress.com/2008/09/24/b-tree-vs-binary-search-tree/
+[31]: http://attractivechaos.wordpress.com/2008/10/07/another-look-at-my-old-benchmark/
+[32]: http://attractivechaos.wordpress.com/2008/09/13/calculating-median/
+[33]: http://attractivechaos.wordpress.com/2008/08/28/comparison-of-hash-table-libraries/
+[34]: http://attractivechaos.wordpress.com/2008/08/28/comparison-of-internal-sorting-algorithms/
+[35]: http://attractivechaos.wordpress.com/2008/08/24/derivative-free-optimization-dfo/
+[36]: http://en.wikipedia.org/wiki/C_preprocessor#Token_concatenation
+[37]: http://en.wikipedia.org/wiki/C_preprocessor
+
+[wiki-avl]: https://en.wikipedia.org/wiki/AVL_tree
+
+[kbtree]: http://attractivechaos.github.io/klib/#KBtree%3A%20generic%20ordered%20map:%5B%5BKBtree%3A%20generic%20ordered%20map%5D%5D
+[khash]: http://attractivechaos.github.io/klib/#Khash%3A%20generic%20hash%20table:%5B%5BKhash%3A%20generic%20hash%20table%5D%5D
+[kseq]: http://attractivechaos.github.io/klib/#Kseq%3A%20stream%20buffer%20and%20FASTA%2FQ%20parser:%5B%5BKseq%3A%20stream%20buffer%20and%20FASTA%2FQ%20parser%5D%5D
+[ksort]: http://attractivechaos.github.io/klib/#Ksort%3A%20sorting%2C%20shuffling%2C%20heap%20and%20k-small:%5B%5BKsort%3A%20sorting%2C%20shuffling%2C%20heap%20and%20k-small%5D%5D
+[kavl]: http://attractivechaos.github.io/klib/#KAVL%3A%20generic%20intrusive%20AVL%20tree
+[ketopt]: http://attractivechaos.github.io/klib/#Ketopt%3A%20parsing%20command-line%20arguments
--- a/ext/klib/bgzf.c
+++ b/ext/klib/bgzf.c
@ -0,0 +1,555 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/types.h>
+#include "bgzf.h"
+
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+typedef knetFile *_bgzf_file_t;
+#define _bgzf_open(fn, mode) knet_open(fn, mode)
+#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
+#define _bgzf_close(fp) knet_close(fp)
+#define _bgzf_fileno(fp) ((fp)->fd)
+#define _bgzf_tell(fp) knet_tell(fp)
+#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
+#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
+#else // ~defined(_USE_KNETFILE)
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else // ~defined(_WIN32)
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif // ~defined(_WIN32)
+typedef FILE *_bgzf_file_t;
+#define _bgzf_open(fn, mode) fopen(fn, mode)
+#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
+#define _bgzf_close(fp) fclose(fp)
+#define _bgzf_fileno(fp) fileno(fp)
+#define _bgzf_tell(fp) ftello(fp)
+#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
+#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
+#endif // ~define(_USE_KNETFILE)
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139|  8|  4|              0|  0|255|      6| 66| 67|      2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+	int size;
+	uint8_t *block;
+	int64_t end_offset;
+} cache_t;
+#include "khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+	buffer[0] = value;
+	buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+	return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+	buffer[0] = value;
+	buffer[1] = value >> 8;
+	buffer[2] = value >> 16;
+	buffer[3] = value >> 24;
+}
+
+static BGZF *bgzf_read_init()
+{
+	BGZF *fp;
+	fp = calloc(1, sizeof(BGZF));
+	fp->open_mode = 'r';
+	fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+#ifdef BGZF_CACHE
+	fp->cache = kh_init(cache);
+#endif
+	return fp;
+}
+
+static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
+{
+	BGZF *fp;
+	fp = calloc(1, sizeof(BGZF));
+	fp->open_mode = 'w';
+	fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+	if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+	return fp;
+}
+// get the compress level from the mode string
+static int mode2level(const char *__restrict mode)
+{
+	int i, compress_level = -1;
+	for (i = 0; mode[i]; ++i)
+		if (mode[i] >= '0' && mode[i] <= '9') break;
+	if (mode[i]) compress_level = (int)mode[i] - '0';
+	if (strchr(mode, 'u')) compress_level = 0;
+	return compress_level;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+	BGZF *fp = 0;
+	if (strchr(mode, 'r') || strchr(mode, 'R')) {
+		_bgzf_file_t fpr;
+		if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
+		fp = bgzf_read_init();
+		fp->fp = fpr;
+	} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+		FILE *fpw;
+		if ((fpw = fopen(path, "w")) == 0) return 0;
+		fp = bgzf_write_init(mode2level(mode));
+		fp->fp = fpw;
+	}
+	return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+	BGZF *fp = 0;
+	if (strchr(mode, 'r') || strchr(mode, 'R')) {
+		_bgzf_file_t fpr;
+		if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
+		fp = bgzf_read_init();
+		fp->fp = fpr;
+	} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+		FILE *fpw;
+		if ((fpw = fdopen(fd, "w")) == 0) return 0;
+		fp = bgzf_write_init(mode2level(mode));
+		fp->fp = fpw;
+	}
+	return fp;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+	uint8_t *buffer = fp->compressed_block;
+	int buffer_size = BGZF_BLOCK_SIZE;
+	int input_length = block_length;
+	int compressed_length = 0;
+	int remaining;
+	uint32_t crc;
+
+	assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller
+	memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+	while (1) { // loop to retry for blocks that do not compress enough
+		int status;
+		z_stream zs;
+		zs.zalloc = NULL;
+		zs.zfree = NULL;
+		zs.next_in = fp->uncompressed_block;
+		zs.avail_in = input_length;
+		zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+		zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+		status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer
+		if (status != Z_OK) {
+			fp->errcode |= BGZF_ERR_ZLIB;
+			return -1;
+		}
+		status = deflate(&zs, Z_FINISH);
+		if (status != Z_STREAM_END) { // not compressed enough
+			deflateEnd(&zs); // reset the stream
+			if (status == Z_OK) { // reduce the size and recompress
+				input_length -= 1024;
+				assert(input_length > 0); // logically, this should not happen
+				continue;
+			}
+			fp->errcode |= BGZF_ERR_ZLIB;
+			return -1;
+		}
+		if (deflateEnd(&zs) != Z_OK) {
+			fp->errcode |= BGZF_ERR_ZLIB;
+			return -1;
+		}
+		compressed_length = zs.total_out;
+		compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+		assert(compressed_length <= BGZF_BLOCK_SIZE);
+		break;
+	}
+
+	assert(compressed_length > 0);
+	packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes
+	crc = crc32(0L, NULL, 0L);
+	crc = crc32(crc, fp->uncompressed_block, input_length);
+	packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+	packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+	remaining = block_length - input_length;
+	if (remaining > 0) {
+		assert(remaining <= input_length);
+		memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining);
+	}
+	fp->block_offset = remaining;
+	return compressed_length;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+	z_stream zs;
+	zs.zalloc = NULL;
+	zs.zfree = NULL;
+	zs.next_in = fp->compressed_block + 18;
+	zs.avail_in = block_length - 16;
+	zs.next_out = fp->uncompressed_block;
+	zs.avail_out = BGZF_BLOCK_SIZE;
+
+	if (inflateInit2(&zs, -15) != Z_OK) {
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+		inflateEnd(&zs);
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	if (inflateEnd(&zs) != Z_OK) {
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	return zs.total_out;
+}
+
+static int check_header(const uint8_t *header)
+{
+	return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
+			&& unpackInt16((uint8_t*)&header[10]) == 6
+			&& header[12] == 'B' && header[13] == 'C'
+			&& unpackInt16((uint8_t*)&header[14]) == 2);
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+	khint_t k;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	if (fp->open_mode != 'r') return;
+	for (k = kh_begin(h); k < kh_end(h); ++k)
+		if (kh_exist(h, k)) free(kh_val(h, k).block);
+	kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+	khint_t k;
+	cache_t *p;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	k = kh_get(cache, h, block_address);
+	if (k == kh_end(h)) return 0;
+	p = &kh_val(h, k);
+	if (fp->block_length != 0) fp->block_offset = 0;
+	fp->block_address = block_address;
+	fp->block_length = p->size;
+	memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
+	_bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
+	return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+	int ret;
+	khint_t k;
+	cache_t *p;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	if (BGZF_BLOCK_SIZE >= fp->cache_size) return;
+	if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
+		/* A better way would be to remove the oldest block in the
+		 * cache, but here we remove a random one for simplicity. This
+		 * should not have a big impact on performance. */
+		for (k = kh_begin(h); k < kh_end(h); ++k)
+			if (kh_exist(h, k)) break;
+		if (k < kh_end(h)) {
+			free(kh_val(h, k).block);
+			kh_del(cache, h, k);
+		}
+	}
+	k = kh_put(cache, h, fp->block_address, &ret);
+	if (ret == 0) return; // if this happens, a bug!
+	p = &kh_val(h, k);
+	p->size = fp->block_length;
+	p->end_offset = fp->block_address + size;
+	p->block = malloc(BGZF_BLOCK_SIZE);
+	memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+	uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+	int count, size = 0, block_length, remaining;
+	int64_t block_address;
+	block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+	if (load_block_from_cache(fp, block_address)) return 0;
+	count = _bgzf_read(fp->fp, header, sizeof(header));
+	if (count == 0) { // no data read
+		fp->block_length = 0;
+		return 0;
+	}
+	if (count != sizeof(header) || !check_header(header)) {
+		fp->errcode |= BGZF_ERR_HEADER;
+		return -1;
+	}
+	size = count;
+	block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+	compressed_block = (uint8_t*)fp->compressed_block;
+	memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+	remaining = block_length - BLOCK_HEADER_LENGTH;
+	count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+	if (count != remaining) {
+		fp->errcode |= BGZF_ERR_IO;
+		return -1;
+	}
+	size += count;
+	if ((count = inflate_block(fp, block_length)) < 0) return -1;
+	if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+	fp->block_address = block_address;
+	fp->block_length = count;
+	cache_block(fp, size);
+	return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
+{
+	ssize_t bytes_read = 0;
+	uint8_t *output = data;
+	if (length <= 0) return 0;
+	assert(fp->open_mode == 'r');
+	while (bytes_read < length) {
+		int copy_length, available = fp->block_length - fp->block_offset;
+		uint8_t *buffer;
+		if (available <= 0) {
+			if (bgzf_read_block(fp) != 0) return -1;
+			available = fp->block_length - fp->block_offset;
+			if (available <= 0) break;
+		}
+		copy_length = length - bytes_read < available? length - bytes_read : available;
+		buffer = fp->uncompressed_block;
+		memcpy(output, buffer + fp->block_offset, copy_length);
+		fp->block_offset += copy_length;
+		output += copy_length;
+		bytes_read += copy_length;
+	}
+	if (fp->block_offset == fp->block_length) {
+		fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+		fp->block_offset = fp->block_length = 0;
+	}
+	return bytes_read;
+}
+
+int bgzf_flush(BGZF *fp)
+{
+	assert(fp->open_mode == 'w');
+	while (fp->block_offset > 0) {
+		int block_length;
+		block_length = deflate_block(fp, fp->block_offset);
+		if (block_length < 0) return -1;
+		if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
+			fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+			return -1;
+		}
+		fp->block_address += block_length;
+	}
+	return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+	if (fp->block_offset + size > BGZF_BLOCK_SIZE)
+		return bgzf_flush(fp);
+	return -1;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
+{
+	const uint8_t *input = data;
+	int block_length = BGZF_BLOCK_SIZE, bytes_written;
+	assert(fp->open_mode == 'w');
+	input = data;
+	bytes_written = 0;
+	while (bytes_written < length) {
+		uint8_t* buffer = fp->uncompressed_block;
+		int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
+		memcpy(buffer + fp->block_offset, input, copy_length);
+		fp->block_offset += copy_length;
+		input += copy_length;
+		bytes_written += copy_length;
+		if (fp->block_offset == block_length && bgzf_flush(fp)) break;
+	}
+	return bytes_written;
+}
+
+int bgzf_close(BGZF* fp)
+{
+	int ret, count, block_length;
+	if (fp == 0) return -1;
+	if (fp->open_mode == 'w') {
+		if (bgzf_flush(fp) != 0) return -1;
+		block_length = deflate_block(fp, 0); // write an empty block
+		count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
+		if (fflush(fp->fp) != 0) {
+			fp->errcode |= BGZF_ERR_IO;
+			return -1;
+		}
+	}
+	ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp);
+	if (ret != 0) return -1;
+	free(fp->uncompressed_block);
+	free(fp->compressed_block);
+	free_cache(fp);
+	free(fp);
+	return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+	if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+	static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
+	uint8_t buf[28];
+	off_t offset;
+	offset = _bgzf_tell((_bgzf_file_t)fp->fp);
+	if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
+	_bgzf_read(fp->fp, buf, 28);
+	_bgzf_seek(fp->fp, offset, SEEK_SET);
+	return (memcmp(magic, buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+	int block_offset;
+	int64_t block_address;
+
+	if (fp->open_mode != 'r' || where != SEEK_SET) {
+		fp->errcode |= BGZF_ERR_MISUSE;
+		return -1;
+	}
+	block_offset = pos & 0xFFFF;
+	block_address = pos >> 16;
+	if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
+		fp->errcode |= BGZF_ERR_IO;
+		return -1;
+	}
+	fp->block_length = 0;  // indicates current block has not been loaded
+	fp->block_address = block_address;
+	fp->block_offset = block_offset;
+	return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+	uint8_t buf[16];
+	int n;
+	_bgzf_file_t fp;
+	if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
+	n = _bgzf_read(fp, buf, 16);
+	_bgzf_close(fp);
+	if (n != 16) return 0;
+	return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+	int c;
+	if (fp->block_offset >= fp->block_length) {
+		if (bgzf_read_block(fp) != 0) return -2; /* error */
+		if (fp->block_length == 0) return -1; /* end-of-file */
+	}
+	c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+	return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+	int l, state = 0;
+	unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+	str->l = 0;
+	do {
+		if (fp->block_offset >= fp->block_length) {
+			if (bgzf_read_block(fp) != 0) { state = -2; break; }
+			if (fp->block_length == 0) { state = -1; break; }
+		}
+		for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+		if (l < fp->block_length) state = 1;
+		l -= fp->block_offset;
+		if (str->l + l + 1 >= str->m) {
+			str->m = str->l + l + 2;
+			kroundup32(str->m);
+			str->s = (char*)realloc(str->s, str->m);
+		}
+		memcpy(str->s + str->l, buf + fp->block_offset, l);
+		str->l += l;
+		fp->block_offset += l + 1;
+		if (fp->block_offset >= fp->block_length) {
+			fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+			fp->block_offset = 0;
+			fp->block_length = 0;
+		} 
+	} while (state == 0);
+	if (str->l == 0 && state < 0) return state;
+	str->s[str->l] = 0;
+	return str->l;
+}
--- a/ext/klib/bgzf.h
+++ b/ext/klib/bgzf.h
@ -0,0 +1,196 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/* The BGZF library was originally written by Bob Handsaker from the Broad
+ * Institute. It was later improved by the SAMtools developers. */
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#define BGZF_BLOCK_SIZE     0x10000
+#define BGZF_MAX_BLOCK_SIZE 0x10000
+
+#define BGZF_ERR_ZLIB   1
+#define BGZF_ERR_HEADER 2
+#define BGZF_ERR_IO     4
+#define BGZF_ERR_MISUSE 8
+
+typedef struct {
+    int open_mode:8, compress_level:8, errcode:16;
+	int cache_size;
+    int block_length, block_offset;
+    int64_t block_address;
+    void *uncompressed_block, *compressed_block;
+	void *cache; // a pointer to a hash table
+	void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading
+} BGZF;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/******************
+	 * Basic routines *
+	 ******************/
+
+	/**
+	 * Open an existing file descriptor for reading or writing.
+	 *
+	 * @param fd    file descriptor
+	 * @param mode  mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies
+	 *              the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored.
+	 * @return      BGZF file handler; 0 on error
+	 */
+	BGZF* bgzf_dopen(int fd, const char *mode);
+
+	#define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility
+
+	/**
+	 * Open the specified file for reading or writing.
+	 */
+	BGZF* bgzf_open(const char* path, const char *mode);
+
+	/**
+	 * Close the BGZF and free all associated resources.
+	 *
+	 * @param fp    BGZF file handler
+	 * @return      0 on success and -1 on error
+	 */
+	int bgzf_close(BGZF *fp);
+
+	/**
+	 * Read up to _length_ bytes from the file storing into _data_.
+	 *
+	 * @param fp     BGZF file handler
+	 * @param data   data array to read into
+	 * @param length size of data to read
+	 * @return       number of bytes actually read; 0 on end-of-file and -1 on error
+	 */
+	ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length);
+
+	/**
+	 * Write _length_ bytes from _data_ to the file.
+	 *
+	 * @param fp     BGZF file handler
+	 * @param data   data array to write
+	 * @param length size of data to write
+	 * @return       number of bytes actually written; -1 on error
+	 */
+	ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length);
+
+	/**
+	 * Write the data in the buffer to the file.
+	 */
+	int bgzf_flush(BGZF *fp);
+
+	/**
+	 * Return a virtual file pointer to the current location in the file.
+	 * No interpetation of the value should be made, other than a subsequent
+	 * call to bgzf_seek can be used to position the file at the same point.
+	 * Return value is non-negative on success.
+	 */
+	#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
+
+	/**
+	 * Set the file to read from the location specified by _pos_.
+	 *
+	 * @param fp     BGZF file handler
+	 * @param pos    virtual file offset returned by bgzf_tell()
+	 * @param whence must be SEEK_SET
+	 * @return       0 on success and -1 on error
+	 */
+	int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
+
+	/**
+	 * Check if the BGZF end-of-file (EOF) marker is present
+	 *
+	 * @param fp    BGZF file handler opened for reading
+	 * @return      1 if EOF is present; 0 if not or on I/O error
+	 */
+	int bgzf_check_EOF(BGZF *fp);
+
+	/**
+	 * Check if a file is in the BGZF format
+	 *
+	 * @param fn    file name
+	 * @return      1 if _fn_ is BGZF; 0 if not or on I/O error
+	 */
+	 int bgzf_is_bgzf(const char *fn);
+
+	/*********************
+	 * Advanced routines *
+	 *********************/
+
+	/**
+	 * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+	 *
+	 * @param fp    BGZF file handler
+	 * @param size  size of cache in bytes; 0 to disable caching (default)
+	 */
+	void bgzf_set_cache_size(BGZF *fp, int size);
+
+	/**
+	 * Flush the file if the remaining buffer size is smaller than _size_ 
+	 */
+	int bgzf_flush_try(BGZF *fp, ssize_t size);
+
+	/**
+	 * Read one byte from a BGZF file. It is faster than bgzf_read()
+	 * @param fp     BGZF file handler
+	 * @return       byte read; -1 on end-of-file or error
+	 */
+	int bgzf_getc(BGZF *fp);
+
+	/**
+	 * Read one line from a BGZF file. It is faster than bgzf_getc()
+	 *
+	 * @param fp     BGZF file handler
+	 * @param delim  delimitor
+	 * @param str    string to write to; must be initialized
+	 * @return       length of the string; 0 on end-of-file; negative on error
+	 */
+	int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
+
+	/**
+	 * Read the next BGZF block.
+	 */
+	int bgzf_read_block(BGZF *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ext/klib/cpp/kavl.hpp
+++ b/ext/klib/cpp/kavl.hpp
@ -0,0 +1,202 @@
+#ifndef KAVL_HPP
+#define KAVL_HPP
+
+#include <functional>
+
+namespace klib {
+
+template<class T, typename Less = std::less<T> >
+class Avl {
+	static const int MAX_DEPTH = 64;
+	struct Node {
+		T data;
+		signed char balance;
+		unsigned size;
+		Node *p[2];
+	};
+	Node *root;
+	inline int cmp_func(const T &x, const T &y) {
+		return Less()(y, x) - Less()(x, y);
+	}
+	inline unsigned child_size(Node *p, int dir) {
+		return p->p[dir]? p->p[dir]->size : 0;
+	};
+	// one rotation: (a,(b,c)q)p => ((a,b)p,c)q
+	inline Node *rotate1(Node *p, int dir) { // dir=0 to left; dir=1 to right
+		int opp = 1 - dir; // opposite direction
+		Node *q = p->p[opp];
+		unsigned size_p = p->size;
+		p->size -= q->size - child_size(q, dir);
+		q->size = size_p;
+		p->p[opp] = q->p[dir];
+		q->p[dir] = p;
+		return q;
+	};
+	// two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r
+	inline Node *rotate2(Node *p, int dir) {
+		int b1, opp = 1 - dir;
+		Node *q = p->p[opp], *r = q->p[dir];
+		unsigned size_x_dir = child_size(r, dir);
+		r->size = p->size;
+		p->size -= q->size - size_x_dir;
+		q->size -= size_x_dir + 1;
+		p->p[opp] = r->p[dir];
+		r->p[dir] = p;
+		q->p[dir] = r->p[opp];
+		r->p[opp] = q;
+		b1 = dir == 0? +1 : -1;
+		if (r->balance == b1) q->balance = 0, p->balance = -b1;
+		else if (r->balance == 0) q->balance = p->balance = 0;
+		else q->balance = b1, p->balance = 0;
+		r->balance = 0;
+		return r;
+	};
+	void destroy(Node *r) {
+		Node *p, *q;
+		for (p = r; p; p = q) {
+			if (p->p[0] == 0) {
+				q = p->p[1];
+				delete p;
+			} else {
+				q = p->p[0];
+				p->p[0] = q->p[1];
+				q->p[1] = p;
+			}
+		}
+	};
+public:
+	Avl() : root(NULL) {};
+	~Avl() { destroy(root); };
+	unsigned size() const { return root? root->size : 0; }
+	T *find(const T &data, unsigned *cnt_ = NULL) {
+		Node *p = root;
+		unsigned cnt = 0;
+		while (p != 0) {
+			int cmp = cmp_func(data, p->data);
+			if (cmp >= 0) cnt += child_size(p, 0) + 1;
+			if (cmp < 0) p = p->p[0];
+			else if (cmp > 0) p = p->p[1];
+			else break;
+		}
+		if (cnt_) *cnt_ = cnt;
+		return p? &p->data : NULL;
+	};
+	T *insert(const T &data, bool *is_new = NULL, unsigned *cnt_ = NULL) {
+		unsigned char stack[MAX_DEPTH];
+		Node *path[MAX_DEPTH];
+		Node *bp, *bq;
+		Node *x, *p, *q, *r = 0; // _r_ is potentially the new root
+		int i, which = 0, top, b1, path_len;
+		unsigned cnt = 0;
+		bp = root, bq = 0;
+		if (is_new) *is_new = false;
+		// find the insertion location
+		for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->p[which]) {
+			int cmp = cmp_func(data, p->data);
+			if (cmp >= 0) cnt += child_size(p, 0) + 1;
+			if (cmp == 0) {
+				if (cnt_) *cnt_ = cnt;
+				return &p->data;
+			}
+			if (p->balance != 0)
+				bq = q, bp = p, top = 0;
+			stack[top++] = which = (cmp > 0);
+			path[path_len++] = p;
+		}
+		if (cnt_) *cnt_ = cnt;
+		x = new Node;
+		x->data = data, x->balance = 0, x->size = 1, x->p[0] = x->p[1] = 0;
+		if (is_new) *is_new = true;
+		if (q == 0) root = x;
+		else q->p[which] = x;
+		if (bp == 0) return &x->data;
+		for (i = 0; i < path_len; ++i) ++path[i]->size;
+		for (p = bp, top = 0; p != x; p = p->p[stack[top]], ++top) /* update balance factors */
+			if (stack[top] == 0) --p->balance;
+			else ++p->balance;
+		if (bp->balance > -2 && bp->balance < 2) return &x->data; /* no re-balance needed */
+		// re-balance
+		which = (bp->balance < 0);
+		b1 = which == 0? +1 : -1;
+		q = bp->p[1 - which];
+		if (q->balance == b1) {
+			r = rotate1(bp, which);
+			q->balance = bp->balance = 0;
+		} else r = rotate2(bp, which);
+		if (bq == 0) root = r;
+		else bq->p[bp != bq->p[0]] = r;
+		return &x->data;
+	};
+	bool erase(const T &data) {
+		Node *p, *path[MAX_DEPTH], fake;
+		unsigned char dir[MAX_DEPTH];
+		int i, d = 0, cmp;
+		fake.p[0] = root, fake.p[1] = 0;
+		for (cmp = -1, p = &fake; cmp; cmp = cmp_func(data, p->data)) {
+			int which = (cmp > 0);
+			dir[d] = which;
+			path[d++] = p;
+			p = p->p[which];
+			if (p == 0) return false;
+		}
+		for (i = 1; i < d; ++i) --path[i]->size;
+		if (p->p[1] == 0) { // ((1,.)2,3)4 => (1,3)4; p=2
+			path[d-1]->p[dir[d-1]] = p->p[0];
+		} else {
+			Node *q = p->p[1];
+			if (q->p[0] == 0) { // ((1,2)3,4)5 => ((1)2,4)5; p=3
+				q->p[0] = p->p[0];
+				q->balance = p->balance;
+				path[d-1]->p[dir[d-1]] = q;
+				path[d] = q, dir[d++] = 1;
+				q->size = p->size - 1;
+			} else { // ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6
+				Node *r;
+				int e = d++; // backup _d_
+				for (;;) {
+					dir[d] = 0;
+					path[d++] = q;
+					r = q->p[0];
+					if (r->p[0] == 0) break;
+					q = r;
+				}
+				r->p[0] = p->p[0];
+				q->p[0] = r->p[1];
+				r->p[1] = p->p[1];
+				r->balance = p->balance;
+				path[e-1]->p[dir[e-1]] = r;
+				path[e] = r, dir[e] = 1;
+				for (i = e + 1; i < d; ++i) --path[i]->size;
+				r->size = p->size - 1;
+			}
+		}
+		while (--d > 0) {
+			Node *q = path[d];
+			int which, other, b1 = 1, b2 = 2;
+			which = dir[d], other = 1 - which;
+			if (which) b1 = -b1, b2 = -b2;
+			q->balance += b1;
+			if (q->balance == b1) break;
+			else if (q->balance == b2) {
+				Node *r = q->p[other];
+				if (r->balance == -b1) {
+					path[d-1]->p[dir[d-1]] = rotate2(q, which);
+				} else {
+					path[d-1]->p[dir[d-1]] = rotate1(q, which);
+					if (r->balance == 0) {
+						r->balance = -b1;
+						q->balance = b1;
+						break;
+					} else r->balance = q->balance = 0;
+				}
+			}
+		}
+		root = fake.p[0];
+		delete p;
+		return true;
+	};
+};
+
+} // end of namespace klib
+
+#endif
--- a/ext/klib/cpp/khash.hpp
+++ b/ext/klib/cpp/khash.hpp
@ -0,0 +1,163 @@
+#ifndef KHASH_HPP
+#define KHASH_HPP
+
+#include <memory>
+#include <functional>
+#include <cstdlib> // for malloc() etc
+#include <cstring> // for memset()
+
+#include <stdint.h> // for uint32_t
+
+namespace klib {
+
+#ifndef kroundup32 // FIXME: doesn't work for 64-bit integers
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+template<class T, class Hash, class Eq = std::equal_to<T>, typename khint_t = uint32_t>
+class KHash {
+	khint_t n_buckets, count, n_occupied, upper_bound;
+	uint32_t *flags;
+	T *keys;
+public:
+	KHash() : n_buckets(0), count(0), n_occupied(0), upper_bound(0), flags(NULL), keys(NULL) {};
+	~KHash() { std::free(flags); std::free(keys); };
+	khint_t capacity(void) const { return n_buckets; };
+	khint_t size(void) const { return count; };
+	khint_t begin(void) const { return 0; };
+	khint_t end(void) const { return n_buckets; };
+
+	void exist(khint_t x) const { return !__ac_iseither(flags, x); };
+	T &at(khint_t x) { return keys[x]; };
+
+	khint_t get(const T &key) const {
+		if (n_buckets) {
+			khint_t k, i, last, mask, step = 0;
+			mask = n_buckets - 1;
+			k = Hash()(key); i = k & mask;
+			last = i;
+			while (!__ac_isempty(flags, i) && (__ac_isdel(flags, i) || !Eq()(keys[i], key))) {
+				i = (i + (++step)) & mask;
+				if (i == last) return n_buckets;
+			}
+			return __ac_iseither(flags, i)? n_buckets : i;
+		} else return 0;
+	};
+
+	int resize(khint_t new_n_buckets) {
+		uint32_t *new_flags = 0;
+		khint_t j = 1;
+		{
+			kroundup32(new_n_buckets);
+			if (new_n_buckets < 4) new_n_buckets = 4;
+			if (count >= (new_n_buckets>>1) + (new_n_buckets>>2)) j = 0;	/* requested count is too small */
+			else { /* hash table count to be changed (shrink or expand); rehash */
+				new_flags = (uint32_t*)std::malloc(__ac_fsize(new_n_buckets) * sizeof(uint32_t));
+				if (!new_flags) return -1;
+				::memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(uint32_t));
+				if (n_buckets < new_n_buckets) {	/* expand */
+					T *new_keys = (T*)std::realloc((void *)keys, new_n_buckets * sizeof(T));
+					if (!new_keys) { std::free(new_flags); return -1; }
+					keys = new_keys;
+				} /* otherwise shrink */
+			}
+		}
+		if (j) { /* rehashing is needed */
+			for (j = 0; j != n_buckets; ++j) {
+				if (__ac_iseither(flags, j) == 0) {
+					T key = keys[j];
+					khint_t new_mask;
+					new_mask = new_n_buckets - 1;
+					__ac_set_isdel_true(flags, j);
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */
+						khint_t k, i, step = 0;
+						k = Hash()(key);
+						i = k & new_mask;
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask;
+						__ac_set_isempty_false(new_flags, i);
+						if (i < n_buckets && __ac_iseither(flags, i) == 0) { /* kick out the existing element */
+							{ T tmp = keys[i]; keys[i] = key; key = tmp; }
+							__ac_set_isdel_true(flags, i); /* mark it as deleted in the old hash table */
+						} else { /* write the element and jump out of the loop */
+							keys[i] = key;
+							break;
+						}
+					}
+				}
+			}
+			if (n_buckets > new_n_buckets) /* shrink the hash table */
+				keys = (T*)std::realloc((void *)keys, new_n_buckets * sizeof(T));
+			std::free(flags); /* free the working space */
+			flags = new_flags;
+			n_buckets = new_n_buckets;
+			n_occupied = count;
+			upper_bound = (n_buckets>>1) + (n_buckets>>2);
+		}
+		return 0;
+	};
+
+	khint_t put(const T &key, int *ret) {
+		khint_t x;
+		if (n_occupied >= upper_bound) { /* update the hash table */
+			if (n_buckets > (count<<1)) {
+				if (resize(n_buckets - 1) < 0) { /* clear "deleted" elements */
+					*ret = -1; return n_buckets;
+				}
+			} else if (resize(n_buckets + 1) < 0) { /* expand the hash table */
+				*ret = -1; return n_buckets;
+			}
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */
+		{
+			khint_t k, i, site, last, mask = n_buckets - 1, step = 0;
+			x = site = n_buckets; k = Hash()(key); i = k & mask;
+			if (__ac_isempty(flags, i)) x = i; /* for speed up */
+			else {
+				last = i;
+				while (!__ac_isempty(flags, i) && (__ac_isdel(flags, i) || !Eq()(keys[i], key))) {
+					if (__ac_isdel(flags, i)) site = i;
+					i = (i + (++step)) & mask;
+					if (i == last) { x = site; break; }
+				}
+				if (x == n_buckets) {
+					if (__ac_isempty(flags, i) && site != n_buckets) x = site;
+					else x = i;
+				}
+			}
+		}
+		if (__ac_isempty(flags, x)) { /* not present at all */
+			keys[x] = key;
+			__ac_set_isboth_false(flags, x);
+			++count; ++n_occupied;
+			*ret = 1;
+		} else if (__ac_isdel(flags, x)) { /* deleted */
+			keys[x] = key;
+			__ac_set_isboth_false(flags, x);
+			++count;
+			*ret = 2;
+		} else *ret = 0; /* Don't touch keys[x] if present and not deleted */
+		return x;
+	};
+
+	void del(khint_t x) {
+		if (x != n_buckets && !__ac_iseither(flags, x)) {
+			__ac_set_isdel_true(flags, x);
+			--count;
+		}
+	};
+};
+
+} // end of namespace klib
+
+#endif
--- a/ext/klib/cpp/khashl.hpp
+++ b/ext/klib/cpp/khashl.hpp
@ -0,0 +1,258 @@
+#ifndef __AC_KHASHL_HPP
+#define __AC_KHASHL_HPP
+
+#include <functional> // for std::equal_to
+#include <cstdlib>    // for malloc() etc
+#include <cstring>    // for memset()
+#include <stdint.h>   // for uint32_t
+
+/* // ==> Code example <==
+#include <cstdio>
+#include "khashl.hpp"
+
+int main(void)
+{
+	klib::KHashMap<uint32_t, int, std::hash<uint32_t> > h; // NB: C++98 doesn't have std::hash
+	uint32_t k;
+	int absent;
+	h[43] = 1, h[53] = 2, h[63] = 3, h[73] = 4;       // one way to insert
+	k = h.put(53, &absent), h.value(k) = -2;          // another way to insert
+	if (!absent) printf("already in the table\n");    //   which allows to test presence
+	if (h.get(33) == h.end()) printf("not found!\n"); // test presence without insertion
+	h.del(h.get(43));               // deletion
+	for (k = 0; k != h.end(); ++k)  // traversal
+		if (h.occupied(k))          // some buckets are not occupied; skip them
+			printf("%u => %d\n", h.key(k), h.value(k));
+	return 0;
+}
+*/
+
+namespace klib {
+
+/***********
+ * HashSet *
+ ***********/
+
+template<class T, class Hash, class Eq = std::equal_to<T>, typename khint_t = uint32_t>
+class KHashSet {
+	khint_t bits, count;
+	uint32_t *used;
+	T *keys;
+	static inline uint32_t __kh_used(const uint32_t *flag, khint_t i) { return flag[i>>5] >> (i&0x1fU) & 1U; };
+	static inline void __kh_set_used(uint32_t *flag, khint_t i) { flag[i>>5] |= 1U<<(i&0x1fU); };
+	static inline void __kh_set_unused(uint32_t *flag, khint_t i) { flag[i>>5] &= ~(1U<<(i&0x1fU)); };
+	static inline khint_t __kh_fsize(khint_t m) { return m<32? 1 : m>>5; }
+	static inline khint_t __kh_h2b(uint32_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); }
+	static inline khint_t __kh_h2b(uint64_t hash, khint_t bits) { return hash * 11400714819323198485ULL >> (64 - bits); }
+public:
+	KHashSet() : bits(0), count(0), used(0), keys(0) {};
+	~KHashSet() { std::free(used); std::free(keys); };
+	inline khint_t n_buckets() const { return used? khint_t(1) << bits : 0; }
+	inline khint_t end() const { return n_buckets(); }
+	inline khint_t size() const { return count; }
+	inline T &key(khint_t x) { return keys[x]; };
+	inline bool occupied(khint_t x) const { return (__kh_used(used, x) != 0); }
+	void clear(void) { 
+		if (!used) return;
+		memset(used, 0, __kh_fsize(n_buckets()) * sizeof(uint32_t));
+		count = 0;
+	}
+	khint_t get(const T &key) const {
+		khint_t i, last, mask, nb;
+		if (keys == 0) return 0;
+		nb = n_buckets();
+		mask = nb - khint_t(1);
+		i = last = __kh_h2b(Hash()(key), bits);
+		while (__kh_used(used, i) && !Eq()(keys[i], key)) {
+			i = (i + khint_t(1)) & mask;
+			if (i == last) return nb;
+		}
+		return !__kh_used(used, i)? nb : i;
+	}
+	int resize(khint_t new_nb) {
+		uint32_t *new_used = 0;
+		khint_t j = 0, x = new_nb, nb, new_bits, new_mask;
+		while ((x >>= khint_t(1)) != 0) ++j;
+		if (new_nb & (new_nb - 1)) ++j;
+		new_bits = j > 2? j : 2;
+		new_nb = khint_t(1) << new_bits;
+		if (count > (new_nb>>1) + (new_nb>>2)) return 0; /* requested size is too small */
+		new_used = (uint32_t*)std::malloc(__kh_fsize(new_nb) * sizeof(uint32_t));
+		memset(new_used, 0, __kh_fsize(new_nb) * sizeof(uint32_t));
+		if (!new_used) return -1; /* not enough memory */
+		nb = n_buckets();
+		if (nb < new_nb) { /* expand */
+			T *new_keys = (T*)std::realloc(keys, new_nb * sizeof(T));
+			if (!new_keys) { std::free(new_used); return -1; }
+			keys = new_keys;
+		} /* otherwise shrink */
+		new_mask = new_nb - 1;
+		for (j = 0; j != nb; ++j) {
+			if (!__kh_used(used, j)) continue;
+			T key = keys[j];
+			__kh_set_unused(used, j);
+			while (1) { /* kick-out process; sort of like in Cuckoo hashing */
+				khint_t i;
+				i = __kh_h2b(Hash()(key), new_bits);
+				while (__kh_used(new_used, i)) i = (i + khint_t(1)) & new_mask;
+				__kh_set_used(new_used, i);
+				if (i < nb && __kh_used(used, i)) { /* kick out the existing element */
+					{ T tmp = keys[i]; keys[i] = key; key = tmp; }
+					__kh_set_unused(used, i); /* mark it as deleted in the old hash table */
+				} else { /* write the element and jump out of the loop */
+					keys[i] = key;
+					break;
+				}
+			}
+		}
+		if (nb > new_nb) /* shrink the hash table */
+			keys = (T*)std::realloc(keys, new_nb * sizeof(T));
+		std::free(used); /* free the working space */
+		used = new_used, bits = new_bits;
+		return 0;
+	}
+	khint_t put(const T &key, int *absent_ = 0) {
+		khint_t nb, i, last, mask;
+		int absent = -1;
+		nb = n_buckets();
+		if (count >= (nb>>1) + (nb>>2)) { /* rehashing */
+			if (resize(nb + khint_t(1)) < 0) {
+				if (absent_) *absent_ = -1;
+				return nb;
+			}
+			nb = n_buckets();
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */
+		mask = nb - 1;
+		i = last = __kh_h2b(Hash()(key), bits);
+		while (__kh_used(used, i) && !Eq()(keys[i], key)) {
+			i = (i + 1U) & mask;
+			if (i == last) break;
+		}
+		if (!__kh_used(used, i)) { /* not present at all */
+			keys[i] = key;
+			__kh_set_used(used, i);
+			++count, absent = 1;
+		} else absent = 0; /* Don't touch keys[i] if present */
+		if (absent_) *absent_ = absent;
+		return i;
+	}
+	int del(khint_t i) {
+		khint_t j = i, k, mask, nb = n_buckets();
+		if (keys == 0 || i >= nb) return 0;
+		mask = nb - khint_t(1);
+		while (1) {
+			j = (j + khint_t(1)) & mask;
+			if (j == i || !__kh_used(used, j)) break; /* j==i only when the table is completely full */
+			k = __kh_h2b(Hash()(keys[j]), bits);
+			if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j)))
+				keys[i] = keys[j], i = j;
+		}
+		__kh_set_unused(used, i);
+		--count;
+		return 1;
+	}
+};
+
+/***********
+ * HashMap *
+ ***********/
+
+template<class KType, class VType>
+struct KHashMapBucket { KType key; VType val; };
+
+template<class T, class Hash, typename khint_t>
+struct KHashMapHash { khint_t operator() (const T &a) const { return Hash()(a.key); } };
+
+template<class T, class Eq>
+struct KHashMapEq { bool operator() (const T &a, const T &b) const { return Eq()(a.key, b.key); } };
+
+template<class KType, class VType, class Hash, class Eq=std::equal_to<KType>, typename khint_t=uint32_t>
+class KHashMap : public KHashSet<KHashMapBucket<KType, VType>,
+		KHashMapHash<KHashMapBucket<KType, VType>, Hash, khint_t>,
+		KHashMapEq<KHashMapBucket<KType, VType>, Eq>, khint_t>
+{
+	typedef KHashMapBucket<KType, VType> bucket_t;
+	typedef KHashSet<bucket_t, KHashMapHash<bucket_t, Hash, khint_t>, KHashMapEq<bucket_t, Eq>, khint_t> hashset_t;
+public:
+	khint_t get(const KType &key) const {
+		bucket_t t = { key, VType() };
+		return hashset_t::get(t);
+	}
+	khint_t put(const KType &key, int *absent) {
+		bucket_t t = { key, VType() };
+		return hashset_t::put(t, absent);
+	}
+	inline KType &key(khint_t i) { return hashset_t::key(i).key; }
+	inline VType &value(khint_t i) { return hashset_t::key(i).val; }
+	inline VType &operator[] (const KType &key) {
+		bucket_t t = { key, VType() };
+		return value(hashset_t::put(t));
+	}
+};
+
+/****************************
+ * HashSet with cached hash *
+ ****************************/
+
+template<class KType, typename khint_t>
+struct KHashSetCachedBucket { KType key; khint_t hash; };
+
+template<class T, typename khint_t>
+struct KHashCachedHash { khint_t operator() (const T &a) const { return a.hash; } };
+
+template<class T, class Eq>
+struct KHashCachedEq { bool operator() (const T &a, const T &b) const { return a.hash == b.hash && Eq()(a.key, b.key); } };
+
+template<class KType, class Hash, class Eq = std::equal_to<KType>, typename khint_t = uint32_t>
+class KHashSetCached : public KHashSet<KHashSetCachedBucket<KType, khint_t>,
+		KHashCachedHash<KHashSetCachedBucket<KType, khint_t>, khint_t>,
+		KHashCachedEq<KHashSetCachedBucket<KType, khint_t>, Eq>, khint_t>
+{
+	typedef KHashSetCachedBucket<KType, khint_t> bucket_t;
+	typedef KHashSet<bucket_t, KHashCachedHash<bucket_t, khint_t>, KHashCachedEq<bucket_t, Eq>, khint_t> hashset_t;
+public:
+	khint_t get(const KType &key) const {
+		bucket_t t = { key, Hash()(key) };
+		return hashset_t::get(t);
+	}
+	khint_t put(const KType &key, int *absent) {
+		bucket_t t = { key, Hash()(key) };
+		return hashset_t::put(t, absent);
+	}
+	inline KType &key(khint_t i) { return hashset_t::key(i).key; }
+};
+
+/****************************
+ * HashMap with cached hash *
+ ****************************/
+
+template<class KType, class VType, typename khint_t>
+struct KHashMapCachedBucket { KType key; VType val; khint_t hash; };
+
+template<class KType, class VType, class Hash, class Eq = std::equal_to<KType>, typename khint_t = uint32_t>
+class KHashMapCached : public KHashSet<KHashMapCachedBucket<KType, VType, khint_t>,
+		KHashCachedHash<KHashMapCachedBucket<KType, VType, khint_t>, khint_t>,
+		KHashCachedEq<KHashMapCachedBucket<KType, VType, khint_t>, Eq>, khint_t>
+{
+	typedef KHashMapCachedBucket<KType, VType, khint_t> bucket_t;
+	typedef KHashSet<bucket_t, KHashCachedHash<bucket_t, khint_t>, KHashCachedEq<bucket_t, Eq>, khint_t> hashset_t;
+public:
+	khint_t get(const KType &key) const {
+		bucket_t t = { key, VType(), Hash()(key) };
+		return hashset_t::get(t);
+	}
+	khint_t put(const KType &key, int *absent) {
+		bucket_t t = { key, VType(), Hash()(key) };
+		return hashset_t::put(t, absent);
+	}
+	inline KType &key(khint_t i) { return hashset_t::key(i).key; }
+	inline VType &value(khint_t i) { return hashset_t::key(i).val; }
+	inline VType &operator[] (const KType &key) {
+		bucket_t t = { key, VType(), Hash()(key) };
+		return value(hashset_t::put(t));
+	}
+};
+
+}
+
+#endif /* __AC_KHASHL_HPP */
--- a/ext/klib/kalloc.c
+++ b/ext/klib/kalloc.c
@ -0,0 +1,224 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "kalloc.h"
+
+/* In kalloc, a *core* is a large chunk of contiguous memory. Each core is
+ * associated with a master header, which keeps the size of the current core
+ * and the pointer to next core. Kalloc allocates small *blocks* of memory from
+ * the cores and organizes free memory blocks in a circular single-linked list.
+ *
+ * In the following diagram, "@" stands for the header of a free block (of type
+ * header_t), "#" for the header of an allocated block (of type size_t), "-"
+ * for free memory, and "+" for allocated memory.
+ *
+ * master        This region is core 1.          master           This region is core 2.
+ *      |                                             |
+ *      *@-------#++++++#++++++++++++@--------        *@----------#++++++++++++#+++++++@------------
+ *       |                           |                 |                               |
+ *       p=p->ptr->ptr->ptr->ptr     p->ptr            p->ptr->ptr                     p->ptr->ptr->ptr
+ */
+typedef struct header_t {
+	size_t size;
+	struct header_t *ptr;
+} header_t;
+
+typedef struct {
+	void *par;
+	size_t min_core_size;
+	header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */
+} kmem_t;
+
+static void panic(const char *s)
+{
+	fprintf(stderr, "%s\n", s);
+	abort();
+}
+
+void *km_init2(void *km_par, size_t min_core_size)
+{
+	kmem_t *km;
+	km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t));
+	km->par = km_par;
+	if (km_par) km->min_core_size = min_core_size > 0? min_core_size : ((kmem_t*)km_par)->min_core_size - 2;
+	else km->min_core_size = min_core_size > 0? min_core_size : 0x80000;
+	return (void*)km;
+}
+
+void *km_init(void) { return km_init2(0, 0); }
+
+void km_destroy(void *_km)
+{
+	kmem_t *km = (kmem_t*)_km;
+	void *km_par;
+	header_t *p, *q;
+	if (km == NULL) return;
+	km_par = km->par;
+	for (p = km->core_head; p != NULL;) {
+		q = p->ptr;
+		kfree(km_par, p);
+		p = q;
+	}
+	kfree(km_par, km);
+}
+
+static header_t *morecore(kmem_t *km, size_t nu)
+{
+	header_t *q;
+	size_t bytes, *p;
+	nu = (nu + 1 + (km->min_core_size - 1)) / km->min_core_size * km->min_core_size; /* the first +1 for core header */
+	bytes = nu * sizeof(header_t);
+	q = (header_t*)kmalloc(km->par, bytes);
+	if (!q) panic("[morecore] insufficient memory");
+	q->ptr = km->core_head, q->size = nu, km->core_head = q;
+	p = (size_t*)(q + 1);
+	*p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */
+	kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */
+	return km->loop_head;
+}
+
+void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */
+{
+	header_t *p, *q;
+	kmem_t *km = (kmem_t*)_km;
+	
+	if (!ap) return;
+	if (km == NULL) {
+		free(ap);
+		return;
+	}
+	p = (header_t*)((size_t*)ap - 1);
+	p->size = *((size_t*)ap - 1);
+	/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
+	 *
+	 * a) "p>q && p<q->ptr": @------#++++++++#+++++++@-------    @---------------#+++++++@-------
+	 *    (can also be in    |      |                |        -> |                       |
+	 *     two cores)        q      p           q->ptr           q                  q->ptr
+	 *
+	 *                       @--------    #+++++++++@--------    @--------    @------------------
+	 *                       |            |         |         -> |            |
+	 *                       q            p    q->ptr            q       q->ptr
+	 *
+	 * b) "q>=q->ptr && (p>q || p<q->ptr)":  @-------#+++++   @--------#+++++++     @-------#+++++   @----------------
+	 *                                       |                |        |         -> |                |
+	 *                                  q->ptr                q        p       q->ptr                q
+	 *
+	 *                                       #+++++++@-----   #++++++++@-------     @-------------   #++++++++@-------
+	 *                                       |       |                 |         -> |                         |
+	 *                                       p  q->ptr                 q       q->ptr                         q
+	 */
+	for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr)
+		if (q >= q->ptr && (p > q || p < q->ptr)) break;
+	if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
+		p->size += q->ptr->size;
+		p->ptr = q->ptr->ptr;
+	} else if (p + p->size > q->ptr && q->ptr >= p) {
+		panic("[kfree] The end of the allocated block enters a free block.");
+	} else p->ptr = q->ptr; /* backup q->ptr */
+
+	if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */
+		q->size += p->size;
+		q->ptr = p->ptr;
+		km->loop_head = q;
+	} else if (q + q->size > p && p >= q) {
+		panic("[kfree] The end of a free block enters the allocated block.");
+	} else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */
+}
+
+void *kmalloc(void *_km, size_t n_bytes)
+{
+	kmem_t *km = (kmem_t*)_km;
+	size_t n_units;
+	header_t *p, *q;
+
+	if (n_bytes == 0) return 0;
+	if (km == NULL) return malloc(n_bytes);
+	n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); /* header+n_bytes requires at least this number of units */
+
+	if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */
+		q = km->loop_head = km->base.ptr = &km->base;
+	for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */
+		if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
+			if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */
+			else { /* split the block. NB: memory is allocated at the end of the block! */
+				p->size -= n_units; /* reduce the size of the free block */
+				p += p->size; /* p points to the allocated block */
+				*(size_t*)p = n_units; /* set the size */
+			}
+			km->loop_head = q; /* set the end of chain */
+			return (size_t*)p + 1;
+		}
+		if (p == km->loop_head) { /* then ask for more "cores" */
+			if ((p = morecore(km, n_units)) == 0) return 0;
+		}
+	}
+}
+
+void *kcalloc(void *_km, size_t count, size_t size)
+{
+	kmem_t *km = (kmem_t*)_km;
+	void *p;
+	if (size == 0 || count == 0) return 0;
+	if (km == NULL) return calloc(count, size);
+	p = kmalloc(km, count * size);
+	memset(p, 0, count * size);
+	return p;
+}
+
+void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle
+{
+	kmem_t *km = (kmem_t*)_km;
+	size_t cap, *p, *q;
+
+	if (n_bytes == 0) {
+		kfree(km, ap); return 0;
+	}
+	if (km == NULL) return realloc(ap, n_bytes);
+	if (ap == NULL) return kmalloc(km, n_bytes);
+	p = (size_t*)ap - 1;
+	cap = (*p) * sizeof(header_t) - sizeof(size_t);
+	if (cap >= n_bytes) return ap; /* TODO: this prevents shrinking */
+	q = (size_t*)kmalloc(km, n_bytes);
+	memcpy(q, ap, cap);
+	kfree(km, ap);
+	return q;
+}
+
+void *krelocate(void *km, void *ap, size_t n_bytes)
+{
+	void *p;
+	if (km == 0 || ap == 0) return ap;
+	p = kmalloc(km, n_bytes);
+	memcpy(p, ap, n_bytes);
+	kfree(km, ap);
+	return p;
+}
+
+void km_stat(const void *_km, km_stat_t *s)
+{
+	kmem_t *km = (kmem_t*)_km;
+	header_t *p;
+	memset(s, 0, sizeof(km_stat_t));
+	if (km == NULL || km->loop_head == NULL) return;
+	for (p = km->loop_head;; p = p->ptr) {
+		s->available += p->size * sizeof(header_t);
+		if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */
+		if (p->ptr > p && p + p->size > p->ptr)
+			panic("[km_stat] The end of a free block enters another free block.");
+		if (p->ptr == km->loop_head) break;
+	}
+	for (p = km->core_head; p != NULL; p = p->ptr) {
+		size_t size = p->size * sizeof(header_t);
+		++s->n_cores;
+		s->capacity += size;
+		s->largest = s->largest > size? s->largest : size;
+	}
+}
+
+void km_stat_print(const void *km)
+{
+	km_stat_t st;
+	km_stat(km, &st);
+	fprintf(stderr, "[km_stat] cap=%ld, avail=%ld, largest=%ld, n_core=%ld, n_block=%ld\n",
+			st.capacity, st.available, st.largest, st.n_blocks, st.n_cores);
+}
--- a/ext/klib/kalloc.h
+++ b/ext/klib/kalloc.h
@ -0,0 +1,87 @@
+#ifndef _KALLOC_H_
+#define _KALLOC_H_
+
+#include <stddef.h> /* for size_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+	size_t capacity, available, n_blocks, n_cores, largest;
+} km_stat_t;
+
+void *kmalloc(void *km, size_t size);
+void *krealloc(void *km, void *ptr, size_t size);
+void *krelocate(void *km, void *ap, size_t n_bytes);
+void *kcalloc(void *km, size_t count, size_t size);
+void kfree(void *km, void *ptr);
+
+void *km_init(void);
+void *km_init2(void *km_par, size_t min_core_size);
+void km_destroy(void *km);
+void km_stat(const void *_km, km_stat_t *s);
+void km_stat_print(const void *km);
+
+#ifdef __cplusplus
+}
+#endif
+
+#define Kmalloc(km, type, cnt)       ((type*)kmalloc((km), (cnt) * sizeof(type)))
+#define Kcalloc(km, type, cnt)       ((type*)kcalloc((km), (cnt), sizeof(type)))
+#define Krealloc(km, type, ptr, cnt) ((type*)krealloc((km), (ptr), (cnt) * sizeof(type)))
+
+#define Kexpand(km, type, a, m) do { \
+		(m) = (m) >= 4? (m) + ((m)>>1) : 16; \
+		(a) = Krealloc(km, type, (a), (m)); \
+	} while (0)
+
+#define KMALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kmalloc((km), (len) * sizeof(*(ptr))))
+#define KCALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kcalloc((km), (len), sizeof(*(ptr))))
+#define KREALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))krealloc((km), (ptr), (len) * sizeof(*(ptr))))
+
+#define KEXPAND(km, a, m) do { \
+		(m) = (m) >= 4? (m) + ((m)>>1) : 16; \
+		KREALLOC((km), (a), (m)); \
+	} while (0)
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \
+	typedef struct { \
+		size_t cnt, n, max; \
+		kmptype_t **buf; \
+		void *km; \
+	} kmp_##name##_t; \
+	SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \
+		kmp_##name##_t *mp; \
+		mp = Kcalloc(km, kmp_##name##_t, 1); \
+		mp->km = km; \
+		return mp; \
+	} \
+	SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \
+		size_t k; \
+		for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \
+		kfree(mp->km, mp->buf); kfree(mp->km, mp); \
+	} \
+	SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
+		++mp->cnt; \
+		if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \
+		return mp->buf[--mp->n]; \
+	} \
+	SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
+		--mp->cnt; \
+		if (mp->n == mp->max) Kexpand(mp->km, kmptype_t*, mp->buf, mp->max); \
+		mp->buf[mp->n++] = p; \
+	}
+
+#define KALLOC_POOL_INIT(name, kmptype_t) \
+	KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t)
+
+#endif
--- a/ext/klib/kavl-lite.h
+++ b/ext/klib/kavl-lite.h
@ -0,0 +1,306 @@
+/* The MIT License
+
+   Copyright (c) 2021 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* An example:
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "kavl-lite.h"
+
+struct my_node {
+  char key;
+  KAVLL_HEAD(struct my_node) head;
+};
+#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key))
+KAVLL_INIT(my, struct my_node, head, my_cmp)
+
+int main(void) {
+  const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate
+  struct my_node *root = 0;
+  int i, l = strlen(str);
+  for (i = 0; i < l; ++i) {        // insert in the input order
+    struct my_node *q, *p = malloc(sizeof(*p));
+    p->key = str[i];
+    q = my_insert(&root, p);
+    if (p != q) free(p);           // if already present, free
+  }
+  my_itr_t itr;
+  my_itr_first(root, &itr);  // place at first
+  do {                             // traverse
+    const struct my_node *p = kavll_at(&itr);
+    putchar(p->key);
+    free((void*)p);                // free node
+  } while (my_itr_next(&itr));
+  putchar('\n');
+  return 0;
+}
+*/
+
+#ifndef KAVL_LITE_H
+#define KAVL_LITE_H
+
+#ifdef __STRICT_ANSI__
+#define inline __inline__
+#endif
+
+#define KAVLL_MAX_DEPTH 64
+
+#define KAVLL_HEAD(__type) \
+	struct { \
+		__type *p[2]; \
+		signed char balance; /* balance factor */ \
+	}
+
+#define __KAVLL_FIND(pre, __scope, __type, __head,  __cmp) \
+	__scope __type *pre##_find(const __type *root, const __type *x) { \
+		const __type *p = root; \
+		while (p != 0) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp < 0) p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		return (__type*)p; \
+	}
+
+#define __KAVLL_ROTATE(pre, __type, __head) \
+	/* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \
+	static inline __type *pre##_rotate1(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \
+		int opp = 1 - dir; /* opposite direction */ \
+		__type *q = p->__head.p[opp]; \
+		p->__head.p[opp] = q->__head.p[dir]; \
+		q->__head.p[dir] = p; \
+		return q; \
+	} \
+	/* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \
+	static inline __type *pre##_rotate2(__type *p, int dir) { \
+		int b1, opp = 1 - dir; \
+		__type *q = p->__head.p[opp], *r = q->__head.p[dir]; \
+		p->__head.p[opp] = r->__head.p[dir]; \
+		r->__head.p[dir] = p; \
+		q->__head.p[dir] = r->__head.p[opp]; \
+		r->__head.p[opp] = q; \
+		b1 = dir == 0? +1 : -1; \
+		if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \
+		else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \
+		else q->__head.balance = b1, p->__head.balance = 0; \
+		r->__head.balance = 0; \
+		return r; \
+	}
+
+#define __KAVLL_INSERT(pre, __scope, __type, __head, __cmp) \
+	__scope __type *pre##_insert(__type **root_, __type *x) { \
+		unsigned char stack[KAVLL_MAX_DEPTH]; \
+		__type *path[KAVLL_MAX_DEPTH]; \
+		__type *bp, *bq; \
+		__type *p, *q, *r = 0; /* _r_ is potentially the new root */ \
+		int which = 0, top, b1, path_len; \
+		bp = *root_, bq = 0; \
+		/* find the insertion location */ \
+		for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp == 0) return p; \
+			if (p->__head.balance != 0) \
+				bq = q, bp = p, top = 0; \
+			stack[top++] = which = (cmp > 0); \
+			path[path_len++] = p; \
+		} \
+		x->__head.balance = 0, x->__head.p[0] = x->__head.p[1] = 0; \
+		if (q == 0) *root_ = x; \
+		else q->__head.p[which] = x; \
+		if (bp == 0) return x; \
+		for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \
+			if (stack[top] == 0) --p->__head.balance; \
+			else ++p->__head.balance; \
+		if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \
+		/* re-balance */ \
+		which = (bp->__head.balance < 0); \
+		b1 = which == 0? +1 : -1; \
+		q = bp->__head.p[1 - which]; \
+		if (q->__head.balance == b1) { \
+			r = pre##_rotate1(bp, which); \
+			q->__head.balance = bp->__head.balance = 0; \
+		} else r = pre##_rotate2(bp, which); \
+		if (bq == 0) *root_ = r; \
+		else bq->__head.p[bp != bq->__head.p[0]] = r; \
+		return x; \
+	}
+
+#define __KAVLL_ERASE(pre, __scope, __type, __head, __cmp) \
+	__scope __type *pre##_erase(__type **root_, const __type *x) { \
+		__type *p, *path[KAVLL_MAX_DEPTH], fake; \
+		unsigned char dir[KAVLL_MAX_DEPTH]; \
+		int d = 0, cmp; \
+		fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \
+		if (x) { \
+			for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \
+				int which = (cmp > 0); \
+				dir[d] = which; \
+				path[d++] = p; \
+				p = p->__head.p[which]; \
+				if (p == 0) return 0; \
+			} \
+		} else { \
+			for (p = &fake; p; p = p->__head.p[0]) \
+				dir[d] = 0, path[d++] = p; \
+			p = path[--d]; \
+		} \
+		if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \
+			path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \
+		} else { \
+			__type *q = p->__head.p[1]; \
+			if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3 */ \
+				q->__head.p[0] = p->__head.p[0]; \
+				q->__head.balance = p->__head.balance; \
+				path[d-1]->__head.p[dir[d-1]] = q; \
+				path[d] = q, dir[d++] = 1; \
+			} else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \
+				__type *r; \
+				int e = d++; /* backup _d_ */\
+				for (;;) { \
+					dir[d] = 0; \
+					path[d++] = q; \
+					r = q->__head.p[0]; \
+					if (r->__head.p[0] == 0) break; \
+					q = r; \
+				} \
+				r->__head.p[0] = p->__head.p[0]; \
+				q->__head.p[0] = r->__head.p[1]; \
+				r->__head.p[1] = p->__head.p[1]; \
+				r->__head.balance = p->__head.balance; \
+				path[e-1]->__head.p[dir[e-1]] = r; \
+				path[e] = r, dir[e] = 1; \
+			} \
+		} \
+		while (--d > 0) { \
+			__type *q = path[d]; \
+			int which, other, b1 = 1, b2 = 2; \
+			which = dir[d], other = 1 - which; \
+			if (which) b1 = -b1, b2 = -b2; \
+			q->__head.balance += b1; \
+			if (q->__head.balance == b1) break; \
+			else if (q->__head.balance == b2) { \
+				__type *r = q->__head.p[other]; \
+				if (r->__head.balance == -b1) { \
+					path[d-1]->__head.p[dir[d-1]] = pre##_rotate2(q, which); \
+				} else { \
+					path[d-1]->__head.p[dir[d-1]] = pre##_rotate1(q, which); \
+					if (r->__head.balance == 0) { \
+						r->__head.balance = -b1; \
+						q->__head.balance = b1; \
+						break; \
+					} else r->__head.balance = q->__head.balance = 0; \
+				} \
+			} \
+		} \
+		*root_ = fake.__head.p[0]; \
+		return p; \
+	}
+
+#define kavll_free(__type, __head, __root, __free) do { \
+		__type *_p, *_q; \
+		for (_p = __root; _p; _p = _q) { \
+			if (_p->__head.p[0] == 0) { \
+				_q = _p->__head.p[1]; \
+				__free(_p); \
+			} else { \
+				_q = _p->__head.p[0]; \
+				_p->__head.p[0] = _q->__head.p[1]; \
+				_q->__head.p[1] = _p; \
+			} \
+		} \
+	} while (0)
+
+#define kavll_size(__type, __head, __root, __cnt) do { \
+		__type *_p, *_q; \
+		*(__cnt) = 0; \
+		for (_p = __root; _p; _p = _q) { \
+			if (_p->__head.p[0] == 0) { \
+				_q = _p->__head.p[1]; \
+				++*(__cnt); \
+			} else { \
+				_q = _p->__head.p[0]; \
+				_p->__head.p[0] = _q->__head.p[1]; \
+				_q->__head.p[1] = _p; \
+			} \
+		} \
+	} while (0)
+
+#define __KAVLL_ITR(pre, __scope, __type, __head, __cmp) \
+	typedef struct pre##_itr_t { \
+		const __type *stack[KAVLL_MAX_DEPTH], **top, *right; /* _right_ points to the right child of *top */ \
+	} pre##_itr_t; \
+	__scope void pre##_itr_first(const __type *root, struct pre##_itr_t *itr) { \
+		const __type *p; \
+		for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \
+			*++itr->top = p; \
+		itr->right = (*itr->top)->__head.p[1]; \
+	} \
+	__scope int pre##_itr_find(const __type *root, const __type *x, struct pre##_itr_t *itr) { \
+		const __type *p = root; \
+		itr->top = itr->stack - 1; \
+		while (p != 0) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp < 0) *++itr->top = p, p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		if (p) { \
+			*++itr->top = p; \
+			itr->right = p->__head.p[1]; \
+			return 1; \
+		} else if (itr->top >= itr->stack) { \
+			itr->right = (*itr->top)->__head.p[1]; \
+			return 0; \
+		} else return 0; \
+	} \
+	__scope int pre##_itr_next(struct pre##_itr_t *itr) { \
+		for (;;) { \
+			const __type *p; \
+			for (p = itr->right, --itr->top; p; p = p->__head.p[0]) \
+				*++itr->top = p; \
+			if (itr->top < itr->stack) return 0; \
+			itr->right = (*itr->top)->__head.p[1]; \
+			return 1; \
+		} \
+	}
+
+#define kavll_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top)
+
+#define KAVLL_INIT2(pre, __scope, __type, __head, __cmp) \
+	__KAVLL_FIND(pre, __scope, __type, __head,  __cmp) \
+	__KAVLL_ROTATE(pre, __type, __head) \
+	__KAVLL_INSERT(pre, __scope, __type, __head, __cmp) \
+	__KAVLL_ERASE(pre, __scope, __type, __head, __cmp) \
+	__KAVLL_ITR(pre, __scope, __type, __head, __cmp)
+
+#define KAVLL_INIT(pre, __type, __head, __cmp) \
+	KAVLL_INIT2(pre,, __type, __head, __cmp)
+
+#endif
--- a/ext/klib/kavl.h
+++ b/ext/klib/kavl.h
@ -0,0 +1,400 @@
+/* The MIT License
+
+   Copyright (c) 2018 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* An example:
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "kavl.h"
+
+struct my_node {
+  char key;
+  KAVL_HEAD(struct my_node) head;
+};
+#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key))
+KAVL_INIT(my, struct my_node, head, my_cmp)
+
+int main(void) {
+  const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate
+  struct my_node *root = 0;
+  int i, l = strlen(str);
+  for (i = 0; i < l; ++i) {        // insert in the input order
+    struct my_node *q, *p = malloc(sizeof(*p));
+    p->key = str[i];
+    q = kavl_insert(my, &root, p, 0);
+    if (p != q) free(p);           // if already present, free
+  }
+  kavl_itr_t(my) itr;
+  kavl_itr_first(my, root, &itr);  // place at first
+  do {                             // traverse
+    const struct my_node *p = kavl_at(&itr);
+    putchar(p->key);
+    free((void*)p);                // free node
+  } while (kavl_itr_next(my, &itr));
+  putchar('\n');
+  return 0;
+}
+*/
+
+#ifndef KAVL_H
+#define KAVL_H
+
+#ifdef __STRICT_ANSI__
+#define inline __inline__
+#endif
+
+#define KAVL_MAX_DEPTH 64
+
+#define kavl_size(head, p) ((p)? (p)->head.size : 0)
+#define kavl_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0)
+
+#define KAVL_HEAD(__type) \
+	struct { \
+		__type *p[2]; \
+		signed char balance; /* balance factor */ \
+		unsigned size; /* #elements in subtree */ \
+	}
+
+#define __KAVL_FIND(suf, __scope, __type, __head,  __cmp) \
+	__scope __type *kavl_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \
+		const __type *p = root; \
+		unsigned cnt = 0; \
+		while (p != 0) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp >= 0) cnt += kavl_size_child(__head, p, 0) + 1; \
+			if (cmp < 0) p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		return (__type*)p; \
+	}
+
+#define __KAVL_ROTATE(suf, __type, __head) \
+	/* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \
+	static inline __type *kavl_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \
+		int opp = 1 - dir; /* opposite direction */ \
+		__type *q = p->__head.p[opp]; \
+		unsigned size_p = p->__head.size; \
+		p->__head.size -= q->__head.size - kavl_size_child(__head, q, dir); \
+		q->__head.size = size_p; \
+		p->__head.p[opp] = q->__head.p[dir]; \
+		q->__head.p[dir] = p; \
+		return q; \
+	} \
+	/* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \
+	static inline __type *kavl_rotate2_##suf(__type *p, int dir) { \
+		int b1, opp = 1 - dir; \
+		__type *q = p->__head.p[opp], *r = q->__head.p[dir]; \
+		unsigned size_x_dir = kavl_size_child(__head, r, dir); \
+		r->__head.size = p->__head.size; \
+		p->__head.size -= q->__head.size - size_x_dir; \
+		q->__head.size -= size_x_dir + 1; \
+		p->__head.p[opp] = r->__head.p[dir]; \
+		r->__head.p[dir] = p; \
+		q->__head.p[dir] = r->__head.p[opp]; \
+		r->__head.p[opp] = q; \
+		b1 = dir == 0? +1 : -1; \
+		if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \
+		else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \
+		else q->__head.balance = b1, p->__head.balance = 0; \
+		r->__head.balance = 0; \
+		return r; \
+	}
+
+#define __KAVL_INSERT(suf, __scope, __type, __head, __cmp) \
+	__scope __type *kavl_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \
+		unsigned char stack[KAVL_MAX_DEPTH]; \
+		__type *path[KAVL_MAX_DEPTH]; \
+		__type *bp, *bq; \
+		__type *p, *q, *r = 0; /* _r_ is potentially the new root */ \
+		int i, which = 0, top, b1, path_len; \
+		unsigned cnt = 0; \
+		bp = *root_, bq = 0; \
+		/* find the insertion location */ \
+		for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp >= 0) cnt += kavl_size_child(__head, p, 0) + 1; \
+			if (cmp == 0) { \
+				if (cnt_) *cnt_ = cnt; \
+				return p; \
+			} \
+			if (p->__head.balance != 0) \
+				bq = q, bp = p, top = 0; \
+			stack[top++] = which = (cmp > 0); \
+			path[path_len++] = p; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0; \
+		if (q == 0) *root_ = x; \
+		else q->__head.p[which] = x; \
+		if (bp == 0) return x; \
+		for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \
+		for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \
+			if (stack[top] == 0) --p->__head.balance; \
+			else ++p->__head.balance; \
+		if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \
+		/* re-balance */ \
+		which = (bp->__head.balance < 0); \
+		b1 = which == 0? +1 : -1; \
+		q = bp->__head.p[1 - which]; \
+		if (q->__head.balance == b1) { \
+			r = kavl_rotate1_##suf(bp, which); \
+			q->__head.balance = bp->__head.balance = 0; \
+		} else r = kavl_rotate2_##suf(bp, which); \
+		if (bq == 0) *root_ = r; \
+		else bq->__head.p[bp != bq->__head.p[0]] = r; \
+		return x; \
+	}
+
+#define __KAVL_ERASE(suf, __scope, __type, __head, __cmp) \
+	__scope __type *kavl_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \
+		__type *p, *path[KAVL_MAX_DEPTH], fake; \
+		unsigned char dir[KAVL_MAX_DEPTH]; \
+		int i, d = 0, cmp; \
+		unsigned cnt = 0; \
+		fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \
+		if (cnt_) *cnt_ = 0; \
+		if (x) { \
+			for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \
+				int which = (cmp > 0); \
+				if (cmp > 0) cnt += kavl_size_child(__head, p, 0) + 1; \
+				dir[d] = which; \
+				path[d++] = p; \
+				p = p->__head.p[which]; \
+				if (p == 0) { \
+					if (cnt_) *cnt_ = 0; \
+					return 0; \
+				} \
+			} \
+			cnt += kavl_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \
+		} else { \
+			for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \
+				dir[d] = 0, path[d++] = p; \
+			p = path[--d]; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		for (i = 1; i < d; ++i) --path[i]->__head.size; \
+		if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \
+			path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \
+		} else { \
+			__type *q = p->__head.p[1]; \
+			if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3 */ \
+				q->__head.p[0] = p->__head.p[0]; \
+				q->__head.balance = p->__head.balance; \
+				path[d-1]->__head.p[dir[d-1]] = q; \
+				path[d] = q, dir[d++] = 1; \
+				q->__head.size = p->__head.size - 1; \
+			} else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \
+				__type *r; \
+				int e = d++; /* backup _d_ */\
+				for (;;) { \
+					dir[d] = 0; \
+					path[d++] = q; \
+					r = q->__head.p[0]; \
+					if (r->__head.p[0] == 0) break; \
+					q = r; \
+				} \
+				r->__head.p[0] = p->__head.p[0]; \
+				q->__head.p[0] = r->__head.p[1]; \
+				r->__head.p[1] = p->__head.p[1]; \
+				r->__head.balance = p->__head.balance; \
+				path[e-1]->__head.p[dir[e-1]] = r; \
+				path[e] = r, dir[e] = 1; \
+				for (i = e + 1; i < d; ++i) --path[i]->__head.size; \
+				r->__head.size = p->__head.size - 1; \
+			} \
+		} \
+		while (--d > 0) { \
+			__type *q = path[d]; \
+			int which, other, b1 = 1, b2 = 2; \
+			which = dir[d], other = 1 - which; \
+			if (which) b1 = -b1, b2 = -b2; \
+			q->__head.balance += b1; \
+			if (q->__head.balance == b1) break; \
+			else if (q->__head.balance == b2) { \
+				__type *r = q->__head.p[other]; \
+				if (r->__head.balance == -b1) { \
+					path[d-1]->__head.p[dir[d-1]] = kavl_rotate2_##suf(q, which); \
+				} else { \
+					path[d-1]->__head.p[dir[d-1]] = kavl_rotate1_##suf(q, which); \
+					if (r->__head.balance == 0) { \
+						r->__head.balance = -b1; \
+						q->__head.balance = b1; \
+						break; \
+					} else r->__head.balance = q->__head.balance = 0; \
+				} \
+			} \
+		} \
+		*root_ = fake.__head.p[0]; \
+		return p; \
+	}
+
+#define kavl_free(__type, __head, __root, __free) do { \
+		__type *_p, *_q; \
+		for (_p = __root; _p; _p = _q) { \
+			if (_p->__head.p[0] == 0) { \
+				_q = _p->__head.p[1]; \
+				__free(_p); \
+			} else { \
+				_q = _p->__head.p[0]; \
+				_p->__head.p[0] = _q->__head.p[1]; \
+				_q->__head.p[1] = _p; \
+			} \
+		} \
+	} while (0)
+
+#define __KAVL_ITR(suf, __scope, __type, __head, __cmp) \
+	struct kavl_itr_##suf { \
+		const __type *stack[KAVL_MAX_DEPTH], **top, *right; /* _right_ points to the right child of *top */ \
+	}; \
+	__scope void kavl_itr_first_##suf(const __type *root, struct kavl_itr_##suf *itr) { \
+		const __type *p; \
+		for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \
+			*++itr->top = p; \
+		itr->right = (*itr->top)->__head.p[1]; \
+	} \
+	__scope int kavl_itr_find_##suf(const __type *root, const __type *x, struct kavl_itr_##suf *itr) { \
+		const __type *p = root; \
+		itr->top = itr->stack - 1; \
+		while (p != 0) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp < 0) *++itr->top = p, p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		if (p) { \
+			*++itr->top = p; \
+			itr->right = p->__head.p[1]; \
+			return 1; \
+		} else if (itr->top >= itr->stack) { \
+			itr->right = (*itr->top)->__head.p[1]; \
+			return 0; \
+		} else return 0; \
+	} \
+	__scope int kavl_itr_next_##suf(struct kavl_itr_##suf *itr) { \
+		for (;;) { \
+			const __type *p; \
+			for (p = itr->right, --itr->top; p; p = p->__head.p[0]) \
+				*++itr->top = p; \
+			if (itr->top < itr->stack) return 0; \
+			itr->right = (*itr->top)->__head.p[1]; \
+			return 1; \
+		} \
+	}
+
+/**
+ * Insert a node to the tree
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param proot   pointer to the root of the tree (in/out: root may change)
+ * @param x       node to insert (in)
+ * @param cnt     number of nodes smaller than or equal to _x_; can be NULL (out)
+ *
+ * @return _x_ if not present in the tree, or the node equal to x.
+ */
+#define kavl_insert(suf, proot, x, cnt) kavl_insert_##suf(proot, x, cnt)
+
+/**
+ * Find a node in the tree
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param root    root of the tree
+ * @param x       node value to find (in)
+ * @param cnt     number of nodes smaller than or equal to _x_; can be NULL (out)
+ *
+ * @return node equal to _x_ if present, or NULL if absent
+ */
+#define kavl_find(suf, root, x, cnt) kavl_find_##suf(root, x, cnt)
+
+/**
+ * Delete a node from the tree
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param proot   pointer to the root of the tree (in/out: root may change)
+ * @param x       node value to delete; if NULL, delete the first node (in)
+ *
+ * @return node removed from the tree if present, or NULL if absent
+ */
+#define kavl_erase(suf, proot, x, cnt) kavl_erase_##suf(proot, x, cnt)
+#define kavl_erase_first(suf, proot) kavl_erase_##suf(proot, 0, 0)
+
+#define kavl_itr_t(suf) struct kavl_itr_##suf
+
+/**
+ * Place the iterator at the smallest object
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param root    root of the tree
+ * @param itr     iterator
+ */
+#define kavl_itr_first(suf, root, itr) kavl_itr_first_##suf(root, itr)
+
+/**
+ * Place the iterator at the object equal to or greater than the query
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param root    root of the tree
+ * @param x       query (in)
+ * @param itr     iterator (out)
+ *
+ * @return 1 if find; 0 otherwise. kavl_at(itr) is NULL if and only if query is
+ *         larger than all objects in the tree
+ */
+#define kavl_itr_find(suf, root, x, itr) kavl_itr_find_##suf(root, x, itr)
+
+/**
+ * Move to the next object in order
+ *
+ * @param itr     iterator (modified)
+ *
+ * @return 1 if there is a next object; 0 otherwise
+ */
+#define kavl_itr_next(suf, itr) kavl_itr_next_##suf(itr)
+
+/**
+ * Return the pointer at the iterator
+ *
+ * @param itr     iterator
+ *
+ * @return pointer if present; NULL otherwise
+ */
+#define kavl_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top)
+
+#define KAVL_INIT2(suf, __scope, __type, __head, __cmp) \
+	__KAVL_FIND(suf, __scope, __type, __head,  __cmp) \
+	__KAVL_ROTATE(suf, __type, __head) \
+	__KAVL_INSERT(suf, __scope, __type, __head, __cmp) \
+	__KAVL_ERASE(suf, __scope, __type, __head, __cmp) \
+	__KAVL_ITR(suf, __scope, __type, __head, __cmp)
+
+#define KAVL_INIT(suf, __type, __head, __cmp) \
+	KAVL_INIT2(suf,, __type, __head, __cmp)
+
+#endif
--- a/ext/klib/kbit.h
+++ b/ext/klib/kbit.h
@ -0,0 +1,30 @@
+#ifndef KBIT_H
+#define KBIT_H
+
+#include <stdint.h>
+
+static inline uint64_t kbi_popcount64(uint64_t y) // standard popcount; from wikipedia
+{
+	y -= ((y >> 1) & 0x5555555555555555ull);
+	y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
+	return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
+}
+
+static inline uint64_t kbi_DNAcount64(uint64_t y, int c) // count #A/C/G/T from a 2-bit encoded integer; from BWA
+{
+	// reduce nucleotide counting to bits counting
+	y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
+	// count the number of 1s in y
+	y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
+	return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
+}
+
+#ifndef kroundup32 // round a 32-bit integer to the next closet integer; from "bit twiddling hacks"
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kbi_swap
+#define kbi_swap(a, b) (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b))) // from "bit twiddling hacks"
+#endif
+
+#endif
--- a/ext/klib/kbtree.h
+++ b/ext/klib/kbtree.h
@ -0,0 +1,437 @@
+/*-
+ * Copyright 1997-1999, 2001, John-Mark Gurney.
+ *           2008-2009, Attractive Chaos <attractor@live.co.uk>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __AC_KBTREE_H
+#define __AC_KBTREE_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#define KB_MAX_DEPTH 64
+
+typedef struct {
+	int32_t is_internal:1, n:31;
+} kbnode_t;
+
+typedef struct {
+	kbnode_t *x;
+	int i;
+} kbpos_t;
+
+typedef struct {
+	kbpos_t stack[KB_MAX_DEPTH], *p;
+} kbitr_t;
+
+#define	__KB_KEY(type, x)	((type*)((char*)x + 4))
+#define __KB_PTR(btr, x)	((kbnode_t**)((char*)x + btr->off_ptr))
+
+#define __KB_TREE_T(name)						\
+	typedef struct {							\
+		kbnode_t *root;							\
+		int	off_key, off_ptr, ilen, elen;		\
+		int	n, t;								\
+		int	n_keys, n_nodes;					\
+	} kbtree_##name##_t;
+
+#define __KB_INIT(name, key_t)											\
+	kbtree_##name##_t *kb_init_##name(int size)							\
+	{																	\
+		kbtree_##name##_t *b;											\
+		b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t));	\
+		b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
+		if (b->t < 2) {													\
+			free(b); return 0;											\
+		}																\
+		b->n = 2 * b->t - 1;											\
+		b->off_ptr = 4 + b->n * sizeof(key_t);							\
+		b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
+		b->elen = (b->off_ptr + 3) >> 2 << 2;							\
+		b->root = (kbnode_t*)calloc(1, b->ilen);						\
+		++b->n_nodes;													\
+		return b;														\
+	}
+
+#define __kb_destroy(b) do {											\
+		int i, max = 8;													\
+		kbnode_t *x, **top, **stack = 0;								\
+		if (b) {														\
+			top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*));	\
+			*top++ = (b)->root;											\
+			while (top != stack) {										\
+				x = *--top;												\
+				if (x->is_internal == 0) { free(x); continue; }			\
+				for (i = 0; i <= x->n; ++i)								\
+					if (__KB_PTR(b, x)[i]) {							\
+						if (top - stack == max) {						\
+							max <<= 1;									\
+							stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
+							top = stack + (max>>1);						\
+						}												\
+						*top++ = __KB_PTR(b, x)[i];						\
+					}													\
+				free(x);												\
+			}															\
+		}																\
+		free(b); free(stack);											\
+	} while (0)
+
+#define __KB_GET_AUX1(name, key_t, __cmp)								\
+	static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+	{																	\
+		int tr, *rr, begin = 0, end = x->n;								\
+		if (x->n == 0) return -1;										\
+		rr = r? r : &tr;												\
+		while (begin < end) {											\
+			int mid = (begin + end) >> 1;								\
+			if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
+			else end = mid;												\
+		}																\
+		if (begin == x->n) { *rr = 1; return x->n - 1; }				\
+		if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin;	\
+		return begin;													\
+	}
+
+#define __KB_GET(name, key_t)											\
+	static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		int i, r = 0;													\
+		kbnode_t *x = b->root;											\
+		while (x) {														\
+			i = __kb_getp_aux_##name(x, k, &r);							\
+			if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i];		\
+			if (x->is_internal == 0) return 0;							\
+			x = __KB_PTR(b, x)[i + 1];									\
+		}																\
+		return 0;														\
+	}																	\
+	static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		return kb_getp_##name(b, &k);									\
+	}
+
+#define __KB_INTERVAL(name, key_t)										\
+	static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper)	\
+	{																	\
+		int i, r = 0;													\
+		kbnode_t *x = b->root;											\
+		*lower = *upper = 0;											\
+		while (x) {														\
+			i = __kb_getp_aux_##name(x, k, &r);							\
+			if (i >= 0 && r == 0) {										\
+				*lower = *upper = &__KB_KEY(key_t, x)[i];				\
+				return;													\
+			}															\
+			if (i >= 0) *lower = &__KB_KEY(key_t, x)[i];				\
+			if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1];		\
+			if (x->is_internal == 0) return;							\
+			x = __KB_PTR(b, x)[i + 1];									\
+		}																\
+	}																	\
+	static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
+	{																	\
+		kb_intervalp_##name(b, &k, lower, upper);						\
+	}
+
+#define __KB_PUT(name, key_t, __cmp)									\
+	/* x must be an internal node */									\
+	static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
+	{																	\
+		kbnode_t *z;													\
+		z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen);	\
+		++b->n_nodes;													\
+		z->is_internal = y->is_internal;								\
+		z->n = b->t - 1;												\
+		memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
+		if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
+		y->n = b->t - 1;												\
+		memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
+		__KB_PTR(b, x)[i + 1] = z;										\
+		memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
+		__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1];			\
+		++x->n;															\
+	}																	\
+	static key_t *__kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
+	{																	\
+		int i = x->n - 1;												\
+		key_t *ret;														\
+		if (x->is_internal == 0) {										\
+			i = __kb_getp_aux_##name(x, k, 0);							\
+			if (i != x->n - 1)											\
+				memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+			ret = &__KB_KEY(key_t, x)[i + 1];							\
+			*ret = *k;													\
+			++x->n;														\
+		} else {														\
+			i = __kb_getp_aux_##name(x, k, 0) + 1;						\
+			if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) {					\
+				__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]);			\
+				if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i;			\
+			}															\
+			ret = __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k);		\
+		}																\
+		return ret; 													\
+	}																	\
+	static key_t *kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		kbnode_t *r, *s;												\
+		++b->n_keys;													\
+		r = b->root;													\
+		if (r->n == 2 * b->t - 1) {										\
+			++b->n_nodes;												\
+			s = (kbnode_t*)calloc(1, b->ilen);							\
+			b->root = s; s->is_internal = 1; s->n = 0;					\
+			__KB_PTR(b, s)[0] = r;										\
+			__kb_split_##name(b, s, 0, r);								\
+			r = s;														\
+		}																\
+		return __kb_putp_aux_##name(b, r, k);							\
+	}																	\
+	static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		kb_putp_##name(b, &k);											\
+	}
+
+
+#define __KB_DEL(name, key_t)											\
+	static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
+	{																	\
+		int yn, zn, i, r = 0;											\
+		kbnode_t *xp, *y, *z;											\
+		key_t kp;														\
+		if (x == 0) return *k;											\
+		if (s) { /* s can only be 0, 1 or 2 */							\
+			r = x->is_internal == 0? 0 : s == 1? 1 : -1;				\
+			i = s == 1? x->n - 1 : -1;									\
+		} else i = __kb_getp_aux_##name(x, k, &r);						\
+		if (x->is_internal == 0) {										\
+			if (s == 2) ++i;											\
+			kp = __KB_KEY(key_t, x)[i];									\
+			memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+			--x->n;														\
+			return kp;													\
+		}																\
+		if (r == 0) {													\
+			if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) {					\
+				xp = __KB_PTR(b, x)[i];									\
+				kp = __KB_KEY(key_t, x)[i];								\
+				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
+				return kp;												\
+			} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) {		\
+				xp = __KB_PTR(b, x)[i + 1];								\
+				kp = __KB_KEY(key_t, x)[i];								\
+				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
+				return kp;												\
+			} else if (yn == b->t - 1 && zn == b->t - 1) {				\
+				y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1];		\
+				__KB_KEY(key_t, y)[y->n++] = *k;						\
+				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
+				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
+				y->n += z->n;											\
+				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+				--x->n;													\
+				free(z);												\
+				return __kb_delp_aux_##name(b, y, k, s);				\
+			}															\
+		}																\
+		++i;															\
+		if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) {					\
+			if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) {		\
+				memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
+				if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+				__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1];		\
+				__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
+				if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
+				--y->n; ++xp->n;										\
+			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
+				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
+				__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0];			\
+				if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
+				--y->n;													\
+				memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
+				if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
+			} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
+				__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1];	\
+				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t));	\
+				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+				y->n += xp->n;											\
+				memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
+				--x->n;													\
+				free(xp);												\
+				xp = y;													\
+			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
+				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
+				memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t));	\
+				if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
+				xp->n += y->n;											\
+				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+				--x->n;													\
+				free(y);												\
+			}															\
+		}																\
+		return __kb_delp_aux_##name(b, xp, k, s);						\
+	}																	\
+	static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		kbnode_t *x;													\
+		key_t ret;														\
+		ret = __kb_delp_aux_##name(b, b->root, k, 0);					\
+		--b->n_keys;													\
+		if (b->root->n == 0 && b->root->is_internal) {					\
+			--b->n_nodes;												\
+			x = b->root;												\
+			b->root = __KB_PTR(b, x)[0];								\
+			free(x);													\
+		}																\
+		return ret;														\
+	}																	\
+	static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		return kb_delp_##name(b, &k);									\
+	}
+
+#define __KB_ITR(name, key_t) \
+	static inline void kb_itr_first_##name(kbtree_##name##_t *b, kbitr_t *itr) \
+	{ \
+		itr->p = 0; \
+		if (b->n_keys == 0) return; \
+		itr->p = itr->stack; \
+		itr->p->x = b->root; itr->p->i = 0; \
+		while (itr->p->x->is_internal && __KB_PTR(b, itr->p->x)[0] != 0) { \
+			kbnode_t *x = itr->p->x; \
+			++itr->p; \
+			itr->p->x = __KB_PTR(b, x)[0]; itr->p->i = 0; \
+		} \
+	} \
+	static int kb_itr_get_##name(kbtree_##name##_t *b, const key_t * __restrict k, kbitr_t *itr) \
+	{ \
+		int i, r = 0; \
+		itr->p = itr->stack; \
+		itr->p->x = b->root; itr->p->i = 0; \
+		while (itr->p->x) { \
+			i = __kb_getp_aux_##name(itr->p->x, k, &r); \
+			if (i >= 0 && r == 0) return 0; \
+			if (itr->p->x->is_internal == 0) return -1; \
+			itr->p[1].x = __KB_PTR(b, itr->p->x)[i + 1]; \
+			itr->p[1].i = i; \
+			++itr->p; \
+		} \
+		return -1; \
+	} \
+	static inline int kb_itr_next_##name(kbtree_##name##_t *b, kbitr_t *itr) \
+	{ \
+		if (itr->p < itr->stack) return 0; \
+		for (;;) { \
+			++itr->p->i; \
+			while (itr->p->x && itr->p->i <= itr->p->x->n) { \
+				itr->p[1].i = 0; \
+				itr->p[1].x = itr->p->x->is_internal? __KB_PTR(b, itr->p->x)[itr->p->i] : 0; \
+				++itr->p; \
+			} \
+			--itr->p; \
+			if (itr->p < itr->stack) return 0; \
+			if (itr->p->x && itr->p->i < itr->p->x->n) return 1; \
+		} \
+	}
+
+#define KBTREE_INIT(name, key_t, __cmp)			\
+	__KB_TREE_T(name)							\
+	__KB_INIT(name, key_t)						\
+	__KB_GET_AUX1(name, key_t, __cmp)			\
+	__KB_GET(name, key_t)						\
+	__KB_INTERVAL(name, key_t)					\
+	__KB_PUT(name, key_t, __cmp)				\
+	__KB_DEL(name, key_t) \
+	__KB_ITR(name, key_t)
+
+#define KB_DEFAULT_SIZE 512
+
+#define kbtree_t(name) kbtree_##name##_t
+#define kb_init(name, s) kb_init_##name(s)
+#define kb_destroy(name, b) __kb_destroy(b)
+#define kb_get(name, b, k) kb_get_##name(b, k)
+#define kb_put(name, b, k) kb_put_##name(b, k)
+#define kb_del(name, b, k) kb_del_##name(b, k)
+#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
+#define kb_getp(name, b, k) kb_getp_##name(b, k)
+#define kb_putp(name, b, k) kb_putp_##name(b, k)
+#define kb_delp(name, b, k) kb_delp_##name(b, k)
+#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
+
+#define kb_itr_first(name, b, i) kb_itr_first_##name(b, i)
+#define kb_itr_get(name, b, k, i) kb_itr_get_##name(b, k, i)
+#define kb_itr_next(name, b, i) kb_itr_next_##name(b, i)
+#define kb_itr_key(type, itr) __KB_KEY(type, (itr)->p->x)[(itr)->p->i]
+#define kb_itr_valid(itr) ((itr)->p >= (itr)->stack)
+
+#define kb_size(b) ((b)->n_keys)
+
+#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
+#define kb_str_cmp(a, b) strcmp(a, b)
+
+/* The following is *DEPRECATED*!!! Use the iterator interface instead! */
+
+typedef struct {
+	kbnode_t *x;
+	int i;
+} __kbstack_t;
+
+#define __kb_traverse(key_t, b, __func) do {							\
+		int __kmax = 8;													\
+		__kbstack_t *__kstack, *__kp;									\
+		__kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
+		__kp->x = (b)->root; __kp->i = 0;								\
+		for (;;) {														\
+			while (__kp->x && __kp->i <= __kp->x->n) {					\
+				if (__kp - __kstack == __kmax - 1) {					\
+					__kmax <<= 1;										\
+					__kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
+					__kp = __kstack + (__kmax>>1) - 1;					\
+				}														\
+				(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
+				++__kp;													\
+			}															\
+			--__kp;														\
+			if (__kp >= __kstack) {										\
+				if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
+				++__kp->i;												\
+			} else break;												\
+		}																\
+		free(__kstack);													\
+	} while (0)
+
+#define __kb_get_first(key_t, b, ret) do {	\
+		kbnode_t *__x = (b)->root;			\
+		while (__KB_PTR(b, __x)[0] != 0)	\
+			__x = __KB_PTR(b, __x)[0];		\
+		(ret) = __KB_KEY(key_t, __x)[0];	\
+	} while (0)
+
+#endif
--- a/ext/klib/kdq.h
+++ b/ext/klib/kdq.h
@ -0,0 +1,128 @@
+#ifndef __AC_KDQ_H
+#define __AC_KDQ_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#define __KDQ_TYPE(type) \
+	typedef struct { \
+		size_t front:58, bits:6, count, mask; \
+		type *a; \
+	} kdq_##type##_t;
+
+#define kdq_t(type) kdq_##type##_t
+#define kdq_size(q) ((q)->count)
+#define kdq_first(q) ((q)->a[(q)->front])
+#define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask])
+#define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask])
+
+#define __KDQ_IMPL(type, SCOPE) \
+	SCOPE kdq_##type##_t *kdq_init_##type() \
+	{ \
+		kdq_##type##_t *q; \
+		q = (kdq_##type##_t*)calloc(1, sizeof(kdq_##type##_t)); \
+		q->bits = 2, q->mask = (1ULL<<q->bits) - 1; \
+		q->a = (type*)malloc((1<<q->bits) * sizeof(type)); \
+		return q; \
+	} \
+	SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \
+	{ \
+		if (q == 0) return; \
+		free(q->a); free(q); \
+	} \
+	SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \
+	{ \
+		size_t new_size = 1ULL<<new_bits, old_size = 1ULL<<q->bits; \
+		if (new_size < q->count) { /* not big enough */ \
+			int i; \
+			for (i = 0; i < 64; ++i) \
+				if (1ULL<<i > q->count) break; \
+			new_bits = i, new_size = 1ULL<<new_bits; \
+		} \
+		if (new_bits == q->bits) return q->bits; /* unchanged */ \
+		if (new_bits > q->bits) q->a = (type*)realloc(q->a, (1ULL<<new_bits) * sizeof(type)); \
+		if (q->front + q->count <= old_size) { /* unwrapped */ \
+			if (q->front + q->count > new_size) /* only happens for shrinking */ \
+				memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \
+		} else { /* wrapped */ \
+			memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \
+			q->front = new_size - (old_size - q->front); \
+		} \
+		q->bits = new_bits, q->mask = (1ULL<<q->bits) - 1; \
+		if (new_bits < q->bits) q->a = (type*)realloc(q->a, (1ULL<<new_bits) * sizeof(type)); \
+		return q->bits; \
+	} \
+	SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \
+	{ \
+		if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+		return &q->a[((q->count++) + q->front) & (q)->mask]; \
+	} \
+	SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \
+	{ \
+		if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+		q->a[((q->count++) + q->front) & (q)->mask] = v; \
+	} \
+	SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \
+	{ \
+		if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+		++q->count; \
+		q->front = q->front? q->front - 1 : (1ULL<<q->bits) - 1; \
+		return &q->a[q->front]; \
+	} \
+	SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \
+	{ \
+		type *p; \
+		p = kdq_unshiftp_##type(q); \
+		*p = v; \
+	} \
+	SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \
+	{ \
+		return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \
+	} \
+	SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \
+	{ \
+		type *d = 0; \
+		if (q->count == 0) return 0; \
+		d = &q->a[q->front++]; \
+		q->front &= q->mask; \
+		--q->count; \
+		return d; \
+	}
+
+#define KDQ_INIT2(type, SCOPE) \
+	__KDQ_TYPE(type) \
+	__KDQ_IMPL(type, SCOPE)
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+#define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused)
+
+#define KDQ_DECLARE(type) \
+	__KDQ_TYPE(type) \
+	kdq_##type##_t *kdq_init_##type(); \
+	void kdq_destroy_##type(kdq_##type##_t *q); \
+	int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \
+	type *kdq_pushp_##type(kdq_##type##_t *q); \
+	void kdq_push_##type(kdq_##type##_t *q, type v); \
+	type *kdq_unshiftp_##type(kdq_##type##_t *q); \
+	void kdq_unshift_##type(kdq_##type##_t *q, type v); \
+	type *kdq_pop_##type(kdq_##type##_t *q); \
+	type *kdq_shift_##type(kdq_##type##_t *q);
+
+#define kdq_init(type) kdq_init_##type()
+#define kdq_destroy(type, q) kdq_destroy_##type(q)
+#define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits)
+#define kdq_pushp(type, q) kdq_pushp_##type(q)
+#define kdq_push(type, q, v) kdq_push_##type(q, v)
+#define kdq_pop(type, q) kdq_pop_##type(q)
+#define kdq_unshiftp(type, q) kdq_unshiftp_##type(q)
+#define kdq_unshift(type, q, v) kdq_unshift_##type(q, v)
+#define kdq_shift(type, q) kdq_shift_##type(q)
+
+#endif
--- a/ext/klib/keigen.c
+++ b/ext/klib/keigen.c
@ -0,0 +1,186 @@
+#include <math.h>
+#include <stdlib.h>
+#include "keigen.h"
+
+void ke_core_strq(int n, double *q, double *b, double *c)
+{
+	int	i, j, k, u, v;
+	double h, f, g, h2;
+	for (i = n - 1; i >= 1; i--) {
+		h = 0.0;
+		if (i > 1)
+			for (k = 0; k < i; k++) {
+				u = i * n + k;
+				h = h + q[u] * q[u];
+			}
+		if (h + 1.0 == 1.0) {
+			c[i] = 0.0;
+			if (i == 1)
+				c[i] = q[i * n + i - 1];
+			b[i] = 0.0;
+		} else {
+			c[i] = sqrt(h);
+			u = i * n + i - 1;
+			if (q[u] > 0.0)
+				c[i] = -c[i];
+			h = h - q[u] * c[i];
+			q[u] = q[u] - c[i];
+			f = 0.0;
+			for (j = 0; j < i; j++) {
+				q[j * n + i] = q[i * n + j] / h;
+				g = 0.0;
+				for (k = 0; k <= j; k++)
+					g = g + q[j * n + k] * q[i * n + k];
+				if (j + 1 < i)
+					for (k = j + 1; k <= i - 1; k++)
+						g = g + q[k * n + j] * q[i * n + k];
+				c[j] = g / h;
+				f = f + g * q[j * n + i];
+			}
+			h2 = f / (h + h);
+			for (j = 0; j < i; j++) {
+				f = q[i * n + j];
+				g = c[j] - h2 * f;
+				c[j] = g;
+				for (k = 0; k <= j; k++) {
+					u = j * n + k;
+					q[u] = q[u] - f * c[k] - g * q[i * n + k];
+				}
+			}
+			b[i] = h;
+		}
+	}
+	for (i = 0; i < n - 1; i++)
+		c[i] = c[i + 1];
+	c[n - 1] = 0.0;
+	b[0] = 0.0;
+	for (i = 0; i < n; i++) {
+		if (b[i] != 0.0 && i - 1 >= 0)
+			for (j = 0; j < i; j++) {
+				g = 0.0;
+				for (k = 0; k < i; k++)
+					g = g + q[i * n + k] * q[k * n + j];
+				for (k = 0; k < i; k++) {
+					u = k * n + j;
+					q[u] = q[u] - g * q[k * n + i];
+				}
+			}
+		u = i * n + i;
+		b[i] = q[u];
+		q[u] = 1.0;
+		if (i - 1 >= 0)
+			for (j = 0; j < i; j++) {
+				q[i * n + j] = 0.0;
+				q[j * n + i] = 0.0;
+			}
+	}
+}
+
+int ke_core_sstq(int n, double *b, double *c, double *q, int cal_ev, double eps, int l)
+{
+	int i, j, k, m, it, u, v;
+	double d, f, h, g, p, r, e, s;
+	c[n - 1] = 0.0;
+	d = 0.0;
+	f = 0.0;
+	for (j = 0; j < n; j++) {
+		it = 0;
+		h = eps * (fabs(b[j]) + fabs(c[j]));
+		if (h > d)
+			d = h;
+		m = j;
+		while (m < n && fabs(c[m]) > d)
+			m = m + 1;
+		if (m != j) {
+			do {
+				if (it == l) return KE_EXCESS_ITER;
+				it = it + 1;
+				g = b[j];
+				p = (b[j + 1] - g) / (2.0 * c[j]);
+				r = sqrt(p * p + 1.0);
+				if (p >= 0.0)
+					b[j] = c[j] / (p + r);
+				else
+					b[j] = c[j] / (p - r);
+				h = g - b[j];
+				for (i = j + 1; i < n; i++)
+					b[i] = b[i] - h;
+				f = f + h;
+				p = b[m];
+				e = 1.0;
+				s = 0.0;
+				for (i = m - 1; i >= j; i--) {
+					g = e * c[i];
+					h = e * p;
+					if (fabs(p) >= fabs(c[i])) {
+						e = c[i] / p;
+						r = sqrt(e * e + 1.0);
+						c[i + 1] = s * p * r;
+						s = e / r;
+						e = 1.0 / r;
+					} else {
+						e = p / c[i];
+						r = sqrt(e * e + 1.0);
+						c[i + 1] = s * c[i] * r;
+						s = 1.0 / r;
+						e = e / r;
+					}
+					p = e * b[i] - s * g;
+					b[i + 1] = h + s * (e * g + s * b[i]);
+					if (cal_ev) {
+						for (k = 0; k < n; k++) {
+							u = k * n + i + 1;
+							v = u - 1;
+							h = q[u];
+							q[u] = s * q[v] + e * h;
+							q[v] = e * q[v] - s * h;
+						}
+					}
+				}
+				c[j] = s * p;
+				b[j] = e * p;
+			}
+			while (fabs(c[j]) > d);
+		}
+		b[j] = b[j] + f;
+	}
+	for (i = 0; i < n; i++) {
+		k = i;
+		p = b[i];
+		if (i + 1 < n) {
+			j = i + 1;
+			while (j < n && b[j] <= p) {
+				k = j;
+				p = b[j];
+				j = j + 1;
+			}
+		}
+		if (k != i) {
+			b[k] = b[i];
+			b[i] = p;
+			for (j = 0; j < n; j++) {
+				u = j * n + i;
+				v = j * n + k;
+				p = q[u];
+				q[u] = q[v];
+				q[v] = p;
+			}
+		}
+	}
+	return 0;
+}
+
+#define MALLOC(type, size) ((type*)malloc(size * sizeof(type)))
+
+int ke_eigen_sd(int n, double *a, double *v, int cal_ev, double eps, int max_iter)
+{
+	double *c;
+	int r;
+	if (1.0 + eps <= 1.0) eps = 1e-7;
+	if (max_iter <= 0) max_iter = 50;
+	c = MALLOC(double, n);
+	ke_core_strq(n, a, v, c);
+	r = ke_core_sstq(n, v, c, a, cal_ev, eps, max_iter);
+	free(c);
+	return r;
+}
--- a/ext/klib/keigen.h
+++ b/ext/klib/keigen.h
@ -0,0 +1,53 @@
+#ifndef KEIGEN_H
+#define KEIGEN_H
+
+#define KE_EXCESS_ITER (-1)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Compute eigenvalues/vectors for a dense symmetric matrix
+ *
+ * @param n       dimension
+ * @param a       input matrix and eigenvalues on return ([n*n]; in & out)
+ * @param v       eigenvalues ([n]; out)
+ * @param cal_ev  compute eigenvectos or not (faster without vectors)
+ * @param eps     precision (<=0 for default)
+ * @param max_itr max iteration (<=0 for detaul)
+ *
+ * @return 0 on success; KE_EXCESS_ITER if too many iterations
+ */
+int ke_eigen_sd(int n, double *a, double *v, int cal_ev, double eps, int max_iter);
+
+/**
+ * Transform a real symmetric matrix to a tridiagonal matrix
+ *
+ * @param n       dimension
+ * @param q       input matrix and transformation matrix ([n*n]; in & out)
+ * @param b       diagonal ([n]; out)
+ * @param c       subdiagonal ([n]; out)
+ */
+void ke_core_strq(int n, double *q, double *b, double *c);
+
+/**
+ * Compute eigenvalues and eigenvectors for a tridiagonal matrix
+ *
+ * @param n       dimension
+ * @param b       diagonal and eigenvalues on return ([n]; in & out)
+ * @param c       subdiagonal ([n]; in)
+ * @param q       transformation matrix and eigenvectors on return ([n*n]; in & out)
+ * @param cal_ev  compute eigenvectors or not (faster without vectors)
+ * @param eps     precision
+ * @param l       max iterations
+ *
+ * @return 0 on success; KE_EXCESS_ITER if too many iterations
+ */
+int ke_core_sstq(int n, double *b, double *c, double *q, int cal_ev, double eps, int l);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ext/klib/ketopt.h
+++ b/ext/klib/ketopt.h
@ -0,0 +1,120 @@
+#ifndef KETOPT_H
+#define KETOPT_H
+
+#include <string.h> /* for strchr() and strncmp() */
+
+#define ko_no_argument       0
+#define ko_required_argument 1
+#define ko_optional_argument 2
+
+typedef struct {
+	int ind;   /* equivalent to optind */
+	int opt;   /* equivalent to optopt */
+	char *arg; /* equivalent to optarg */
+	int longidx; /* index of a long option; or -1 if short */
+	/* private variables not intended for external uses */
+	int i, pos, n_args;
+} ketopt_t;
+
+typedef struct {
+	char *name;
+	int has_arg;
+	int val;
+} ko_longopt_t;
+
+static ketopt_t KETOPT_INIT = { 1, 0, 0, -1, 1, 0, 0 };
+
+static void ketopt_permute(char *argv[], int j, int n) /* move argv[j] over n elements to the left */
+{
+	int k;
+	char *p = argv[j];
+	for (k = 0; k < n; ++k)
+		argv[j - k] = argv[j - k - 1];
+	argv[j - k] = p;
+}
+
+/**
+ * Parse command-line options and arguments
+ *
+ * This fuction has a similar interface to GNU's getopt_long(). Each call
+ * parses one option and returns the option name.  s->arg points to the option
+ * argument if present. The function returns -1 when all command-line arguments
+ * are parsed. In this case, s->ind is the index of the first non-option
+ * argument.
+ *
+ * @param s         status; shall be initialized to KETOPT_INIT on the first call
+ * @param argc      length of argv[]
+ * @param argv      list of command-line arguments; argv[0] is ignored
+ * @param permute   non-zero to move options ahead of non-option arguments
+ * @param ostr      option string
+ * @param longopts  long options
+ *
+ * @return ASCII for a short option; ko_longopt_t::val for a long option; -1 if
+ *         argv[] is fully processed; '?' for an unknown option or an ambiguous
+ *         long option; ':' if an option argument is missing
+ */
+static int ketopt(ketopt_t *s, int argc, char *argv[], int permute, const char *ostr, const ko_longopt_t *longopts)
+{
+	int opt = -1, i0, j;
+	if (permute) {
+		while (s->i < argc && (argv[s->i][0] != '-' || argv[s->i][1] == '\0'))
+			++s->i, ++s->n_args;
+	}
+	s->arg = 0, s->longidx = -1, i0 = s->i;
+	if (s->i >= argc || argv[s->i][0] != '-' || argv[s->i][1] == '\0') {
+		s->ind = s->i - s->n_args;
+		return -1;
+	}
+	if (argv[s->i][0] == '-' && argv[s->i][1] == '-') { /* "--" or a long option */
+		if (argv[s->i][2] == '\0') { /* a bare "--" */
+			ketopt_permute(argv, s->i, s->n_args);
+			++s->i, s->ind = s->i - s->n_args;
+			return -1;
+		}
+		s->opt = 0, opt = '?', s->pos = -1;
+		if (longopts) { /* parse long options */
+			int k, n_exact = 0, n_partial = 0;
+			const ko_longopt_t *o = 0, *o_exact = 0, *o_partial = 0;
+			for (j = 2; argv[s->i][j] != '\0' && argv[s->i][j] != '='; ++j) {} /* find the end of the option name */
+			for (k = 0; longopts[k].name != 0; ++k)
+				if (strncmp(&argv[s->i][2], longopts[k].name, j - 2) == 0) {
+					if (longopts[k].name[j - 2] == 0) ++n_exact, o_exact = &longopts[k];
+					else ++n_partial, o_partial = &longopts[k];
+				}
+			if (n_exact > 1 || (n_exact == 0 && n_partial > 1)) return '?';
+			o = n_exact == 1? o_exact : n_partial == 1? o_partial : 0;
+			if (o) {
+				s->opt = opt = o->val, s->longidx = o - longopts;
+				if (argv[s->i][j] == '=') s->arg = &argv[s->i][j + 1];
+				if (o->has_arg == 1 && argv[s->i][j] == '\0') {
+					if (s->i < argc - 1) s->arg = argv[++s->i];
+					else opt = ':'; /* missing option argument */
+				}
+			}
+		}
+	} else { /* a short option */
+		const char *p;
+		if (s->pos == 0) s->pos = 1;
+		opt = s->opt = argv[s->i][s->pos++];
+		p = strchr((char*)ostr, opt);
+		if (p == 0) {
+			opt = '?'; /* unknown option */
+		} else if (p[1] == ':') {
+			if (argv[s->i][s->pos] == 0) {
+				if (s->i < argc - 1) s->arg = argv[++s->i];
+				else opt = ':'; /* missing option argument */
+			} else s->arg = &argv[s->i][s->pos];
+			s->pos = -1;
+		}
+	}
+	if (s->pos < 0 || argv[s->i][s->pos] == 0) {
+		++s->i, s->pos = 0;
+		if (s->n_args > 0) /* permute */
+			for (j = i0; j < s->i; ++j)
+				ketopt_permute(argv, j, s->n_args);
+	}
+	s->ind = s->i - s->n_args;
+	return opt;
+}
+
+#endif
--- a/ext/klib/kexpr.c
+++ b/ext/klib/kexpr.c
@ -0,0 +1,586 @@
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <math.h>
+#include "kexpr.h"
+
+/***************
+ * Definitions *
+ ***************/
+
+#define KEO_NULL  0
+#define KEO_POS   1
+#define KEO_NEG   2
+#define KEO_BNOT  3
+#define KEO_LNOT  4
+#define KEO_POW   5
+#define KEO_MUL   6
+#define KEO_DIV   7
+#define KEO_IDIV  8
+#define KEO_MOD   9
+#define KEO_ADD  10
+#define KEO_SUB  11
+#define KEO_LSH  12
+#define KEO_RSH  13
+#define KEO_LT   14
+#define KEO_LE   15
+#define KEO_GT   16
+#define KEO_GE   17
+#define KEO_EQ   18
+#define KEO_NE   19
+#define KEO_BAND 20
+#define KEO_BXOR 21
+#define KEO_BOR  22
+#define KEO_LAND 23
+#define KEO_LOR  24
+
+#define KET_NULL  0
+#define KET_VAL   1
+#define KET_OP    2
+#define KET_FUNC  3
+
+#define KEF_NULL  0
+#define KEF_REAL  1
+
+struct ke1_s;
+
+typedef struct ke1_s {
+	uint32_t ttype:16, vtype:10, assigned:1, user_func:5; // ttype: token type; vtype: value type
+	int32_t op:8, n_args:24; // op: operator, n_args: number of arguments
+	char *name; // variable name or function name
+	union {
+		void (*builtin)(struct ke1_s *a, struct ke1_s *b); // execution function
+		double (*real_func1)(double);
+		double (*real_func2)(double, double);
+	} f;
+	double r;
+	int64_t i;
+	char *s;
+} ke1_t;
+
+static int ke_op[25] = {
+	0,
+	1<<1|1, 1<<1|1, 1<<1|1, 1<<1|1, // unary operators
+	2<<1|1, // pow()
+	3<<1, 3<<1, 3<<1, 3<<1, // * / // %
+	4<<1, 4<<1, // + and -
+	5<<1, 5<<1, // << and >>
+	6<<1, 6<<1, 6<<1, 6<<1, // < > <= >=
+	7<<1, 7<<1, // == !=
+	8<<1, // &
+	9<<1, // ^
+	10<<1,// |
+	11<<1,// &&
+	12<<1 // ||
+};
+
+static const char *ke_opstr[] = {
+	"",
+	"+(1)", "-(1)", "~", "!",
+	"**",
+	"*", "/", "//", "%",
+	"+", "-",
+	"<<", ">>",
+	"<", "<=", ">", ">=",
+	"==", "!=",
+	"&",
+	"^",
+	"|",
+	"&&",
+	"||"
+};
+
+struct kexpr_s {
+	int n;
+	ke1_t *e;
+};
+
+/**********************
+ * Operator functions *
+ **********************/
+
+#define KE_GEN_CMP(_type, _op) \
+	static void ke_op_##_type(ke1_t *p, ke1_t *q) { \
+		if (p->vtype == KEV_STR && q->vtype == KEV_STR) p->i = (strcmp(p->s, q->s) _op 0); \
+		else p->i = p->vtype == KEV_REAL || q->vtype == KEV_REAL? (p->r _op q->r) : (p->i _op q->i); \
+		p->r = (double)p->i; \
+		p->vtype = KEV_INT; \
+	}
+
+KE_GEN_CMP(KEO_LT, <)
+KE_GEN_CMP(KEO_LE, <=)
+KE_GEN_CMP(KEO_GT, >)
+KE_GEN_CMP(KEO_GE, >=)
+KE_GEN_CMP(KEO_EQ, ==)
+KE_GEN_CMP(KEO_NE, !=)
+
+#define KE_GEN_BIN_INT(_type, _op) \
+	static void ke_op_##_type(ke1_t *p, ke1_t *q) { \
+		p->i _op q->i; p->r = (double)p->i; \
+		p->vtype = KEV_INT; \
+	}
+
+KE_GEN_BIN_INT(KEO_BAND, &=)
+KE_GEN_BIN_INT(KEO_BOR, |=)
+KE_GEN_BIN_INT(KEO_BXOR, ^=)
+KE_GEN_BIN_INT(KEO_LSH, <<=)
+KE_GEN_BIN_INT(KEO_RSH, >>=)
+KE_GEN_BIN_INT(KEO_MOD, %=)
+KE_GEN_BIN_INT(KEO_IDIV, /=)
+
+#define KE_GEN_BIN_BOTH(_type, _op) \
+	static void ke_op_##_type(ke1_t *p, ke1_t *q) { \
+		p->i _op q->i; p->r _op q->r; \
+		p->vtype = p->vtype == KEV_REAL || q->vtype == KEV_REAL? KEV_REAL : KEV_INT; \
+	}
+
+KE_GEN_BIN_BOTH(KEO_ADD, +=)
+KE_GEN_BIN_BOTH(KEO_SUB, -=)
+KE_GEN_BIN_BOTH(KEO_MUL, *=)
+
+static void ke_op_KEO_DIV(ke1_t *p, ke1_t *q)  { p->r /= q->r, p->i = (int64_t)(p->r + .5); p->vtype = KEV_REAL; }
+static void ke_op_KEO_LAND(ke1_t *p, ke1_t *q) { p->i = (p->i && q->i); p->r = p->i; p->vtype = KEV_INT; }
+static void ke_op_KEO_LOR(ke1_t *p, ke1_t *q)  { p->i = (p->i || q->i); p->r = p->i; p->vtype = KEV_INT; }
+static void ke_op_KEO_POW(ke1_t *p, ke1_t *q)  { p->r = pow(p->r, q->r), p->i = (int64_t)(p->r + .5); p->vtype = p->vtype == KEV_REAL || q->vtype == KEV_REAL? KEV_REAL : KEV_INT; }
+static void ke_op_KEO_BNOT(ke1_t *p, ke1_t *q) { p->i = ~p->i; p->r = (double)p->i; p->vtype = KEV_INT; }
+static void ke_op_KEO_LNOT(ke1_t *p, ke1_t *q) { p->i = !p->i; p->r = (double)p->i; p->vtype = KEV_INT; }
+static void ke_op_KEO_POS(ke1_t *p, ke1_t *q)  { } // do nothing
+static void ke_op_KEO_NEG(ke1_t *p, ke1_t *q)  { p->i = -p->i, p->r = -p->r; }
+
+static void ke_func1_abs(ke1_t *p, ke1_t *q) { if (p->vtype == KEV_INT) p->i = abs(p->i), p->r = (double)p->i; else p->r = fabs(p->r), p->i = (int64_t)(p->r + .5); }
+
+/**********
+ * Parser *
+ **********/
+
+static inline char *mystrndup(const char *src, int n)
+{
+	char *dst;
+	dst = (char*)calloc(n + 1, 1);
+	strncpy(dst, src, n);
+	return dst;
+}
+
+// parse a token except "(", ")" and ","
+static ke1_t ke_read_token(char *p, char **r, int *err, int last_is_val) // it doesn't parse parentheses
+{
+	char *q = p;
+	ke1_t e;
+	memset(&e, 0, sizeof(ke1_t));
+	if (isalpha(*p) || *p == '_') { // a variable or a function
+		for (; *p && (*p == '_' || isalnum(*p)); ++p);
+		if (*p == '(') e.ttype = KET_FUNC, e.n_args = 1;
+		else e.ttype = KET_VAL, e.vtype = KEV_REAL;
+		e.name = mystrndup(q, p - q);
+		e.i = 0, e.r = 0.;
+		*r = p;
+	} else if (isdigit(*p) || *p == '.') { // a number
+		long x;
+		double y;
+		char *pp;
+		e.ttype = KET_VAL;
+		y = strtod(q, &p);
+		x = strtol(q, &pp, 0); // FIXME: check int/double parsing errors
+		if (q == p && q == pp) { // parse error
+			*err |= KEE_NUM;
+		} else if (p > pp) { // has "." or "[eE]"; then it is a real number
+			e.vtype = KEV_REAL;
+			e.i = (int64_t)(y + .5), e.r = y;
+			*r = p;
+		} else {
+			e.vtype = KEV_INT;
+			e.i = x, e.r = y;
+			*r = pp;
+		}
+	} else if (*p == '"' || *p == '\'') { // a string value
+		int c = *p;
+		for (++p; *p && *p != c; ++p)
+			if (*p == '\\') ++p; // escaping
+		if (*p == c) {
+			e.ttype = KET_VAL, e.vtype = KEV_STR;
+			e.s = mystrndup(q + 1, p - q - 1);
+			*r = p + 1;
+		} else *err |= KEE_UNQU, *r = p;
+	} else { // an operator
+		e.ttype = KET_OP;
+		if (*p == '*' && p[1] == '*') e.op = KEO_POW, e.f.builtin = ke_op_KEO_POW, e.n_args = 2, *r = q + 2;
+		else if (*p == '*') e.op = KEO_MUL, e.f.builtin = ke_op_KEO_MUL, e.n_args = 2, *r = q + 1; // FIXME: NOT working for unary operators
+		else if (*p == '/' && p[1] == '/') e.op = KEO_IDIV, e.f.builtin = ke_op_KEO_IDIV, e.n_args = 2, *r = q + 2;
+		else if (*p == '/') e.op = KEO_DIV, e.f.builtin = ke_op_KEO_DIV, e.n_args = 2, *r = q + 1;
+		else if (*p == '%') e.op = KEO_MOD, e.f.builtin = ke_op_KEO_MOD, e.n_args = 2, *r = q + 1;
+		else if (*p == '+') {
+			if (last_is_val) e.op = KEO_ADD, e.f.builtin = ke_op_KEO_ADD, e.n_args = 2;
+			else e.op = KEO_POS, e.f.builtin = ke_op_KEO_POS, e.n_args = 1;
+			*r = q + 1;
+		} else if (*p == '-') {
+			if (last_is_val) e.op = KEO_SUB, e.f.builtin = ke_op_KEO_SUB, e.n_args = 2;
+			else e.op = KEO_NEG, e.f.builtin = ke_op_KEO_NEG, e.n_args = 1;
+			*r = q + 1;
+		} else if (*p == '=' && p[1] == '=') e.op = KEO_EQ, e.f.builtin = ke_op_KEO_EQ, e.n_args = 2, *r = q + 2;
+		else if (*p == '!' && p[1] == '=') e.op = KEO_NE, e.f.builtin = ke_op_KEO_NE, e.n_args = 2, *r = q + 2;
+		else if (*p == '<' && p[1] == '>') e.op = KEO_NE, e.f.builtin = ke_op_KEO_NE, e.n_args = 2, *r = q + 2;
+		else if (*p == '>' && p[1] == '=') e.op = KEO_GE, e.f.builtin = ke_op_KEO_GE, e.n_args = 2, *r = q + 2;
+		else if (*p == '<' && p[1] == '=') e.op = KEO_LE, e.f.builtin = ke_op_KEO_LE, e.n_args = 2, *r = q + 2;
+		else if (*p == '>' && p[1] == '>') e.op = KEO_RSH, e.f.builtin = ke_op_KEO_RSH, e.n_args = 2, *r = q + 2;
+		else if (*p == '<' && p[1] == '<') e.op = KEO_LSH, e.f.builtin = ke_op_KEO_LSH, e.n_args = 2, *r = q + 2;
+		else if (*p == '>') e.op = KEO_GT, e.f.builtin = ke_op_KEO_GT, e.n_args = 2, *r = q + 1;
+		else if (*p == '<') e.op = KEO_LT, e.f.builtin = ke_op_KEO_LT, e.n_args = 2, *r = q + 1;
+		else if (*p == '|' && p[1] == '|') e.op = KEO_LOR, e.f.builtin = ke_op_KEO_LOR, e.n_args = 2, *r = q + 2;
+		else if (*p == '&' && p[1] == '&') e.op = KEO_LAND, e.f.builtin = ke_op_KEO_LAND, e.n_args = 2, *r = q + 2;
+		else if (*p == '|') e.op = KEO_BOR, e.f.builtin = ke_op_KEO_BOR, e.n_args = 2, *r = q + 1;
+		else if (*p == '&') e.op = KEO_BAND, e.f.builtin = ke_op_KEO_BAND, e.n_args = 2, *r = q + 1;
+		else if (*p == '^') e.op = KEO_BXOR, e.f.builtin = ke_op_KEO_BXOR, e.n_args = 2, *r = q + 1;
+		else if (*p == '~') e.op = KEO_BNOT, e.f.builtin = ke_op_KEO_BNOT, e.n_args = 1, *r = q + 1;
+		else if (*p == '!') e.op = KEO_LNOT, e.f.builtin = ke_op_KEO_LNOT, e.n_args = 1, *r = q + 1;
+		else e.ttype = KET_NULL, *err |= KEE_UNOP;
+	}
+	return e;
+}
+
+static inline ke1_t *push_back(ke1_t **a, int *n, int *m)
+{
+	if (*n == *m) {
+		int old_m = *m;
+		*m = *m? *m<<1 : 8;
+		*a = (ke1_t*)realloc(*a, *m * sizeof(ke1_t));
+		memset(*a + old_m, 0, (*m - old_m) * sizeof(ke1_t));
+	}
+	return &(*a)[(*n)++];
+}
+
+static ke1_t *ke_parse_core(const char *_s, int *_n, int *err)
+{
+	char *s, *p, *q;
+	int n_out, m_out, n_op, m_op, last_is_val = 0;
+	ke1_t *out, *op, *t, *u;
+
+	*err = 0; *_n = 0;
+	s = strdup(_s); // make a copy
+	for (p = q = s; *p; ++p) // squeeze out spaces
+		if (!isspace(*p)) *q++ = *p;
+	*q++ = 0;
+
+	out = op = 0;
+	n_out = m_out = n_op = m_op = 0;
+	p = s;
+	while (*p) {
+		if (*p == '(') {
+			t = push_back(&op, &n_op, &m_op); // push to the operator stack
+			t->op = -1, t->ttype = KET_NULL; // ->op < 0 for a left parenthsis
+			++p;
+		} else if (*p == ')') {
+			while (n_op > 0 && op[n_op-1].op >= 0) { // move operators to the output until we see a left parenthesis
+				u = push_back(&out, &n_out, &m_out);
+				*u = op[--n_op];
+			}
+			if (n_op == 0) { // error: extra right parenthesis
+				*err |= KEE_UNRP;
+				break;
+			} else --n_op; // pop out '('
+			if (n_op > 0 && op[n_op-1].ttype == KET_FUNC) { // the top of the operator stack is a function
+				u = push_back(&out, &n_out, &m_out); // move it to the output
+				*u = op[--n_op];
+				if (u->n_args == 1 && strcmp(u->name, "abs") == 0) u->f.builtin = ke_func1_abs;
+			}
+			++p;
+		} else if (*p == ',') { // function arguments separator
+			while (n_op > 0 && op[n_op-1].op >= 0) {
+				u = push_back(&out, &n_out, &m_out);
+				*u = op[--n_op];
+			}
+			if (n_op < 2 || op[n_op-2].ttype != KET_FUNC) { // we should at least see a function and a left parenthesis
+				*err |= KEE_FUNC;
+				break;
+			}
+			++op[n_op-2].n_args;
+			++p;
+		} else { // output-able token
+			ke1_t v;
+			v = ke_read_token(p, &p, err, last_is_val);
+			if (*err) break;
+			if (v.ttype == KET_VAL) {
+				u = push_back(&out, &n_out, &m_out);
+				*u = v;
+				last_is_val = 1;
+			} else if (v.ttype == KET_FUNC) {
+				t = push_back(&op, &n_op, &m_op);
+				*t = v;
+				last_is_val = 0;
+			} else if (v.ttype == KET_OP) {
+				int oi = ke_op[v.op];
+				while (n_op > 0 && op[n_op-1].ttype == KET_OP) {
+					int pre = ke_op[op[n_op-1].op]>>1;
+					if (((oi&1) && oi>>1 <= pre) || (!(oi&1) && oi>>1 < pre)) break;
+					u = push_back(&out, &n_out, &m_out);
+					*u = op[--n_op];
+				}
+				t = push_back(&op, &n_op, &m_op);
+				*t = v;
+				last_is_val = 0;
+			}
+		}
+	}
+
+	if (*err == 0) {
+		while (n_op > 0 && op[n_op-1].op >= 0) {
+			u = push_back(&out, &n_out, &m_out);
+			*u = op[--n_op];
+		}
+		if (n_op > 0) *err |= KEE_UNLP;
+	}
+	
+	if (*err == 0) { // then check if the number of args is correct
+		int i, n;
+		for (i = n = 0; i < n_out; ++i) {
+			ke1_t *e = &out[i];
+			if (e->ttype == KET_VAL) ++n;
+			else n -= e->n_args - 1;
+		}
+		if (n != 1) *err |= KEE_ARG;
+	}
+
+	free(op); free(s);
+	if (*err) {
+		free(out);
+		return 0;
+	}
+	*_n = n_out;
+	return out;
+}
+
+kexpr_t *ke_parse(const char *_s, int *err)
+{
+	int n;
+	ke1_t *e;
+	kexpr_t *ke;
+	e = ke_parse_core(_s, &n, err);
+	if (*err) return 0;
+	ke = (kexpr_t*)calloc(1, sizeof(kexpr_t));
+	ke->n = n, ke->e = e;
+	return ke;
+}
+
+int ke_eval(const kexpr_t *ke, int64_t *_i, double *_r, const char **_p, int *ret_type)
+{
+	ke1_t *stack, *p, *q;
+	int i, top = 0, err = 0;
+	*_i = 0, *_r = 0., *ret_type = 0;
+	for (i = 0; i < ke->n; ++i) {
+		ke1_t *e = &ke->e[i];
+		if ((e->ttype == KET_OP || e->ttype == KET_FUNC) && e->f.builtin == 0) err |= KEE_UNFUNC;
+		else if (e->ttype == KET_VAL && e->name && e->assigned == 0) err |= KEE_UNVAR;
+	}
+	stack = (ke1_t*)malloc(ke->n * sizeof(ke1_t));
+	for (i = 0; i < ke->n; ++i) {
+		ke1_t *e = &ke->e[i];
+		if (e->ttype == KET_OP || e->ttype == KET_FUNC) {
+			if (e->n_args == 2 && e->f.builtin) {
+				q = &stack[--top], p = &stack[top-1];
+				if (e->user_func) {
+					if (e->user_func == KEF_REAL)
+						p->r = e->f.real_func2(p->r, q->r), p->i = (int64_t)(p->r + .5), p->vtype = KEV_REAL;
+				} else e->f.builtin(p, q);
+			} else if (e->n_args == 1 && e->f.builtin) {
+				p = &stack[top-1];
+				if (e->user_func) {
+					if (e->user_func == KEF_REAL)
+						p->r = e->f.real_func1(p->r), p->i = (int64_t)(p->r + .5), p->vtype = KEV_REAL;
+				} else e->f.builtin(&stack[top-1], 0);
+			} else top -= e->n_args - 1;
+		} else stack[top++] = *e;
+	}
+	*ret_type = stack->vtype;
+	*_i = stack->i, *_r = stack->r, *_p = stack->s;
+	free(stack);
+	return err;
+}	
+
+int64_t ke_eval_int(const kexpr_t *ke, int *err)
+{
+	int int_ret;
+	int64_t i;
+	double r;
+	const char *s;
+	*err = ke_eval(ke, &i, &r, &s, &int_ret);
+	return i;
+}
+
+double ke_eval_real(const kexpr_t *ke, int *err)
+{
+	int int_ret;
+	int64_t i;
+	double r;
+	const char *s;
+	*err = ke_eval(ke, &i, &r, &s, &int_ret);
+	return r;
+}
+
+void ke_destroy(kexpr_t *ke)
+{
+	int i;
+	if (ke == 0) return;
+	for (i = 0; i < ke->n; ++i) {
+		free(ke->e[i].name);
+		free(ke->e[i].s);
+	}
+	free(ke->e); free(ke);
+}
+
+int ke_set_int(kexpr_t *ke, const char *var, int64_t y)
+{
+	int i, n = 0;
+	double yy = (double)y;
+	for (i = 0; i < ke->n; ++i) {
+		ke1_t *e = &ke->e[i];
+		if (e->ttype == KET_VAL && e->name && strcmp(e->name, var) == 0)
+			e->i = y, e->r = yy, e->vtype = KEV_INT, e->assigned = 1, ++n;
+	}
+	return n;
+}
+
+int ke_set_real(kexpr_t *ke, const char *var, double x)
+{
+	int i, n = 0;
+	int64_t xx = (int64_t)(x + .5);
+	for (i = 0; i < ke->n; ++i) {
+		ke1_t *e = &ke->e[i];
+		if (e->ttype == KET_VAL && e->name && strcmp(e->name, var) == 0)
+			e->r = x, e->i = xx, e->vtype = KEV_REAL, e->assigned = 1, ++n;
+	}
+	return n;
+}
+
+int ke_set_str(kexpr_t *ke, const char *var, const char *x)
+{
+	int i, n = 0;
+	for (i = 0; i < ke->n; ++i) {
+		ke1_t *e = &ke->e[i];
+		if (e->ttype == KET_VAL && e->name && strcmp(e->name, var) == 0) {
+			if (e->vtype == KEV_STR) free(e->s);
+			e->s = strdup(x);
+			e->i = 0, e->r = 0., e->assigned = 1;
+			e->vtype = KEV_STR;
+			++n;
+		}
+	}
+	return n;
+}
+
+int ke_set_real_func1(kexpr_t *ke, const char *name, double (*func)(double))
+{
+	int i, n = 0;
+	for (i = 0; i < ke->n; ++i) {
+		ke1_t *e = &ke->e[i];
+		if (e->ttype == KET_FUNC && e->n_args == 1 && strcmp(e->name, name) == 0)
+			e->f.real_func1 = func, e->user_func = KEF_REAL, ++n;
+	}
+	return n;
+}
+
+int ke_set_real_func2(kexpr_t *ke, const char *name, double (*func)(double, double))
+{
+	int i, n = 0;
+	for (i = 0; i < ke->n; ++i) {
+		ke1_t *e = &ke->e[i];
+		if (e->ttype == KET_FUNC && e->n_args == 2 && strcmp(e->name, name) == 0)
+			e->f.real_func2 = func, e->user_func = KEF_REAL, ++n;
+	}
+	return n;
+}
+
+int ke_set_default_func(kexpr_t *ke)
+{
+	int n = 0;
+	n += ke_set_real_func1(ke, "exp", exp);
+	n += ke_set_real_func1(ke, "log", log);
+	n += ke_set_real_func1(ke, "log10", log10);
+	n += ke_set_real_func1(ke, "sqrt", sqrt);
+	n += ke_set_real_func1(ke, "sin", sin);
+	n += ke_set_real_func1(ke, "cos", cos);
+	n += ke_set_real_func1(ke, "tan", tan);
+	n += ke_set_real_func2(ke, "pow", pow);
+	return n;
+}
+
+void ke_unset(kexpr_t *ke)
+{
+	int i;
+	for (i = 0; i < ke->n; ++i) {
+		ke1_t *e = &ke->e[i];
+		if (e->ttype == KET_VAL && e->name) e->assigned = 0;
+	}
+}
+
+void ke_print(const kexpr_t *ke)
+{
+	int i;
+	if (ke == 0) return;
+	for (i = 0; i < ke->n; ++i) {
+		const ke1_t *u = &ke->e[i];
+		if (i) putchar(' ');
+		if (u->ttype == KET_VAL) {
+			if (u->name) printf("%s", u->name);
+			else if (u->vtype == KEV_REAL) printf("%g", u->r);
+			else if (u->vtype == KEV_INT) printf("%lld", (long long)u->i);
+			else if (u->vtype == KEV_STR) printf("\"%s\"", u->s);
+		} else if (u->ttype == KET_OP) {
+			printf("%s", ke_opstr[u->op]);
+		} else if (u->ttype == KET_FUNC) {
+			printf("%s(%d)", u->name, u->n_args);
+		}
+	}
+	putchar('\n');
+}
+
+
+#ifdef KE_MAIN
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+	int c, err, to_print = 0, is_int = 0;
+	kexpr_t *ke;
+
+	while ((c = getopt(argc, argv, "pi")) >= 0) {
+		if (c == 'p') to_print = 1;
+		else if (c == 'i') is_int = 1;
+	}
+	if (optind == argc) {
+		fprintf(stderr, "Usage: %s [-pi] <expr>\n", argv[0]);
+		return 1;
+	}
+	ke = ke_parse(argv[optind], &err);
+	ke_set_default_func(ke);
+	if (err) {
+		fprintf(stderr, "Parse error: 0x%x\n", err);
+		return 1;
+	}
+	if (!to_print) {
+		int64_t vi;
+		double vr;
+		const char *vs;
+		int i, ret_type;
+		if (argc - optind > 1) {
+			for (i = optind + 1; i < argc; ++i) {
+				char *p, *s = argv[i];
+				for (p = s; *p && *p != '='; ++p);
+				if (*p == 0) continue; // not an assignment
+				*p = 0;
+				ke_set_real(ke, s, strtod(p+1, &p));
+			}
+		}
+		err |= ke_eval(ke, &vi, &vr, &vs, &ret_type);
+		if (err & KEE_UNFUNC)
+			fprintf(stderr, "Evaluation warning: an undefined function returns the first function argument.\n");
+		if (err & KEE_UNVAR) fprintf(stderr, "Evaluation warning: unassigned variables are set to 0.\n");
+		if (ret_type == KEV_INT) printf("%lld\n", (long long)vi);
+		else if (ret_type == KEV_REAL) printf("%g\n", vr);
+		else printf("%s\n", vs);
+	} else ke_print(ke);
+	ke_destroy(ke);
+	return 0;
+}
+#endif
--- a/ext/klib/kexpr.h
+++ b/ext/klib/kexpr.h
@ -0,0 +1,68 @@
+#ifndef KEXPR_H
+#define KEXPR_H
+
+#include <stdint.h>
+
+struct kexpr_s;
+typedef struct kexpr_s kexpr_t;
+
+// Parse errors
+#define KEE_UNQU    0x01 // unmatched quotation marks
+#define KEE_UNLP    0x02 // unmatched left parentheses
+#define KEE_UNRP    0x04 // unmatched right parentheses
+#define KEE_UNOP    0x08 // unknown operators
+#define KEE_FUNC    0x10 // wrong function syntax
+#define KEE_ARG     0x20
+#define KEE_NUM     0x40 // fail to parse a number
+
+// Evaluation errors
+#define KEE_UNFUNC  0x40 // undefined function
+#define KEE_UNVAR   0x80 // unassigned variable
+
+// Return type
+#define KEV_REAL  1
+#define KEV_INT   2
+#define KEV_STR   3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	// parse an expression and return errors in $err
+	kexpr_t *ke_parse(const char *_s, int *err);
+
+	// free memory allocated during parsing
+	void ke_destroy(kexpr_t *ke);
+
+	// set a variable to integer value and return the occurrence of the variable
+	int ke_set_int(kexpr_t *ke, const char *var, int64_t x);
+
+	// set a variable to real value and return the occurrence of the variable
+	int ke_set_real(kexpr_t *ke, const char *var, double x);
+
+	// set a variable to string value and return the occurrence of the variable
+	int ke_set_str(kexpr_t *ke, const char *var, const char *x);
+
+	// set a user-defined function
+	int ke_set_real_func1(kexpr_t *ke, const char *name, double (*func)(double));
+	int ke_set_real_func2(kexpr_t *ke, const char *name, double (*func)(double, double));
+
+	// set default math functions
+	int ke_set_default_func(kexpr_t *ke);
+
+	// mark all variable as unset
+	void ke_unset(kexpr_t *e);
+
+	// evaluate expression; return error code; final value is returned via pointers
+	int ke_eval(const kexpr_t *ke, int64_t *_i, double *_r, const char **_s, int *ret_type);
+	int64_t ke_eval_int(const kexpr_t *ke, int *err);
+	double ke_eval_real(const kexpr_t *ke, int *err);
+
+	// print the expression in Reverse Polish notation (RPN)
+	void ke_print(const kexpr_t *ke);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ext/klib/kgraph.h
+++ b/ext/klib/kgraph.h
@ -0,0 +1,79 @@
+#ifndef AC_KGRAPH_H
+#define AC_KGRAPH_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "khash.h"
+#include "kbtree.h"
+
+typedef unsigned kgint_t;
+
+#define kgraph_t(name) kh_##name##_t
+
+#define __KG_BASIC(name, SCOPE, vertex_t, arc_t, ehn) \
+	SCOPE kgraph_t(name) *kg_init_##name(void) { return kh_init(name); } \
+	SCOPE void kg_destroy_##name(kgraph_t(name) *g) { \
+		khint_t k; \
+		if (g == 0) return; \
+		for (k = kh_begin(g); k != kh_end(g); ++k) \
+			if (kh_exist(g, k)) kh_destroy(ehn, kh_val(g, k)._arc); \
+		kh_destroy(name, g); \
+	} \
+	SCOPE vertex_t *kg_get_v_##name(kgraph_t(name) *g, kgint_t v) { \
+		khint_t k = kh_get(name, g, v); \
+		return k == kh_end(g)? 0 : &kh_val(g, k); \
+	} \
+	SCOPE vertex_t *kg_put_v_##name(kgraph_t(name) *g, kgint_t v, int *absent) { \
+		khint_t k; \
+		k = kh_put(name, g, v, absent); \
+		if (*absent) kh_val(g, k)._arc = kh_init(ehn); \
+		return &kh_val(g, k); \
+	} \
+	SCOPE void kg_put_a_##name(kgraph_t(name) *g, kgint_t vbeg, kgint_t vend, int dir, arc_t **pb, arc_t **pe) { \
+		vertex_t *p; \
+		khint_t k; \
+		int absent; \
+		p = kg_put_v_##name(g, vbeg, &absent); \
+		k = kh_put(ehn, p->_arc, vend<<2|dir, &absent); \
+		*pb = &kh_val(p->_arc, k); \
+		p = kg_put_v_##name(g, vend, &absent); \
+		k = kh_put(ehn, p->_arc, vbeg<<2|(~dir&3), &absent); \
+		*pe = &kh_val(p->_arc, k); \
+	} \
+	SCOPE vertex_t *kg_del_v_##name(kgraph_t(name) *g, kgint_t v) { \
+		khint_t k, k0, k2, k3; \
+		khash_t(ehn) *h; \
+		k0 = k = kh_get(name, g, v); \
+		if (k == kh_end(g)) return 0; /* not present in the graph */ \
+		h = kh_val(g, k)._arc; \
+		for (k = kh_begin(h); k != kh_end(h); ++k) /* remove v from its neighbors */ \
+			if (kh_exist(h, k)) { \
+				k2 = kh_get(name, g, kh_key(h, k)>>2); \
+				/* assert(k2 != kh_end(g)); */ \
+				k3 = kh_get(ehn, kh_val(g, k2)._arc, v<<2|(~kh_key(h, k)&3)); \
+				/* assert(k3 != kh_end(kh_val(g, k2)._arc)); */ \
+				kh_del(ehn, kh_val(g, k2)._arc, k3); \
+			} \
+		kh_destroy(ehn, h); \
+		kh_del(name, g, k0); \
+		return &kh_val(g, k0); \
+	}
+
+#define KGRAPH_PRINT(name, SCOPE) \
+	SCOPE void kg_print_##name(kgraph_t(name) *g) { \
+		khint_t k, k2; \
+		for (k = kh_begin(g); k != kh_end(g); ++k) \
+			if (kh_exist(g, k)) { \
+				printf("v %u\n", kh_key(g, k)); \
+				for (k2 = kh_begin(kh_val(g, k)._arc); k2 != kh_end(kh_val(g, k)._arc); ++k2) \
+					if (kh_exist(kh_val(g, k)._arc, k2) && kh_key(g, k) < kh_key(kh_val(g, k)._arc, k2)>>2) \
+						printf("a %u%c%c%u\n", kh_key(g, k), "><"[kh_key(kh_val(g, k)._arc, k2)>>1&1], \
+								"><"[kh_key(kh_val(g, k)._arc, k2)&1], kh_key(kh_val(g, k)._arc, k2)>>2); \
+			} \
+	}
+
+#define KGRAPH_INIT(name, SCOPE, vertex_t, arc_t, ehn) \
+	KHASH_INIT2(name, SCOPE, kgint_t, vertex_t, 1, kh_int_hash_func, kh_int_hash_equal) \
+	__KG_BASIC(name, SCOPE, vertex_t, arc_t, ehn)
+
+#endif
--- a/ext/klib/khash.h
+++ b/ext/klib/khash.h
@ -0,0 +1,627 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(new_flags); return -1; }		\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(new_flags); return -1; }	\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More convenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash set containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
--- a/ext/klib/khashl.h
+++ b/ext/klib/khashl.h
@ -0,0 +1,446 @@
+/* The MIT License
+
+   Copyright (c) 2019-2024 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef __AC_KHASHL_H
+#define __AC_KHASHL_H
+
+#define AC_VERSION_KHASHL_H "r20"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/************************************
+ * Compiler specific configurations *
+ ************************************/
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+#define KH_LOCAL static kh_inline klib_unused
+
+typedef khint32_t khint_t;
+typedef const char *kh_cstr_t;
+
+/***********************
+ * Configurable macros *
+ ***********************/
+
+#ifndef kh_max_count
+#define kh_max_count(cap) (((cap)>>1) + ((cap)>>2)) /* default load factor: 75% */
+#endif
+
+#ifndef kh_packed
+#define kh_packed __attribute__ ((__packed__))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+/****************************
+ * Simple private functions *
+ ****************************/
+
+#define __kh_used(flag, i)       (flag[i>>5] >> (i&0x1fU) & 1U)
+#define __kh_set_used(flag, i)   (flag[i>>5] |= 1U<<(i&0x1fU))
+#define __kh_set_unused(flag, i) (flag[i>>5] &= ~(1U<<(i&0x1fU)))
+
+#define __kh_fsize(m) ((m) < 32? 1 : (m)>>5)
+
+static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); }
+
+/*******************
+ * Hash table base *
+ *******************/
+
+#define __KHASHL_TYPE(HType, khkey_t) \
+	typedef struct HType { \
+		khint_t bits, count; \
+		khint32_t *used; \
+		khkey_t *keys; \
+	} HType;
+
+#define __KHASHL_PROTOTYPES(HType, prefix, khkey_t) \
+	extern HType *prefix##_init(void); \
+	extern void prefix##_destroy(HType *h); \
+	extern void prefix##_clear(HType *h); \
+	extern khint_t prefix##_getp(const HType *h, const khkey_t *key); \
+	extern int prefix##_resize(HType *h, khint_t new_n_buckets); \
+	extern khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent); \
+	extern void prefix##_del(HType *h, khint_t k);
+
+#define __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \
+	SCOPE HType *prefix##_init(void) { \
+		return (HType*)kcalloc(1, sizeof(HType)); \
+	} \
+	SCOPE void prefix##_destroy(HType *h) { \
+		if (!h) return; \
+		kfree((void *)h->keys); kfree(h->used); \
+		kfree(h); \
+	} \
+	SCOPE void prefix##_clear(HType *h) { \
+		if (h && h->used) { \
+			khint_t n_buckets = (khint_t)1U << h->bits; \
+			memset(h->used, 0, __kh_fsize(n_buckets) * sizeof(khint32_t)); \
+			h->count = 0; \
+		} \
+	}
+
+#define __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	SCOPE khint_t prefix##_getp_core(const HType *h, const khkey_t *key, khint_t hash) { \
+		khint_t i, last, n_buckets, mask; \
+		if (h->keys == 0) return 0; \
+		n_buckets = (khint_t)1U << h->bits; \
+		mask = n_buckets - 1U; \
+		i = last = __kh_h2b(hash, h->bits); \
+		while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \
+			i = (i + 1U) & mask; \
+			if (i == last) return n_buckets; \
+		} \
+		return !__kh_used(h->used, i)? n_buckets : i; \
+	} \
+	SCOPE khint_t prefix##_getp(const HType *h, const khkey_t *key) { return prefix##_getp_core(h, key, __hash_fn(*key)); } \
+	SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { return prefix##_getp_core(h, &key, __hash_fn(key)); }
+
+#define __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	SCOPE int prefix##_resize(HType *h, khint_t new_n_buckets) { \
+		khint32_t *new_used = 0; \
+		khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; \
+		while ((x >>= 1) != 0) ++j; \
+		if (new_n_buckets & (new_n_buckets - 1)) ++j; \
+		new_bits = j > 2? j : 2; \
+		new_n_buckets = (khint_t)1U << new_bits; \
+		if (h->count > kh_max_count(new_n_buckets)) return 0; /* requested size is too small */ \
+		new_used = (khint32_t*)kmalloc(__kh_fsize(new_n_buckets) * sizeof(khint32_t)); \
+		memset(new_used, 0, __kh_fsize(new_n_buckets) * sizeof(khint32_t)); \
+		if (!new_used) return -1; /* not enough memory */ \
+		n_buckets = h->keys? (khint_t)1U<<h->bits : 0U; \
+		if (n_buckets < new_n_buckets) { /* expand */ \
+			khkey_t *new_keys = (khkey_t*)krealloc((void*)h->keys, new_n_buckets * sizeof(khkey_t)); \
+			if (!new_keys) { kfree(new_used); return -1; } \
+			h->keys = new_keys; \
+		} /* otherwise shrink */ \
+		new_mask = new_n_buckets - 1; \
+		for (j = 0; j != n_buckets; ++j) { \
+			khkey_t key; \
+			if (!__kh_used(h->used, j)) continue; \
+			key = h->keys[j]; \
+			__kh_set_unused(h->used, j); \
+			while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+				khint_t i; \
+				i = __kh_h2b(__hash_fn(key), new_bits); \
+				while (__kh_used(new_used, i)) i = (i + 1) & new_mask; \
+				__kh_set_used(new_used, i); \
+				if (i < n_buckets && __kh_used(h->used, i)) { /* kick out the existing element */ \
+					{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+					__kh_set_unused(h->used, i); /* mark it as deleted in the old hash table */ \
+				} else { /* write the element and jump out of the loop */ \
+					h->keys[i] = key; \
+					break; \
+				} \
+			} \
+		} \
+		if (n_buckets > new_n_buckets) /* shrink the hash table */ \
+			h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+		kfree(h->used); /* free the working space */ \
+		h->used = new_used, h->bits = new_bits; \
+		return 0; \
+	}
+
+#define __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	SCOPE khint_t prefix##_putp_core(HType *h, const khkey_t *key, khint_t hash, int *absent) { \
+		khint_t n_buckets, i, last, mask; \
+		n_buckets = h->keys? (khint_t)1U<<h->bits : 0U; \
+		*absent = -1; \
+		if (h->count >= kh_max_count(n_buckets)) { /* rehashing */ \
+			if (prefix##_resize(h, n_buckets + 1U) < 0) \
+				return n_buckets; \
+			n_buckets = (khint_t)1U<<h->bits; \
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		mask = n_buckets - 1; \
+		i = last = __kh_h2b(hash, h->bits); \
+		while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \
+			i = (i + 1U) & mask; \
+			if (i == last) break; \
+		} \
+		if (!__kh_used(h->used, i)) { /* not present at all */ \
+			h->keys[i] = *key; \
+			__kh_set_used(h->used, i); \
+			++h->count; \
+			*absent = 1; \
+		} else *absent = 0; /* Don't touch h->keys[i] if present */ \
+		return i; \
+	} \
+	SCOPE khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent) { return prefix##_putp_core(h, key, __hash_fn(*key), absent); } \
+	SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { return prefix##_putp_core(h, &key, __hash_fn(key), absent); }
+
+#define __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) \
+	SCOPE int prefix##_del(HType *h, khint_t i) { \
+		khint_t j = i, k, mask, n_buckets; \
+		if (h->keys == 0) return 0; \
+		n_buckets = (khint_t)1U<<h->bits; \
+		mask = n_buckets - 1U; \
+		while (1) { \
+			j = (j + 1U) & mask; \
+			if (j == i || !__kh_used(h->used, j)) break; /* j==i only when the table is completely full */ \
+			k = __kh_h2b(__hash_fn(h->keys[j]), h->bits); \
+			if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) \
+				h->keys[i] = h->keys[j], i = j; \
+		} \
+		__kh_set_unused(h->used, i); \
+		--h->count; \
+		return 1; \
+	}
+
+#define KHASHL_DECLARE(HType, prefix, khkey_t) \
+	__KHASHL_TYPE(HType, khkey_t) \
+	__KHASHL_PROTOTYPES(HType, prefix, khkey_t)
+
+#define KHASHL_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	__KHASHL_TYPE(HType, khkey_t) \
+	__KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \
+	__KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	__KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	__KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	__KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn)
+
+/***************************
+ * Ensemble of hash tables *
+ ***************************/
+
+typedef struct {
+	khint_t sub, pos;
+} kh_ensitr_t;
+
+#define KHASHE_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	KHASHL_INIT(KH_LOCAL, HType##_sub, prefix##_sub, khkey_t, __hash_fn, __hash_eq) \
+	typedef struct HType { \
+		khint64_t count:54, bits:8; \
+		HType##_sub *sub; \
+	} HType; \
+	SCOPE HType *prefix##_init(int bits) { \
+		HType *g; \
+		g = (HType*)kcalloc(1, sizeof(*g)); \
+		g->bits = bits; \
+		g->sub = (HType##_sub*)kcalloc(1U<<bits, sizeof(*g->sub)); \
+		return g; \
+	} \
+	SCOPE void prefix##_destroy(HType *g) { \
+		int t; \
+		if (!g) return; \
+		for (t = 0; t < 1<<g->bits; ++t) { kfree((void*)g->sub[t].keys); kfree(g->sub[t].used); } \
+		kfree(g->sub); kfree(g); \
+	} \
+	SCOPE kh_ensitr_t prefix##_getp(const HType *g, const khkey_t *key) { \
+		khint_t hash, low, ret; \
+		kh_ensitr_t r; \
+		HType##_sub *h; \
+		hash = __hash_fn(*key); \
+		low = hash & ((1U<<g->bits) - 1); \
+		h = &g->sub[low]; \
+		ret = prefix##_sub_getp_core(h, key, hash); \
+		if (ret == kh_end(h)) r.sub = low, r.pos = (khint_t)-1; \
+		else r.sub = low, r.pos = ret; \
+		return r; \
+	} \
+	SCOPE kh_ensitr_t prefix##_get(const HType *g, const khkey_t key) { return prefix##_getp(g, &key); } \
+	SCOPE kh_ensitr_t prefix##_putp(HType *g, const khkey_t *key, int *absent) { \
+		khint_t hash, low, ret; \
+		kh_ensitr_t r; \
+		HType##_sub *h; \
+		hash = __hash_fn(*key); \
+		low = hash & ((1U<<g->bits) - 1); \
+		h = &g->sub[low]; \
+		ret = prefix##_sub_putp_core(h, key, hash, absent); \
+		if (*absent) ++g->count; \
+		r.sub = low, r.pos = ret; \
+		return r; \
+	} \
+	SCOPE kh_ensitr_t prefix##_put(HType *g, const khkey_t key, int *absent) { return prefix##_putp(g, &key, absent); } \
+	SCOPE int prefix##_del(HType *g, kh_ensitr_t itr) { \
+		HType##_sub *h = &g->sub[itr.sub]; \
+		int ret; \
+		ret = prefix##_sub_del(h, itr.pos); \
+		if (ret) --g->count; \
+		return ret; \
+	}
+
+/*****************************
+ * More convenient interface *
+ *****************************/
+
+#define __kh_cached_hash(x) ((x).hash)
+
+#define KHASHL_SET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	KHASHL_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq)
+
+#define KHASHL_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \
+	typedef struct { khkey_t key; kh_val_t val; } kh_packed HType##_m_bucket_t; \
+	static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \
+	static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \
+	KHASHL_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \
+	SCOPE HType *prefix##_init(void) { return prefix##_m_init(); } \
+	SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \
+	SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \
+	SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_m_del(h, k); } \
+	SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); }
+
+#define KHASHL_CSET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	typedef struct { khkey_t key; khint_t hash; } kh_packed HType##_cs_bucket_t; \
+	static kh_inline int prefix##_cs_eq(HType##_cs_bucket_t x, HType##_cs_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \
+	KHASHL_INIT(KH_LOCAL, HType, prefix##_cs, HType##_cs_bucket_t, __kh_cached_hash, prefix##_cs_eq) \
+	SCOPE HType *prefix##_init(void) { return prefix##_cs_init(); } \
+	SCOPE void prefix##_destroy(HType *h) { prefix##_cs_destroy(h); } \
+	SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cs_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cs_getp(h, &t); } \
+	SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cs_del(h, k); } \
+	SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cs_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cs_putp(h, &t, absent); }
+
+#define KHASHL_CMAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \
+	typedef struct { khkey_t key; kh_val_t val; khint_t hash; } kh_packed HType##_cm_bucket_t; \
+	static kh_inline int prefix##_cm_eq(HType##_cm_bucket_t x, HType##_cm_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \
+	KHASHL_INIT(KH_LOCAL, HType, prefix##_cm, HType##_cm_bucket_t, __kh_cached_hash, prefix##_cm_eq) \
+	SCOPE HType *prefix##_init(void) { return prefix##_cm_init(); } \
+	SCOPE void prefix##_destroy(HType *h) { prefix##_cm_destroy(h); } \
+	SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cm_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cm_getp(h, &t); } \
+	SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cm_del(h, k); } \
+	SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cm_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cm_putp(h, &t, absent); }
+
+#define KHASHE_SET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+	KHASHE_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq)
+
+#define KHASHE_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \
+	typedef struct { khkey_t key; kh_val_t val; } kh_packed HType##_m_bucket_t; \
+	static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \
+	static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \
+	KHASHE_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \
+	SCOPE HType *prefix##_init(int bits) { return prefix##_m_init(bits); } \
+	SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \
+	SCOPE kh_ensitr_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \
+	SCOPE int prefix##_del(HType *h, kh_ensitr_t k) { return prefix##_m_del(h, k); } \
+	SCOPE kh_ensitr_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); }
+
+/**************************
+ * Public macro functions *
+ **************************/
+
+#define kh_bucket(h, x) ((h)->keys[x])
+#define kh_size(h) ((h)->count)
+#define kh_capacity(h) ((h)->keys? 1U<<(h)->bits : 0U)
+#define kh_end(h) kh_capacity(h)
+
+#define kh_key(h, x) ((h)->keys[x].key)
+#define kh_val(h, x) ((h)->keys[x].val)
+#define kh_exist(h, x) __kh_used((h)->used, (x))
+
+#define kh_foreach(h, x) for ((x) = 0; (x) != kh_end(h); ++(x)) if (kh_exist((h), (x)))
+
+#define kh_ens_key(g, x) kh_key(&(g)->sub[(x).sub], (x).pos)
+#define kh_ens_val(g, x) kh_val(&(g)->sub[(x).sub], (x).pos)
+#define kh_ens_exist(g, x) kh_exist(&(g)->sub[(x).sub], (x).pos)
+#define kh_ens_is_end(x) ((x).pos == (khint_t)-1)
+#define kh_ens_size(g) ((g)->count)
+
+#define kh_ens_foreach(g, x) for ((x).sub = 0; (x).sub != 1<<(g)->bits; ++(x).sub) for ((x).pos = 0; (x).pos != kh_end(&(g)->sub[(x).sub]); ++(x).pos) if (kh_ens_exist((g), (x)))
+
+/**************************************
+ * Common hash and equality functions *
+ **************************************/
+
+#define kh_eq_generic(a, b) ((a) == (b))
+#define kh_eq_str(a, b) (strcmp((a), (b)) == 0)
+#define kh_hash_dummy(x) ((khint_t)(x))
+
+static kh_inline khint_t kh_hash_uint32(khint_t x) { /* murmur finishing */
+	x ^= x >> 16;
+	x *= 0x85ebca6bU;
+	x ^= x >> 13;
+	x *= 0xc2b2ae35U;
+	x ^= x >> 16;
+	return x;
+}
+
+static kh_inline khint_t kh_hash_uint64(khint64_t x) { /* splitmix64; see https://nullprogram.com/blog/2018/07/31/ for inversion */
+	x ^= x >> 30;
+	x *= 0xbf58476d1ce4e5b9ULL;
+	x ^= x >> 27;
+	x *= 0x94d049bb133111ebULL;
+	x ^= x >> 31;
+	return (khint_t)x;
+}
+
+#define KH_FNV_SEED 11
+
+static kh_inline khint_t kh_hash_str(kh_cstr_t s) { /* FNV1a */
+	khint_t h = KH_FNV_SEED ^ 2166136261U;
+	const unsigned char *t = (const unsigned char*)s;
+	for (; *t; ++t)
+		h ^= *t, h *= 16777619;
+	return h;
+}
+
+static kh_inline khint_t kh_hash_bytes(int len, const unsigned char *s) {
+	khint_t h = KH_FNV_SEED ^ 2166136261U;
+	int i;
+	for (i = 0; i < len; ++i)
+		h ^= s[i], h *= 16777619;
+	return h;
+}
+
+#endif /* __AC_KHASHL_H */
--- a/ext/klib/khmm.c
+++ b/ext/klib/khmm.c
@ -0,0 +1,423 @@
+#include <math.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include "khmm.h"
+
+// new/delete hmm_par_t
+
+hmm_par_t *hmm_new_par(int m, int n)
+{
+	hmm_par_t *hp;
+	int i;
+	assert(m > 0 && n > 0);
+	hp = (hmm_par_t*)calloc(1, sizeof(hmm_par_t));
+	hp->m = m; hp->n = n;
+	hp->a0 = (FLOAT*)calloc(n, sizeof(FLOAT));
+	hp->a = (FLOAT**)calloc2(n, n, sizeof(FLOAT));
+	hp->e = (FLOAT**)calloc2(m + 1, n, sizeof(FLOAT));
+	hp->ae = (FLOAT**)calloc2((m + 1) * n, n, sizeof(FLOAT));
+	for (i = 0; i != n; ++i) hp->e[m][i] = 1.0;
+	return hp;
+}
+void hmm_delete_par(hmm_par_t *hp)
+{
+	int i;
+	if (hp == 0) return;
+	for (i = 0; i != hp->n; ++i) free(hp->a[i]);
+	for (i = 0; i <= hp->m; ++i) free(hp->e[i]);
+	for (i = 0; i < (hp->m + 1) * hp->n; ++i) free(hp->ae[i]);
+	free(hp->a); free(hp->e); free(hp->a0); free(hp->ae);
+	free(hp);
+}
+
+// new/delete hmm_data_t
+
+hmm_data_t *hmm_new_data(int L, const char *seq, const hmm_par_t *hp)
+{
+	hmm_data_t *hd;
+	hd = (hmm_data_t*)calloc(1, sizeof(hmm_data_t));
+	hd->L = L;
+	hd->seq = (char*)malloc(L + 1);
+	memcpy(hd->seq + 1, seq, L);
+	return hd;
+}
+void hmm_delete_data(hmm_data_t *hd)
+{
+	int i;
+	if (hd == 0) return;
+	for (i = 0; i <= hd->L; ++i) {
+		if (hd->f) free(hd->f[i]);
+		if (hd->b) free(hd->b[i]);
+	}
+	free(hd->f); free(hd->b); free(hd->s); free(hd->v); free(hd->p); free(hd->seq);
+	free(hd);
+}
+
+// new/delete hmm_exp_t
+
+hmm_exp_t *hmm_new_exp(const hmm_par_t *hp)
+{
+	hmm_exp_t *he;
+	assert(hp);
+	he = (hmm_exp_t*)calloc(1, sizeof(hmm_exp_t));
+	he->m = hp->m; he->n = hp->n;
+	he->A0 = (FLOAT*)calloc(hp->n, sizeof(FLOAT));
+	he->A = (FLOAT**)calloc2(hp->n, hp->n, sizeof(FLOAT));
+	he->E = (FLOAT**)calloc2(hp->m + 1, hp->n, sizeof(FLOAT));
+	return he;
+}
+void hmm_delete_exp(hmm_exp_t *he)
+{
+	int i;
+	if (he == 0) return;
+	for (i = 0; i != he->n; ++i) free(he->A[i]);
+	for (i = 0; i <= he->m; ++i) free(he->E[i]);
+	free(he->A); free(he->E); free(he->A0);
+	free(he);
+}
+
+// Viterbi algorithm
+
+FLOAT hmm_Viterbi(const hmm_par_t *hp, hmm_data_t *hd)
+{
+	FLOAT **la, **le, *preV, *curV, max;
+	int **Vmax, max_l; // backtrace matrix
+	int k, l, b, u;
+	
+	if (hd->v) free(hd->v);
+	hd->v = (int*)calloc(hd->L+1, sizeof(int));
+	la = (FLOAT**)calloc2(hp->n, hp->n, sizeof(FLOAT));
+	le = (FLOAT**)calloc2(hp->m + 1, hp->n, sizeof(FLOAT));
+	Vmax = (int**)calloc2(hd->L+1, hp->n, sizeof(int));
+	preV = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
+	curV = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
+	for (k = 0; k != hp->n; ++k)
+		for (l = 0; l != hp->n; ++l)
+			la[k][l] = log(hp->a[l][k]); // this is not a bug
+	for (b = 0; b != hp->m; ++b)
+		for (k = 0; k != hp->n; ++k)
+			le[b][k] = log(hp->e[b][k]);
+	for (k = 0; k != hp->n; ++k) le[hp->m][k] = 0.0;
+	// V_k(1)
+	for (k = 0; k != hp->n; ++k) {
+		preV[k] = le[(int)hd->seq[1]][k] + log(hp->a0[k]);
+		Vmax[1][k] = 0;
+	}
+	// all the rest
+	for (u = 2; u <= hd->L; ++u) {
+		FLOAT *tmp, *leu = le[(int)hd->seq[u]];
+		for (k = 0; k != hp->n; ++k) {
+			FLOAT *laa = la[k];
+			for (l = 0, max = -HMM_INF, max_l = -1; l != hp->n; ++l) {
+				if (max < preV[l] + laa[l]) {
+					max = preV[l] + laa[l];
+					max_l = l;
+				}
+			}
+			assert(max_l >= 0); // cannot be zero
+			curV[k] = leu[k] + max;
+			Vmax[u][k] = max_l;
+		}
+		tmp = curV; curV = preV; preV = tmp; // swap
+	}
+	// backtrace
+	for (k = 0, max_l = -1, max = -HMM_INF; k != hp->n; ++k) {
+		if (max < preV[k]) {
+			max = preV[k]; max_l = k;
+		}
+	}
+	assert(max_l >= 0); // cannot be zero
+	hd->v[hd->L] = max_l;
+	for (u = hd->L; u >= 1; --u)
+		hd->v[u-1] = Vmax[u][hd->v[u]];
+	for (k = 0; k != hp->n; ++k) free(la[k]);
+	for (b = 0; b < hp->m; ++b) free(le[b]);
+	for (u = 0; u <= hd->L; ++u) free(Vmax[u]);
+	free(la); free(le); free(Vmax); free(preV); free(curV);
+	hd->status |= HMM_VITERBI;
+	return max;
+}
+
+// forward algorithm
+
+void hmm_forward(const hmm_par_t *hp, hmm_data_t *hd)
+{
+	FLOAT sum, tmp, **at;
+	int u, k, l;
+	int n, m, L;
+	assert(hp && hd);
+	// allocate memory for hd->f and hd->s
+	n = hp->n; m = hp->m; L = hd->L;
+	if (hd->s) free(hd->s);
+	if (hd->f) { 
+		for (k = 0; k <= hd->L; ++k) free(hd->f[k]);
+		free(hd->f);
+	}
+	hd->f = (FLOAT**)calloc2(hd->L+1, hp->n, sizeof(FLOAT));
+	hd->s = (FLOAT*)calloc(hd->L+1, sizeof(FLOAT));
+	hd->status &= ~(unsigned)HMM_FORWARD;
+	// at[][] array helps to improve the cache efficiency
+	at = (FLOAT**)calloc2(n, n, sizeof(FLOAT));
+	// transpose a[][]
+	for (k = 0; k != n; ++k)
+		for (l = 0; l != n; ++l)
+			at[k][l] = hp->a[l][k];
+	// f[0], but it should never be used
+	hd->s[0] = 1.0;
+	for (k = 0; k != n; ++k) hd->f[0][k] = 0.0;
+	// f[1]
+	for (k = 0, sum = 0.0; k != n; ++k)
+		sum += (hd->f[1][k] = hp->a0[k] * hp->e[(int)hd->seq[1]][k]);
+	for (k = 0; k != n; ++k) hd->f[1][k] /= sum;
+	hd->s[1] = sum;
+	// f[2..hmmL], the core loop
+	for (u = 2; u <= L; ++u) {
+		FLOAT *fu = hd->f[u], *fu1 = hd->f[u-1], *eu = hp->e[(int)hd->seq[u]];
+		for (k = 0, sum = 0.0; k != n; ++k) {
+			FLOAT *aa = at[k];
+			for (l = 0, tmp = 0.0; l != n; ++l) tmp += fu1[l] * aa[l];
+			sum += (fu[k] = eu[k] * tmp);
+		}
+		for (k = 0; k != n; ++k) fu[k] /= sum;
+		hd->s[u] = sum;
+	}
+	// free at array
+	for (k = 0; k != hp->n; ++k) free(at[k]);
+	free(at);
+	hd->status |= HMM_FORWARD;
+}
+
+//  precalculate hp->ae
+
+void hmm_pre_backward(hmm_par_t *hp)
+{
+	int m, n, b, k, l;
+	assert(hp);
+	m = hp->m; n = hp->n;
+	for (b = 0; b <= m; ++b) {
+		for (k = 0; k != n; ++k) {
+			FLOAT *p = hp->ae[b * hp->n + k];
+			for (l = 0; l != n; ++l)
+				p[l] = hp->e[b][l] * hp->a[k][l];
+		}
+	}
+}
+
+// backward algorithm
+
+void hmm_backward(const hmm_par_t *hp, hmm_data_t *hd)
+{
+	FLOAT tmp;
+	int k, l, u;
+	int m, n, L;
+	assert(hp && hd);
+	assert(hd->status & HMM_FORWARD);
+	// allocate memory for hd->b
+	m = hp->m; n = hp->n; L = hd->L;
+	if (hd->b) { 
+		for (k = 0; k <= hd->L; ++k) free(hd->b[k]);
+		free(hd->b);
+	}
+	hd->status &= ~(unsigned)HMM_BACKWARD;
+	hd->b = (FLOAT**)calloc2(L+1, hp->n, sizeof(FLOAT));
+	// b[L]
+	for (k = 0; k != hp->n; ++k) hd->b[L][k] = 1.0 / hd->s[L];
+	// b[1..L-1], the core loop
+	for (u = L-1; u >= 1; --u) {
+		FLOAT *bu1 = hd->b[u+1], **p = hp->ae + (int)hd->seq[u+1] * n;
+		for (k = 0; k != n; ++k) {
+			FLOAT *q = p[k];
+			for (l = 0, tmp = 0.0; l != n; ++l) tmp += q[l] * bu1[l];
+			hd->b[u][k] = tmp / hd->s[u];
+		}
+	}
+	hd->status |= HMM_BACKWARD;
+	for (l = 0, tmp = 0.0; l != n; ++l)
+		tmp += hp->a0[l] * hd->b[1][l] * hp->e[(int)hd->seq[1]][l];
+	if (tmp > 1.0 + 1e-6 || tmp < 1.0 - 1e-6) // in theory, tmp should always equal to 1
+		fprintf(stderr, "++ Underflow may have happened (%lg).\n", tmp);
+}
+
+// log-likelihood of the observation
+
+FLOAT hmm_lk(const hmm_data_t *hd)
+{
+    FLOAT sum = 0.0, prod = 1.0;
+	int u, L;
+	L = hd->L;
+	assert(hd->status & HMM_FORWARD);
+	for (u = 1; u <= L; ++u) {
+		prod *= hd->s[u];
+		if (prod < HMM_TINY || prod >= 1.0/HMM_TINY) { // reset
+			sum += log(prod);
+			prod = 1.0;
+		}
+	}
+	sum += log(prod);
+	return sum;
+}
+
+// posterior decoding
+
+void hmm_post_decode(const hmm_par_t *hp, hmm_data_t *hd)
+{
+	int u, k;
+	assert(hd->status && HMM_BACKWARD);
+	if (hd->p) free(hd->p);
+	hd->p = (int*)calloc(hd->L + 1, sizeof(int));
+	for (u = 1; u <= hd->L; ++u) {
+		FLOAT prob, max, *fu = hd->f[u], *bu = hd->b[u], su = hd->s[u];
+		int max_k;
+		for (k = 0, max = -1.0, max_k = -1; k != hp->n; ++k) {
+			if (max < (prob = fu[k] * bu[k] * su)) {
+				max = prob; max_k = k;
+			}
+		}
+		assert(max_k >= 0);
+		hd->p[u] = max_k;
+	}
+	hd->status |= HMM_POSTDEC;
+}
+
+// posterior probability of states
+
+FLOAT hmm_post_state(const hmm_par_t *hp, const hmm_data_t *hd, int u, FLOAT *prob)
+{
+	FLOAT sum = 0.0, ss = hd->s[u], *fu = hd->f[u], *bu = hd->b[u];
+	int k;
+	for (k = 0; k != hp->n; ++k)
+		sum += (prob[k] = fu[k] * bu[k] * ss);
+	return sum; // in theory, this should always equal to 1.0
+}
+
+// expected counts
+
+hmm_exp_t *hmm_expect(const hmm_par_t *hp, const hmm_data_t *hd)
+{
+	int k, l, u, b, m, n;
+	hmm_exp_t *he;
+	assert(hd->status & HMM_BACKWARD);
+	he = hmm_new_exp(hp);
+	// initialization
+	m = hp->m; n = hp->n;
+	for (k = 0; k != n; ++k)
+		for (l = 0; l != n; ++l) he->A[k][l] = HMM_TINY;
+	for (b = 0; b <= m; ++b)
+		for (l = 0; l != n; ++l) he->E[b][l] = HMM_TINY;
+	// calculate A_{kl} and E_k(b), k,l\in[0,n)
+	for (u = 1; u < hd->L; ++u) {
+		FLOAT *fu = hd->f[u], *bu = hd->b[u], *bu1 = hd->b[u+1], ss = hd->s[u];
+		FLOAT *Ec = he->E[(int)hd->seq[u]], **p = hp->ae + (int)hd->seq[u+1] * n;
+		for (k = 0; k != n; ++k) {
+			FLOAT *q = p[k], *AA = he->A[k], fuk = fu[k];
+			for (l = 0; l != n; ++l) // this is cache-efficient
+				AA[l] += fuk * q[l] * bu1[l];
+			Ec[k] += fuk * bu[k] * ss;
+		}
+	}
+	// calculate A0_l
+	for (l = 0; l != n; ++l)
+		he->A0[l] += hp->a0[l] * hp->e[(int)hd->seq[1]][l] * hd->b[1][l];
+	return he;
+}
+
+FLOAT hmm_Q0(const hmm_par_t *hp, hmm_exp_t *he)
+{
+	int k, l, b;
+	FLOAT sum = 0.0;
+	for (k = 0; k != hp->n; ++k) {
+		FLOAT tmp;
+		for (b = 0, tmp = 0.0; b != hp->m; ++b) tmp += he->E[b][k];
+		for (b = 0; b != hp->m; ++b)
+			sum += he->E[b][k] * log(he->E[b][k] / tmp);
+	}
+	for (k = 0; k != hp->n; ++k) {
+		FLOAT tmp, *A = he->A[k];
+		for (l = 0, tmp = 0.0; l != hp->n; ++l) tmp += A[l];
+		for (l = 0; l != hp->n; ++l) sum += A[l] * log(A[l] / tmp);
+	}
+	return (he->Q0 = sum);
+}
+
+// add he0 to he1
+
+void hmm_add_expect(const hmm_exp_t *he0, hmm_exp_t *he1)
+{
+	int b, k, l;
+	assert(he0->m == he1->m && he0->n == he1->n);
+	for (k = 0; k != he1->n; ++k) {
+		he1->A0[k] += he0->A0[k];
+		for (l = 0; l != he1->n; ++l)
+			he1->A[k][l] += he0->A[k][l];
+	}
+	for (b = 0; b != he1->m; ++b) {
+		for (l = 0; l != he1->n; ++l)
+			he1->E[b][l] += he0->E[b][l];
+	}
+}
+
+// the EM-Q function
+
+FLOAT hmm_Q(const hmm_par_t *hp, const hmm_exp_t *he)
+{
+	FLOAT sum = 0.0;
+	int bb, k, l;
+	for (bb = 0; bb != he->m; ++bb) {
+		FLOAT *eb = hp->e[bb], *Eb = he->E[bb];
+		for (k = 0; k != hp->n; ++k) {
+			if (eb[k] <= 0.0) return -HMM_INF;
+			sum += Eb[k] * log(eb[k]);
+		}
+	}
+	for (k = 0; k != he->n; ++k) {
+		FLOAT *Ak = he->A[k], *ak = hp->a[k];
+		for (l = 0; l != he->n; ++l) {
+			if (ak[l] <= 0.0) return -HMM_INF;
+			sum += Ak[l] * log(ak[l]);
+		}
+	}
+	return (sum -= he->Q0);
+}
+
+// simulate sequence
+
+char *hmm_simulate(const hmm_par_t *hp, int L)
+{
+	int i, k, l, b;
+	FLOAT x, y, **et;
+	char *seq;
+	seq = (char*)calloc(L+1, 1);
+	// calculate the transpose of hp->e[][]
+	et = (FLOAT**)calloc2(hp->n, hp->m, sizeof(FLOAT));
+	for (k = 0; k != hp->n; ++k)
+		for (b = 0; b != hp->m; ++b)
+			et[k][b] = hp->e[b][k];
+	// the initial state, drawn from a0[]
+	x = drand48();
+	for (k = 0, y = 0.0; k != hp->n; ++k) {
+		y += hp->a0[k];
+		if (y >= x) break;
+	}
+	// main loop
+	for (i = 0; i != L; ++i) {
+		FLOAT *el, *ak = hp->a[k];
+		x = drand48();
+		for (l = 0, y = 0.0; l != hp->n; ++l) {
+			y += ak[l];
+			if (y >= x) break;
+		}
+		el = et[l];
+		x = drand48();
+		for (b = 0, y = 0.0; b != hp->m; ++b) {
+			y += el[b];
+			if (y >= x) break;
+		} 
+		seq[i] = b;
+		k = l;
+	}
+	for (k = 0; k != hp->n; ++k) free(et[k]);
+	free(et);
+	return seq;
+}
--- a/ext/klib/khmm.h
+++ b/ext/klib/khmm.h
@ -0,0 +1,107 @@
+#ifndef AC_SCHMM_H_
+#define AC_SCHMM_H_
+
+/*
+ * Last Modified: 2008-03-10
+ * Version: 0.1.0-8
+ *
+ * 2008-03-10, 0.1.0-8: make icc report two more "VECTORIZED"
+ * 2008-03-10, 0.1.0-7: accelerate for some CPU
+ * 2008-02-07, 0.1.0-6: simulate sequences
+ * 2008-01-15, 0.1.0-5: goodness of fit
+ * 2007-11-20, 0.1.0-4: add function declaration of hmm_post_decode()
+ * 2007-11-09: fix a memory leak
+ */
+
+#include <stdlib.h>
+
+#define HMM_VERSION "0.1.0-7"
+
+#define HMM_FORWARD  0x02
+#define HMM_BACKWARD 0x04
+#define HMM_VITERBI  0x40
+#define HMM_POSTDEC  0x80
+
+#ifndef FLOAT
+#define FLOAT double
+#endif
+#define HMM_TINY     1e-25
+#define HMM_INF      1e300
+
+typedef struct
+{
+	int m, n; // number of symbols, number of states
+	FLOAT **a, **e; // transition matrix and emitting probilities
+	FLOAT **ae; // auxiliary array for acceleration, should be calculated by hmm_pre_backward()
+	FLOAT *a0; // trasition matrix from the start state
+} hmm_par_t;
+
+typedef struct
+{
+	int L;
+	unsigned status;
+	char *seq;
+	FLOAT **f, **b, *s;
+	int *v; // Viterbi path
+	int *p; // posterior decoding
+} hmm_data_t;
+
+typedef struct
+{
+	int m, n;
+	FLOAT Q0, **A, **E, *A0;
+} hmm_exp_t;
+
+typedef struct
+{
+	int l, *obs;
+	FLOAT *thr;
+} hmm_gof_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+	/* initialize and destroy hmm_par_t */
+	hmm_par_t *hmm_new_par(int m, int n);
+	void hmm_delete_par(hmm_par_t *hp);
+	/* initialize and destroy hmm_data_t */
+	hmm_data_t *hmm_new_data(int L, const char *seq, const hmm_par_t *hp);
+	void hmm_delete_data(hmm_data_t *hd);
+	/* initialize and destroy hmm_exp_t */
+	hmm_exp_t *hmm_new_exp(const hmm_par_t *hp);
+	void hmm_delete_exp(hmm_exp_t *he);
+	/* Viterbi, forward and backward algorithms */
+	FLOAT hmm_Viterbi(const hmm_par_t *hp, hmm_data_t *hd);
+	void hmm_pre_backward(hmm_par_t *hp);
+	void hmm_forward(const hmm_par_t *hp, hmm_data_t *hd);
+	void hmm_backward(const hmm_par_t *hp, hmm_data_t *hd);
+	/* log-likelihood of the observations (natural based) */
+	FLOAT hmm_lk(const hmm_data_t *hd);
+	/* posterior probability at the position on the sequence */
+	FLOAT hmm_post_state(const hmm_par_t *hp, const hmm_data_t *hd, int u, FLOAT *prob);
+	/* posterior decoding */
+	void hmm_post_decode(const hmm_par_t *hp, hmm_data_t *hd);
+	/* expected counts of transitions and emissions */
+	hmm_exp_t *hmm_expect(const hmm_par_t *hp, const hmm_data_t *hd);
+	/* add he0 counts to he1 counts*/
+	void hmm_add_expect(const hmm_exp_t *he0, hmm_exp_t *he1);
+	/* the Q function that should be maximized in EM */
+	FLOAT hmm_Q(const hmm_par_t *hp, const hmm_exp_t *he);
+	FLOAT hmm_Q0(const hmm_par_t *hp, hmm_exp_t *he);
+	/* simulate sequences */
+	char *hmm_simulate(const hmm_par_t *hp, int L);
+#ifdef __cplusplus
+}
+#endif
+
+static inline void **calloc2(int n_row, int n_col, int size)
+{
+	char **p;
+	int k;
+	p = (char**)malloc(sizeof(char*) * n_row);
+	for (k = 0; k != n_row; ++k)
+		p[k] = (char*)calloc(n_col, size);
+	return (void**)p;
+}
+
+#endif
--- a/ext/klib/klist.h
+++ b/ext/klib/klist.h
@ -0,0 +1,135 @@
+/* The MIT License
+
+   Copyright (c) 2008-2009, by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef _AC_KLIST_H
+#define _AC_KLIST_H
+
+#include <stdlib.h>
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+#define KMEMPOOL_INIT2(SCOPE, name, kmptype_t, kmpfree_f)				\
+	typedef struct {													\
+		size_t cnt, n, max;												\
+		kmptype_t **buf;												\
+	} kmp_##name##_t;													\
+	SCOPE kmp_##name##_t *kmp_init_##name(void) {						\
+		return calloc(1, sizeof(kmp_##name##_t));						\
+	}																	\
+	SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) {					\
+		size_t k;														\
+		for (k = 0; k < mp->n; ++k) {									\
+			kmpfree_f(mp->buf[k]); free(mp->buf[k]);					\
+		}																\
+		free(mp->buf); free(mp);										\
+	}																	\
+	SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) {				\
+		++mp->cnt;														\
+		if (mp->n == 0) return calloc(1, sizeof(kmptype_t));			\
+		return mp->buf[--mp->n];										\
+	}																	\
+	SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) {		\
+		--mp->cnt;														\
+		if (mp->n == mp->max) {											\
+			mp->max = mp->max? mp->max<<1 : 16;							\
+			mp->buf = realloc(mp->buf, sizeof(kmptype_t *) * mp->max);	\
+		}																\
+		mp->buf[mp->n++] = p;											\
+	}
+
+#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f)						\
+	KMEMPOOL_INIT2(static inline klib_unused, name, kmptype_t, kmpfree_f)
+
+#define kmempool_t(name) kmp_##name##_t
+#define kmp_init(name) kmp_init_##name()
+#define kmp_destroy(name, mp) kmp_destroy_##name(mp)
+#define kmp_alloc(name, mp) kmp_alloc_##name(mp)
+#define kmp_free(name, mp, p) kmp_free_##name(mp, p)
+
+#define KLIST_INIT2(SCOPE, name, kltype_t, kmpfree_t)					\
+	struct __kl1_##name {												\
+		kltype_t data;													\
+		struct __kl1_##name *next;										\
+	};																	\
+	typedef struct __kl1_##name kl1_##name;								\
+	KMEMPOOL_INIT2(SCOPE, name, kl1_##name, kmpfree_t)					\
+	typedef struct {													\
+		kl1_##name *head, *tail;										\
+		kmp_##name##_t *mp;												\
+		size_t size;													\
+	} kl_##name##_t;													\
+	SCOPE kl_##name##_t *kl_init_##name(void) {							\
+		kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t));			\
+		kl->mp = kmp_init(name);										\
+		kl->head = kl->tail = kmp_alloc(name, kl->mp);					\
+		kl->head->next = 0;												\
+		return kl;														\
+	}																	\
+	SCOPE void kl_destroy_##name(kl_##name##_t *kl) {					\
+		kl1_##name *p;													\
+		for (p = kl->head; p != kl->tail; p = p->next)					\
+			kmp_free(name, kl->mp, p);									\
+		kmp_free(name, kl->mp, p);										\
+		kmp_destroy(name, kl->mp);										\
+		free(kl);														\
+	}																	\
+	SCOPE kltype_t *kl_pushp_##name(kl_##name##_t *kl) {				\
+		kl1_##name *q, *p = kmp_alloc(name, kl->mp);					\
+		q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p;	\
+		++kl->size;														\
+		return &q->data;												\
+	}																	\
+	SCOPE int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) {			\
+		kl1_##name *p;													\
+		if (kl->head->next == 0) return -1;								\
+		--kl->size;														\
+		p = kl->head; kl->head = kl->head->next;						\
+		if (d) *d = p->data;											\
+		kmp_free(name, kl->mp, p);										\
+		return 0;														\
+	}
+
+#define KLIST_INIT(name, kltype_t, kmpfree_t)							\
+	KLIST_INIT2(static inline klib_unused, name, kltype_t, kmpfree_t)
+
+#define kliter_t(name) kl1_##name
+#define klist_t(name) kl_##name##_t
+#define kl_val(iter) ((iter)->data)
+#define kl_next(iter) ((iter)->next)
+#define kl_begin(kl) ((kl)->head)
+#define kl_end(kl) ((kl)->tail)
+
+#define kl_init(name) kl_init_##name()
+#define kl_destroy(name, kl) kl_destroy_##name(kl)
+#define kl_pushp(name, kl) kl_pushp_##name(kl)
+#define kl_shift(name, kl, d) kl_shift_##name(kl, d)
+
+#endif
--- a/ext/klib/kmath.c
+++ b/ext/klib/kmath.c
@ -0,0 +1,447 @@
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "kmath.h"
+
+/******************************
+ *** Non-linear programming ***
+ ******************************/
+
+/* Hooke-Jeeves algorithm for nonlinear minimization
+ 
+   Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and
+   the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the
+   papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM
+   6(6):313-314). The original algorithm was designed by Hooke and
+   Jeeves (ACM 8:212-229). This program is further revised according to
+   Johnson's implementation at Netlib (opt/hooke.c).
+ 
+   Hooke-Jeeves algorithm is very simple and it works quite well on a
+   few examples. However, it might fail to converge due to its heuristic
+   nature. A possible improvement, as is suggested by Johnson, may be to
+   choose a small r at the beginning to quickly approach to the minimum
+   and a large r at later step to hit the minimum.
+ */
+
+static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls)
+{
+	int k, j = *n_calls;
+	double ftmp;
+	for (k = 0; k != n; ++k) {
+		x1[k] += dx[k];
+		ftmp = func(n, x1, data); ++j;
+		if (ftmp < fx1) fx1 = ftmp;
+		else { /* search the opposite direction */
+			dx[k] = 0.0 - dx[k];
+			x1[k] += dx[k] + dx[k];
+			ftmp = func(n, x1, data); ++j;
+			if (ftmp < fx1) fx1 = ftmp;
+			else x1[k] -= dx[k]; /* back to the original x[k] */
+		}
+	}
+	*n_calls = j;
+	return fx1; /* here: fx1=f(n,x1) */
+}
+
+double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls)
+{
+	double fx, fx1, *x1, *dx, radius;
+	int k, n_calls = 0;
+	x1 = (double*)calloc(n, sizeof(double));
+	dx = (double*)calloc(n, sizeof(double));
+	for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */
+		dx[k] = fabs(x[k]) * r;
+		if (dx[k] == 0) dx[k] = r;
+	}
+	radius = r;
+	fx1 = fx = func(n, x, data); ++n_calls;
+	for (;;) {
+		memcpy(x1, x, n * sizeof(double)); /* x1 = x */
+		fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls);
+		while (fx1 < fx) {
+			for (k = 0; k != n; ++k) {
+				double t = x[k];
+				dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]);
+				x[k] = x1[k];
+				x1[k] = x1[k] + x1[k] - t;
+			}
+			fx = fx1;
+			if (n_calls >= max_calls) break;
+			fx1 = func(n, x1, data); ++n_calls;
+			fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls);
+			if (fx1 >= fx) break;
+			for (k = 0; k != n; ++k)
+				if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break;
+			if (k == n) break;
+		}
+		if (radius >= eps) {
+			if (n_calls >= max_calls) break;
+			radius *= r;
+			for (k = 0; k != n; ++k) dx[k] *= r;
+		} else break; /* converge */
+	}
+	free(x1); free(dx);
+	return fx1;
+}
+
+// I copied this function somewhere several years ago with some of my modifications, but I forgot the source.
+double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin)
+{
+	double bound, u, r, q, fu, tmp, fa, fb, fc, c;
+	const double gold1 = 1.6180339887;
+	const double gold2 = 0.3819660113;
+	const double tiny = 1e-20;
+	const int max_iter = 100;
+
+	double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw;
+	int iter;
+
+	fa = func(a, data); fb = func(b, data);
+	if (fb > fa) { // swap, such that f(a) > f(b)
+		tmp = a; a = b; b = tmp;
+		tmp = fa; fa = fb; fb = tmp;
+	}
+	c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation
+	while (fb > fc) {
+		bound = b + 100.0 * (c - b); // the farthest point where we want to go
+		r = (b - a) * (fb - fc);
+		q = (b - c) * (fb - fa);
+		if (fabs(q - r) < tiny) { // avoid 0 denominator
+			tmp = q > r? tiny : 0.0 - tiny;
+		} else tmp = q - r;
+		u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point
+		if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c
+			fu = func(u, data);
+			if (fu < fc) { // (b,u,c) bracket the minimum
+				a = b; b = u; fa = fb; fb = fu;
+				break;
+			} else if (fu > fb) { // (a,b,u) bracket the minimum
+				c = u; fc = fu;
+				break;
+			}
+			u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation
+		} else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound
+			fu = func(u, data);
+			if (fu < fc) { // fb > fc > fu
+				b = c; c = u; u = c + gold1 * (c - b);
+				fb = fc; fc = fu; fu = func(u, data);
+			} else { // (b,c,u) bracket the minimum
+				a = b; b = c; c = u;
+				fa = fb; fb = fc; fc = fu;
+				break;
+			}
+		} else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound
+			u = bound; fu = func(u, data);
+		} else { // u goes the other way around, use golden section extrapolation
+			u = c + gold1 * (c - b); fu = func(u, data);
+		}
+		a = b; b = c; c = u;
+		fa = fb; fb = fc; fc = fu;
+	}
+	if (a > c) u = a, a = c, c = u; // swap
+
+	// now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
+	e = d = 0.0;
+	w = v = b; fv = fw = fb;
+	for (iter = 0; iter != max_iter; ++iter) {
+		mid = 0.5 * (a + c);
+		tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny);
+		if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) {
+			*xmin = b; return fb; // found
+		}
+		if (fabs(e) > tol1) {
+			// related to parabolic interpolation
+			r = (b - w) * (fb - fv);
+			q = (b - v) * (fb - fw);
+			p = (b - v) * q - (b - w) * r;
+			q = 2.0 * (q - r);
+			if (q > 0.0) p = 0.0 - p;
+			else q = 0.0 - q;
+			eold = e; e = d;
+			if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) {
+				d = gold2 * (e = (b >= mid ? a - b : c - b));
+			} else {
+				d = p / q; u = b + d; // actual parabolic interpolation happens here
+				if (u - a < tol2 || c - u < tol2)
+					d = (mid > b)? tol1 : 0.0 - tol1;
+			}
+		} else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation
+		u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1);
+		fu = func(u, data);
+		if (fu <= fb) { // u is the minimum point so far
+			if (u >= b) a = b;
+			else c = b;
+			v = w; w = b; b = u; fv = fw; fw = fb; fb = fu;
+		} else { // adjust (a,c) and (u,v,w)
+			if (u < b) a = u;
+			else c = u;
+			if (fu <= fw || w == b) {
+				v = w; w = u;
+				fv = fw; fw = fu;
+			} else if (fu <= fv || v == b || v == w) {
+				v = u; fv = fu;
+			}
+		}
+	}
+	*xmin = b;
+	return fb;
+}
+
+static inline float SIGN(float a, float b)
+{
+	return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);
+}
+
+double krf_brent(double x1, double x2, double tol, double (*func)(double, void*), void *data, int *err)
+{
+	const int max_iter = 100;
+	const double eps = 3e-8f;
+	int i;
+	double a = x1, b = x2, c = x2, d, e, min1, min2;
+	double fa, fb, fc, p, q, r, s, tol1, xm;
+
+	*err = 0;
+	fa = func(a, data), fb = func(b, data);
+	if ((fa > 0.0f && fb > 0.0f) || (fa < 0.0f && fb < 0.0f)) {
+		*err = -1;
+		return 0.0f;
+	}
+	fc = fb;
+	for (i = 0; i < max_iter; ++i) {
+		if ((fb > 0.0f && fc > 0.0f) || (fb < 0.0f && fc < 0.0f)) {
+			c = a;
+			fc = fa;
+			e = d = b - a;
+		}
+		if (fabs(fc) < fabs(fb)) {
+			a = b, b = c, c = a;
+			fa = fb, fb = fc, fc = fa;
+		}
+		tol1 = 2.0f * eps * fabs(b) + 0.5f * tol;
+		xm = 0.5f * (c - b);
+		if (fabs(xm) <= tol1 || fb == 0.0f)
+			return b;
+		if (fabs(e) >= tol1 && fabs(fa) > fabs(fb)) {
+			s = fb / fa;
+			if (a == c) {
+				p = 2.0f * xm * s;
+				q = 1.0f - s;
+			} else {
+				q = fa / fc;
+				r = fb / fc;
+				p = s * (2.0f * xm * q * (q - r) - (b - a) * (r - 1.0f));
+				q = (q - 1.0f) * (r - 1.0f) * (s - 1.0f);
+			}
+			if (p > 0.0f) q = -q;
+			p = fabs(p);
+			min1 = 3.0f * xm * q - fabs(tol1 * q);
+			min2 = fabs(e * q);
+			if (2.0f * p < (min1 < min2 ? min1 : min2)) {
+				e = d;
+				d = p / q;
+			} else {
+				d = xm;
+				e = d;
+			}
+		} else {
+			d = xm;
+			e = d;
+		}
+		a = b;
+		fa = fb;
+		if (fabs(d) > tol1) b += d;
+		else b += SIGN(tol1, xm);
+		fb = func(b, data);
+	}
+	*err = -2;
+	return 0.0;
+}
+
+/*************************
+ *** Special functions ***
+ *************************/
+
+/* Log gamma function
+ * \log{\Gamma(z)}
+ * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+ */
+double kf_lgamma(double z)
+{
+	double x = 0;
+	x += 0.1659470187408462e-06 / (z+7);
+	x += 0.9934937113930748e-05 / (z+6);
+	x -= 0.1385710331296526     / (z+5);
+	x += 12.50734324009056      / (z+4);
+	x -= 176.6150291498386      / (z+3);
+	x += 771.3234287757674      / (z+2);
+	x -= 1259.139216722289      / (z+1);
+	x += 676.5203681218835      / z;
+	x += 0.9999999999995183;
+	return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
+}
+
+/* complementary error function
+ * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
+ * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
+ */
+double kf_erfc(double x)
+{
+	const double p0 = 220.2068679123761;
+	const double p1 = 221.2135961699311;
+	const double p2 = 112.0792914978709;
+	const double p3 = 33.912866078383;
+	const double p4 = 6.37396220353165;
+	const double p5 = .7003830644436881;
+	const double p6 = .03526249659989109;
+	const double q0 = 440.4137358247522;
+	const double q1 = 793.8265125199484;
+	const double q2 = 637.3336333788311;
+	const double q3 = 296.5642487796737;
+	const double q4 = 86.78073220294608;
+	const double q5 = 16.06417757920695;
+	const double q6 = 1.755667163182642;
+	const double q7 = .08838834764831844;
+	double expntl, z, p;
+	z = fabs(x) * M_SQRT2;
+	if (z > 37.) return x > 0.? 0. : 2.;
+	expntl = exp(z * z * - .5);
+	if (z < 10. / M_SQRT2) // for small z
+	    p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
+			/ (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
+	else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
+	return x > 0.? 2. * p : 2. * (1. - p);
+}
+
+/* The following computes regularized incomplete gamma functions.
+ * Formulas are taken from Wiki, with additional input from Numerical
+ * Recipes in C (for modified Lentz's algorithm) and AS245
+ * (http://lib.stat.cmu.edu/apstat/245).
+ *
+ * A good online calculator is available at:
+ *
+ *   http://www.danielsoper.com/statcalc/calc23.aspx
+ *
+ * It calculates upper incomplete gamma function, which equals
+ * kf_gammaq(s,z)*tgamma(s).
+ */
+
+#define KF_GAMMA_EPS 1e-14
+#define KF_TINY 1e-290
+
+// regularized lower incomplete gamma function, by series expansion
+static double _kf_gammap(double s, double z)
+{
+	double sum, x;
+	int k;
+	for (k = 1, sum = x = 1.; k < 100; ++k) {
+		sum += (x *= z / (s + k));
+		if (x / sum < KF_GAMMA_EPS) break;
+	}
+	return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
+}
+// regularized upper incomplete gamma function, by continued fraction
+static double _kf_gammaq(double s, double z)
+{
+	int j;
+	double C, D, f;
+	f = 1. + z - s; C = f; D = 0.;
+	// Modified Lentz's algorithm for computing continued fraction
+	// See Numerical Recipes in C, 2nd edition, section 5.2
+	for (j = 1; j < 100; ++j) {
+		double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
+		D = b + a * D;
+		if (D < KF_TINY) D = KF_TINY;
+		C = b + a / C;
+		if (C < KF_TINY) C = KF_TINY;
+		D = 1. / D;
+		d = C * D;
+		f *= d;
+		if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+	}
+	return exp(s * log(z) - z - kf_lgamma(s) - log(f));
+}
+
+double kf_gammap(double s, double z)
+{
+	return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
+}
+
+double kf_gammaq(double s, double z)
+{
+	return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
+}
+
+/* Regularized incomplete beta function. The method is taken from
+ * Numerical Recipe in C, 2nd edition, section 6.4. The following web
+ * page calculates the incomplete beta function, which equals
+ * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
+ *
+ *   http://www.danielsoper.com/statcalc/calc36.aspx
+ */
+static double kf_betai_aux(double a, double b, double x)
+{
+	double C, D, f;
+	int j;
+	if (x == 0.) return 0.;
+	if (x == 1.) return 1.;
+	f = 1.; C = f; D = 0.;
+	// Modified Lentz's algorithm for computing continued fraction
+	for (j = 1; j < 200; ++j) {
+		double aa, d;
+		int m = j>>1;
+		aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
+			: m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
+		D = 1. + aa * D;
+		if (D < KF_TINY) D = KF_TINY;
+		C = 1. + aa / C;
+		if (C < KF_TINY) C = KF_TINY;
+		D = 1. / D;
+		d = C * D;
+		f *= d;
+		if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+	}
+	return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
+}
+double kf_betai(double a, double b, double x)
+{
+	return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
+}
+
+/******************
+ *** Statistics ***
+ ******************/
+
+double km_ks_dist(int na, const double a[], int nb, const double b[]) // a[] and b[] MUST BE sorted
+{
+	int ia = 0, ib = 0;
+	double fa = 0, fb = 0, sup = 0, na1 = 1. / na, nb1 = 1. / nb;
+	while (ia < na || ib < nb) {
+		if (ia == na) fb += nb1, ++ib;
+		else if (ib == nb) fa += na1, ++ia;
+		else if (a[ia] < b[ib]) fa += na1, ++ia;
+		else if (a[ia] > b[ib]) fb += nb1, ++ib;
+		else fa += na1, fb += nb1, ++ia, ++ib;
+		if (sup < fabs(fa - fb)) sup = fabs(fa - fb);
+	}
+	return sup;
+}
+
+#ifdef KF_MAIN
+#include <stdio.h>
+#include "ksort.h"
+KSORT_INIT_GENERIC(double)
+int main(int argc, char *argv[])
+{
+	double x = 5.5, y = 3;
+	double a, b;
+	double xx[] = {0.22, -0.87, -2.39, -1.79, 0.37, -1.54, 1.28, -0.31, -0.74, 1.72, 0.38, -0.17, -0.62, -1.10, 0.30, 0.15, 2.30, 0.19, -0.50, -0.09};
+	double yy[] = {-5.13, -2.19, -2.43, -3.83, 0.50, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11, 4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50};
+	ks_introsort(double, 20, xx); ks_introsort(double, 20, yy);
+	printf("K-S distance: %f\n", km_ks_dist(20, xx, 20, yy));
+	printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
+	printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
+	a = 2; b = 2; x = 0.5;
+	printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
+	return 0;
+}
+#endif
--- a/ext/klib/kmath.h
+++ b/ext/klib/kmath.h
@ -0,0 +1,38 @@
+#ifndef AC_KMATH_H
+#define AC_KMATH_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/**************************
+	 * Non-linear programming *
+	 **************************/
+
+	#define KMIN_RADIUS  0.5
+	#define KMIN_EPS     1e-7
+	#define KMIN_MAXCALL 50000
+
+	typedef double (*kmin_f)(int, double*, void*);
+	typedef double (*kmin1_f)(double, void*);
+
+	double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls); // Hooke-Jeeves'
+	double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin); // Brent's 1-dimenssion
+
+	/*********************
+	 * Special functions *
+	 *********************/
+
+	double kf_lgamma(double z); // log gamma function
+	double kf_erfc(double x); // complementary error function
+	double kf_gammap(double s, double z); // regularized lower incomplete gamma function
+	double kf_gammaq(double s, double z); // regularized upper incomplete gamma function
+	double kf_betai(double a, double b, double x); // regularized incomplete beta function
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ext/klib/knetfile.c
+++ b/ext/klib/knetfile.c
@ -0,0 +1,628 @@
+/* The MIT License
+
+   Copyright (c) 2008 by Genome Research Ltd (GRL).
+                 2010 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Probably I will not do socket programming in the next few years and
+   therefore I decide to heavily annotate this file, for Linux and
+   Windows as well.  -ac */
+
+#include <time.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#include "knetfile.h"
+
+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
+ * integer -1. In knetfile.c, I use "int" for socket type
+ * throughout. This should be improved to avoid confusion.
+ *
+ * In Linux/Mac, recv() and read() do almost the same thing. You can see
+ * in the header file that netread() is simply an alias of read(). In
+ * Windows, however, they are different and using recv() is mandatory.
+ */
+
+/* This function tests if the file handler is ready for reading (or
+ * writing if is_read==0). */
+static int socket_wait(int fd, int is_read)
+{
+	fd_set fds, *fdr = 0, *fdw = 0;
+	struct timeval tv;
+	int ret;
+	tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+	FD_ZERO(&fds);
+	FD_SET(fd, &fds);
+	if (is_read) fdr = &fds;
+	else fdw = &fds;
+	ret = select(fd+1, fdr, fdw, 0, &tv);
+#ifndef _WIN32
+	if (ret == -1) perror("select");
+#else
+	if (ret == 0)
+		fprintf(stderr, "select time-out\n");
+	else if (ret == SOCKET_ERROR)
+		fprintf(stderr, "select: %d\n", WSAGetLastError());
+#endif
+	return ret;
+}
+
+#ifndef _WIN32
+/* This function does not work with Windows due to the lack of
+ * getaddrinfo() in winsock. It is addapted from an example in "Beej's
+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+	int ai_err, on = 1, fd;
+	struct linger lng = { 0, 0 };
+	struct addrinfo hints, *res = 0;
+	memset(&hints, 0, sizeof(struct addrinfo));
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	/* In Unix/Mac, getaddrinfo() is the most convenient way to get
+	 * server information. */
+	if ((ai_err = getaddrinfo(host, port, &hints, &res)) != 0) { fprintf(stderr, "can't resolve %s:%s: %s\n", host, port, gai_strerror(ai_err)); return -1; }
+	if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+	/* The following two setsockopt() are used by ftplib
+	 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
+	 * necessary. */
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+	freeaddrinfo(res);
+	return fd;
+}
+#else
+/* MinGW's printf has problem with "%lld" */
+char *int64tostr(char *buf, int64_t x)
+{
+	int cnt;
+	int i = 0;
+	do {
+		buf[i++] = '0' + x % 10;
+		x /= 10;
+	} while (x);
+	buf[i] = 0;
+	for (cnt = i, i = 0; i < cnt/2; ++i) {
+		int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
+	}
+	return buf;
+}
+
+int64_t strtoint64(const char *buf)
+{
+	int64_t x;
+	for (x = 0; *buf != '\0'; ++buf)
+		x = x * 10 + ((int64_t) *buf - 48);
+	return x;
+}
+/* In windows, the first thing is to establish the TCP connection. */
+int knet_win32_init()
+{
+	WSADATA wsaData;
+	return WSAStartup(MAKEWORD(2, 2), &wsaData);
+}
+void knet_win32_destroy()
+{
+	WSACleanup();
+}
+/* A slightly modfied version of the following function also works on
+ * Mac (and presummably Linux). However, this function is not stable on
+ * my Mac. It sometimes works fine but sometimes does not. Therefore for
+ * non-Windows OS, I do not use this one. */
+static SOCKET socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func)										\
+	do {														\
+		fprintf(stderr, "%s: %d\n", func, WSAGetLastError());	\
+		return -1;												\
+	} while (0)
+
+	int on = 1;
+	SOCKET fd;
+	struct linger lng = { 0, 0 };
+	struct sockaddr_in server;
+	struct hostent *hp = 0;
+	// open socket
+	if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	// get host info
+	if (isalpha(host[0])) hp = gethostbyname(host);
+	else {
+		struct in_addr addr;
+		addr.s_addr = inet_addr(host);
+		hp = gethostbyaddr((char*)&addr, 4, AF_INET);
+	}
+	if (hp == 0) __err_connect("gethost");
+	// connect
+	server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
+	server.sin_family= AF_INET;
+	server.sin_port = htons(atoi(port));
+	if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
+	// freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
+	return fd;
+}
+#endif
+
+static off_t my_netread(int fd, void *buf, off_t len)
+{
+	off_t rest = len, curr, l = 0;
+	/* recv() and read() may not read the required length of data with
+	 * one call. They have to be called repeatedly. */
+	while (rest) {
+		if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
+		curr = netread(fd, buf + l, rest);
+		/* According to the glibc manual, section 13.2, a zero returned
+		 * value indicates end-of-file (EOF), which should mean that
+		 * read() will not return zero if EOF has not been met but data
+		 * are not immediately available. */
+		if (curr == 0) break;
+		l += curr; rest -= curr;
+	}
+	return l;
+}
+
+/*************************
+ * FTP specific routines *
+ *************************/
+
+static int kftp_get_response(knetFile *ftp)
+{
+#ifndef _WIN32
+	unsigned char c;
+#else
+	char c;
+#endif
+	int n = 0;
+	char *p;
+	if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
+	while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+		//fputc(c, stderr);
+		if (n >= ftp->max_response) {
+			ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
+			ftp->response = (char*)realloc(ftp->response, ftp->max_response);
+		}
+		ftp->response[n++] = c;
+		if (c == '\n') {
+			if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
+				&& ftp->response[3] != '-') break;
+			n = 0;
+			continue;
+		}
+	}
+	if (n < 2) return -1;
+	ftp->response[n-2] = 0;
+	return strtol(ftp->response, &p, 0);
+}
+
+static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
+{
+	if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+	netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
+	return is_get? kftp_get_response(ftp) : 0;
+}
+
+static int kftp_pasv_prep(knetFile *ftp)
+{
+	char *p;
+	int v[6];
+	kftp_send_cmd(ftp, "PASV\r\n", 1);
+	for (p = ftp->response; *p && *p != '('; ++p);
+	if (*p != '(') return -1;
+	++p;
+	sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+	memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
+	ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
+	return 0;
+}
+
+
+static int kftp_pasv_connect(knetFile *ftp)
+{
+	char host[80], port[10];
+	if (ftp->pasv_port == 0) {
+		fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
+		return -1;
+	}
+	sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
+	sprintf(port, "%d", ftp->pasv_port);
+	ftp->fd = socket_connect(host, port);
+	if (ftp->fd == -1) return -1;
+	return 0;
+}
+
+int kftp_connect(knetFile *ftp)
+{
+	ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
+	if (ftp->ctrl_fd == -1) return -1;
+	kftp_get_response(ftp);
+	kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+	kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+	kftp_send_cmd(ftp, "TYPE I\r\n", 1);
+	return 0;
+}
+
+int kftp_reconnect(knetFile *ftp)
+{
+	if (ftp->ctrl_fd != -1) {
+		netclose(ftp->ctrl_fd);
+		ftp->ctrl_fd = -1;
+	}
+	netclose(ftp->fd);
+	ftp->fd = -1;
+	return kftp_connect(ftp);
+}
+
+// initialize ->type, ->host, ->retr and ->size
+knetFile *kftp_parse_url(const char *fn, const char *mode)
+{
+	knetFile *fp;
+	char *p;
+	int l;
+	if (strstr(fn, "ftp://") != fn) return 0;
+	for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+	if (*p != '/') return 0;
+	l = p - fn - 6;
+	fp = (knetFile*)calloc(1, sizeof(knetFile));
+	fp->type = KNF_TYPE_FTP;
+	fp->fd = -1;
+	/* the Linux/Mac version of socket_connect() also recognizes a port
+	 * like "ftp", but the Windows version does not. */
+	fp->port = strdup("21");
+	fp->host = (char*)calloc(l + 1, 1);
+	if (strchr(mode, 'c')) fp->no_reconnect = 1;
+	strncpy(fp->host, fn + 6, l);
+	fp->retr = (char*)calloc(strlen(p) + 8, 1);
+	sprintf(fp->retr, "RETR %s\r\n", p);
+    fp->size_cmd = (char*)calloc(strlen(p) + 8, 1);
+    sprintf(fp->size_cmd, "SIZE %s\r\n", p);
+	fp->seek_offset = 0;
+	return fp;
+}
+// place ->fd at offset off
+int kftp_connect_file(knetFile *fp)
+{
+	int ret;
+	long long file_size;
+	if (fp->fd != -1) {
+		netclose(fp->fd);
+		if (fp->no_reconnect) kftp_get_response(fp);
+	}
+	kftp_pasv_prep(fp);
+    kftp_send_cmd(fp, fp->size_cmd, 1);
+#ifndef _WIN32
+    if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
+    {
+        fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
+        return -1;
+    }
+#else
+	const char *p = fp->response;
+	while (*p != ' ') ++p;
+	while (*p < '0' || *p > '9') ++p;
+	file_size = strtoint64(p);
+#endif
+	fp->file_size = file_size;
+	if (fp->offset>=0) {
+		char tmp[32];
+#ifndef _WIN32
+		sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+#else
+		strcpy(tmp, "REST ");
+		int64tostr(tmp + 5, fp->offset);
+		strcat(tmp, "\r\n");
+#endif
+		kftp_send_cmd(fp, tmp, 1);
+	}
+	kftp_send_cmd(fp, fp->retr, 0);
+	kftp_pasv_connect(fp);
+	ret = kftp_get_response(fp);
+	if (ret != 150) {
+		fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	fp->is_ready = 1;
+	return 0;
+}
+
+
+/**************************
+ * HTTP specific routines *
+ **************************/
+
+knetFile *khttp_parse_url(const char *fn, const char *mode)
+{
+	knetFile *fp;
+	char *p, *proxy, *q;
+	int l;
+	if (strstr(fn, "http://") != fn) return 0;
+	// set ->http_host
+	for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+	l = p - fn - 7;
+	fp = (knetFile*)calloc(1, sizeof(knetFile));
+	fp->http_host = (char*)calloc(l + 1, 1);
+	strncpy(fp->http_host, fn + 7, l);
+	fp->http_host[l] = 0;
+	for (q = fp->http_host; *q && *q != ':'; ++q);
+	if (*q == ':') *q++ = 0;
+	// get http_proxy
+	proxy = getenv("http_proxy");
+	// set ->host, ->port and ->path
+	if (proxy == 0) {
+		fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
+		fp->port = strdup(*q? q : "80");
+		fp->path = strdup(*p? p : "/");
+	} else {
+		fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+		for (q = fp->host; *q && *q != ':'; ++q);
+		if (*q == ':') *q++ = 0; 
+		fp->port = strdup(*q? q : "80");
+		fp->path = strdup(fn);
+	}
+	fp->type = KNF_TYPE_HTTP;
+	fp->ctrl_fd = fp->fd = -1;
+	fp->seek_offset = 0;
+	return fp;
+}
+
+int khttp_connect_file(knetFile *fp)
+{
+	int ret, l = 0;
+	char *buf, *p;
+	if (fp->fd != -1) netclose(fp->fd);
+	fp->fd = socket_connect(fp->host, fp->port);
+	buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+	l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
+    l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
+	l += sprintf(buf + l, "\r\n");
+	netwrite(fp->fd, buf, l);
+	l = 0;
+	while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+		if (buf[l] == '\n' && l >= 3)
+			if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+		++l;
+	}
+	buf[l] = 0;
+	if (l < 14) { // prematured header
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	ret = strtol(buf + 8, &p, 0); // HTTP return code
+	if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
+		off_t rest = fp->offset;
+		while (rest) {
+			off_t l = rest < 0x10000? rest : 0x10000;
+			rest -= my_netread(fp->fd, buf, l);
+		}
+	} else if (ret != 206 && ret != 200) {
+		free(buf);
+		fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	free(buf);
+	fp->is_ready = 1;
+	return 0;
+}
+
+/********************
+ * Generic routines *
+ ********************/
+
+knetFile *knet_open(const char *fn, const char *mode)
+{
+	knetFile *fp = 0;
+	if (mode[0] != 'r') {
+		fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
+		return 0;
+	}
+	if (strstr(fn, "ftp://") == fn) {
+		fp = kftp_parse_url(fn, mode);
+		if (fp == 0) return 0;
+		if (kftp_connect(fp) == -1) {
+			knet_close(fp);
+			return 0;
+		}
+		kftp_connect_file(fp);
+	} else if (strstr(fn, "http://") == fn) {
+		fp = khttp_parse_url(fn, mode);
+		if (fp == 0) return 0;
+		khttp_connect_file(fp);
+	} else { // local file
+#ifdef _WIN32
+		/* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
+		 * be undefined on some systems, although it is defined on my
+		 * Mac and the Linux I have tested on. */
+		int fd = open(fn, O_RDONLY | O_BINARY);
+#else		
+		int fd = open(fn, O_RDONLY);
+#endif
+		if (fd == -1) {
+			perror("open");
+			return 0;
+		}
+		fp = (knetFile*)calloc(1, sizeof(knetFile));
+		fp->type = KNF_TYPE_LOCAL;
+		fp->fd = fd;
+		fp->ctrl_fd = -1;
+	}
+	if (fp && fp->fd == -1) {
+		knet_close(fp);
+		return 0;
+	}
+	return fp;
+}
+
+knetFile *knet_dopen(int fd, const char *mode)
+{
+	knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
+	fp->type = KNF_TYPE_LOCAL;
+	fp->fd = fd;
+	return fp;
+}
+
+off_t knet_read(knetFile *fp, void *buf, off_t len)
+{
+	off_t l = 0;
+	if (fp->fd == -1) return 0;
+	if (fp->type == KNF_TYPE_FTP) {
+		if (fp->is_ready == 0) {
+			if (!fp->no_reconnect) kftp_reconnect(fp);
+			kftp_connect_file(fp);
+		}
+	} else if (fp->type == KNF_TYPE_HTTP) {
+		if (fp->is_ready == 0)
+			khttp_connect_file(fp);
+	}
+	if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
+		off_t rest = len, curr;
+		while (rest) {
+			do {
+				curr = read(fp->fd, buf + l, rest);
+			} while (curr < 0 && EINTR == errno);
+			if (curr < 0) return -1;
+			if (curr == 0) break;
+			l += curr; rest -= curr;
+		}
+	} else l = my_netread(fp->fd, buf, len);
+	fp->offset += l;
+	return l;
+}
+
+off_t knet_seek(knetFile *fp, int64_t off, int whence)
+{
+	if (whence == SEEK_SET && off == fp->offset) return 0;
+	if (fp->type == KNF_TYPE_LOCAL) {
+		/* Be aware that lseek() returns the offset after seeking,
+		 * while fseek() returns zero on success. */
+		off_t offset = lseek(fp->fd, off, whence);
+		if (offset == -1) {
+            // Be silent, it is OK for knet_seek to fail when the file is streamed
+            // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+			return -1;
+		}
+		fp->offset = offset;
+		return off;
+	} else if (fp->type == KNF_TYPE_FTP) {
+        if (whence==SEEK_CUR)
+            fp->offset += off;
+        else if (whence==SEEK_SET)
+            fp->offset = off;
+        else if ( whence==SEEK_END)
+            fp->offset = fp->file_size+off;
+		fp->is_ready = 0;
+		return off;
+	} else if (fp->type == KNF_TYPE_HTTP) {
+		if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
+			fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
+			errno = ESPIPE;
+			return -1;
+		}
+        if (whence==SEEK_CUR)
+            fp->offset += off;
+        else if (whence==SEEK_SET)
+            fp->offset = off;
+		fp->is_ready = 0;
+		return off;
+	}
+	errno = EINVAL;
+    fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+	return -1;
+}
+
+int knet_close(knetFile *fp)
+{
+	if (fp == 0) return 0;
+	if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
+	if (fp->fd != -1) {
+		/* On Linux/Mac, netclose() is an alias of close(), but on
+		 * Windows, it is an alias of closesocket(). */
+		if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
+		else netclose(fp->fd);
+	}
+	free(fp->host); free(fp->port);
+	free(fp->response); free(fp->retr); // FTP specific
+	free(fp->path); free(fp->http_host); // HTTP specific
+	free(fp);
+	return 0;
+}
+
+#ifdef KNETFILE_MAIN
+int main(void)
+{
+	char *buf;
+	knetFile *fp;
+	int type = 4, l;
+#ifdef _WIN32
+	knet_win32_init();
+#endif
+	buf = calloc(0x100000, 1);
+	if (type == 0) {
+		fp = knet_open("knetfile.c", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 1) { // NCBI FTP, large file
+		fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
+		knet_seek(fp, 2500000000ll, SEEK_SET);
+		l = knet_read(fp, buf, 255);
+	} else if (type == 2) {
+		fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 3) {
+		fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 4) {
+		fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
+		knet_read(fp, buf, 10000);
+		knet_seek(fp, 20000, SEEK_SET);
+		knet_seek(fp, 10000, SEEK_SET);
+		l = knet_read(fp, buf+10000, 10000000) + 10000;
+	}
+	if (type != 4 && type != 1) {
+		knet_read(fp, buf, 255);
+		buf[255] = 0;
+		printf("%s\n", buf);
+	} else write(fileno(stdout), buf, l);
+	knet_close(fp);
+	free(buf);
+	return 0;
+}
+#endif
--- a/ext/klib/knetfile.h
+++ b/ext/klib/knetfile.h
@ -0,0 +1,75 @@
+#ifndef KNETFILE_H
+#define KNETFILE_H
+
+#include <stdint.h>
+#include <fcntl.h>
+
+#ifndef _WIN32
+#define netread(fd, ptr, len) read(fd, ptr, len)
+#define netwrite(fd, ptr, len) write(fd, ptr, len)
+#define netclose(fd) close(fd)
+#else
+#include <winsock2.h>
+#define netread(fd, ptr, len) recv(fd, ptr, len, 0)
+#define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
+#define netclose(fd) closesocket(fd)
+#endif
+
+// FIXME: currently I/O is unbuffered
+
+#define KNF_TYPE_LOCAL 1
+#define KNF_TYPE_FTP   2
+#define KNF_TYPE_HTTP  3
+
+typedef struct knetFile_s {
+	int type, fd;
+	int64_t offset;
+	char *host, *port;
+
+	// the following are for FTP only
+	int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
+	char *response, *retr, *size_cmd;
+	int64_t seek_offset; // for lazy seek
+    int64_t file_size;
+
+	// the following are for HTTP only
+	char *path, *http_host;
+} knetFile;
+
+#define knet_tell(fp) ((fp)->offset)
+#define knet_fileno(fp) ((fp)->fd)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+	int knet_win32_init();
+	void knet_win32_destroy();
+#endif
+
+	knetFile *knet_open(const char *fn, const char *mode);
+
+	/* 
+	   This only works with local files.
+	 */
+	knetFile *knet_dopen(int fd, const char *mode);
+
+	/*
+	  If ->is_ready==0, this routine updates ->fd; otherwise, it simply
+	  reads from ->fd.
+	 */
+	off_t knet_read(knetFile *fp, void *buf, off_t len);
+
+	/*
+	  This routine only sets ->offset and ->is_ready=0. It does not
+	  communicate with the FTP server.
+	 */
+	off_t knet_seek(knetFile *fp, int64_t off, int whence);
+	int knet_close(knetFile *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ext/klib/knhx.c
+++ b/ext/klib/knhx.c
@ -0,0 +1,172 @@
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "knhx.h"
+
+typedef struct {
+	int error, n, max;
+	knhx1_t *node;
+} knaux_t;
+
+static inline char *add_node(const char *s, knaux_t *aux, int x)
+{
+	char *p, *nbeg, *nend = 0;
+	knhx1_t *r;
+	if (aux->n == aux->max) {
+		aux->max = aux->max? aux->max<<1 : 8;
+		aux->node = (knhx1_t*)realloc(aux->node, sizeof(knhx1_t) * aux->max);
+	}
+	r = aux->node + (aux->n++);
+	r->n = x; r->parent = -1;
+	for (p = (char*)s, nbeg = p, r->d = -1.0; *p && *p != ',' && *p != ')'; ++p) {
+		if (*p == '[') {
+			if (nend == 0) nend = p;
+			do ++p; while (*p && *p != ']');
+			if (*p == 0) {
+				aux->error |= KNERR_BRACKET;
+				break;
+			}
+		} else if (*p == ':') {
+			if (nend == 0) nend = p;
+			r->d = strtod(p + 1, &p);
+			--p;
+		} else if (!isgraph(*p)) if (nend == 0) nend = p;
+	}
+	if (nend == 0) nend = p;
+	if (nend != nbeg) {
+		r->name = (char*)calloc(nend - nbeg + 1, 1);
+		strncpy(r->name, nbeg, nend - nbeg);
+	} else r->name = strdup("");
+	return p;
+}
+
+knhx1_t *kn_parse(const char *nhx, int *_n, int *_error)
+{
+	char *p;
+	int *stack, top, max;
+	knaux_t *aux;
+	knhx1_t *ret;
+
+#define __push_back(y) do {										\
+		if (top == max) {										\
+			max = max? max<<1 : 16;								\
+			stack = (int*)realloc(stack, sizeof(int) * max);	\
+		}														\
+		stack[top++] = (y);										\
+	} while (0)													\
+
+	stack = 0; top = max = 0;
+	p = (char*)nhx;
+	aux = (knaux_t*)calloc(1, sizeof(knaux_t));
+	while (*p) {
+		while (*p && !isgraph(*p)) ++p;
+		if (*p == 0) break;
+		if (*p == ',') ++p;
+		else if (*p == '(') {
+			__push_back(-1);
+			++p;
+		} else if (*p == ')') {
+			int x = aux->n, m, i;
+			for (i = top - 1; i >= 0; --i)
+				if (stack[i] < 0) break;
+			m = top - 1 - i;
+			p = add_node(p + 1, aux, m);
+			aux->node[x].child = (int*)calloc(m, sizeof(int));
+			for (i = top - 1, m = m - 1; m >= 0; --m, --i) {
+				aux->node[x].child[m] = stack[i];
+				aux->node[stack[i]].parent = x;
+			}
+			top = i;
+			__push_back(x);
+		} else {
+			__push_back(aux->n);
+			p = add_node(p, aux, 0);
+		}
+	}
+	*_n = aux->n;
+	*_error = aux->error;
+	ret = aux->node;
+	free(aux); free(stack);
+	return ret;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l; s->s[s->l] = 0;
+	return l;
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+	if (s->l + 1 >= s->m) {
+		s->m = s->l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	s->s[s->l++] = c; s->s[s->l] = 0;
+	return c;
+}
+
+static void format_node_recur(const knhx1_t *node, const knhx1_t *p, kstring_t *s, char *numbuf)
+{
+	if (p->n) {
+		int i;
+		kputc('(', s);
+		for (i = 0; i < p->n; ++i) {
+			if (i) kputc(',', s);
+			format_node_recur(node, &node[p->child[i]], s, numbuf);
+		}
+		kputc(')', s);
+		if (p->name) kputsn(p->name, strlen(p->name), s);
+		if (p->d >= 0) {
+			sprintf(numbuf, ":%g", p->d);
+			kputsn(numbuf, strlen(numbuf), s);
+		}
+	} else {
+	  kputsn(p->name, strlen(p->name), s);
+	  if (p->d >= 0) {
+	    sprintf(numbuf, ":%g", p->d);
+	    kputsn(numbuf, strlen(numbuf), s);
+	  }
+	}
+}
+
+void kn_format(const knhx1_t *node, int root, kstring_t *s) // TODO: get rid of recursion
+{
+	char numbuf[128];
+	format_node_recur(node, &node[root], s, numbuf);
+}
+
+#ifdef KNHX_MAIN
+int main(int argc, char *argv[])
+{
+	char *s = "((a[abc],d1)x:0.5,((b[&&NHX:S=MOUSE],h2)[&&NHX:S=HUMAN:B=99][blabla][&&NHX:K=foo],c))";
+	knhx1_t *node;
+	int i, j, n, error;
+	kstring_t str;
+	node = kn_parse(s, &n, &error);
+	for (i = 0; i < n; ++i) {
+		knhx1_t *p = node + i;
+		printf("[%d] %s\t%d\t%d\t%g", i, p->name, p->parent, p->n, p->d);
+		for (j = 0; j < p->n; ++j)
+			printf("\t%d", p->child[j]);
+		putchar('\n');
+	}
+	str.l = str.m = 0; str.s = 0;
+	kn_format(node, n-1, &str);
+	puts(str.s);
+	free(str.s);
+	return 0;
+}
+#endif
--- a/ext/klib/knhx.h
+++ b/ext/klib/knhx.h
@ -0,0 +1,35 @@
+#ifndef KNHX_H_
+#define KNHX_H_
+
+#define KNERR_MISSING_LEFT   0x01
+#define KNERR_MISSING_RGHT   0x02
+#define KNERR_BRACKET        0x04
+#define KNERR_COLON          0x08
+
+typedef struct {
+	int parent, n;
+	int *child;
+	char *name;
+	double d;
+} knhx1_t;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	knhx1_t *kn_parse(const char *nhx, int *_n, int *_error);
+	void kn_format(const knhx1_t *node, int root, kstring_t *s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ext/klib/kopen.c
+++ b/ext/klib/kopen.c
@ -0,0 +1,343 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/types.h>
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#ifdef _WIN32
+#define _KO_NO_NET
+#endif
+
+#ifndef _KO_NO_NET
+static int socket_wait(int fd, int is_read)
+{
+	fd_set fds, *fdr = 0, *fdw = 0;
+	struct timeval tv;
+	int ret;
+	tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+	FD_ZERO(&fds);
+	FD_SET(fd, &fds);
+	if (is_read) fdr = &fds;
+	else fdw = &fds;
+	ret = select(fd+1, fdr, fdw, 0, &tv);
+	if (ret == -1) perror("select");
+	return ret;
+}
+
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+	int ai_err, on = 1, fd;
+	struct linger lng = { 0, 0 };
+	struct addrinfo hints, *res = 0;
+	memset(&hints, 0, sizeof(struct addrinfo));
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	if ((ai_err = getaddrinfo(host, port, &hints, &res)) != 0) { fprintf(stderr, "can't resolve %s:%s: %s\n", host, port, gai_strerror(ai_err)); return -1; }
+	if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+	freeaddrinfo(res);
+	return fd;
+#undef __err_connect
+}
+
+static int http_open(const char *fn)
+{
+	char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
+	int fd, ret, l;
+
+	/* parse URL; adapted from khttp_parse_url() in knetfile.c */
+	if (strstr(fn, "http://") != fn) return 0;
+	// set ->http_host
+	for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+	l = p - fn - 7;
+	http_host = calloc(l + 1, 1);
+	strncpy(http_host, fn + 7, l);
+	http_host[l] = 0;
+	for (q = http_host; *q && *q != ':'; ++q);
+	if (*q == ':') *q++ = 0;
+	// get http_proxy
+	proxy = getenv("http_proxy");
+	// set host, port and path
+	if (proxy == 0) {
+		host = strdup(http_host); // when there is no proxy, server name is identical to http_host name.
+		port = strdup(*q? q : "80");
+		path = strdup(*p? p : "/");
+	} else {
+		host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+		for (q = host; *q && *q != ':'; ++q);
+		if (*q == ':') *q++ = 0; 
+		port = strdup(*q? q : "80");
+		path = strdup(fn);
+	}
+
+	/* connect; adapted from khttp_connect() in knetfile.c */
+	l = 0;
+	fd = socket_connect(host, port);
+	buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+	l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", path, http_host);
+	l += sprintf(buf + l, "\r\n");
+	write(fd, buf, l);
+	l = 0;
+	while (read(fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+		if (buf[l] == '\n' && l >= 3)
+			if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+		++l;
+	}
+	buf[l] = 0;
+	if (l < 14) { // prematured header
+		close(fd);
+		fd = -1;
+	}
+	ret = strtol(buf + 8, &p, 0); // HTTP return code
+	if (ret != 200) {
+		close(fd);
+		fd = -1;
+	}
+	free(buf); free(http_host); free(host); free(port); free(path);
+	return fd;
+}
+
+typedef struct {
+	int max_response, ctrl_fd;
+	char *response;
+} ftpaux_t;
+
+static int kftp_get_response(ftpaux_t *aux)
+{
+	unsigned char c;
+	int n = 0;
+	char *p;
+	if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
+	while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+		if (n >= aux->max_response) {
+			aux->max_response = aux->max_response? aux->max_response<<1 : 256;
+			aux->response = realloc(aux->response, aux->max_response);
+		}
+		aux->response[n++] = c;
+		if (c == '\n') {
+			if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
+				&& aux->response[3] != '-') break;
+			n = 0;
+			continue;
+		}
+	}
+	if (n < 2) return -1;
+	aux->response[n-2] = 0;
+	return strtol(aux->response, &p, 0);
+}
+
+static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
+{
+	if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+	write(aux->ctrl_fd, cmd, strlen(cmd));
+	return is_get? kftp_get_response(aux) : 0;
+}
+
+static int ftp_open(const char *fn)
+{
+	char *p, *host = 0, *port = 0, *retr = 0;
+	char host2[80], port2[10];
+	int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
+	ftpaux_t aux;
+	
+	/* parse URL */
+	if (strstr(fn, "ftp://") != fn) return 0;
+	for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+	if (*p != '/') return 0;
+	l = p - fn - 6;
+	port = strdup("21");
+	host = calloc(l + 1, 1);
+	strncpy(host, fn + 6, l);
+	retr = calloc(strlen(p) + 8, 1);
+	sprintf(retr, "RETR %s\r\n", p);
+	
+	/* connect to ctrl */
+	memset(&aux, 0, sizeof(ftpaux_t));
+	aux.ctrl_fd = socket_connect(host, port);
+	if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
+
+	/* connect to the data stream */
+	kftp_get_response(&aux);
+	kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
+	kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
+	kftp_send_cmd(&aux, "TYPE I\r\n", 1);
+	kftp_send_cmd(&aux, "PASV\r\n", 1);
+	for (p = aux.response; *p && *p != '('; ++p);
+	if (*p != '(') goto ftp_open_end;
+	++p;
+	sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+	memcpy(pasv_ip, v, 4 * sizeof(int));
+	pasv_port = (v[4]<<8&0xff00) + v[5];
+	kftp_send_cmd(&aux, retr, 0);
+	sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
+	sprintf(port2, "%d", pasv_port);
+	fd = socket_connect(host2, port2);
+	if (fd == -1) goto ftp_open_end;
+	ret = kftp_get_response(&aux);
+	if (ret != 150) {
+		close(fd);
+		fd = -1;
+	}
+	close(aux.ctrl_fd);
+
+ftp_open_end:
+	free(host); free(port); free(retr); free(aux.response);
+	return fd;
+}
+#endif /* !defined(_KO_NO_NET) */
+
+static char **cmd2argv(const char *cmd)
+{
+	int i, beg, end, argc;
+	char **argv, *p, *q, *str;
+	end = strlen(cmd);
+	for (i = end - 1; i >= 0; --i)
+		if (!isspace(cmd[i])) break;
+	end = i + 1;
+	for (beg = 0; beg < end; ++beg)
+		if (!isspace(cmd[beg])) break;
+	if (beg == end) return 0;
+	for (i = beg + 1, argc = 0; i < end; ++i)
+		if (isspace(cmd[i]) && !isspace(cmd[i-1]))
+			++argc;
+	argv = (char**)calloc(argc + 2, sizeof(void*));
+	argv[0] = str = (char*)calloc(end - beg + 1, 1);
+	strncpy(argv[0], cmd + beg, end - beg);
+	for (i = argc = 1, q = p = str; i < end - beg; ++i)
+		if (isspace(str[i])) str[i] = 0;
+		else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
+	return argv;
+}
+
+#define KO_STDIN    1
+#define KO_FILE     2
+#define KO_PIPE     3
+#define KO_HTTP     4
+#define KO_FTP      5
+
+typedef struct {
+	int type, fd;
+	pid_t pid;
+} koaux_t;
+
+void *kopen(const char *fn, int *_fd)
+{
+	koaux_t *aux = 0;
+	*_fd = -1;
+	if (strstr(fn, "http://") == fn) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_HTTP;
+		aux->fd = http_open(fn);
+	} else if (strstr(fn, "ftp://") == fn) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_FTP;
+		aux->fd = ftp_open(fn);
+	} else if (strcmp(fn, "-") == 0) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_STDIN;
+		aux->fd = STDIN_FILENO;
+	} else {
+		const char *p, *q;
+		for (p = fn; *p; ++p)
+			if (!isspace(*p)) break;
+		if (*p == '<') { // pipe open
+			int need_shell, pfd[2];
+			pid_t pid;
+			// a simple check to see if we need to invoke a shell; not always working
+			for (q = p + 1; *q; ++q)
+				if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
+					break;
+			need_shell = (*q != 0);
+			pipe(pfd);
+			pid = vfork();
+			if (pid == -1) { /* vfork() error */
+				close(pfd[0]); close(pfd[1]);
+				return 0;
+			}
+			if (pid == 0) { /* the child process */
+				char **argv; /* FIXME: I do not know if this will lead to a memory leak */
+				close(pfd[0]);
+				dup2(pfd[1], STDOUT_FILENO);
+				close(pfd[1]);
+				if (!need_shell) {
+					argv = cmd2argv(p + 1);
+					execvp(argv[0], argv);
+					free(argv[0]); free(argv);
+				} else execl("/bin/sh", "sh", "-c", p + 1, NULL);
+				exit(1);
+			} else { /* parent process */
+				close(pfd[1]);
+				aux = calloc(1, sizeof(koaux_t));
+				aux->type = KO_PIPE;
+				aux->fd = pfd[0];
+				aux->pid = pid;
+			}
+		} else {
+#ifdef _WIN32
+			*_fd = open(fn, O_RDONLY | O_BINARY);
+#else
+			*_fd = open(fn, O_RDONLY);
+#endif
+			if (*_fd) {
+				aux = calloc(1, sizeof(koaux_t));
+				aux->type = KO_FILE;
+				aux->fd = *_fd;
+			}
+		}
+	}
+	*_fd = aux->fd;
+	return aux;
+}
+
+int kclose(void *a)
+{
+	koaux_t *aux = (koaux_t*)a;
+	if (aux->type == KO_PIPE) {
+		int status;
+		pid_t pid;
+		pid = waitpid(aux->pid, &status, WNOHANG);
+		if (pid != aux->pid) kill(aux->pid, 15);
+	}
+	return 0;
+}
+
+#ifdef _KO_MAIN
+#define BUF_SIZE 0x10000
+int main(int argc, char *argv[])
+{
+	void *x;
+	int l, fd;
+	unsigned char buf[BUF_SIZE];
+	FILE *fp;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: kopen <file>\n");
+		return 1;
+	}
+	x = kopen(argv[1], &fd);
+	fp = fdopen(fd, "r");
+	if (fp == 0) {
+		fprintf(stderr, "ERROR: fail to open the input\n");
+		return 1;
+	}
+	do {
+		if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
+			fwrite(buf, 1, l, stdout);
+	} while (l == BUF_SIZE);
+	fclose(fp);
+	kclose(x);
+	return 0;
+}
+#endif
--- a/ext/klib/krmq.h
+++ b/ext/klib/krmq.h
@ -0,0 +1,474 @@
+/* The MIT License
+
+   Copyright (c) 2019 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* An example:
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "krmq.h"
+
+struct my_node {
+  char key;
+  KRMQ_HEAD(struct my_node) head;
+};
+#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key))
+KRMQ_INIT(my, struct my_node, head, my_cmp)
+
+int main(void) {
+  const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate
+  struct my_node *root = 0;
+  int i, l = strlen(str);
+  for (i = 0; i < l; ++i) {        // insert in the input order
+    struct my_node *q, *p = malloc(sizeof(*p));
+    p->key = str[i];
+    q = krmq_insert(my, &root, p, 0);
+    if (p != q) free(p);           // if already present, free
+  }
+  krmq_itr_t(my) itr;
+  krmq_itr_first(my, root, &itr);  // place at first
+  do {                             // traverse
+    const struct my_node *p = krmq_at(&itr);
+    putchar(p->key);
+    free((void*)p);                // free node
+  } while (krmq_itr_next(my, &itr));
+  putchar('\n');
+  return 0;
+}
+*/
+
+#ifndef KRMQ_H
+#define KRMQ_H
+
+#ifdef __STRICT_ANSI__
+#define inline __inline__
+#endif
+
+#define KRMQ_MAX_DEPTH 64
+
+#define krmq_size(head, p) ((p)? (p)->head.size : 0)
+#define krmq_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0)
+
+#define KRMQ_HEAD(__type) \
+	struct { \
+		__type *p[2], *s; \
+		signed char balance; /* balance factor */ \
+		unsigned size; /* #elements in subtree */ \
+	}
+
+#define __KRMQ_FIND(suf, __scope, __type, __head,  __cmp) \
+	__scope __type *krmq_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \
+		const __type *p = root; \
+		unsigned cnt = 0; \
+		while (p != 0) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \
+			if (cmp < 0) p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		return (__type*)p; \
+	} \
+	__scope __type *krmq_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \
+		const __type *p = root, *l = 0, *u = 0; \
+		while (p != 0) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp < 0) u = p, p = p->__head.p[0]; \
+			else if (cmp > 0) l = p, p = p->__head.p[1]; \
+			else { l = u = p; break; } \
+		} \
+		if (lower) *lower = (__type*)l; \
+		if (upper) *upper = (__type*)u; \
+		return (__type*)p; \
+	}
+
+#define __KRMQ_RMQ(suf, __scope, __type, __head,  __cmp, __lt2) \
+	__scope __type *krmq_rmq_##suf(const __type *root, const __type *lo, const __type *up) { /* CLOSED interval */ \
+		const __type *p = root, *path[2][KRMQ_MAX_DEPTH], *min; \
+		int plen[2] = {0, 0}, pcmp[2][KRMQ_MAX_DEPTH], i, cmp, lca; \
+		if (root == 0) return 0; \
+		while (p) { \
+			cmp = __cmp(lo, p); \
+			path[0][plen[0]] = p, pcmp[0][plen[0]++] = cmp; \
+			if (cmp < 0) p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		p = root; \
+		while (p) { \
+			cmp = __cmp(up, p); \
+			path[1][plen[1]] = p, pcmp[1][plen[1]++] = cmp; \
+			if (cmp < 0) p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		for (i = 0; i < plen[0] && i < plen[1]; ++i) /* find the LCA */ \
+			if (path[0][i] == path[1][i] && pcmp[0][i] <= 0 && pcmp[1][i] >= 0) \
+				break; \
+		if (i == plen[0] || i == plen[1]) return 0; /* no elements in the closed interval */ \
+		lca = i, min = path[0][lca]; \
+		for (i = lca + 1; i < plen[0]; ++i) { \
+			if (pcmp[0][i] <= 0) { \
+				if (__lt2(path[0][i], min)) min = path[0][i]; \
+				if (path[0][i]->__head.p[1] && __lt2(path[0][i]->__head.p[1]->__head.s, min)) \
+					min = path[0][i]->__head.p[1]->__head.s; \
+			} \
+		} \
+		for (i = lca + 1; i < plen[1]; ++i) { \
+			if (pcmp[1][i] >= 0) { \
+				if (__lt2(path[1][i], min)) min = path[1][i]; \
+				if (path[1][i]->__head.p[0] && __lt2(path[1][i]->__head.p[0]->__head.s, min)) \
+					min = path[1][i]->__head.p[0]->__head.s; \
+			} \
+		} \
+		return (__type*)min; \
+	}
+
+#define __KRMQ_ROTATE(suf, __type, __head, __lt2) \
+	/* */ \
+	static inline void krmq_update_min_##suf(__type *p, const __type *q, const __type *r) { \
+		p->__head.s = !q || __lt2(p, q->__head.s)? p : q->__head.s; \
+		p->__head.s = !r || __lt2(p->__head.s, r->__head.s)? p->__head.s : r->__head.s; \
+	} \
+	/* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \
+	static inline __type *krmq_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \
+		int opp = 1 - dir; /* opposite direction */ \
+		__type *q = p->__head.p[opp], *s = p->__head.s; \
+		unsigned size_p = p->__head.size; \
+		p->__head.size -= q->__head.size - krmq_size_child(__head, q, dir); \
+		q->__head.size = size_p; \
+		krmq_update_min_##suf(p, p->__head.p[dir], q->__head.p[dir]); \
+		q->__head.s = s; \
+		p->__head.p[opp] = q->__head.p[dir]; \
+		q->__head.p[dir] = p; \
+		return q; \
+	} \
+	/* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \
+	static inline __type *krmq_rotate2_##suf(__type *p, int dir) { \
+		int b1, opp = 1 - dir; \
+		__type *q = p->__head.p[opp], *r = q->__head.p[dir], *s = p->__head.s; \
+		unsigned size_x_dir = krmq_size_child(__head, r, dir); \
+		r->__head.size = p->__head.size; \
+		p->__head.size -= q->__head.size - size_x_dir; \
+		q->__head.size -= size_x_dir + 1; \
+		krmq_update_min_##suf(p, p->__head.p[dir], r->__head.p[dir]); \
+		krmq_update_min_##suf(q, q->__head.p[opp], r->__head.p[opp]); \
+		r->__head.s = s; \
+		p->__head.p[opp] = r->__head.p[dir]; \
+		r->__head.p[dir] = p; \
+		q->__head.p[dir] = r->__head.p[opp]; \
+		r->__head.p[opp] = q; \
+		b1 = dir == 0? +1 : -1; \
+		if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \
+		else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \
+		else q->__head.balance = b1, p->__head.balance = 0; \
+		r->__head.balance = 0; \
+		return r; \
+	}
+
+#define __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \
+	__scope __type *krmq_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \
+		unsigned char stack[KRMQ_MAX_DEPTH]; \
+		__type *path[KRMQ_MAX_DEPTH]; \
+		__type *bp, *bq; \
+		__type *p, *q, *r = 0; /* _r_ is potentially the new root */ \
+		int i, which = 0, top, b1, path_len; \
+		unsigned cnt = 0; \
+		bp = *root_, bq = 0; \
+		/* find the insertion location */ \
+		for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \
+			if (cmp == 0) { \
+				if (cnt_) *cnt_ = cnt; \
+				return p; \
+			} \
+			if (p->__head.balance != 0) \
+				bq = q, bp = p, top = 0; \
+			stack[top++] = which = (cmp > 0); \
+			path[path_len++] = p; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0, x->__head.s = x; \
+		if (q == 0) *root_ = x; \
+		else q->__head.p[which] = x; \
+		if (bp == 0) return x; \
+		for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \
+		for (i = path_len - 1; i >= 0; --i) { \
+			krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \
+			if (path[i]->__head.s != x) break; \
+		} \
+		for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \
+			if (stack[top] == 0) --p->__head.balance; \
+			else ++p->__head.balance; \
+		if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \
+		/* re-balance */ \
+		which = (bp->__head.balance < 0); \
+		b1 = which == 0? +1 : -1; \
+		q = bp->__head.p[1 - which]; \
+		if (q->__head.balance == b1) { \
+			r = krmq_rotate1_##suf(bp, which); \
+			q->__head.balance = bp->__head.balance = 0; \
+		} else r = krmq_rotate2_##suf(bp, which); \
+		if (bq == 0) *root_ = r; \
+		else bq->__head.p[bp != bq->__head.p[0]] = r; \
+		return x; \
+	}
+
+#define __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \
+	__scope __type *krmq_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \
+		__type *p, *path[KRMQ_MAX_DEPTH], fake; \
+		unsigned char dir[KRMQ_MAX_DEPTH]; \
+		int i, d = 0, cmp; \
+		unsigned cnt = 0; \
+		fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \
+		if (cnt_) *cnt_ = 0; \
+		if (x) { \
+			for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \
+				int which = (cmp > 0); \
+				if (cmp > 0) cnt += krmq_size_child(__head, p, 0) + 1; \
+				dir[d] = which; \
+				path[d++] = p; \
+				p = p->__head.p[which]; \
+				if (p == 0) { \
+					if (cnt_) *cnt_ = 0; \
+					return 0; \
+				} \
+			} \
+			cnt += krmq_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \
+		} else { \
+			for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \
+				dir[d] = 0, path[d++] = p; \
+			p = path[--d]; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		for (i = 1; i < d; ++i) --path[i]->__head.size; \
+		if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \
+			path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \
+		} else { \
+			__type *q = p->__head.p[1]; \
+			if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3,q=2 */ \
+				q->__head.p[0] = p->__head.p[0]; \
+				q->__head.balance = p->__head.balance; \
+				path[d-1]->__head.p[dir[d-1]] = q; \
+				path[d] = q, dir[d++] = 1; \
+				q->__head.size = p->__head.size - 1; \
+			} else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \
+				__type *r; \
+				int e = d++; /* backup _d_ */\
+				for (;;) { \
+					dir[d] = 0; \
+					path[d++] = q; \
+					r = q->__head.p[0]; \
+					if (r->__head.p[0] == 0) break; \
+					q = r; \
+				} \
+				r->__head.p[0] = p->__head.p[0]; \
+				q->__head.p[0] = r->__head.p[1]; \
+				r->__head.p[1] = p->__head.p[1]; \
+				r->__head.balance = p->__head.balance; \
+				path[e-1]->__head.p[dir[e-1]] = r; \
+				path[e] = r, dir[e] = 1; \
+				for (i = e + 1; i < d; ++i) --path[i]->__head.size; \
+				r->__head.size = p->__head.size - 1; \
+			} \
+		} \
+		for (i = d - 1; i >= 0; --i) /* not sure why adding condition "path[i]->__head.s==p" doesn't work */ \
+			krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \
+		while (--d > 0) { \
+			__type *q = path[d]; \
+			int which, other, b1 = 1, b2 = 2; \
+			which = dir[d], other = 1 - which; \
+			if (which) b1 = -b1, b2 = -b2; \
+			q->__head.balance += b1; \
+			if (q->__head.balance == b1) break; \
+			else if (q->__head.balance == b2) { \
+				__type *r = q->__head.p[other]; \
+				if (r->__head.balance == -b1) { \
+					path[d-1]->__head.p[dir[d-1]] = krmq_rotate2_##suf(q, which); \
+				} else { \
+					path[d-1]->__head.p[dir[d-1]] = krmq_rotate1_##suf(q, which); \
+					if (r->__head.balance == 0) { \
+						r->__head.balance = -b1; \
+						q->__head.balance = b1; \
+						break; \
+					} else r->__head.balance = q->__head.balance = 0; \
+				} \
+			} \
+		} \
+		*root_ = fake.__head.p[0]; \
+		return p; \
+	}
+
+#define krmq_free(__type, __head, __root, __free) do { \
+		__type *_p, *_q; \
+		for (_p = __root; _p; _p = _q) { \
+			if (_p->__head.p[0] == 0) { \
+				_q = _p->__head.p[1]; \
+				__free(_p); \
+			} else { \
+				_q = _p->__head.p[0]; \
+				_p->__head.p[0] = _q->__head.p[1]; \
+				_q->__head.p[1] = _p; \
+			} \
+		} \
+	} while (0)
+
+#define __KRMQ_ITR(suf, __scope, __type, __head, __cmp) \
+	struct krmq_itr_##suf { \
+		const __type *stack[KRMQ_MAX_DEPTH], **top; \
+	}; \
+	__scope void krmq_itr_first_##suf(const __type *root, struct krmq_itr_##suf *itr) { \
+		const __type *p; \
+		for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \
+			*++itr->top = p; \
+	} \
+	__scope int krmq_itr_find_##suf(const __type *root, const __type *x, struct krmq_itr_##suf *itr) { \
+		const __type *p = root; \
+		itr->top = itr->stack - 1; \
+		while (p != 0) { \
+			int cmp; \
+			*++itr->top = p; \
+			cmp = __cmp(x, p); \
+			if (cmp < 0) p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		return p? 1 : 0; \
+	} \
+	__scope int krmq_itr_next_bidir_##suf(struct krmq_itr_##suf *itr, int dir) { \
+		const __type *p; \
+		if (itr->top < itr->stack) return 0; \
+		dir = !!dir; \
+		p = (*itr->top)->__head.p[dir]; \
+		if (p) { /* go down */ \
+			for (; p; p = p->__head.p[!dir]) \
+				*++itr->top = p; \
+			return 1; \
+		} else { /* go up */ \
+			do { \
+				p = *itr->top--; \
+			} while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \
+			return itr->top < itr->stack? 0 : 1; \
+		} \
+	} \
+
+/**
+ * Insert a node to the tree
+ *
+ * @param suf     name suffix used in KRMQ_INIT()
+ * @param proot   pointer to the root of the tree (in/out: root may change)
+ * @param x       node to insert (in)
+ * @param cnt     number of nodes smaller than or equal to _x_; can be NULL (out)
+ *
+ * @return _x_ if not present in the tree, or the node equal to x.
+ */
+#define krmq_insert(suf, proot, x, cnt) krmq_insert_##suf(proot, x, cnt)
+
+/**
+ * Find a node in the tree
+ *
+ * @param suf     name suffix used in KRMQ_INIT()
+ * @param root    root of the tree
+ * @param x       node value to find (in)
+ * @param cnt     number of nodes smaller than or equal to _x_; can be NULL (out)
+ *
+ * @return node equal to _x_ if present, or NULL if absent
+ */
+#define krmq_find(suf, root, x, cnt) krmq_find_##suf(root, x, cnt)
+#define krmq_interval(suf, root, x, lower, upper) krmq_interval_##suf(root, x, lower, upper)
+#define krmq_rmq(suf, root, lo, up) krmq_rmq_##suf(root, lo, up)
+
+/**
+ * Delete a node from the tree
+ *
+ * @param suf     name suffix used in KRMQ_INIT()
+ * @param proot   pointer to the root of the tree (in/out: root may change)
+ * @param x       node value to delete; if NULL, delete the first node (in)
+ *
+ * @return node removed from the tree if present, or NULL if absent
+ */
+#define krmq_erase(suf, proot, x, cnt) krmq_erase_##suf(proot, x, cnt)
+#define krmq_erase_first(suf, proot) krmq_erase_##suf(proot, 0, 0)
+
+#define krmq_itr_t(suf) struct krmq_itr_##suf
+
+/**
+ * Place the iterator at the smallest object
+ *
+ * @param suf     name suffix used in KRMQ_INIT()
+ * @param root    root of the tree
+ * @param itr     iterator
+ */
+#define krmq_itr_first(suf, root, itr) krmq_itr_first_##suf(root, itr)
+
+/**
+ * Place the iterator at the object equal to or greater than the query
+ *
+ * @param suf     name suffix used in KRMQ_INIT()
+ * @param root    root of the tree
+ * @param x       query (in)
+ * @param itr     iterator (out)
+ *
+ * @return 1 if find; 0 otherwise. krmq_at(itr) is NULL if and only if query is
+ *         larger than all objects in the tree
+ */
+#define krmq_itr_find(suf, root, x, itr) krmq_itr_find_##suf(root, x, itr)
+
+/**
+ * Move to the next object in order
+ *
+ * @param itr     iterator (modified)
+ *
+ * @return 1 if there is a next object; 0 otherwise
+ */
+#define krmq_itr_next(suf, itr) krmq_itr_next_bidir_##suf(itr, 1)
+#define krmq_itr_prev(suf, itr) krmq_itr_next_bidir_##suf(itr, 0)
+
+/**
+ * Return the pointer at the iterator
+ *
+ * @param itr     iterator
+ *
+ * @return pointer if present; NULL otherwise
+ */
+#define krmq_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top)
+
+#define KRMQ_INIT2(suf, __scope, __type, __head, __cmp, __lt2) \
+	__KRMQ_FIND(suf, __scope, __type, __head,  __cmp) \
+	__KRMQ_RMQ(suf, __scope, __type, __head,  __cmp, __lt2) \
+	__KRMQ_ROTATE(suf, __type, __head, __lt2) \
+	__KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \
+	__KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \
+	__KRMQ_ITR(suf, __scope, __type, __head, __cmp)
+
+#define KRMQ_INIT(suf, __type, __head, __cmp, __lt2) \
+	KRMQ_INIT2(suf,, __type, __head, __cmp, __lt2)
+
+#endif
--- a/ext/klib/krng.h
+++ b/ext/klib/krng.h
@ -0,0 +1,54 @@
+#ifndef KRNG_H
+#define KRNG_H
+
+typedef struct {
+	uint64_t s[2];
+} krng_t;
+
+static inline uint64_t kr_splitmix64(uint64_t x)
+{
+	uint64_t z = (x += 0x9E3779B97F4A7C15ULL);
+	z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
+	z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
+	return z ^ (z >> 31);
+}
+
+static inline uint64_t kr_rand_r(krng_t *r)
+{
+	const uint64_t s0 = r->s[0];
+	uint64_t s1 = r->s[1];
+	const uint64_t result = s0 + s1;
+	s1 ^= s0;
+	r->s[0] = (s0 << 55 | s0 >> 9) ^ s1 ^ (s1 << 14);
+	r->s[1] = s0 << 36 | s0 >> 28;
+	return result;
+}
+
+static inline void kr_jump_r(krng_t *r)
+{
+	static const uint64_t JUMP[] = { 0xbeac0467eba5facbULL, 0xd86b048b86aa9922ULL };
+	uint64_t s0 = 0, s1 = 0;
+	int i, b;
+	for (i = 0; i < 2; ++i)
+		for (b = 0; b < 64; b++) {
+			if (JUMP[i] & 1ULL << b)
+				s0 ^= r->s[0], s1 ^= r->s[1];
+			kr_rand_r(r);
+		}
+	r->s[0] = s0, r->s[1] = s1;
+}
+
+static inline void kr_srand_r(krng_t *r, uint64_t seed)
+{
+	r->s[0] = kr_splitmix64(seed);
+	r->s[1] = kr_splitmix64(r->s[0]);
+}
+
+static inline double kr_drand_r(krng_t *r)
+{
+	union { uint64_t i; double d; } u;
+	u.i = 0x3FFULL << 52 | kr_rand_r(r) >> 12;
+	return u.d - 1.0;
+}
+
+#endif
--- a/ext/klib/ksa.c
+++ b/ext/klib/ksa.c
@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2008 Yuta Mori    All Rights Reserved.
+ *               2011 Attractive Chaos <attractor@live.co.uk>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* This is a library for constructing the suffix array for a string containing
+ * multiple sentinels with sentinels all represented by 0. The last symbol in
+ * the string must be a sentinel. The library is modified from an early version
+ * of Yuta Mori's SAIS library, but is slower than the lastest SAIS by about
+ * 30%, partly due to the recent optimization Yuta has applied and partly due
+ * to the extra comparisons between sentinels. This is not the first effort in
+ * supporting multi-sentinel strings, but is probably the easiest to use. */
+
+#include <stdlib.h>
+
+#ifdef _KSA64
+#include <stdint.h>
+typedef int64_t saint_t;
+#define SAINT_MAX INT64_MAX
+#define SAIS_CORE ksa_core64
+#define SAIS_BWT  ksa_bwt64
+#define SAIS_MAIN ksa_sa64
+#else
+#include <limits.h>
+typedef int saint_t;
+#define SAINT_MAX INT_MAX
+#define SAIS_CORE ksa_core
+#define SAIS_BWT  ksa_bwt
+#define SAIS_MAIN ksa_sa
+#endif
+
+/* T is of type "const unsigned char*". If T[i] is a sentinel, chr(i) takes a negative value */
+#define chr(i) (cs == sizeof(saint_t) ? ((const saint_t *)T)[i] : (T[i]? (saint_t)T[i] : i - SAINT_MAX))
+
+/** Count the occurrences of each symbol */
+static void getCounts(const unsigned char *T, saint_t *C, saint_t n, saint_t k, int cs)
+{
+	saint_t i;
+	for (i = 0; i < k; ++i) C[i] = 0;
+	for (i = 0; i < n; ++i) {
+		saint_t c = chr(i);
+		++C[c > 0? c : 0];
+	}
+}
+
+/**
+ * Find the end of each bucket
+ *
+ * @param C   occurrences computed by getCounts(); input
+ * @param B   start/end of each bucket; output
+ * @param k   size of alphabet
+ * @param end compute the end of bucket if true; otherwise compute the end
+ */
+static inline void getBuckets(const saint_t *C, saint_t *B, saint_t k, saint_t end)
+{
+	saint_t i, sum = 0;
+	if (end) for (i = 0; i < k; ++i) sum += C[i], B[i] = sum;
+	else for (i = 0; i < k; ++i) sum += C[i], B[i] = sum - C[i];
+}
+
+/** Induced sort */
+static void induceSA(const unsigned char *T, saint_t *SA, saint_t *C, saint_t *B, saint_t n, saint_t k, saint_t cs)
+{
+	saint_t *b, i, j;
+	saint_t  c0, c1;
+	/* left-to-right induced sort (for L-type) */
+	if (C == B) getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 0);	/* find starts of buckets */
+	for (i = 0, b = 0, c1 = -1; i < n; ++i) {
+		j = SA[i], SA[i] = ~j;
+		if (0 < j) { /* >0 if j-1 is L-type; <0 if S-type; ==0 undefined */
+			--j;
+			if ((c0 = chr(j)) != c1) {
+				B[c1 > 0? c1 : 0] = b - SA;
+				c1 = c0;
+				b = SA + B[c1 > 0? c1 : 0];
+			}
+			*b++ = (0 < j && chr(j - 1) < c1) ? ~j : j;
+		}
+	}
+	/* right-to-left induced sort (for S-type) */
+	if (C == B) getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = n - 1, b = 0, c1 = -1; 0 <= i; --i) {
+		if (0 < (j = SA[i])) { /* the prefix is S-type */
+			--j;
+			if ((c0 = chr(j)) != c1) {
+				B[c1 > 0? c1 : 0] = b - SA;
+				c1 = c0;
+				b = SA + B[c1 > 0? c1 : 0];
+			}
+			if (c0 > 0) *--b = (j == 0 || chr(j - 1) > c1) ? ~j : j;
+		} else SA[i] = ~j; /* if L-type, change the sign */
+	}
+}
+
+/**
+ * Recursively construct the suffix array for a string containing multiple
+ * sentinels. NULL is taken as the sentinel.
+ *
+ * @param T   NULL terminated input string (there can be multiple NULLs)
+ * @param SA  output suffix array
+ * @param fs  working space available in SA (typically 0 when first called)
+ * @param n   length of T, including the trailing NULL
+ * @param k   size of the alphabet (typically 256 when first called)
+ * @param cs  # bytes per element in T; 1 or sizeof(saint_t) (typically 1 when first called)
+ *
+ * @return    0 upon success
+ */
+int SAIS_CORE(const unsigned char *T, saint_t *SA, saint_t fs, saint_t n, saint_t k, int cs)
+{
+	saint_t *C, *B;
+	saint_t  i, j, c, m, q, qlen, name;
+	saint_t  c0, c1;
+
+	/* STAGE I: reduce the problem by at least 1/2 sort all the S-substrings */
+	if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
+	else {
+		if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
+		B = cs == 1? C + k : C;
+	}
+	getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = 0; i < n; ++i) SA[i] = 0;
+	/* mark L and S (the t array in Nong et al.), and keep the positions of LMS in the buckets */
+	for (i = n - 2, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+		if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1); c==1 if in an S run */
+		else if (c) SA[--B[c1 > 0? c1 : 0]] = i + 1, c = 0;
+	}
+	induceSA(T, SA, C, B, n, k, cs);
+	if (fs < k) free(C);
+	/* pack all the sorted LMS into the first m items of SA 
+	   2*m must be not larger than n (see Nong et al. for the proof) */
+	for (i = 0, m = 0; i < n; ++i) {
+		saint_t p = SA[i];
+		if (p == n - 1) SA[m++] = p;
+		else if (0 < p && chr(p - 1) > (c0 = chr(p))) {
+			for (j = p + 1; j < n && c0 == (c1 = chr(j)); ++j);
+			if (j < n && c0 < c1) SA[m++] = p;
+		}
+	}
+	for (i = m; i < n; ++i) SA[i] = 0;	/* init the name array buffer */
+	/* store the length of all substrings */
+	for (i = n - 2, j = n, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+		if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1) */
+		else if (c) SA[m + ((i + 1) >> 1)] = j - i - 1, j = i + 1, c = 0;
+	}
+	/* find the lexicographic names of all substrings */
+	for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
+		saint_t p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
+		if (plen == qlen) {
+			for (j = 0; j < plen && chr(p + j) == chr(q + j); j++);
+			if (j == plen) diff = 0;
+		}
+		if (diff) ++name, q = p, qlen = plen;
+		SA[m + (p >> 1)] = name;
+	}
+
+	/* STAGE II: solve the reduced problem; recurse if names are not yet unique */
+	if (name < m) {
+		saint_t *RA = SA + n + fs - m - 1;
+		for (i = n - 1, j = m - 1; m <= i; --i)
+			if (SA[i] != 0) RA[j--] = SA[i];
+		RA[m] = 0; // add a sentinel; in the resulting SA, SA[0]==m always stands
+		if (SAIS_CORE((unsigned char *)RA, SA, fs + n - m * 2 - 2, m + 1, name + 1, sizeof(saint_t)) != 0) return -2;
+		for (i = n - 2, j = m - 1, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+			if ((c0 = chr(i)) < c1 + c) c = 1;
+			else if (c) RA[j--] = i + 1, c = 0; /* get p1 */
+		}
+		for (i = 0; i < m; ++i) SA[i] = RA[SA[i+1]]; /* get index  */
+	}
+
+	/* STAGE III: induce the result for the original problem */
+	if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
+	else {
+		if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
+		B = cs == 1? C + k : C;
+	}
+	/* put all LMS characters into their buckets */
+	getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
+	for (i = m - 1; 0 <= i; --i) {
+		j = SA[i], SA[i] = 0;
+		c = chr(j);
+		SA[--B[c > 0? c : 0]] = j;
+	}
+	induceSA(T, SA, C, B, n, k, cs);
+	if (fs < k) free(C);
+	return 0;
+}
+
+/**
+ * Construct the suffix array for a NULL terminated string possibly containing
+ * multiple sentinels (NULLs).
+ *
+ * @param T[0..n-1]  NULL terminated input string
+ * @param SA[0..n-1] output suffix array
+ * @param n          length of the given string, including NULL
+ * @param k          size of the alphabet including the sentinel; no more than 256
+ * @return           0 upon success
+ */
+int SAIS_MAIN(const unsigned char *T, saint_t *SA, saint_t n, int k)
+{
+	if (T == NULL || SA == NULL || T[n - 1] != '\0' || n <= 0) return -1;
+	if (k < 0 || k > 256) k = 256;
+	return SAIS_CORE(T, SA, 0, n, (saint_t)k, 1);
+}
+
+int SAIS_BWT(unsigned char *T, saint_t n, int k)
+{
+	saint_t *SA, i;
+	int ret;
+	if ((SA = malloc(n * sizeof(saint_t))) == 0) return -1;
+	if ((ret = SAIS_MAIN(T, SA, n, k)) != 0) return ret;
+	for (i = 0; i < n; ++i)
+		if (SA[i]) SA[i] = T[SA[i] - 1]; // if SA[i]==0, SA[i]=0
+	for (i = 0; i < n; ++i) T[i] = SA[i];
+	free(SA);
+	return 0;
+}
--- a/ext/klib/kseq.h
+++ b/ext/klib/kseq.h
@ -0,0 +1,242 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
+#define __KS_TYPE(type_t)						\
+	typedef struct __kstream_t {				\
+		unsigned char *buf;						\
+		int begin, end, is_eof;					\
+		type_t f;								\
+	} kstream_t;
+
+#define ks_err(ks) ((ks)->end == -1)
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)								\
+	static inline kstream_t *ks_init(type_t f)						\
+	{																\
+		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
+		ks->f = f;													\
+		ks->buf = (unsigned char*)malloc(__bufsize);				\
+		return ks;													\
+	}																\
+	static inline void ks_destroy(kstream_t *ks)					\
+	{																\
+		if (ks) {													\
+			free(ks->buf);											\
+			free(ks);												\
+		}															\
+	}
+
+#define __KS_GETC(__read, __bufsize)						\
+	static inline int ks_getc(kstream_t *ks)				\
+	{														\
+		if (ks_err(ks)) return -3;							\
+		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
+		if (ks->begin >= ks->end) {							\
+			ks->begin = 0;									\
+			ks->end = __read(ks->f, ks->buf, __bufsize);	\
+			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
+			if (ks->end == -1) { ks->is_eof = 1; return -3;}\
+		}													\
+		return (int)ks->buf[ks->begin++];					\
+	}
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)								\
+	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+	{																	\
+		int gotany = 0;													\
+		if (dret) *dret = 0;											\
+		str->l = append? str->l : 0;									\
+		for (;;) {														\
+			int i;														\
+			if (ks_err(ks)) return -3;									\
+			if (ks->begin >= ks->end) {									\
+				if (!ks->is_eof) {										\
+					ks->begin = 0;										\
+					ks->end = __read(ks->f, ks->buf, __bufsize);		\
+					if (ks->end == 0) { ks->is_eof = 1; break; }		\
+					if (ks->end == -1) { ks->is_eof = 1; return -3; }	\
+				} else break;											\
+			}															\
+			if (delimiter == KS_SEP_LINE) {								\
+				unsigned char *sep = memchr(ks->buf + ks->begin, '\n', ks->end - ks->begin); \
+				i = sep != NULL ? sep - ks->buf : ks->end;				\
+			} else if (delimiter > KS_SEP_MAX) {						\
+				unsigned char *sep = memchr(ks->buf + ks->begin, delimiter, ks->end - ks->begin); \
+				i = sep != NULL ? sep - ks->buf : ks->end;				\
+			} else if (delimiter == KS_SEP_SPACE) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i])) break;						\
+			} else if (delimiter == KS_SEP_TAB) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+			} else i = 0; /* never come to here! */						\
+			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
+				str->m = str->l + (i - ks->begin) + 1;					\
+				kroundup32(str->m);										\
+				str->s = (char*)realloc(str->s, str->m);				\
+			}															\
+			gotany = 1;													\
+			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+			str->l = str->l + (i - ks->begin);							\
+			ks->begin = i + 1;											\
+			if (i < ks->end) {											\
+				if (dret) *dret = ks->buf[i];							\
+				break;													\
+			}															\
+		}																\
+		if (!gotany && ks_eof(ks)) return -1;							\
+		if (str->s == 0) {												\
+			str->m = 1;													\
+			str->s = (char*)calloc(1, 1);								\
+		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+		str->s[str->l] = '\0';											\
+		return str->l;													\
+	} \
+	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+	__KS_TYPE(type_t)							\
+	__KS_BASIC(type_t, __bufsize)				\
+	__KS_GETC(__read, __bufsize)				\
+	__KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+	SCOPE kseq_t *kseq_init(type_t fd)									\
+	{																	\
+		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
+		s->f = ks_init(fd);												\
+		return s;														\
+	}																	\
+	SCOPE void kseq_destroy(kseq_t *ks)									\
+	{																	\
+		if (!ks) return;												\
+		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
+		ks_destroy(ks->f);												\
+		free(ks);														\
+	}
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+   -3   error reading stream
+ */
+#define __KSEQ_READ(SCOPE) \
+	SCOPE int kseq_read(kseq_t *seq) \
+	{ \
+		int c,r; \
+		kstream_t *ks = seq->f; \
+		if (seq->last_char == 0) { /* then jump to the next header line */ \
+			while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \
+			if (c < 0) return c; /* end of file or error*/ \
+			seq->last_char = c; \
+		} /* else: the first header char has been read in the previous call */ \
+		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+		if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r;  /* normal exit: EOF or error */ \
+		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+			seq->seq.m = 256; \
+			seq->seq.s = (char*)malloc(seq->seq.m); \
+		} \
+		while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \
+			if (c == '\n') continue; /* skip empty lines */ \
+			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+		} \
+		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
+		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+			seq->seq.m = seq->seq.l + 2; \
+			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+		} \
+		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+		if (c != '+') return seq->seq.l; /* FASTA */ \
+		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+			seq->qual.m = seq->seq.m; \
+			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+		} \
+		while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \
+		if (c == -1) return -2; /* error: no quality string */ \
+		while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \
+		if (c == -3) return -3; /* stream error */ \
+		seq->last_char = 0;	/* we have not come to the next header line */ \
+		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+		return seq->seq.l; \
+	}
+
+#define __KSEQ_TYPE(type_t)						\
+	typedef struct {							\
+		kstring_t name, comment, seq, qual;		\
+		int last_char;							\
+		kstream_t *f;							\
+	} kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+	KSTREAM_INIT(type_t, __read, 16384)			\
+	__KSEQ_TYPE(type_t)							\
+	__KSEQ_BASIC(SCOPE, type_t)					\
+	__KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+	__KS_TYPE(type_t) \
+	__KSEQ_TYPE(type_t) \
+	extern kseq_t *kseq_init(type_t fd); \
+	void kseq_destroy(kseq_t *ks); \
+	int kseq_read(kseq_t *seq);
+
+#endif
--- a/ext/klib/kson.c
+++ b/ext/klib/kson.c
@ -0,0 +1,253 @@
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include "kson.h"
+
+/*************
+ *** Parse ***
+ *************/
+
+kson_node_t *kson_parse_core(const char *json, long *_n, int *error, long *parsed_len)
+{
+	long *stack = 0, top = 0, max = 0, n_a = 0, m_a = 0, i, j;
+	kson_node_t *a = 0, *u;
+	const char *p, *q;
+	size_t *tmp;
+
+#define __push_back(y) do { \
+		if (top == max) { \
+			max = max? max<<1 : 4; \
+			stack = (long*)realloc(stack, sizeof(long) * max); \
+		} \
+		stack[top++] = (y); \
+	} while (0)
+
+#define __new_node(z) do { \
+		if (n_a == m_a) { \
+			long old_m = m_a; \
+			m_a = m_a? m_a<<1 : 4; \
+			a = (kson_node_t*)realloc(a, sizeof(kson_node_t) * m_a); \
+			memset(a + old_m, 0, sizeof(kson_node_t) * (m_a - old_m)); \
+		} \
+		*(z) = &a[n_a++]; \
+	} while (0)
+
+	assert(sizeof(size_t) == sizeof(kson_node_t*));
+	*error = KSON_OK;
+	for (p = json; *p; ++p) {
+		while (*p && isspace(*p)) ++p;
+		if (*p == 0) break;
+		if (*p == ',') { // comma is somewhat redundant
+		} else if (*p == '[' || *p == '{') {
+			int t = *p == '['? -1 : -2;
+			if (top < 2 || stack[top-1] != -3) { // unnamed internal node
+				__push_back(n_a);
+				__new_node(&u);
+				__push_back(t);
+			} else stack[top-1] = t; // named internal node
+		} else if (*p == ']' || *p == '}') {
+			long i, start, t = *p == ']'? -1 : -2;
+			for (i = top - 1; i >= 0 && stack[i] != t; --i);
+			if (i < 0) { // error: an extra right bracket
+				*error = KSON_ERR_EXTRA_RIGHT;
+				break;
+			}
+			start = i;
+			u = &a[stack[start-1]];
+			u->key = u->v.str;
+			u->n = top - 1 - start;
+			u->v.child = (kson_node_t**)malloc(u->n * sizeof(kson_node_t*));
+			tmp = (size_t*)u->v.child;
+			for (i = start + 1; i < top; ++i)
+				tmp[i - start - 1] = stack[i];
+			u->type = *p == ']'? KSON_TYPE_BRACKET : KSON_TYPE_BRACE;
+			if ((top = start) == 1) break; // completed one object; remaining characters discarded
+		} else if (*p == ':') {
+			if (top == 0 || stack[top-1] == -3) {
+				*error = KSON_ERR_NO_KEY;
+				break;
+			}
+			__push_back(-3);
+		} else {
+			int c = *p;
+			// get the node to modify
+			if (top >= 2 && stack[top-1] == -3) { // we have a key:value pair here
+				--top;
+				u = &a[stack[top-1]];
+				u->key = u->v.str; // move old value to key
+			} else { // don't know if this is a bare value or a key:value pair; keep it as a value for now
+				__push_back(n_a);
+				__new_node(&u);
+			}
+			// parse string
+			if (c == '\'' || c == '"') {
+				for (q = ++p; *q && *q != c; ++q)
+					if (*q == '\\') ++q;
+			} else {
+				for (q = p; *q && *q != ']' && *q != '}' && *q != ',' && *q != ':' && *q != '\n'; ++q)
+					if (*q == '\\') ++q;
+			}
+			u->v.str = (char*)malloc(q - p + 1); strncpy(u->v.str, p, q - p); u->v.str[q-p] = 0; // equivalent to u->v.str=strndup(p, q-p)
+			u->type = c == '\''? KSON_TYPE_SGL_QUOTE : c == '"'? KSON_TYPE_DBL_QUOTE : KSON_TYPE_NO_QUOTE;
+			p = c == '\'' || c == '"'? q : q - 1;
+		}
+	}
+	while (*p && isspace(*p)) ++p; // skip trailing blanks
+	if (parsed_len) *parsed_len = p - json;
+	if (top != 1) *error = KSON_ERR_EXTRA_LEFT;
+
+	for (i = 0; i < n_a; ++i)
+		for (j = 0, u = &a[i], tmp = (size_t*)u->v.child; j < (long)u->n; ++j)
+			u->v.child[j] = &a[tmp[j]];
+
+	free(stack);
+	*_n = n_a;
+	return a;
+}
+
+void kson_destroy(kson_t *kson)
+{
+	long i;
+	if (kson == 0) return;
+	for (i = 0; i < kson->n_nodes; ++i) {
+		free(kson->root[i].key); free(kson->root[i].v.str);
+	}
+	free(kson->root); free(kson);
+}
+
+kson_t *kson_parse(const char *json)
+{
+	kson_t *kson;
+	int error;
+	kson = (kson_t*)calloc(1, sizeof(kson_t));
+	kson->root = kson_parse_core(json, &kson->n_nodes, &error, 0);
+	if (error) {
+		kson_destroy(kson);
+		return 0;
+	}
+	return kson;
+}
+
+/*************
+ *** Query ***
+ *************/
+
+const kson_node_t *kson_by_path(const kson_node_t *p, int depth, ...)
+{
+	va_list ap;
+	va_start(ap, depth);
+	while (p && depth > 0) {
+		if (p->type == KSON_TYPE_BRACE) {
+			p = kson_by_key(p, va_arg(ap, const char*));
+		} else if (p->type == KSON_TYPE_BRACKET) {
+			p = kson_by_index(p, va_arg(ap, long));
+		} else break;
+		--depth;
+	}
+	va_end(ap);
+	return p;
+}
+
+/**************
+ *** Fromat ***
+ **************/
+
+void kson_format_recur(const kson_node_t *p, int depth)
+{
+	long i;
+	if (p->key) printf("\"%s\":", p->key);
+	if (p->type == KSON_TYPE_BRACKET || p->type == KSON_TYPE_BRACE) {
+		putchar(p->type == KSON_TYPE_BRACKET? '[' : '{');
+		if (p->n) {
+			putchar('\n'); for (i = 0; i <= depth; ++i) fputs("  ", stdout);
+			for (i = 0; i < (long)p->n; ++i) {
+				if (i) {
+					int i;
+					putchar(',');
+					putchar('\n'); for (i = 0; i <= depth; ++i) fputs("  ", stdout);
+				}
+				kson_format_recur(p->v.child[i], depth + 1);
+			}
+			putchar('\n'); for (i = 0; i < depth; ++i) fputs("  ", stdout);
+		}
+		putchar(p->type == KSON_TYPE_BRACKET? ']' : '}');
+	} else {
+		if (p->type != KSON_TYPE_NO_QUOTE)
+			putchar(p->type == KSON_TYPE_SGL_QUOTE? '\'' : '"');
+		fputs(p->v.str, stdout);
+		if (p->type != KSON_TYPE_NO_QUOTE)
+			putchar(p->type == KSON_TYPE_SGL_QUOTE? '\'' : '"');
+	}
+}
+
+void kson_format(const kson_node_t *root)
+{
+	kson_format_recur(root, 0);
+	putchar('\n');
+}
+
+/*********************
+ *** Main function ***
+ *********************/
+
+#ifdef KSON_MAIN
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+int main(int argc, char *argv[])
+{
+	kson_t *kson = 0;
+	if (argc > 1) {
+		FILE *fp;
+		int len = 0, max = 0, tmp, i;
+		char *json = 0, buf[0x10000];
+		if ((fp = fopen(argv[1], "rb")) != 0) {
+			// read the entire file into a string
+			while ((tmp = fread(buf, 1, 0x10000, fp)) != 0) {
+				if (len + tmp + 1 > max) {
+					max = len + tmp + 1;
+					kroundup32(max);
+					json = (char*)realloc(json, max);
+				}
+				memcpy(json + len, buf, tmp);
+				len += tmp;
+			}
+			fclose(fp);
+			// parse
+			kson = kson_parse(json);
+			free(json);
+			if (kson) {
+				kson_format(kson->root);
+				if (argc > 2) {
+					// path finding
+					const kson_node_t *p = kson->root;
+					for (i = 2; i < argc && p; ++i) {
+						if (p->type == KSON_TYPE_BRACKET)
+							p = kson_by_index(p, atoi(argv[i]));
+						else if (p->type == KSON_TYPE_BRACE)
+							p = kson_by_key(p, argv[i]);
+						else p = 0;
+					}
+					if (p) {
+						if (kson_is_internal(p)) printf("Reached an internal node\n");
+						else printf("Value: %s\n", p->v.str);
+					} else printf("Failed to find the slot\n");
+				}
+			} else printf("Failed to parse\n");
+		}
+	} else {
+		kson = kson_parse("{'a' : 1,'b':[0,'isn\\'t',true],'d':[{\n\n\n}]}");
+		if (kson) {
+			const kson_node_t *p = kson_by_path(kson->root, 2, "b", 1);
+			if (p) printf("*** %s\n", p->v.str);
+			else printf("!!! not found\n");
+			kson_format(kson->root);
+		} else {
+			printf("Failed to parse\n");
+		}
+	}
+	kson_destroy(kson);
+	return 0;
+}
+#endif
--- a/ext/klib/kson.h
+++ b/ext/klib/kson.h
@ -0,0 +1,64 @@
+#ifndef KSON_H
+#define KSON_H
+
+#include <string.h>
+
+#define KSON_TYPE_NO_QUOTE  1
+#define KSON_TYPE_SGL_QUOTE 2
+#define KSON_TYPE_DBL_QUOTE 3
+#define KSON_TYPE_BRACKET   4
+#define KSON_TYPE_BRACE     5
+
+#define KSON_OK              0
+#define KSON_ERR_EXTRA_LEFT  1
+#define KSON_ERR_EXTRA_RIGHT 2
+#define KSON_ERR_NO_KEY      3
+
+typedef struct kson_node_s {
+	unsigned long long type:3, n:61;
+	char *key;
+	union {
+		struct kson_node_s **child;
+		char *str;
+	} v;
+} kson_node_t;
+
+typedef struct {
+	long n_nodes;
+	kson_node_t *root;
+} kson_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	kson_t *kson_parse(const char *json);
+	void kson_destroy(kson_t *kson);
+	const kson_node_t *kson_by_path(const kson_node_t *root, int path_len, ...);
+	void kson_format(const kson_node_t *root);
+
+#ifdef __cplusplus
+}
+#endif
+
+#define kson_is_internal(p) ((p)->type == KSON_TYPE_BRACKET || (p)->type == KSON_TYPE_BRACE)
+
+static inline const kson_node_t *kson_by_key(const kson_node_t *p, const char *key)
+{
+	long i;
+	if (!kson_is_internal(p)) return 0;
+	for (i = 0; i < (long)p->n; ++i) {
+		const kson_node_t *q = p->v.child[i];
+		if (q->key && strcmp(q->key, key) == 0)
+			return q;
+	}
+	return 0;
+}
+
+static inline const kson_node_t *kson_by_index(const kson_node_t *p, long i)
+{
+	if (!kson_is_internal(p)) return 0;
+	return 0 <= i && i < (long)p->n? p->v.child[i] : 0;
+}
+
+#endif
--- a/ext/klib/ksort.h
+++ b/ext/klib/ksort.h
@ -0,0 +1,353 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  2011-04-10 (0.1.6):
+
+  	* Added sample
+
+  2011-03 (0.1.5):
+
+	* Added shuffle/permutation
+
+  2008-11-16 (0.1.4):
+
+    * Fixed a bug in introsort() that happens in rare cases.
+
+  2008-11-05 (0.1.3):
+
+    * Fixed a bug in introsort() for complex comparisons.
+
+	* Fixed a bug in mergesort(). The previous version is not stable.
+
+  2008-09-15 (0.1.2):
+
+	* Accelerated introsort. On my Mac (not on another Linux machine),
+	  my implementation is as fast as std::sort on random input.
+
+	* Added combsort and in introsort, switch to combsort if the
+	  recursion is too deep.
+
+  2008-09-13 (0.1.1):
+
+	* Added k-small algorithm
+
+  2008-09-05 (0.1.0):
+
+	* Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+	void *left, *right;
+	int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt)								\
+	void ks_mergesort_##name(size_t n, type_t array[], type_t temp[])	\
+	{																	\
+		type_t *a2[2], *a, *b;											\
+		int curr, shift;												\
+																		\
+		a2[0] = array;													\
+		a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);		\
+		for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {			\
+			a = a2[curr]; b = a2[1-curr];								\
+			if (shift == 0) {											\
+				type_t *p = b, *i, *eb = a + n;							\
+				for (i = a; i < eb; i += 2) {							\
+					if (i == eb - 1) *p++ = *i;							\
+					else {												\
+						if (__sort_lt(*(i+1), *i)) {					\
+							*p++ = *(i+1); *p++ = *i;					\
+						} else {										\
+							*p++ = *i; *p++ = *(i+1);					\
+						}												\
+					}													\
+				}														\
+			} else {													\
+				size_t i, step = 1ul<<shift;							\
+				for (i = 0; i < n; i += step<<1) {						\
+					type_t *p, *j, *k, *ea, *eb;						\
+					if (n < i + step) {									\
+						ea = a + n; eb = a;								\
+					} else {											\
+						ea = a + i + step;								\
+						eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+					}													\
+					j = a + i; k = a + i + step; p = b + i;				\
+					while (j < ea && k < eb) {							\
+						if (__sort_lt(*k, *j)) *p++ = *k++;				\
+						else *p++ = *j++;								\
+					}													\
+					while (j < ea) *p++ = *j++;							\
+					while (k < eb) *p++ = *k++;							\
+				}														\
+			}															\
+			curr = 1 - curr;											\
+		}																\
+		if (curr == 1) {												\
+			type_t *p = a2[0], *i = a2[1], *eb = array + n;				\
+			for (; p < eb; ++i) *p++ = *i;								\
+		}																\
+		if (temp == 0) free(a2[1]);										\
+	}																	\
+	void ks_heapadjust_##name(size_t i, size_t n, type_t l[])			\
+	{																	\
+		size_t k = i;													\
+		type_t tmp = l[i];												\
+		while ((k = (k << 1) + 1) < n) {								\
+			if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k;				\
+			if (__sort_lt(l[k], tmp)) break;							\
+			l[i] = l[k]; i = k;											\
+		}																\
+		l[i] = tmp;														\
+	}																	\
+	void ks_heapmake_##name(size_t lsize, type_t l[])					\
+	{																	\
+		size_t i;														\
+		for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i)				\
+			ks_heapadjust_##name(i, lsize, l);							\
+	}																	\
+	void ks_heapsort_##name(size_t lsize, type_t l[])					\
+	{																	\
+		size_t i;														\
+		for (i = lsize - 1; i > 0; --i) {								\
+			type_t tmp;													\
+			tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+		}																\
+	}																	\
+	static inline void __ks_insertsort_##name(type_t *s, type_t *t)			\
+	{																	\
+		type_t *i, *j, swap_tmp;										\
+		for (i = s + 1; i < t; ++i)										\
+			for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) {			\
+				swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp;			\
+			}															\
+	}																	\
+	void ks_combsort_##name(size_t n, type_t a[])						\
+	{																	\
+		const double shrink_factor = 1.2473309501039786540366528676643; \
+		int do_swap;													\
+		size_t gap = n;													\
+		type_t tmp, *i, *j;												\
+		do {															\
+			if (gap > 2) {												\
+				gap = (size_t)(gap / shrink_factor);					\
+				if (gap == 9 || gap == 10) gap = 11;					\
+			}															\
+			do_swap = 0;												\
+			for (i = a; i < a + n - gap; ++i) {							\
+				j = i + gap;											\
+				if (__sort_lt(*j, *i)) {								\
+					tmp = *i; *i = *j; *j = tmp;						\
+					do_swap = 1;										\
+				}														\
+			}															\
+		} while (do_swap || gap > 2);									\
+		if (gap != 1) __ks_insertsort_##name(a, a + n);					\
+	}																	\
+	void ks_introsort_##name(size_t n, type_t a[])						\
+	{																	\
+		int d;															\
+		ks_isort_stack_t *top, *stack;									\
+		type_t rp, swap_tmp;											\
+		type_t *s, *t, *i, *j, *k;										\
+																		\
+		if (n < 1) return;												\
+		else if (n == 2) {												\
+			if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+			return;														\
+		}																\
+		for (d = 2; 1ul<<d < n; ++d);									\
+		stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+		top = stack; s = a; t = a + (n-1); d <<= 1;						\
+		while (1) {														\
+			if (s < t) {												\
+				if (--d == 0) {											\
+					ks_combsort_##name(t - s + 1, s);					\
+					t = s;												\
+					continue;											\
+				}														\
+				i = s; j = t; k = i + ((j-i)>>1) + 1;					\
+				if (__sort_lt(*k, *i)) {								\
+					if (__sort_lt(*k, *j)) k = j;						\
+				} else k = __sort_lt(*j, *i)? i : j;					\
+				rp = *k;												\
+				if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }	\
+				for (;;) {												\
+					do ++i; while (__sort_lt(*i, rp));					\
+					do --j; while (i <= j && __sort_lt(rp, *j));		\
+					if (j <= i) break;									\
+					swap_tmp = *i; *i = *j; *j = swap_tmp;				\
+				}														\
+				swap_tmp = *i; *i = *t; *t = swap_tmp;					\
+				if (i-s > t-i) {										\
+					if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+					s = t-i > 16? i+1 : t;								\
+				} else {												\
+					if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+					t = i-s > 16? i-1 : s;								\
+				}														\
+			} else {													\
+				if (top == stack) {										\
+					free(stack);										\
+					__ks_insertsort_##name(a, a+n);						\
+					return;												\
+				} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+			}															\
+		}																\
+	}																	\
+	/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+	/* 0 <= kk < n */													\
+	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)			\
+	{																	\
+		type_t *low, *high, *k, *ll, *hh, *mid;							\
+		low = arr; high = arr + n - 1; k = arr + kk;					\
+		for (;;) {														\
+			if (high <= low) return *k;									\
+			if (high == low + 1) {										\
+				if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+				return *k;												\
+			}															\
+			mid = low + (high - low) / 2;								\
+			if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+			if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+			if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);	\
+			KSORT_SWAP(type_t, *mid, *(low+1));							\
+			ll = low + 1; hh = high;									\
+			for (;;) {													\
+				do ++ll; while (__sort_lt(*ll, *low));					\
+				do --hh; while (__sort_lt(*low, *hh));					\
+				if (hh < ll) break;										\
+				KSORT_SWAP(type_t, *ll, *hh);							\
+			}															\
+			KSORT_SWAP(type_t, *low, *hh);								\
+			if (hh <= k) low = ll;										\
+			if (hh >= k) high = hh - 1;									\
+		}																\
+	}																	\
+	void ks_shuffle_##name(size_t n, type_t a[])						\
+	{																	\
+		int i, j;														\
+		for (i = n; i > 1; --i) {										\
+			type_t tmp;													\
+			j = (int)(drand48() * i);									\
+			tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;					\
+		}																\
+	}																	\
+	void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \
+	{ /* reference: http://code.activestate.com/recipes/272884/ */ \
+		int i, k, pop = n; \
+		for (i = (int)r, k = 0; i >= 0; --i) { \
+			double z = 1., x = drand48(); \
+			type_t tmp; \
+			while (x < z) z -= z * i / (pop--); \
+			if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \
+			++k; \
+		} \
+	}
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#define RS_MIN_SIZE 64
+#define RS_MAX_BITS 8
+
+#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
+	typedef struct { \
+		rstype_t *b, *e; \
+	} rsbucket_##name##_t; \
+	void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
+	{ \
+		rstype_t *i; \
+		for (i = beg + 1; i < end; ++i) \
+			if (rskey(*i) < rskey(*(i - 1))) { \
+				rstype_t *j, tmp = *i; \
+				for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
+					*j = *(j - 1); \
+				*j = tmp; \
+			} \
+	} \
+	void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
+	{ \
+		rstype_t *i; \
+		int size = 1<<n_bits, m = size - 1; \
+		rsbucket_##name##_t *k, b[1<<RS_MAX_BITS], *be = b + size; \
+		assert(n_bits <= RS_MAX_BITS); \
+		for (k = b; k != be; ++k) k->b = k->e = beg; \
+		for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
+		for (k = b + 1; k != be; ++k) \
+			k->e += (k-1)->e - beg, k->b = (k-1)->e; \
+		for (k = b; k != be;) { \
+			if (k->b != k->e) { \
+				rsbucket_##name##_t *l; \
+				if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
+					rstype_t tmp = *k->b, swap; \
+					do { \
+						swap = tmp; tmp = *l->b; *l->b++ = swap; \
+						l = b + (rskey(tmp)>>s&m); \
+					} while (l != k); \
+					*k->b++ = tmp; \
+				} else ++k->b; \
+			} else ++k; \
+		} \
+		for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
+		if (s) { \
+			s = s > n_bits? s - n_bits : 0; \
+			for (k = b; k != be; ++k) \
+				if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
+				else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
+		} \
+	} \
+	void radix_sort_##name(rstype_t *beg, rstype_t *end) \
+	{ \
+		if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
+		else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \
+	}
+
+#endif
--- a/ext/klib/kstring.c
+++ b/ext/klib/kstring.c
@ -0,0 +1,250 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include "kstring.h"
+
+int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
+{
+	va_list args;
+	int l;
+	va_copy(args, ap);
+	l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
+	va_end(args);
+	if (l + 1 > s->m - s->l) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+		va_copy(args, ap);
+		l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
+		va_end(args);
+	}
+	s->l += l;
+	return l;
+}
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+	va_list ap;
+	int l;
+	va_start(ap, fmt);
+	l = kvsprintf(s, fmt, ap);
+	va_end(ap);
+	return l;
+}
+
+char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
+{
+	const unsigned char *p, *start, *sep = (unsigned char *) sep_in;
+	if (sep) { // set up the table
+		if (str == 0 && aux->finished) return 0; // no need to set up if we have finished
+		aux->finished = 0;
+		if (sep[0] && sep[1]) {
+			aux->sep = -1;
+			aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+			for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
+		} else aux->sep = sep[0];
+	}
+	if (aux->finished) return 0;
+	else if (str) start = (unsigned char *) str, aux->finished = 0;
+	else start = (unsigned char *) aux->p + 1;
+	if (aux->sep < 0) {
+		for (p = start; *p; ++p)
+			if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
+	} else {
+		for (p = start; *p; ++p)
+			if (*p == aux->sep) break;
+	}
+	aux->p = (const char *) p; // end of token
+	if (*p == 0) aux->finished = 1; // no more tokens
+	return (char*)start;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+	int i, n, max, last_char, last_start, *offsets, l;
+	n = 0; max = *_max; offsets = *_offsets;
+	l = strlen(s);
+	
+#define __ksplit_aux do {						\
+		if (_offsets) {						\
+			s[i] = 0;					\
+			if (n == max) {					\
+				int *tmp;				\
+				max = max? max<<1 : 2;			\
+				if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
+					offsets = tmp;			\
+				} else	{				\
+					free(offsets);			\
+					*_offsets = NULL;		\
+					return 0;			\
+				}					\
+			}						\
+			offsets[n++] = last_start;			\
+		} else ++n;						\
+	} while (0)
+
+	for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+		if (delimiter == 0) {
+			if (isspace(s[i]) || s[i] == 0) {
+				if (isgraph(last_char)) __ksplit_aux; // the end of a field
+			} else {
+				if (isspace(last_char) || last_char == 0) last_start = i;
+			}
+		} else {
+			if (s[i] == delimiter || s[i] == 0) {
+				if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+			} else {
+				if (last_char == delimiter || last_char == 0) last_start = i;
+			}
+		}
+		last_char = s[i];
+	}
+	*_max = max; *_offsets = offsets;
+	return n;
+}
+
+int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
+{
+	size_t l0 = s->l;
+
+	while (s->l == l0 || s->s[s->l-1] != '\n') {
+		if (s->m - s->l < 200) ks_resize(s, s->m + 200);
+		if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
+		s->l += strlen(s->s + s->l);
+	}
+
+	if (s->l == l0) return EOF;
+
+	if (s->l > l0 && s->s[s->l-1] == '\n') {
+		s->l--;
+		if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
+	}
+	s->s[s->l] = '\0';
+	return 0;
+}
+
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+typedef unsigned char ubyte_t;
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+static int *ksBM_prep(const ubyte_t *pat, int m)
+{
+	int i, *suff, *prep, *bmGs, *bmBc;
+	prep = (int*)calloc(m + 256, sizeof(int));
+	bmGs = prep; bmBc = prep + m;
+	{ // preBmBc()
+		for (i = 0; i < 256; ++i) bmBc[i] = m;
+		for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+	}
+	suff = (int*)calloc(m, sizeof(int));
+	{ // suffixes()
+		int f = 0, g;
+		suff[m - 1] = m;
+		g = m - 1;
+		for (i = m - 2; i >= 0; --i) {
+			if (i > g && suff[i + m - 1 - f] < i - g)
+				suff[i] = suff[i + m - 1 - f];
+			else {
+				if (i < g) g = i;
+				f = i;
+				while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+				suff[i] = f - g;
+			}
+		}
+	}
+	{ // preBmGs()
+		int j = 0;
+		for (i = 0; i < m; ++i) bmGs[i] = m;
+		for (i = m - 1; i >= 0; --i)
+			if (suff[i] == i + 1)
+				for (; j < m - 1 - i; ++j)
+					if (bmGs[j] == m)
+						bmGs[j] = m - 1 - i;
+		for (i = 0; i <= m - 2; ++i)
+			bmGs[m - 1 - suff[i]] = m - 1 - i;
+	}
+	free(suff);
+	return prep;
+}
+
+void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
+{
+	int i, j, *prep = 0, *bmGs, *bmBc;
+	const ubyte_t *str, *pat;
+	str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
+	prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
+	if (_prep && *_prep == 0) *_prep = prep;
+	bmGs = prep; bmBc = prep + m;
+	j = 0;
+	while (j <= n - m) {
+		for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+		if (i >= 0) {
+			int max = bmBc[str[i+j]] - m + 1 + i;
+			if (max < bmGs[i]) max = bmGs[i];
+			j += max;
+		} else return (void*)(str + j);
+	}
+	if (_prep == 0) free(prep);
+	return 0;
+}
+
+char *kstrstr(const char *str, const char *pat, int **_prep)
+{
+	return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
+}
+
+char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
+{
+	return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
+}
+
+/***********************
+ * The main() function *
+ ***********************/
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+	kstring_t *s;
+	int *fields, n, i;
+	ks_tokaux_t aux;
+	char *p;
+	s = (kstring_t*)calloc(1, sizeof(kstring_t));
+	// test ksprintf()
+	ksprintf(s, " abcdefg:    %d ", 100);
+	printf("'%s'\n", s->s);
+	// test ksplit()
+	fields = ksplit(s, 0, &n);
+	for (i = 0; i < n; ++i)
+		printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+	// test kstrtok()
+	s->l = 0;
+	for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
+		kputsn(p, aux.p - p, s);
+		kputc('\n', s);
+	}
+	printf("%s", s->s);
+	// free
+	free(s->s); free(s); free(fields);
+
+	{
+		static char *str = "abcdefgcdgcagtcakcdcd";
+		static char *pat = "cd";
+		char *ret, *s = str;
+		int *prep = 0;
+		while ((ret = kstrstr(s, pat, &prep)) != 0) {
+			printf("match: %s\n", ret);
+			s = ret + prep[0];
+		}
+		free(prep);
+	}
+	return 0;
+}
+#endif
--- a/ext/klib/kstring.h
+++ b/ext/klib/kstring.h
@ -0,0 +1,277 @@
+/* The MIT License
+
+   Copyright (c) by Attractive Chaos <attractor@live.co.uk> 
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
+#define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg)))
+#else
+#define KS_ATTR_PRINTF(fmt, arg)
+#endif
+
+
+/* kstring_t is a simple non-opaque type whose fields are likely to be
+ * used directly by user code (but see also ks_str() and ks_len() below).
+ * A kstring_t object is initialised by either of
+ *       kstring_t str = { 0, 0, NULL };
+ *       kstring_t str; ...; str.l = str.m = 0; str.s = NULL;
+ * and either ownership of the underlying buffer should be given away before
+ * the object disappears (see ks_release() below) or the kstring_t should be
+ * destroyed with  free(str.s);  */
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+typedef struct {
+	uint64_t tab[4];
+	int sep, finished;
+	const char *p; // end of the current token
+} ks_tokaux_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0);
+	int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3);
+	int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+	char *kstrstr(const char *str, const char *pat, int **_prep);
+	char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
+	void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
+
+	/* kstrtok() is similar to strtok_r() except that str is not
+	 * modified and both str and sep can be NULL. For efficiency, it is
+	 * actually recommended to set both to NULL in the subsequent calls
+	 * if sep is not changed. */
+	char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
+
+	/* kgetline() uses the supplied fgets()-like function to read a "\n"-
+	 * or "\r\n"-terminated line from fp.  The line read is appended to the
+	 * kstring without its terminator and 0 is returned; EOF is returned at
+	 * EOF or on error (determined by querying fp, as per fgets()). */
+	typedef char *kgets_func(char *, int, void *);
+	int kgetline(kstring_t *s, kgets_func *fgets, void *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int ks_resize(kstring_t *s, size_t size)
+{
+	if (s->m < size) {
+		char *tmp;
+		s->m = size;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return -1;
+	}
+	return 0;
+}
+
+static inline char *ks_str(kstring_t *s)
+{
+	return s->s;
+}
+
+static inline size_t ks_len(kstring_t *s)
+{
+	return s->l;
+}
+
+// Give ownership of the underlying buffer away to something else (making
+// that something else responsible for freeing it), leaving the kstring_t
+// empty and ready to be used again, or ready to go out of scope without
+// needing  free(str.s)  to prevent a memory leak.
+static inline char *ks_release(kstring_t *s)
+{
+	char *ss = s->s;
+	s->l = s->m = 0;
+	s->s = NULL;
+	return ss;
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	s->s[s->l] = 0;
+	return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+	return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+	if (s->l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	s->s[s->l++] = c;
+	s->s[s->l] = 0;
+	return c;
+}
+
+static inline int kputc_(int c, kstring_t *s)
+{
+	if (s->l + 1 > s->m) {
+		char *tmp;
+		s->m = s->l + 1;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	s->s[s->l++] = c;
+	return 1;
+}
+
+static inline int kputsn_(const void *p, int l, kstring_t *s)
+{
+	if (s->l + l > s->m) {
+		char *tmp;
+		s->m = s->l + l;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	return l;
+}
+
+static inline int kputw(int c, kstring_t *s)
+{
+	char buf[16];
+	int i, l = 0;
+	unsigned int x = c;
+	if (c < 0) x = -x;
+	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+	char buf[16];
+	int l, i;
+	unsigned x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputl(long c, kstring_t *s)
+{
+	char buf[32];
+	int i, l = 0;
+	unsigned long x = c;
+	if (c < 0) x = -x;
+	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+/*
+ * Returns 's' split by delimiter, with *n being the number of components;
+ *         NULL on failue.
+ */
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+	int max = 0, *offsets = 0;
+	*n = ksplit_core(s->s, delimiter, &max, &offsets);
+	return offsets;
+}
+
+#endif
--- a/ext/klib/ksw.c
+++ b/ext/klib/ksw.c
@ -0,0 +1,633 @@
+/* The MIT License
+
+   Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <emmintrin.h>
+#include "ksw.h"
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
+
+struct _kswq_t {
+	int qlen, slen;
+	uint8_t shift, mdiff, max, size;
+	__m128i *qp, *H0, *H1, *E, *Hmax;
+};
+
+/**
+ * Initialize the query data structure
+ *
+ * @param size   Number of bytes used to store a score; valid valures are 1 or 2
+ * @param qlen   Length of the query sequence
+ * @param query  Query sequence
+ * @param m      Size of the alphabet
+ * @param mat    Scoring matrix in a one-dimension array
+ *
+ * @return       Query data structure
+ */
+kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
+{
+	kswq_t *q;
+	int slen, a, tmp, p;
+
+	size = size > 1? 2 : 1;
+	p = 8 * (3 - size); // # values per __m128i
+	slen = (qlen + p - 1) / p; // segmented length
+	q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
+	q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
+	q->H0 = q->qp + slen * m;
+	q->H1 = q->H0 + slen;
+	q->E  = q->H1 + slen;
+	q->Hmax = q->E + slen;
+	q->slen = slen; q->qlen = qlen; q->size = size;
+	// compute shift
+	tmp = m * m;
+	for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
+		if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
+		if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
+	}
+	q->max = q->mdiff;
+	q->shift = 256 - q->shift; // NB: q->shift is uint8_t
+	q->mdiff += q->shift; // this is the difference between the min and max scores
+	// An example: p=8, qlen=19, slen=3 and segmentation:
+	//  {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
+	if (size == 1) {
+		int8_t *t = (int8_t*)q->qp;
+		for (a = 0; a < m; ++a) {
+			int i, k, nlen = slen * p;
+			const int8_t *ma = mat + a * m;
+			for (i = 0; i < slen; ++i)
+				for (k = i; k < nlen; k += slen) // p iterations
+					*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
+		}
+	} else {
+		int16_t *t = (int16_t*)q->qp;
+		for (a = 0; a < m; ++a) {
+			int i, k, nlen = slen * p;
+			const int8_t *ma = mat + a * m;
+			for (i = 0; i < slen; ++i)
+				for (k = i; k < nlen; k += slen) // p iterations
+					*t++ = (k >= qlen? 0 : ma[query[k]]);
+		}
+	}
+	return q;
+}
+
+kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+{
+	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
+	uint64_t *b;
+	__m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
+	kswr_t r;
+
+#define __max_16(ret, xx) do { \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
+    	(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
+	} while (0)
+
+	// initialization
+	r = g_defr;
+	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
+	m_b = n_b = 0; b = 0;
+	zero = _mm_set1_epi32(0);
+	gapoe = _mm_set1_epi8(_gapo + _gape);
+	gape = _mm_set1_epi8(_gape);
+	shift = _mm_set1_epi8(q->shift);
+	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
+	slen = q->slen;
+	for (i = 0; i < slen; ++i) {
+		_mm_store_si128(E + i, zero);
+		_mm_store_si128(H0 + i, zero);
+		_mm_store_si128(Hmax + i, zero);
+	}
+	// the core loop
+	for (i = 0; i < tlen; ++i) {
+		int j, k, cmp, imax;
+		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+		h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
+		for (j = 0; LIKELY(j < slen); ++j) {
+			/* SW cells are computed in the following order:
+			 *   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+			 *   E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
+			 *   F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
+			 */
+			// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
+			h = _mm_adds_epu8(h, _mm_load_si128(S + j));
+			h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
+			e = _mm_load_si128(E + j); // e=E'(i,j)
+			h = _mm_max_epu8(h, e);
+			h = _mm_max_epu8(h, f); // h=H'(i,j)
+			max = _mm_max_epu8(max, h); // set max
+			_mm_store_si128(H1 + j, h); // save to H'(i,j)
+			// now compute E'(i+1,j)
+			h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo
+			e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape
+			e = _mm_max_epu8(e, h); // e=E'(i+1,j)
+			_mm_store_si128(E + j, e); // save to E'(i+1,j)
+			// now compute F'(i,j+1)
+			f = _mm_subs_epu8(f, gape);
+			f = _mm_max_epu8(f, h);
+			// get H'(i-1,j) and prepare for the next j
+			h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
+		}
+		// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
+		for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
+			f = _mm_slli_si128(f, 1);
+			for (j = 0; LIKELY(j < slen); ++j) {
+				h = _mm_load_si128(H1 + j);
+				h = _mm_max_epu8(h, f); // h=H'(i,j)
+				_mm_store_si128(H1 + j, h);
+				h = _mm_subs_epu8(h, gapoe);
+				f = _mm_subs_epu8(f, gape);
+				cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
+				if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
+			}
+		}
+end_loop16:
+		//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
+		__max_16(imax, max); // imax is the maximum number in max
+		if (imax >= minsc) { // write the b array; this condition adds branching unfornately
+			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
+				if (n_b == m_b) {
+					m_b = m_b? m_b<<1 : 8;
+					b = (uint64_t*)realloc(b, 8 * m_b);
+				}
+				b[n_b++] = (uint64_t)imax<<32 | i;
+			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
+		}
+		if (imax > gmax) {
+			gmax = imax; te = i; // te is the end position on the target
+			for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
+				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+			if (gmax + q->shift >= 255 || gmax >= endsc) break;
+		}
+		S = H1; H1 = H0; H0 = S; // swap H0 and H1
+	}
+	r.score = gmax + q->shift < 255? gmax : 255;
+	r.te = te;
+	if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
+		int max = -1, low, high, qlen = slen * 16;
+		uint8_t *t = (uint8_t*)Hmax;
+		for (i = 0; i < qlen; ++i, ++t)
+			if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
+		//printf("%d,%d\n", max, gmax);
+		if (b) {
+			i = (r.score + q->max - 1) / q->max;
+			low = te - i; high = te + i;
+			for (i = 0; i < n_b; ++i) {
+				int e = (int32_t)b[i];
+				if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+					r.score2 = b[i]>>32, r.te2 = e;
+			}
+		}
+	}
+	free(b);
+	return r;
+}
+
+kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+{
+	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
+	uint64_t *b;
+	__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
+	kswr_t r;
+
+#define __max_8(ret, xx) do { \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
+    	(ret) = _mm_extract_epi16((xx), 0); \
+	} while (0)
+
+	// initialization
+	r = g_defr;
+	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
+	m_b = n_b = 0; b = 0;
+	zero = _mm_set1_epi32(0);
+	gapoe = _mm_set1_epi16(_gapo + _gape);
+	gape = _mm_set1_epi16(_gape);
+	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
+	slen = q->slen;
+	for (i = 0; i < slen; ++i) {
+		_mm_store_si128(E + i, zero);
+		_mm_store_si128(H0 + i, zero);
+		_mm_store_si128(Hmax + i, zero);
+	}
+	// the core loop
+	for (i = 0; i < tlen; ++i) {
+		int j, k, imax;
+		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+		h = _mm_slli_si128(h, 2);
+		for (j = 0; LIKELY(j < slen); ++j) {
+			h = _mm_adds_epi16(h, *S++);
+			e = _mm_load_si128(E + j);
+			h = _mm_max_epi16(h, e);
+			h = _mm_max_epi16(h, f);
+			max = _mm_max_epi16(max, h);
+			_mm_store_si128(H1 + j, h);
+			h = _mm_subs_epu16(h, gapoe);
+			e = _mm_subs_epu16(e, gape);
+			e = _mm_max_epi16(e, h);
+			_mm_store_si128(E + j, e);
+			f = _mm_subs_epu16(f, gape);
+			f = _mm_max_epi16(f, h);
+			h = _mm_load_si128(H0 + j);
+		}
+		for (k = 0; LIKELY(k < 16); ++k) {
+			f = _mm_slli_si128(f, 2);
+			for (j = 0; LIKELY(j < slen); ++j) {
+				h = _mm_load_si128(H1 + j);
+				h = _mm_max_epi16(h, f);
+				_mm_store_si128(H1 + j, h);
+				h = _mm_subs_epu16(h, gapoe);
+				f = _mm_subs_epu16(f, gape);
+				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
+			}
+		}
+end_loop8:
+		__max_8(imax, max);
+		if (imax >= minsc) {
+			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
+				if (n_b == m_b) {
+					m_b = m_b? m_b<<1 : 8;
+					b = (uint64_t*)realloc(b, 8 * m_b);
+				}
+				b[n_b++] = (uint64_t)imax<<32 | i;
+			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
+		}
+		if (imax > gmax) {
+			gmax = imax; te = i;
+			for (j = 0; LIKELY(j < slen); ++j)
+				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+			if (gmax >= endsc) break;
+		}
+		S = H1; H1 = H0; H0 = S;
+	}
+	r.score = gmax; r.te = te;
+	{
+		int max = -1, low, high, qlen = slen * 8;
+		uint16_t *t = (uint16_t*)Hmax;
+		for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
+			if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
+		if (b) {
+			i = (r.score + q->max - 1) / q->max;
+			low = te - i; high = te + i;
+			for (i = 0; i < n_b; ++i) {
+				int e = (int32_t)b[i];
+				if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+					r.score2 = b[i]>>32, r.te2 = e;
+			}
+		}
+	}
+	free(b);
+	return r;
+}
+
+static void revseq(int l, uint8_t *s)
+{
+	int i, t;
+	for (i = 0; i < l>>1; ++i)
+		t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
+}
+
+kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+{
+	int size;
+	kswq_t *q;
+	kswr_t r, rr;
+	kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
+
+	q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
+	if (qry && *qry == 0) *qry = q;
+	func = q->size == 2? ksw_i16 : ksw_u8;
+	size = q->size;
+	r = func(q, tlen, target, gapo, gape, xtra);
+	if (qry == 0) free(q);
+	if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
+	revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
+	q = ksw_qinit(size, r.qe + 1, query, m, mat);
+	rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
+	revseq(r.qe + 1, query); revseq(r.te + 1, target);
+	free(q);
+	if (r.score == rr.score)
+		r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
+	return r;
+}
+
+/********************
+ *** SW extension ***
+ ********************/
+
+typedef struct {
+	int32_t h, e;
+} eh_t;
+
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
+{
+	eh_t *eh; // score array
+	int8_t *qp; // query profile
+	int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
+	if (h0 < 0) h0 = 0;
+	// allocate memory
+	qp = malloc(qlen * m);
+	eh = calloc(qlen + 1, 8);
+	// generate the query profile
+	for (k = i = 0; k < m; ++k) {
+		const int8_t *p = &mat[k * m];
+		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+	}
+	// fill the first row
+	eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
+	for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
+		eh[j].h = eh[j-1].h - gape;
+	// adjust $w if it is too large
+	k = m * m;
+	for (i = 0, max = 0; i < k; ++i) // get the max score
+		max = max > mat[i]? max : mat[i];
+	max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
+	max_gap = max_gap > 1? max_gap : 1;
+	w = w < max_gap? w : max_gap;
+	// DP loop
+	max = h0, max_i = max_j = -1;
+	beg = 0, end = qlen;
+	for (i = 0; LIKELY(i < tlen); ++i) {
+		int f = 0, h1, m = 0, mj = -1;
+		int8_t *q = &qp[target[i] * qlen];
+		// compute the first column
+		h1 = h0 - (gapo + gape * (i + 1));
+		if (h1 < 0) h1 = 0;
+		// apply the band and the constraint (if provided)
+		if (beg < i - w) beg = i - w;
+		if (end > i + w + 1) end = i + w + 1;
+		if (end > qlen) end = qlen;
+		for (j = beg; LIKELY(j < end); ++j) {
+			// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
+			// Similar to SSE2-SW, cells are computed in the following order:
+			//   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+			//   E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
+			//   F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
+			eh_t *p = &eh[j];
+			int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
+			p->h = h1;          // set H(i,j-1) for the next row
+			h += q[j];
+			h = h > e? h : e;
+			h = h > f? h : f;
+			h1 = h;             // save H(i,j) to h1 for the next column
+			mj = m > h? mj : j;
+			m = m > h? m : h;   // m is stored at eh[mj+1]
+			h -= gapoe;
+			h = h > 0? h : 0;
+			e -= gape;
+			e = e > h? e : h;   // computed E(i+1,j)
+			p->e = e;           // save E(i+1,j) for the next row
+			f -= gape;
+			f = f > h? f : h;   // computed F(i,j+1)
+		}
+		eh[end].h = h1; eh[end].e = 0;
+		if (m == 0) break;
+		if (m > max) max = m, max_i = i, max_j = mj;
+		// update beg and end for the next round
+		for (j = mj; j >= beg && eh[j].h; --j);
+		beg = j + 1;
+		for (j = mj + 2; j <= end && eh[j].h; ++j);
+		end = j;
+		//beg = 0; end = qlen; // uncomment this line for debugging
+	}
+	free(eh); free(qp);
+	if (_qle) *_qle = max_j + 1;
+	if (_tle) *_tle = max_i + 1;
+	return max;
+}
+
+/********************
+ * Global alignment *
+ ********************/
+
+#define MINUS_INF -0x40000000
+
+static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
+{
+	if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
+		if (*n_cigar == *m_cigar) {
+			*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
+			cigar = realloc(cigar, (*m_cigar) << 2);
+		}
+		cigar[(*n_cigar)++] = len<<4 | op;
+	} else cigar[(*n_cigar)-1] += len<<4;
+	return cigar;
+}
+
+int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+{
+	eh_t *eh;
+	int8_t *qp; // query profile
+	int i, j, k, gapoe = gapo + gape, score, n_col;
+	uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
+	if (n_cigar_) *n_cigar_ = 0;
+	// allocate memory
+	n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
+	z = malloc(n_col * tlen);
+	qp = malloc(qlen * m);
+	eh = calloc(qlen + 1, 8);
+	// generate the query profile
+	for (k = i = 0; k < m; ++k) {
+		const int8_t *p = &mat[k * m];
+		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+	}
+	// fill the first row
+	eh[0].h = 0; eh[0].e = MINUS_INF;
+	for (j = 1; j <= qlen && j <= w; ++j)
+		eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
+	for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
+	// DP loop
+	for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
+		int32_t f = MINUS_INF, h1, beg, end;
+		int8_t *q = &qp[target[i] * qlen];
+		uint8_t *zi = &z[i * n_col];
+		beg = i > w? i - w : 0;
+		end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
+		h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
+		for (j = beg; LIKELY(j < end); ++j) {
+			// This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
+			// 1) not checking h>0; 2) recording direction for backtracking
+			eh_t *p = &eh[j];
+			int32_t h = p->h, e = p->e;
+			uint8_t d; // direction
+			p->h = h1;
+			h += q[j];
+			d = h > e? 0 : 1;
+			h = h > e? h : e;
+			d = h > f? d : 2;
+			h = h > f? h : f;
+			h1 = h;
+			h -= gapoe;
+			e -= gape;
+			d |= e > h? 1<<2 : 0;
+			e = e > h? e : h;
+			p->e = e;
+			f -= gape;
+			d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
+			f = f > h? f : h;
+			zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
+		}
+		eh[end].h = h1; eh[end].e = MINUS_INF;
+	}
+	score = eh[qlen].h;
+	if (n_cigar_ && cigar_) { // backtrack
+		int n_cigar = 0, m_cigar = 0, which = 0;
+		uint32_t *cigar = 0, tmp;
+		i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+		while (i >= 0 && k >= 0) {
+			which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
+			if (which == 0)      cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
+			else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
+			else                 cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
+		}
+		if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
+		if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
+		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
+			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
+		*n_cigar_ = n_cigar, *cigar_ = cigar;
+	}
+	free(eh); free(qp); free(z);
+	return score;
+}
+
+/*******************************************
+ * Main function (not compiled by default) *
+ *******************************************/
+
+#ifdef _KSW_MAIN
+
+#include <unistd.h>
+#include <stdio.h>
+#include <zlib.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+unsigned char seq_nt4_table[256] = {
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
+};
+
+int main(int argc, char *argv[])
+{
+	int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
+	int8_t mat[25];
+	int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
+	uint8_t *rseq = 0;
+	gzFile fpt, fpq;
+	kseq_t *kst, *ksq;
+
+	// parse command line
+	while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
+		switch (c) {
+			case 'a': sa = atoi(optarg); break;
+			case 'b': sb = atoi(optarg); break;
+			case 'q': gapo = atoi(optarg); break;
+			case 'r': gape = atoi(optarg); break;
+			case 't': minsc = atoi(optarg); break;
+			case 'f': forward_only = 1; break;
+			case '1': xtra |= KSW_XBYTE; break;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
+		return 1;
+	}
+	if (minsc > 0xffff) minsc = 0xffff;
+	xtra |= KSW_XSUBO | minsc;
+	// initialize scoring matrix
+	for (i = k = 0; i < 4; ++i) {
+		for (j = 0; j < 4; ++j)
+			mat[k++] = i == j? sa : -sb;
+		mat[k++] = 0; // ambiguous base
+	}
+	for (j = 0; j < 5; ++j) mat[k++] = 0;
+	// open file
+	fpt = gzopen(argv[optind],   "r"); kst = kseq_init(fpt);
+	fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
+	// all-pair alignment
+	while (kseq_read(ksq) > 0) {
+		kswq_t *q[2] = {0, 0};
+		kswr_t r;
+		for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
+		if (!forward_only) { // reverse
+			if ((int)ksq->seq.m > max_rseq) {
+				max_rseq = ksq->seq.m;
+				rseq = (uint8_t*)realloc(rseq, max_rseq);
+			}
+			for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
+				rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
+		}
+		gzrewind(fpt); kseq_rewind(kst);
+		while (kseq_read(kst) > 0) {
+			for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
+			r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
+			if (r.score >= minsc)
+				printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
+			if (rseq) {
+				r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
+				if (r.score >= minsc)
+					printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
+			}
+		}
+		free(q[0]); free(q[1]);
+	}
+	free(rseq);
+	kseq_destroy(kst); gzclose(fpt);
+	kseq_destroy(ksq); gzclose(fpq);
+	return 0;
+}
+#endif
--- a/ext/klib/ksw.h
+++ b/ext/klib/ksw.h
@ -0,0 +1,72 @@
+#ifndef __AC_KSW_H
+#define __AC_KSW_H
+
+#include <stdint.h>
+
+#define KSW_XBYTE  0x10000
+#define KSW_XSTOP  0x20000
+#define KSW_XSUBO  0x40000
+#define KSW_XSTART 0x80000
+
+struct _kswq_t;
+typedef struct _kswq_t kswq_t;
+
+typedef struct {
+	int score; // best score
+	int te, qe; // target end and query end
+	int score2, te2; // second best score and ending position on the target
+	int tb, qb; // target start and query start
+} kswr_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/**
+	 * Aligning two sequences
+	 *
+	 * @param qlen    length of the query sequence (typically <tlen)
+	 * @param query   query sequence with 0 <= query[i] < m
+	 * @param tlen    length of the target sequence
+	 * @param target  target sequence
+	 * @param m       number of residue types
+	 * @param mat     m*m scoring matrix in one-dimention array
+	 * @param gapo    gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+	 * @param gape    gap extension penalty
+	 * @param xtra    extra information (see below)
+	 * @param qry     query profile (see below)
+	 *
+	 * @return        alignment information in a struct; unset values to -1
+	 *
+	 * When xtra==0, ksw_align() uses a signed two-byte integer to store a
+	 * score and only finds the best score and the end positions. The 2nd best
+	 * score or the start positions are not attempted. The default behavior can
+	 * be tuned by setting KSW_X* flags:
+	 *
+	 *   KSW_XBYTE:  use an unsigned byte to store a score. If overflow occurs,
+	 *               kswr_t::score will be set to 255
+	 *
+	 *   KSW_XSUBO:  track the 2nd best score and the ending position on the
+	 *               target if the 2nd best is higher than (xtra&0xffff)
+	 *
+	 *   KSW_XSTOP:  stop if the maximum score is above (xtra&0xffff)
+	 *
+	 *   KSW_XSTART: find the start positions
+	 *
+	 * When *qry==NULL, ksw_align() will compute and allocate the query profile
+	 * and when the function returns, *qry will point to the profile, which can
+	 * be deallocated simply by free(). If one query is aligned against multiple
+	 * target sequences, *qry should be set to NULL during the first call and
+	 * freed after the last call. Note that qry can equal 0. In this case, the
+	 * query profile will be deallocated in ksw_align().
+	 */
+	kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
+
+	int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
+	int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ext/klib/kthread.c
+++ b/ext/klib/kthread.c
@ -0,0 +1,256 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <limits.h>
+
+/************
+ * kt_for() *
+ ************/
+
+struct kt_for_t;
+
+typedef struct {
+	struct kt_for_t *t;
+	long i;
+} ktf_worker_t;
+
+typedef struct kt_for_t {
+	int n_threads;
+	long n;
+	ktf_worker_t *w;
+	void (*func)(void*,long,int);
+	void *data;
+} kt_for_t;
+
+static inline long steal_work(kt_for_t *t)
+{
+	int i, min_i = -1;
+	long k, min = LONG_MAX;
+	for (i = 0; i < t->n_threads; ++i)
+		if (min > t->w[i].i) min = t->w[i].i, min_i = i;
+	k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
+	return k >= t->n? -1 : k;
+}
+
+static void *ktf_worker(void *data)
+{
+	ktf_worker_t *w = (ktf_worker_t*)data;
+	long i;
+	for (;;) {
+		i = __sync_fetch_and_add(&w->i, w->t->n_threads);
+		if (i >= w->t->n) break;
+		w->t->func(w->t->data, i, w - w->t->w);
+	}
+	while ((i = steal_work(w->t)) >= 0)
+		w->t->func(w->t->data, i, w - w->t->w);
+	pthread_exit(0);
+}
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
+{
+	if (n_threads > 1) {
+		int i;
+		kt_for_t t;
+		pthread_t *tid;
+		t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
+		t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
+		tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+		for (i = 0; i < n_threads; ++i)
+			t.w[i].t = &t, t.w[i].i = i;
+		for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
+		for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+	} else {
+		long j;
+		for (j = 0; j < n; ++j) func(data, j, 0);
+	}
+}
+
+/***************************
+ * kt_for with thread pool *
+ ***************************/
+
+struct kt_forpool_t;
+
+typedef struct {
+	struct kt_forpool_t *t;
+	long i;
+	int action;
+} kto_worker_t;
+
+typedef struct kt_forpool_t {
+	int n_threads, n_pending;
+	long n;
+	pthread_t *tid;
+	kto_worker_t *w;
+	void (*func)(void*,long,int);
+	void *data;
+	pthread_mutex_t mutex;
+	pthread_cond_t cv_m, cv_s;
+} kt_forpool_t;
+
+static inline long kt_fp_steal_work(kt_forpool_t *t)
+{
+	int i, min_i = -1;
+	long k, min = LONG_MAX;
+	for (i = 0; i < t->n_threads; ++i)
+		if (min > t->w[i].i) min = t->w[i].i, min_i = i;
+	k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
+	return k >= t->n? -1 : k;
+}
+
+static void *kt_fp_worker(void *data)
+{
+	kto_worker_t *w = (kto_worker_t*)data;
+	kt_forpool_t *fp = w->t;
+	for (;;) {
+		long i;
+		int action;
+		pthread_mutex_lock(&fp->mutex);
+		if (--fp->n_pending == 0)
+			pthread_cond_signal(&fp->cv_m);
+		w->action = 0;
+		while (w->action == 0) pthread_cond_wait(&fp->cv_s, &fp->mutex);
+		action = w->action;
+		pthread_mutex_unlock(&fp->mutex);
+		if (action < 0) break;
+		for (;;) { // process jobs allocated to this worker
+			i = __sync_fetch_and_add(&w->i, fp->n_threads);
+			if (i >= fp->n) break;
+			fp->func(fp->data, i, w - fp->w);
+		}
+		while ((i = kt_fp_steal_work(fp)) >= 0) // steal jobs allocated to other workers
+			fp->func(fp->data, i, w - fp->w);
+	}
+	pthread_exit(0);
+}
+
+void *kt_forpool_init(int n_threads)
+{
+	kt_forpool_t *fp;
+	int i;
+	fp = (kt_forpool_t*)calloc(1, sizeof(kt_forpool_t));
+	fp->n_threads = fp->n_pending = n_threads;
+	fp->tid = (pthread_t*)calloc(fp->n_threads, sizeof(pthread_t));
+	fp->w = (kto_worker_t*)calloc(fp->n_threads, sizeof(kto_worker_t));
+	for (i = 0; i < fp->n_threads; ++i) fp->w[i].t = fp;
+	pthread_mutex_init(&fp->mutex, 0);
+	pthread_cond_init(&fp->cv_m, 0);
+	pthread_cond_init(&fp->cv_s, 0);
+	for (i = 0; i < fp->n_threads; ++i) pthread_create(&fp->tid[i], 0, kt_fp_worker, &fp->w[i]);
+	pthread_mutex_lock(&fp->mutex);
+	while (fp->n_pending) pthread_cond_wait(&fp->cv_m, &fp->mutex);
+	pthread_mutex_unlock(&fp->mutex);
+	return fp;
+}
+
+void kt_forpool_destroy(void *_fp)
+{
+	kt_forpool_t *fp = (kt_forpool_t*)_fp;
+	int i;
+	for (i = 0; i < fp->n_threads; ++i) fp->w[i].action = -1;
+	pthread_cond_broadcast(&fp->cv_s);
+	for (i = 0; i < fp->n_threads; ++i) pthread_join(fp->tid[i], 0);
+	pthread_cond_destroy(&fp->cv_s);
+	pthread_cond_destroy(&fp->cv_m);
+	pthread_mutex_destroy(&fp->mutex);
+	free(fp->w); free(fp->tid); free(fp);
+}
+
+void kt_forpool(void *_fp, void (*func)(void*,long,int), void *data, long n)
+{
+	kt_forpool_t *fp = (kt_forpool_t*)_fp;
+	long i;
+	if (fp && fp->n_threads > 1) {
+		fp->n = n, fp->func = func, fp->data = data, fp->n_pending = fp->n_threads;
+		for (i = 0; i < fp->n_threads; ++i) fp->w[i].i = i, fp->w[i].action = 1;
+		pthread_mutex_lock(&fp->mutex);
+		pthread_cond_broadcast(&fp->cv_s);
+		while (fp->n_pending) pthread_cond_wait(&fp->cv_m, &fp->mutex);
+		pthread_mutex_unlock(&fp->mutex);
+	} else for (i = 0; i < n; ++i) func(data, i, 0);
+}
+
+/*****************
+ * kt_pipeline() *
+ *****************/
+
+struct ktp_t;
+
+typedef struct {
+	struct ktp_t *pl;
+	int64_t index;
+	int step;
+	void *data;
+} ktp_worker_t;
+
+typedef struct ktp_t {
+	void *shared;
+	void *(*func)(void*, int, void*);
+	int64_t index;
+	int n_workers, n_steps;
+	ktp_worker_t *workers;
+	pthread_mutex_t mutex;
+	pthread_cond_t cv;
+} ktp_t;
+
+static void *ktp_worker(void *data)
+{
+	ktp_worker_t *w = (ktp_worker_t*)data;
+	ktp_t *p = w->pl;
+	while (w->step < p->n_steps) {
+		// test whether we can kick off the job with this worker
+		pthread_mutex_lock(&p->mutex);
+		for (;;) {
+			int i;
+			// test whether another worker is doing the same step
+			for (i = 0; i < p->n_workers; ++i) {
+				if (w == &p->workers[i]) continue; // ignore itself
+				if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
+					break;
+			}
+			if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
+			pthread_cond_wait(&p->cv, &p->mutex);
+		}
+		pthread_mutex_unlock(&p->mutex);
+
+		// working on w->step
+		w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
+
+		// update step and let other workers know
+		pthread_mutex_lock(&p->mutex);
+		w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
+		if (w->step == 0) w->index = p->index++;
+		pthread_cond_broadcast(&p->cv);
+		pthread_mutex_unlock(&p->mutex);
+	}
+	pthread_exit(0);
+}
+
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
+{
+	ktp_t aux;
+	pthread_t *tid;
+	int i;
+
+	if (n_threads < 1) n_threads = 1;
+	aux.n_workers = n_threads;
+	aux.n_steps = n_steps;
+	aux.func = func;
+	aux.shared = shared_data;
+	aux.index = 0;
+	pthread_mutex_init(&aux.mutex, 0);
+	pthread_cond_init(&aux.cv, 0);
+
+	aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
+	for (i = 0; i < n_threads; ++i) {
+		ktp_worker_t *w = &aux.workers[i];
+		w->step = 0; w->pl = &aux; w->data = 0;
+		w->index = aux.index++;
+	}
+
+	tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
+	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+
+	pthread_mutex_destroy(&aux.mutex);
+	pthread_cond_destroy(&aux.cv);
+}
--- a/ext/klib/kthread.h
+++ b/ext/klib/kthread.h
@ -0,0 +1,19 @@
+#ifndef KTHREAD_H
+#define KTHREAD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
+
+void *kt_forpool_init(int n_threads);
+void kt_forpool_destroy(void *_fp);
+void kt_forpool(void *_fp, void (*func)(void*,long,int), void *data, long n);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ext/klib/kurl.c
+++ b/ext/klib/kurl.c
@ -0,0 +1,583 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <curl/curl.h>
+#include "kurl.h"
+
+/**********************
+ *** Core kurl APIs ***
+ **********************/
+
+#define KU_DEF_BUFLEN   0x8000
+#define KU_MAX_SKIP     (KU_DEF_BUFLEN<<1) // if seek step is smaller than this, skip
+
+#define kurl_isfile(u) ((u)->fd >= 0)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+struct kurl_t {
+	CURLM *multi; // cURL multi handler
+	CURL *curl;   // cURL easy handle
+	uint8_t *buf; // buffer
+	off_t off0;   // offset of the first byte in the buffer; the actual file offset equals off0 + p_buf
+	int fd;       // file descriptor for a normal file; <0 for a remote file
+	int m_buf;    // max buffer size; for a remote file, CURL_MAX_WRITE_SIZE*2 is recommended
+	int l_buf;    // length of the buffer; l_buf == 0 iff the input read entirely; l_buf <= m_buf
+	int p_buf;    // file position in the buffer; p_buf <= l_buf
+	int done_reading; // true if we can read nothing from the file; buffer may not be empty even if done_reading is set
+	int err;      // error code
+	struct curl_slist *hdr;
+};
+
+typedef struct {
+	char *url, *date, *auth;
+} s3aux_t;
+
+int kurl_init(void) // required for SSL and win32 socket; NOT thread safe
+{
+	return curl_global_init(CURL_GLOBAL_DEFAULT);
+}
+
+void kurl_destroy(void)
+{
+	curl_global_cleanup();
+}
+
+static int prepare(kurl_t *ku, int do_seek)
+{
+	if (kurl_isfile(ku)) {
+		if (do_seek && lseek(ku->fd, ku->off0, SEEK_SET) != ku->off0)
+			return -1;
+	} else { // FIXME: for S3, we need to re-authorize
+		int rc;
+		rc = curl_multi_remove_handle(ku->multi, ku->curl);
+		rc = curl_easy_setopt(ku->curl, CURLOPT_RESUME_FROM, ku->off0);
+		rc = curl_multi_add_handle(ku->multi, ku->curl);
+	}
+	ku->p_buf = ku->l_buf = 0; // empty the buffer
+	return 0;
+}
+
+static size_t write_cb(char *ptr, size_t size, size_t nmemb, void *data) // callback required by cURL
+{
+	kurl_t *ku = (kurl_t*)data;
+	ssize_t nbytes = size * nmemb;
+	if (nbytes + ku->l_buf > ku->m_buf)
+		return CURL_WRITEFUNC_PAUSE;
+	memcpy(ku->buf + ku->l_buf, ptr, nbytes);
+	ku->l_buf += nbytes;
+	return nbytes;
+}
+
+static int fill_buffer(kurl_t *ku) // fill the buffer
+{
+	assert(ku->p_buf == ku->l_buf); // buffer is always used up when fill_buffer() is called; otherwise a bug
+	ku->off0 += ku->l_buf;
+	ku->p_buf = ku->l_buf = 0;
+	if (ku->done_reading) return 0;
+	if (kurl_isfile(ku)) {
+		// The following block is equivalent to "ku->l_buf = read(ku->fd, ku->buf, ku->m_buf)" on Mac.
+		// On Linux, the man page does not specify whether read() guarantees to read ku->m_buf bytes
+		// even if ->fd references a normal file with sufficient remaining bytes.
+		while (ku->l_buf < ku->m_buf) {
+			int l;
+			l = read(ku->fd, ku->buf + ku->l_buf, ku->m_buf - ku->l_buf);
+			if (l == 0) break;
+			ku->l_buf += l;
+		}
+		if (ku->l_buf < ku->m_buf) ku->done_reading = 1;
+	} else {
+		int n_running, rc;
+		fd_set fdr, fdw, fde;
+		do {
+			int maxfd = -1;
+			long curl_to = -1;
+			struct timeval to;
+			// the following is adaped from docs/examples/fopen.c 
+			to.tv_sec = 10, to.tv_usec = 0; // 10 seconds
+			curl_multi_timeout(ku->multi, &curl_to);
+			if (curl_to >= 0) {
+				to.tv_sec = curl_to / 1000;
+				if (to.tv_sec > 1) to.tv_sec = 1;
+				else to.tv_usec = (curl_to % 1000) * 1000;
+			}
+			FD_ZERO(&fdr); FD_ZERO(&fdw); FD_ZERO(&fde);
+			curl_multi_fdset(ku->multi, &fdr, &fdw, &fde, &maxfd); // FIXME: check return code
+			if (maxfd >= 0 && (rc = select(maxfd+1, &fdr, &fdw, &fde, &to)) < 0) break;
+			if (maxfd < 0) { // check curl_multi_fdset.3 about why we wait for 100ms here
+				struct timespec req, rem;
+				req.tv_sec = 0; req.tv_nsec = 100000000; // this is 100ms
+				nanosleep(&req, &rem);
+			}
+			curl_easy_pause(ku->curl, CURLPAUSE_CONT);
+			rc = curl_multi_perform(ku->multi, &n_running); // FIXME: check return code
+		} while (n_running && ku->l_buf < ku->m_buf - CURL_MAX_WRITE_SIZE);
+		if (ku->l_buf < ku->m_buf - CURL_MAX_WRITE_SIZE) ku->done_reading = 1;
+	}
+	return ku->l_buf;
+}
+
+int kurl_close(kurl_t *ku)
+{
+	if (ku == 0) return 0;
+	if (ku->fd < 0) {
+		curl_multi_remove_handle(ku->multi, ku->curl);
+		curl_easy_cleanup(ku->curl);
+		curl_multi_cleanup(ku->multi);
+		if (ku->hdr) curl_slist_free_all(ku->hdr);
+	} else close(ku->fd);
+	free(ku->buf);
+	free(ku);
+	return 0;
+}
+
+kurl_t *kurl_open(const char *url, kurl_opt_t *opt)
+{
+	extern s3aux_t s3_parse(const char *url, const char *_id, const char *_secret, const char *fn);
+	const char *p, *q;
+	kurl_t *ku;
+	int fd = -1, is_file = 1, failed = 0;
+
+	p = strstr(url, "://");
+	if (p && *p) {
+		for (q = url; q != p; ++q)
+			if (!isalnum(*q)) break;
+		if (q == p) is_file = 0;
+	}
+	if (is_file && (fd = open(url, O_RDONLY)) < 0) return 0;
+
+	ku = (kurl_t*)calloc(1, sizeof(kurl_t));
+	ku->fd = is_file? fd : -1;
+	if (!kurl_isfile(ku)) {
+		ku->multi = curl_multi_init();
+		ku->curl  = curl_easy_init();
+		if (strstr(url, "s3://") == url) {
+			s3aux_t a;
+			a = s3_parse(url, (opt? opt->s3keyid : 0), (opt? opt->s3secretkey : 0), (opt? opt->s3key_fn : 0));
+			if (a.url == 0 || a.date == 0 || a.auth == 0) {
+				kurl_close(ku);
+				return 0;
+			}
+			ku->hdr = curl_slist_append(ku->hdr, a.date);
+			ku->hdr = curl_slist_append(ku->hdr, a.auth);
+			curl_easy_setopt(ku->curl, CURLOPT_URL, a.url);
+			curl_easy_setopt(ku->curl, CURLOPT_HTTPHEADER, ku->hdr);
+			free(a.date); free(a.auth); free(a.url);
+		} else curl_easy_setopt(ku->curl, CURLOPT_URL, url);
+		curl_easy_setopt(ku->curl, CURLOPT_WRITEDATA, ku);
+		curl_easy_setopt(ku->curl, CURLOPT_VERBOSE, 0L);
+		curl_easy_setopt(ku->curl, CURLOPT_NOSIGNAL, 1L);
+		curl_easy_setopt(ku->curl, CURLOPT_WRITEFUNCTION, write_cb);
+		curl_easy_setopt(ku->curl, CURLOPT_SSL_VERIFYPEER, 0L);
+		curl_easy_setopt(ku->curl, CURLOPT_SSL_VERIFYHOST, 0L);
+		curl_easy_setopt(ku->curl, CURLOPT_FOLLOWLOCATION, 1L);
+	}
+	ku->m_buf = KU_DEF_BUFLEN;
+	if (!kurl_isfile(ku) && ku->m_buf < CURL_MAX_WRITE_SIZE * 2)
+		ku->m_buf = CURL_MAX_WRITE_SIZE * 2; // for remote files, the buffer set to 2*CURL_MAX_WRITE_SIZE
+	ku->buf = (uint8_t*)calloc(ku->m_buf, 1);
+	if (kurl_isfile(ku)) failed = (fill_buffer(ku) <= 0);
+	else failed = (prepare(ku, 0) < 0 || fill_buffer(ku) <= 0);
+	if (failed) {
+		kurl_close(ku);
+		return 0;
+	}
+	return ku;
+}
+
+kurl_t *kurl_dopen(int fd)
+{
+	kurl_t *ku;
+	ku = (kurl_t*)calloc(1, sizeof(kurl_t));
+	ku->fd = fd;
+	ku->m_buf = KU_DEF_BUFLEN;
+	ku->buf = (uint8_t*)calloc(ku->m_buf, 1);
+	if (prepare(ku, 0) < 0 || fill_buffer(ku) <= 0) {
+		kurl_close(ku);
+		return 0;
+	}
+	return ku;
+}
+
+int kurl_buflen(kurl_t *ku, int len)
+{
+	if (len <= 0 || len < ku->l_buf) return ku->m_buf;
+	if (!kurl_isfile(ku) && len < CURL_MAX_WRITE_SIZE * 2) return ku->m_buf;
+	ku->m_buf = len;
+	kroundup32(ku->m_buf);
+	ku->buf = (uint8_t*)realloc(ku->buf, ku->m_buf);
+	return ku->m_buf;
+}
+
+ssize_t kurl_read(kurl_t *ku, void *buf, size_t nbytes)
+{
+	ssize_t rest = nbytes;
+	if (ku->l_buf == 0) return 0; // end-of-file
+	while (rest) {
+		if (ku->l_buf - ku->p_buf >= rest) {
+			if (buf) memcpy((uint8_t*)buf + (nbytes - rest), ku->buf + ku->p_buf, rest);
+			ku->p_buf += rest;
+			rest = 0;
+		} else {
+			int ret;
+			if (buf && ku->l_buf > ku->p_buf)
+				memcpy((uint8_t*)buf + (nbytes - rest), ku->buf + ku->p_buf, ku->l_buf - ku->p_buf);
+			rest -= ku->l_buf - ku->p_buf;
+			ku->p_buf = ku->l_buf;
+			ret = fill_buffer(ku);
+			if (ret <= 0) break;
+		}
+	}
+	return nbytes - rest;
+}
+
+off_t kurl_seek(kurl_t *ku, off_t offset, int whence) // FIXME: sometimes when seek() fails, read() will fail as well.
+{
+	off_t new_off = -1, cur_off;
+	int failed = 0, seek_end = 0;
+	if (ku == 0) return -1;
+	cur_off = ku->off0 + ku->p_buf;
+	if (whence == SEEK_SET) new_off = offset;
+	else if (whence == SEEK_CUR) new_off += cur_off + offset;
+	else if (whence == SEEK_END && kurl_isfile(ku)) new_off = lseek(ku->fd, offset, SEEK_END), seek_end = 1;
+	else { // not supported whence
+		ku->err = KURL_INV_WHENCE;
+		return -1;
+	}
+	if (new_off < 0) { // negtive absolute offset
+		ku->err = KURL_SEEK_OUT;
+		return -1;
+	}
+	if (!seek_end && new_off >= cur_off && new_off - cur_off + ku->p_buf < ku->l_buf) {
+		ku->p_buf += new_off - cur_off;
+		return ku->off0 + ku->p_buf;
+	}
+	if (seek_end || new_off < cur_off || new_off - cur_off > KU_MAX_SKIP) { // if jump is large, do actual seek
+		ku->off0 = new_off;
+		ku->done_reading = 0;
+		if (prepare(ku, 1) < 0 || fill_buffer(ku) <= 0) failed = 1;
+	} else { // if jump is small, read through
+		off_t r;
+		r = kurl_read(ku, 0, new_off - cur_off);
+		if (r + cur_off != new_off) failed = 1; // out of range
+	}
+	if (failed) ku->err = KURL_SEEK_OUT, ku->l_buf = ku->p_buf = 0, new_off = -1;
+	return new_off;
+}
+
+off_t kurl_tell(const kurl_t *ku)
+{
+	if (ku == 0) return -1;
+	return ku->off0 + ku->p_buf;
+}
+
+int kurl_eof(const kurl_t *ku)
+{
+	if (ku == 0) return 1;
+	return (ku->l_buf == 0); // unless file end, buffer should never be empty
+}
+
+int kurl_fileno(const kurl_t *ku)
+{
+	if (ku == 0) return -1;
+	return ku->fd;
+}
+
+int kurl_error(const kurl_t *ku)
+{
+	if (ku == 0) return KURL_NULL;
+	return ku->err;
+}
+
+/*****************
+ *** HMAC-SHA1 ***
+ *****************/
+
+/* This code is public-domain - it is based on libcrypt placed in the public domain by Wei Dai and other contributors. */
+
+#define HASH_LENGTH 20
+#define BLOCK_LENGTH 64
+
+typedef struct sha1nfo {
+	union { uint8_t b[BLOCK_LENGTH]; uint32_t w[BLOCK_LENGTH/4]; } buf;
+	uint8_t bufOffset;
+	union { uint8_t b[HASH_LENGTH]; uint32_t w[HASH_LENGTH/4]; } state;
+	uint32_t byteCount;
+	uint8_t keyBuffer[BLOCK_LENGTH];
+	uint8_t innerHash[HASH_LENGTH];
+} sha1nfo;
+
+void sha1_init(sha1nfo *s)
+{
+	const uint8_t table[] = { 0x01,0x23,0x45,0x67, 0x89,0xab,0xcd,0xef, 0xfe,0xdc,0xba,0x98, 0x76,0x54,0x32,0x10, 0xf0,0xe1,0xd2,0xc3 };
+	memcpy(s->state.b, table, HASH_LENGTH);
+	s->byteCount = 0;
+	s->bufOffset = 0;
+}
+
+#define rol32(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+static void sha1_hashBlock(sha1nfo *s)
+{
+	uint32_t i, t, a = s->state.w[0], b = s->state.w[1], c = s->state.w[2], d = s->state.w[3], e = s->state.w[4];
+	for (i = 0; i < 80; i++) {
+		if (i >= 16) {
+			t = s->buf.w[(i+13)&15] ^ s->buf.w[(i+8)&15] ^ s->buf.w[(i+2)&15] ^ s->buf.w[i&15];
+			s->buf.w[i&15] = rol32(t, 1);
+		}
+		if (i < 20)      t = 0x5a827999 + (d ^ (b & (c ^ d)));
+		else if (i < 40) t = 0x6ed9eba1 + (b ^ c ^ d);
+		else if (i < 60) t = 0x8f1bbcdc + ((b & c) | (d & (b | c)));
+		else             t = 0xca62c1d6 + (b ^ c ^ d);
+		t += rol32(a, 5) + e + s->buf.w[i&15];
+		e = d; d = c; c = rol32(b, 30); b = a; a = t;
+	}
+	s->state.w[0] += a; s->state.w[1] += b; s->state.w[2] += c; s->state.w[3] += d; s->state.w[4] += e;
+}
+
+static inline void sha1_add(sha1nfo *s, uint8_t data)
+{
+	s->buf.b[s->bufOffset ^ 3] = data;
+	if (++s->bufOffset == BLOCK_LENGTH) {
+		sha1_hashBlock(s);
+		s->bufOffset = 0;
+	}
+}
+
+void sha1_write1(sha1nfo *s, uint8_t data)
+{
+	++s->byteCount;
+	sha1_add(s, data);
+}
+
+void sha1_write(sha1nfo *s, const char *data, size_t len)
+{
+	while (len--) sha1_write1(s, (uint8_t)*data++);
+}
+
+const uint8_t *sha1_final(sha1nfo *s)
+{
+	int i;
+	sha1_add(s, 0x80);
+	while (s->bufOffset != 56) sha1_add(s, 0);
+	sha1_add(s, 0);
+	sha1_add(s, 0);
+	sha1_add(s, 0);
+	sha1_add(s, s->byteCount >> 29);
+	sha1_add(s, s->byteCount >> 21);
+	sha1_add(s, s->byteCount >> 13);
+	sha1_add(s, s->byteCount >> 5);
+	sha1_add(s, s->byteCount << 3);
+	for (i = 0; i < 5; ++i) {
+		uint32_t a = s->state.w[i];
+		s->state.w[i] = a<<24 | (a<<8&0x00ff0000) | (a>>8&0x0000ff00) | a>>24;
+	}
+	return s->state.b;
+}
+
+#define HMAC_IPAD 0x36
+#define HMAC_OPAD 0x5c
+
+void sha1_init_hmac(sha1nfo *s, const uint8_t* key, int l_key)
+{
+	uint8_t i;
+	memset(s->keyBuffer, 0, BLOCK_LENGTH);
+	if (l_key > BLOCK_LENGTH) {
+		sha1_init(s);
+		while (l_key--) sha1_write1(s, *key++);
+		memcpy(s->keyBuffer, sha1_final(s), HASH_LENGTH);
+	} else memcpy(s->keyBuffer, key, l_key);
+	sha1_init(s);
+	for (i = 0; i < BLOCK_LENGTH; ++i)
+		sha1_write1(s, s->keyBuffer[i] ^ HMAC_IPAD);
+}
+
+const uint8_t *sha1_final_hmac(sha1nfo *s)
+{
+	uint8_t i;
+	memcpy(s->innerHash, sha1_final(s), HASH_LENGTH);
+	sha1_init(s);
+	for (i = 0; i < BLOCK_LENGTH; ++i) sha1_write1(s, s->keyBuffer[i] ^ HMAC_OPAD);
+	for (i = 0; i < HASH_LENGTH; ++i) sha1_write1(s, s->innerHash[i]);
+	return sha1_final(s);
+}
+
+/*******************
+ *** S3 protocol ***
+ *******************/
+
+#include <time.h>
+#include <ctype.h>
+
+static void s3_sign(const char *key, const char *data, char out[29])
+{
+	const char *b64tab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+	const uint8_t *digest;
+	int i, j, rest;
+	sha1nfo s;
+	sha1_init_hmac(&s, (uint8_t*)key, strlen(key));
+	sha1_write(&s, data, strlen(data));
+	digest = sha1_final_hmac(&s);
+	for (j = i = 0, rest = 8; i < 20; ++j) { // base64 encoding
+		if (rest <= 6) {
+			int next = i < 19? digest[i+1] : 0;
+			out[j] = b64tab[(int)(digest[i] << (6-rest) & 0x3f) | next >> (rest+2)], ++i, rest += 2;
+		} else out[j] = b64tab[(int)digest[i] >> (rest-6) & 0x3f], rest -= 6;
+	}
+	out[j++] = '='; out[j] = 0; // SHA1 digest always has 160 bits, or 20 bytes. We need one '=' at the end.
+}
+
+static char *s3_read_awssecret(const char *fn)
+{
+	char *p, *secret, buf[128], *path;
+	FILE *fp;
+	int l;
+	if (fn == 0) {
+		char *home;
+		home = getenv("HOME");
+		if (home == 0) return 0;
+		l = strlen(home) + 12;
+		path = (char*)malloc(strlen(home) + 12);
+		strcat(strcpy(path, home), "/.awssecret");
+	} else path = (char*)fn;
+	fp = fopen(path, "r");
+	if (path != fn) free(path);
+	if (fp == 0) return 0;
+	l = fread(buf, 1, 127, fp);
+	fclose(fp);
+	buf[l] = 0;
+	for (p = buf; *p != 0 && *p != '\n'; ++p);
+	if (*p == 0) return 0;
+	*p = 0; secret = p + 1;
+	for (++p; *p != 0 && *p != '\n'; ++p);
+	*p = 0;
+	l = p - buf + 1;
+	p = (char*)malloc(l);
+	memcpy(p, buf, l);
+	return p;
+}
+
+typedef struct { int l, m; char *s; } kstring_t;
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	s->s[s->l] = 0;
+	return l;
+}
+
+s3aux_t s3_parse(const char *url, const char *_id, const char *_secret, const char *fn_secret)
+{
+	const char *id, *secret, *bucket, *obj;
+	char *id_secret = 0, date[64], sig[29];
+	time_t t;
+	struct tm tmt;
+	s3aux_t a = {0,0};
+	kstring_t str = {0,0,0};
+	// parse URL
+	if (strstr(url, "s3://") != url) return a;
+	bucket = url + 5;
+	for (obj = bucket; *obj && *obj != '/'; ++obj);
+	if (*obj == 0) return a; // no object
+	// acquire AWS credential and time
+	if (_id == 0 || _secret == 0) {
+		id_secret = s3_read_awssecret(fn_secret);
+		if (id_secret == 0) return a; // fail to read the AWS credential
+		id = id_secret;
+		secret = id_secret + strlen(id) + 1;
+	} else id = _id, secret = _secret;
+	// compose URL for curl
+	kputsn("https://", 8, &str);
+	kputsn(bucket, obj - bucket, &str);
+	kputsn(".s3.amazonaws.com", 17, &str);
+	kputsn(obj, strlen(obj), &str);
+	a.url = str.s;
+	// compose the Date line
+	str.l = str.m = 0; str.s = 0;
+	t = time(0);
+	strftime(date, 64, "%a, %d %b %Y %H:%M:%S +0000", gmtime_r(&t, &tmt));
+	kputsn("Date: ", 6, &str);
+	kputsn(date, strlen(date), &str);
+	a.date = str.s;
+	// compose the string to sign and sign it
+	str.l = str.m = 0; str.s = 0;
+	kputsn("GET\n\n\n", 6, &str);
+	kputsn(date, strlen(date), &str);
+	kputsn("\n", 1, &str);
+	kputsn(bucket-1, strlen(bucket-1), &str);
+	s3_sign(secret, str.s, sig);
+	// compose the Authorization line
+	str.l = 0;
+	kputsn("Authorization: AWS ", 19, &str);
+	kputsn(id, strlen(id), &str);
+	kputsn(":", 1, &str);
+	kputsn(sig, strlen(sig), &str);
+	a.auth = str.s;
+//	printf("curl -H '%s' -H '%s' %s\n", a.date, a.auth, a.url);
+	return a;
+}
+
+/*********************
+ *** Main function ***
+ *********************/
+
+#ifdef KURL_MAIN
+int main(int argc, char *argv[])
+{
+	kurl_t *f;
+	int c, l, l_buf = 0x10000;
+	off_t start = 0, rest = -1;
+	uint8_t *buf;
+	char *p;
+	kurl_opt_t opt;
+
+	memset(&opt, 0, sizeof(kurl_opt_t));
+	while ((c = getopt(argc, argv, "c:l:a:")) >= 0) {
+		if (c == 'c') start = strtol(optarg, &p, 0);
+		else if (c == 'l') rest = strtol(optarg, &p, 0);
+		else if (c == 'a') opt.s3key_fn = optarg;
+	}
+	if (optind == argc) {
+		fprintf(stderr, "Usage: kurl [-c start] [-l length] <url>\n");
+		return 1;
+	}
+	kurl_init();
+	f = kurl_open(argv[optind], &opt);
+	if (f == 0) {
+		fprintf(stderr, "ERROR: fail to open URL\n");
+		return 2;
+	}
+	if (start > 0) {
+		if (kurl_seek(f, start, SEEK_SET) < 0) {
+			kurl_close(f);
+			fprintf(stderr, "ERROR: fail to seek\n");
+			return 3;
+		}
+	}
+	buf = (uint8_t*)calloc(l_buf, 1);
+	while (rest != 0) {
+		int to_read = rest > 0 && rest < l_buf? rest : l_buf;
+		l = kurl_read(f, buf, to_read);
+		if (l == 0) break;
+		fwrite(buf, 1, l, stdout);
+		rest -= l;
+	}
+	free(buf);
+	kurl_close(f);
+	kurl_destroy();
+	return 0;
+}
+#endif
--- a/ext/klib/kurl.h
+++ b/ext/klib/kurl.h
@ -0,0 +1,57 @@
+#ifndef KURL_H
+#define KURL_H
+
+#include <sys/types.h>
+
+#define KURL_NULL       1
+#define KURL_INV_WHENCE 2
+#define KURL_SEEK_OUT   3
+#define KURL_NO_AUTH    4
+
+struct kurl_t;
+typedef struct kurl_t kurl_t;
+
+typedef struct {
+	const char *s3keyid;
+	const char *s3secretkey;
+	const char *s3key_fn;
+} kurl_opt_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kurl_init(void);
+void kurl_destroy(void);
+
+kurl_t *kurl_open(const char *url, kurl_opt_t *opt);
+kurl_t *kurl_dopen(int fd);
+int kurl_close(kurl_t *ku);
+ssize_t kurl_read(kurl_t *ku, void *buf, size_t nbytes);
+off_t kurl_seek(kurl_t *ku, off_t offset, int whence);
+int kurl_buflen(kurl_t *ku, int len);
+
+off_t kurl_tell(const kurl_t *ku);
+int kurl_eof(const kurl_t *ku);
+int kurl_fileno(const kurl_t *ku);
+int kurl_error(const kurl_t *ku);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef KNETFILE_H
+#define KNETFILE_H
+typedef kurl_t knetFile;
+#define knet_open(fn, mode) kurl_open(fn, 0)
+#define knet_dopen(fd, mode) kurl_dopen(fd)
+#define knet_close(fp) kurl_close(fp)
+#define knet_read(fp, buf, len) kurl_read(fp, buf, len)
+#define knet_seek(fp, off, whence) kurl_seek(fp, off, whence)
+#define knet_tell(fp) kurl_tell(fp)
+#define knet_fileno(fp) kurl_fileno(fp)
+#define knet_win32_init() kurl_init()
+#define knet_win32_destroy() kurl_destroy()
+#endif
+
+#endif
--- a/ext/klib/kvec.h
+++ b/ext/klib/kvec.h
@ -0,0 +1,90 @@
+/* The MIT License
+
+   Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "kvec.h"
+int main() {
+	kvec_t(int) array;
+	kv_init(array);
+	kv_push(int, array, 10); // append
+	kv_a(int, array, 20) = 5; // dynamic
+	kv_A(array, 20) = 4; // static
+	kv_destroy(array);
+	return 0;
+}
+*/
+
+/*
+  2008-09-22 (0.1.0):
+
+	* The initial version.
+
+*/
+
+#ifndef AC_KVEC_H
+#define AC_KVEC_H
+
+#include <stdlib.h>
+
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+#define kvec_t(type) struct { size_t n, m; type *a; }
+#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
+#define kv_destroy(v) free((v).a)
+#define kv_A(v, i) ((v).a[(i)])
+#define kv_pop(v) ((v).a[--(v).n])
+#define kv_size(v) ((v).n)
+#define kv_max(v) ((v).m)
+
+#define kv_resize(type, v, s)  ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
+
+#define kv_copy(type, v1, v0) do {							\
+		if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n);	\
+		(v1).n = (v0).n;									\
+		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n);		\
+	} while (0)												\
+
+#define kv_push(type, v, x) do {									\
+		if ((v).n == (v).m) {										\
+			(v).m = (v).m? (v).m<<1 : 2;							\
+			(v).a = (type*)realloc((v).a, sizeof(type) * (v).m);	\
+		}															\
+		(v).a[(v).n++] = (x);										\
+	} while (0)
+
+#define kv_pushp(type, v) (((v).n == (v).m)?							\
+						   ((v).m = ((v).m? (v).m<<1 : 2),				\
+							(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0)	\
+						   : 0), ((v).a + ((v).n++))
+
+#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
+						  ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
+						   (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+						  : (v).n <= (size_t)(i)? (v).n = (i) + 1 \
+						  : 0), (v).a[(i)])
+
+#endif
--- a/ext/klib/lua/bio.lua
+++ b/ext/klib/lua/bio.lua
@ -0,0 +1,149 @@
+-- bioinformatics routines
+
+-- Description: read a fasta/fastq file
+local function readseq(fp)
+	local finished, last = false, nil;
+	return function()
+		local match;
+		if finished then return nil end
+		if (last == nil) then -- the first record or a record following a fastq
+			for l in fp:lines() do
+				if l:byte(1) == 62 or l:byte(1) == 64 then -- ">" || "@"
+					last = l;
+					break;
+				end
+			end
+			if last == nil then
+				finished = true;
+				return nil;
+			end
+		end
+		local tmp = last:find("%s");
+		name = (tmp and last:sub(2, tmp-1)) or last:sub(2); -- sequence name
+		local seqs = {};
+		local c; -- the first character of the last line
+		last = nil;
+		for l in fp:lines() do -- read sequence
+			c = l:byte(1);
+			if c == 62 or c == 64 or c == 43 then
+				last = l;
+				break;
+			end
+			table.insert(seqs, l);
+		end
+		if last == nil then finished = true end -- end of file
+		if c ~= 43 then return name, table.concat(seqs) end -- a fasta record
+		local seq, len = table.concat(seqs), 0; -- prepare to parse quality
+		seqs = {};
+		for l in fp:lines() do -- read quality
+			table.insert(seqs, l);
+			len = len + #l;
+			if len >= #seq then
+				last = nil;
+				return name, seq, table.concat(seqs);
+			end
+		end
+		finished = true;
+		return name, seq;
+	end
+end
+
+-- extract subsequence from a fasta file indexe by samtools faidx
+local function faidxsub(fn)
+	local fpidx = io.open(fn .. ".fai");
+	if fpidx == nil then
+		io.stderr:write("[faidxsub] fail to open the FASTA index file.\n");
+		return nil
+	end
+	local idx = {};
+	for l in fpidx:lines() do
+		local name, len, offset, line_blen, line_len = l:match("(%S+)%s(%d+)%s(%d+)%s(%d+)%s(%d+)");
+		if name then
+			idx[name] = {tonumber(len), offset, line_blen, line_len};
+		end
+	end
+	fpidx:close();
+	local fp = io.open(fn);
+	return function(name, beg_, end_) -- 0-based coordinate
+		if name == nil then fp:close(); return nil; end
+		if idx[name] then
+			local a = idx[name];
+			beg_ = beg_ or 0;
+			end_ = end_ or a[1];
+			end_ = (end_ <= a[1] and end_) or a[1];
+			local fb, fe = math.floor(beg_ / a[3]), math.floor(end_ / a[3]);
+			local qb, qe = beg_ - fb * a[3], end_ - fe * a[3];
+			fp:seek("set", a[2] + fb * a[4] + qb);
+			local s = fp:read((fe - fb) * a[4] + (qe - qb)):gsub("%s", "");
+			return s;
+		end
+	end
+end
+
+--Description: Index a list of intervals and test if a given interval overlaps with the list
+--Example: lua -lbio -e 'a={{100,201},{200,300},{400,600}};f=bio.intvovlp(a);print(f(600,700))'
+--[[
+  By default, we keep for each tiling 8192 window the interval overlaping the
+  window while having the smallest start position. This method may not work
+  well when most intervals are small but few intervals span a long distance.
+]]--
+local function intvovlp(intv, bits)
+	bits = bits or 13 -- the default bin size is 8192 = 1<<13
+	table.sort(intv, function(a,b) return a[1] < b[1] end) -- sort by the start
+	-- merge intervals; the step speeds up testing, but can be skipped
+	local b, e, k = -1, -1, 1
+	for i = 1, #intv do
+		if e < intv[i][1] then
+			if e >= 0 then intv[k], k = {b, e}, k + 1 end
+			b, e = intv[i][1], intv[i][2]
+		else e = intv[i][2] end
+	end
+	if e >= 0 then intv[k] = {b, e} end
+	while #a > k do table.remove(a) end -- truncate the interval list
+	-- build the index for the list of intervals
+	local idx, size, max = {}, math.pow(2, bits), 0
+	for i = 1, #a do
+		b = math.modf(intv[i][1] / size)
+		e = math.modf(intv[i][2] / size)
+		if b == e then idx[b] = idx[b] or i
+		else for j = b, e do idx[j] = idx[j] or i end end
+		max = (max > e and max) or e
+	end
+	-- return a function (closure)
+	return function(_beg, _end)
+		local x = math.modf(_beg / size)
+		if x > max then return false end
+		local off = idx[x]; -- the start bin
+		if off == nil then -- the following is not the best in efficiency
+			for i = x - 1, 0, -1 do -- find the minimum bin with a value
+				if idx[i] ~= nil then off = idx[i]; break; end
+			end
+			if off == nil then return false end
+		end
+		for i = off, #intv do -- start from off and search for overlaps
+			if intv[i][1] >= _end then return false
+			elseif intv[i][2] > _beg then return true end 
+		end
+		return false
+	end
+end
+
+bio = {
+	readseq = readseq,
+	faidxsub = faidxsub,
+	intvovlp = intvovlp
+}
+
+bio.nt16 = {
+[0]=15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+}
+bio.ntcnt  = { [0]=4, 1, 1,  2, 1,  2, 2,  3, 1, 2, 2,  3, 2,  3, 3,  4 }
+bio.ntcomp = { [0]=0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }
+bio.ntrev  = 'XACMGRSVTWYHKDBN'
--- a/ext/klib/lua/klib.lua
+++ b/ext/klib/lua/klib.lua
@ -0,0 +1,677 @@
+--[[
+  The MIT License
+  
+  Copyright (c) 2011, Attractive Chaos <attractor@live.co.uk>
+  
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+]]--
+
+--[[
+  This is a Lua library, more exactly a collection of Lua snippets, covering
+  utilities (e.g. getopt), string operations (e.g. split), statistics (e.g.
+  Fisher's exact test), special functions (e.g. logarithm gamma) and matrix
+  operations (e.g. Gauss-Jordan elimination). The routines are designed to be
+  as independent as possible, such that one can copy-paste relevant pieces of
+  code without worrying about additional library dependencies.
+
+  If you use routines from this library, please include the licensing
+  information above where appropriate.
+]]--
+
+--[[
+  Library functions and dependencies. "a>b" means "a is required by b"; "b<a"
+  means "b depends on a".
+
+  os.getopt()
+  string:split()
+  io.xopen()
+  table.ksmall()
+  table.shuffle()
+  math.lgamma() >math.lbinom() >math.igamma()
+  math.igamma() <math.lgamma() >matrix.chi2()
+  math.erfc()
+  math.lbinom() <math.lgamma() >math.fisher_exact()
+  math.bernstein_poly() <math.lbinom()
+  math.fisher_exact() <math.lbinom()
+  math.jackknife()
+  math.pearson()
+  math.spearman()
+  math.fmin()
+  matrix
+  matrix.add()
+  matrix.T() >matrix.mul()
+  matrix.mul() <matrix.T()
+  matrix.tostring()
+  matrix.chi2() <math.igamma()
+  matrix.solve()
+]]--
+
+-- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt()
+--[[ Example:
+	for o, a in os.getopt(arg, 'a:b') do
+		print(o, a)
+	end
+]]--
+function os.getopt(args, ostr)
+	local arg, place = nil, 0;
+	return function ()
+		if place == 0 then -- update scanning pointer
+			place = 1
+			if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end
+			if #args[1] >= 2 then
+				place = place + 1
+				if args[1]:sub(2, 2) == '-' then -- found "--"
+					place = 0
+					table.remove(args, 1);
+					return nil;
+				end
+			end
+		end
+		local optopt = args[1]:sub(place, place);
+		place = place + 1;
+		local oli = ostr:find(optopt);
+		if optopt == ':' or oli == nil then -- unknown option
+			if optopt == '-' then return nil end
+			if place > #args[1] then
+				table.remove(args, 1);
+				place = 0;
+			end
+			return '?';
+		end
+		oli = oli + 1;
+		if ostr:sub(oli, oli) ~= ':' then -- do not need argument
+			arg = nil;
+			if place > #args[1] then
+				table.remove(args, 1);
+				place = 0;
+			end
+		else -- need an argument
+			if place <= #args[1] then  -- no white space
+				arg = args[1]:sub(place);
+			else
+				table.remove(args, 1);
+				if #args == 0 then -- an option requiring argument is the last one
+					place = 0;
+					if ostr:sub(1, 1) == ':' then return ':' end
+					return '?';
+				else arg = args[1] end
+			end
+			table.remove(args, 1);
+			place = 0;
+		end
+		return optopt, arg;
+	end
+end
+
+-- Description: string split
+function string:split(sep, n)
+	local a, start = {}, 1;
+	sep = sep or "%s+";
+	repeat
+		local b, e = self:find(sep, start);
+		if b == nil then
+			table.insert(a, self:sub(start));
+			break
+		end
+		a[#a+1] = self:sub(start, b - 1);
+		start = e + 1;
+		if n and #a == n then
+			table.insert(a, self:sub(start));
+			break
+		end
+	until start > #self;
+	return a;
+end
+
+-- Description: smart file open
+function io.xopen(fn, mode)
+	mode = mode or 'r';
+	if fn == nil then return io.stdin;
+	elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout;
+	elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w');
+	elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w');
+	else return io.open(fn, mode) end
+end
+
+-- Description: find the k-th smallest element in an array (Ref. http://ndevilla.free.fr/median/)
+function table.ksmall(arr, k)
+	local low, high = 1, #arr;
+	while true do
+		if high <= low then return arr[k] end
+		if high == low + 1 then
+			if arr[high] < arr[low] then arr[high], arr[low] = arr[low], arr[high] end;
+			return arr[k];
+		end
+		local mid = math.floor((high + low) / 2);
+		if arr[high] < arr[mid] then arr[mid], arr[high] = arr[high], arr[mid] end
+		if arr[high] < arr[low] then arr[low], arr[high] = arr[high], arr[low] end
+		if arr[low]  < arr[mid] then arr[low], arr[mid]  = arr[mid],  arr[low] end
+		arr[mid], arr[low+1] = arr[low+1], arr[mid];
+		local ll, hh = low + 1, high;
+		while true do
+			repeat ll = ll + 1 until arr[ll] >= arr[low]
+			repeat hh = hh - 1 until arr[low] >= arr[hh]
+			if hh < ll then break end
+			arr[ll], arr[hh] = arr[hh], arr[ll];
+		end
+		arr[low], arr[hh] = arr[hh], arr[low];
+		if hh <= k then low = ll end
+		if hh >= k then high = hh - 1 end
+	end
+end
+
+-- Description: shuffle/permutate an array
+function table.shuffle(a)
+	for i = #a, 1, -1 do
+		local j = math.random(i)
+		a[j], a[i] = a[i], a[j]
+	end
+end
+
+--
+-- Mathematics
+--
+
+-- Description: log gamma function
+-- Required by: math.lbinom()
+-- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+function math.lgamma(z)
+	local x;
+	x = 0.1659470187408462e-06     / (z+7);
+	x = x + 0.9934937113930748e-05 / (z+6);
+	x = x - 0.1385710331296526     / (z+5);
+	x = x + 12.50734324009056      / (z+4);
+	x = x - 176.6150291498386      / (z+3);
+	x = x + 771.3234287757674      / (z+2);
+	x = x - 1259.139216722289      / (z+1);
+	x = x + 676.5203681218835      / z;
+	x = x + 0.9999999999995183;
+	return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5);
+end
+
+-- Description: regularized incomplete gamma function
+-- Dependent on: math.lgamma()
+--[[
+  Formulas are taken from Wiki, with additional input from Numerical
+  Recipes in C (for modified Lentz's algorithm) and AS245
+  (http://lib.stat.cmu.edu/apstat/245).
+ 
+  A good online calculator is available at:
+ 
+    http://www.danielsoper.com/statcalc/calc23.aspx
+ 
+  It calculates upper incomplete gamma function, which equals
+  math.igamma(s,z,true)*math.exp(math.lgamma(s))
+]]--
+function math.igamma(s, z, complement)
+
+	local function _kf_gammap(s, z)
+		local sum, x = 1, 1;
+		for k = 1, 100 do
+			x = x * z / (s + k);
+			sum = sum + x;
+			if x / sum < 1e-14 then break end
+		end
+		return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum));
+	end
+
+	local function _kf_gammaq(s, z)
+		local C, D, f, TINY;
+		f = 1. + z - s; C = f; D = 0.; TINY = 1e-290;
+		-- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2
+		for j = 1, 100 do
+			local d;
+			local a, b = j * (s - j), j*2 + 1 + z - s;
+			D = b + a * D;
+			if D < TINY then D = TINY end
+			C = b + a / C;
+			if C < TINY then C = TINY end
+			D = 1. / D;
+			d = C * D;
+			f = f * d;
+			if math.abs(d - 1) < 1e-14 then break end
+		end
+		return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f));
+	end
+
+	if complement then
+		return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z);
+	else 
+		return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z));
+	end
+end
+
+math.M_SQRT2   = 1.41421356237309504880  -- sqrt(2)
+math.M_SQRT1_2 = 0.70710678118654752440  -- 1/sqrt(2)
+
+-- Description: complement error function erfc(x): \Phi(x) = 0.5 * erfc(-x/M_SQRT2)
+function math.erfc(x)
+	local z = math.abs(x) * math.M_SQRT2
+	if z > 37 then return (x > 0 and 0) or 2 end
+	local expntl = math.exp(-0.5 * z * z)
+	local p
+	if z < 10. / math.M_SQRT2 then -- for small z
+	    p = expntl * ((((((.03526249659989109 * z + .7003830644436881) * z + 6.37396220353165) * z + 33.912866078383)
+				* z + 112.0792914978709) * z + 221.2135961699311) * z + 220.2068679123761)
+			/ (((((((.08838834764831844 * z + 1.755667163182642) * z + 16.06417757920695) * z + 86.78073220294608)
+				* z + 296.5642487796737) * z + 637.3336333788311) * z + 793.8265125199484) * z + 440.4137358247522);
+	else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65))))) end
+	return (x > 0 and 2 * p) or 2 * (1 - p)
+end
+
+-- Description: log binomial coefficient
+-- Dependent on: math.lgamma()
+-- Required by: math.fisher_exact()
+function math.lbinom(n, m)
+	if m == nil then
+		local a = {};
+		a[0], a[n] = 0, 0;
+		local t = math.lgamma(n+1);
+		for m = 1, n-1 do a[m] = t - math.lgamma(m+1) - math.lgamma(n-m+1) end
+		return a;
+	else return math.lgamma(n+1) - math.lgamma(m+1) - math.lgamma(n-m+1) end
+end
+
+-- Description: Berstein polynomials (mainly for Bezier curves)
+-- Dependent on: math.lbinom()
+-- Note: to compute derivative: let beta_new[i]=beta[i+1]-beta[i]
+function math.bernstein_poly(beta)
+	local n = #beta - 1;
+	local lbc = math.lbinom(n); -- log binomial coefficients
+	return function (t)
+		assert(t >= 0 and t <= 1);
+		if t == 0 then return beta[1] end
+		if t == 1 then return beta[n+1] end
+		local sum, logt, logt1 = 0, math.log(t), math.log(1-t);
+		for i = 0, n do sum = sum + beta[i+1] * math.exp(lbc[i] + i * logt + (n-i) * logt1) end
+		return sum;
+	end
+end
+
+-- Description: Fisher's exact test
+-- Dependent on: math.lbinom()
+-- Return: left-, right- and two-tail P-values
+--[[
+  Fisher's exact test for 2x2 congintency tables:
+
+    n11  n12  | n1_
+    n21  n22  | n2_
+   -----------+----
+    n_1  n_2  | n
+
+  Reference: http://www.langsrud.com/fisher.htm
+]]--
+function math.fisher_exact(n11, n12, n21, n22)
+	local aux; -- keep the states of n* for acceleration
+
+	-- Description: hypergeometric function
+	local function hypergeo(n11, n1_, n_1, n)
+		return math.exp(math.lbinom(n1_, n11) + math.lbinom(n-n1_, n_1-n11) - math.lbinom(n, n_1));
+	end
+
+	-- Description: incremental hypergeometric function
+	-- Note: aux = {n11, n1_, n_1, n, p}
+	local function hypergeo_inc(n11, n1_, n_1, n)
+		if n1_ ~= 0 or n_1 ~= 0 or n ~= 0 then
+			aux = {n11, n1_, n_1, n, 1};
+		else -- then only n11 is changed
+			local mod;
+			_, mod = math.modf(n11 / 11);
+			if mod ~= 0 and n11 + aux[4] - aux[2] - aux[3] ~= 0 then
+				if n11 == aux[1] + 1 then -- increase by 1
+					aux[5] = aux[5] * (aux[2] - aux[1]) / n11 * (aux[3] - aux[1]) / (n11 + aux[4] - aux[2] - aux[3]);
+					aux[1] = n11;
+					return aux[5];
+				end
+				if n11 == aux[1] - 1 then -- descrease by 1
+					aux[5] = aux[5] * aux[1] / (aux[2] - n11) * (aux[1] + aux[4] - aux[2] - aux[3]) / (aux[3] - n11);
+					aux[1] = n11;
+					return aux[5];
+				end
+			end
+			aux[1] = n11;
+		end
+		aux[5] = hypergeo(aux[1], aux[2], aux[3], aux[4]);
+		return aux[5];
+	end
+	
+	-- Description: computing the P-value by Fisher's exact test
+	local max, min, left, right, n1_, n_1, n, two, p, q, i, j;
+	n1_, n_1, n = n11 + n12, n11 + n21, n11 + n12 + n21 + n22;
+	max = (n_1 < n1_ and n_1) or n1_; -- max n11, for the right tail
+	min = n1_ + n_1 - n;
+	if min < 0 then min = 0 end -- min n11, for the left tail
+	two, left, right = 1, 1, 1;
+	if min == max then return 1 end -- no need to do test
+	q = hypergeo_inc(n11, n1_, n_1, n); -- the probability of the current table
+	-- left tail
+	i, left, p = min + 1, 0, hypergeo_inc(min, 0, 0, 0);
+	while p < 0.99999999 * q do
+		left, p, i = left + p, hypergeo_inc(i, 0, 0, 0), i + 1;
+	end
+	i = i - 1;
+	if p < 1.00000001 * q then left = left + p;
+	else i = i - 1 end
+	-- right tail
+	j, right, p = max - 1, 0, hypergeo_inc(max, 0, 0, 0);
+	while p < 0.99999999 * q do
+		right, p, j = right + p, hypergeo_inc(j, 0, 0, 0), j - 1;
+	end
+	j = j + 1;
+	if p < 1.00000001 * q then right = right + p;
+	else j = j + 1 end
+	-- two-tail
+	two = left + right;
+	if two > 1 then two = 1 end
+	-- adjust left and right
+	if math.abs(i - n11) < math.abs(j - n11) then right = 1 - left + q;
+	else left = 1 - right + q end
+	return left, right, two;
+end
+
+-- Description: Delete-m Jackknife
+--[[
+  Given g groups of values with a statistics estimated from m[i] samples in
+  i-th group being t[i], compute the mean and the variance. t0 below is the
+  estimate from all samples. Reference:
+
+     Busing et al. (1999) Delete-m Jackknife for unequal m. Statistics and Computing, 9:3-8.
+]]--
+function math.jackknife(g, m, t, t0)
+	local h, n, sum = {}, 0, 0;
+	for j = 1, g do n = n + m[j] end
+	if t0 == nil then -- When t0 is absent, estimate it in a naive way
+		t0 = 0;
+		for j = 1, g do t0 = t0 + m[j] * t[j] end
+		t0 = t0 / n;
+	end
+	local mean, var = 0, 0;
+	for j = 1, g do
+		h[j] = n / m[j];
+		mean = mean + (1 - m[j] / n) * t[j];
+	end
+	mean = g * t0 - mean; -- Eq. (8)
+	for j = 1, g do
+		local x = h[j] * t0 - (h[j] - 1) * t[j] - mean;
+		var = var + 1 / (h[j] - 1) * x * x;
+	end
+	var = var / g;
+	return mean, var;
+end
+
+-- Description: Pearson correlation coefficient
+-- Input: a is an n*2 table
+function math.pearson(a)
+	-- compute the mean
+	local x1, y1 = 0, 0
+	for _, v in pairs(a) do
+		x1, y1 = x1 + v[1], y1 + v[2]
+	end
+	-- compute the coefficient
+	x1, y1 = x1 / #a, y1 / #a
+	local x2, y2, xy = 0, 0, 0
+	for _, v in pairs(a) do
+		local tx, ty = v[1] - x1, v[2] - y1
+		xy, x2, y2 = xy + tx * ty, x2 + tx * tx, y2 + ty * ty
+	end
+	return xy / math.sqrt(x2) / math.sqrt(y2)
+end
+
+-- Description: Spearman correlation coefficient
+function math.spearman(a)
+	local function aux_func(t) -- auxiliary function
+		return (t == 1 and 0) or (t*t - 1) * t / 12
+	end
+
+	for _, v in pairs(a) do v.r = {} end
+	local T, S = {}, {}
+	-- compute the rank
+	for k = 1, 2 do
+		table.sort(a, function(u,v) return u[k]<v[k] end)
+		local same = 1
+		T[k] = 0
+		for i = 2, #a + 1 do
+			if i <= #a and a[i-1][k] == a[i][k] then same = same + 1
+			else
+				local rank = (i-1) * 2 - same + 1
+				for j = i - same, i - 1 do a[j].r[k] = rank end
+				if same > 1 then T[k], same = T[k] + aux_func(same), 1 end
+			end
+		end
+		S[k] = aux_func(#a) - T[k]
+	end
+	-- compute the coefficient
+	local sum = 0
+	for _, v in pairs(a) do -- TODO: use nested loops to reduce loss of precision
+		local t = (v.r[1] - v.r[2]) / 2
+		sum = sum + t * t
+	end
+	return (S[1] + S[2] - sum) / 2 / math.sqrt(S[1] * S[2])
+end
+
+-- Description: Hooke-Jeeves derivative-free optimization
+function math.fmin(func, x, data, r, eps, max_calls)
+	local n, n_calls = #x, 0;
+	r = r or 0.5;
+	eps = eps or 1e-7;
+	max_calls = max_calls or 50000
+
+	function fmin_aux(x1, data, fx1, dx) -- auxiliary function
+		local ftmp;
+		for k = 1, n do
+			x1[k] = x1[k] + dx[k];
+			local ftmp = func(x1, data); n_calls = n_calls + 1;
+			if ftmp < fx1 then fx1 = ftmp;
+			else -- search the opposite direction
+				dx[k] = -dx[k];
+				x1[k] = x1[k] + dx[k] + dx[k];
+				ftmp = func(x1, data); n_calls = n_calls + 1;
+				if ftmp < fx1 then fx1 = ftmp
+				else x1[k] = x1[k] - dx[k] end -- back to the original x[k]
+			end
+		end
+		return fx1; -- here: fx1=f(n,x1)
+	end
+
+	local dx, x1 = {}, {};
+	for k = 1, n do -- initial directions, based on MGJ
+		dx[k] = math.abs(x[k]) * r;
+		if dx[k] == 0 then dx[k] = r end;
+	end
+	local radius = r;
+	local fx1, fx;
+	fx = func(x, data); fx1 = fx; n_calls = n_calls + 1;
+	while true do
+		for i = 1, n do x1[i] = x[i] end; -- x1 = x
+		fx1 = fmin_aux(x1, data, fx, dx);
+		while fx1 < fx do
+			for k = 1, n do
+				local t = x[k];
+				dx[k] = (x1[k] > x[k] and math.abs(dx[k])) or -math.abs(dx[k]);
+				x[k] = x1[k];
+				x1[k] = x1[k] + x1[k] - t;
+			end
+			fx = fx1;
+			if n_calls >= max_calls then break end
+			fx1 = func(x1, data); n_calls = n_calls + 1;
+			fx1 = fmin_aux(x1, data, fx1, dx);
+			if fx1 >= fx then break end
+			local kk = n;
+			for k = 1, n do
+				if math.abs(x1[k] - x[k]) > .5 * math.abs(dx[k]) then
+					kk = k;
+					break;
+				end
+			end
+			if kk == n then break end
+		end
+		if radius >= eps then
+			if n_calls >= max_calls then break end
+			radius = radius * r;
+			for k = 1, n do dx[k] = dx[k] * r end
+		else break end
+	end
+	return fx1, n_calls;
+end
+
+--
+-- Matrix
+--
+
+matrix = {}
+
+-- Description: matrix transpose
+-- Required by: matrix.mul()
+function matrix.T(a)
+	local m, n, x = #a, #a[1], {};
+	for i = 1, n do
+		x[i] = {};
+		for j = 1, m do x[i][j] = a[j][i] end
+	end
+	return x;
+end
+
+-- Description: matrix add
+function matrix.add(a, b)
+	assert(#a == #b and #a[1] == #b[1]);
+	local m, n, x = #a, #a[1], {};
+	for i = 1, m do
+		x[i] = {};
+		local ai, bi, xi = a[i], b[i], x[i];
+		for j = 1, n do xi[j] = ai[j] + bi[j] end
+	end
+	return x;
+end
+
+-- Description: matrix mul
+-- Dependent on: matrix.T()
+-- Note: much slower without transpose
+function matrix.mul(a, b)
+	assert(#a[1] == #b);
+	local m, n, p, x = #a, #a[1], #b[1], {};
+	local c = matrix.T(b); -- transpose for efficiency
+	for i = 1, m do
+		x[i] = {}
+		local xi = x[i];
+		for j = 1, p do
+			local sum, ai, cj = 0, a[i], c[j];
+			for k = 1, n do sum = sum + ai[k] * cj[k] end
+			xi[j] = sum;
+		end
+	end
+	return x;
+end
+
+-- Description: matrix print
+function matrix.tostring(a)
+	local z = {};
+	for i = 1, #a do
+		z[i] = table.concat(a[i], "\t");
+	end
+	return table.concat(z, "\n");
+end
+
+-- Description: chi^2 test for contingency tables
+-- Dependent on: math.igamma()
+function matrix.chi2(a)
+	if #a == 2 and #a[1] == 2 then -- 2x2 table
+		local x, z
+		x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2])
+		if x == 0 then return 0, 1, false end
+		z = a[1][1] * a[2][2] - a[1][2] * a[2][1]
+		z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x
+		return z, math.igamma(.5, .5 * z, true), true
+	else -- generic table
+		local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0
+		for i = 1, n do rs[i] = 0 end
+		for j = 1, m do cs[j] = 0 end
+		for i = 1, n do -- compute column sum and row sum
+			for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end
+		end
+		for i = 1, n do N = N + rs[i] end
+		for i = 1, n do -- compute the chi^2 statistics
+			for j = 1, m do
+				local E = rs[i] * cs[j] / N;
+				z = z + (a[i][j] - E) * (a[i][j] - E) / E
+			end
+		end
+		return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true;
+	end
+end
+
+-- Description: Gauss-Jordan elimination (solving equations; computing inverse)
+-- Note: on return, a[n][n] is the inverse; b[n][m] is the solution
+-- Reference: Section 2.1, Numerical Recipes in C, 2nd edition
+function matrix.solve(a, b)
+	assert(#a == #a[1]);
+	local n, m = #a, (b and #b[1]) or 0;
+	local xc, xr, ipiv = {}, {}, {};
+	local ic, ir;
+
+	for j = 1, n do ipiv[j] = 0 end
+	for i = 1, n do
+		local big = 0;
+		for j = 1, n do
+			local aj = a[j];
+			if ipiv[j] ~= 1 then
+				for k = 1, n do
+					if ipiv[k] == 0 then
+						if math.abs(aj[k]) >= big then
+							big = math.abs(aj[k]);
+							ir, ic = j, k;
+						end
+					elseif ipiv[k] > 1 then return -2 end -- singular matrix
+				end
+			end
+		end
+		ipiv[ic] = ipiv[ic] + 1;
+		if ir ~= ic then
+			for l = 1, n do a[ir][l], a[ic][l] = a[ic][l], a[ir][l] end
+			if b then
+				for l = 1, m do b[ir][l], b[ic][l] = b[ic][l], b[ir][l] end
+			end
+		end
+		xr[i], xc[i] = ir, ic;
+		if a[ic][ic] == 0 then return -3 end -- singular matrix
+		local pivinv = 1 / a[ic][ic];
+		a[ic][ic] = 1;
+		for l = 1, n do a[ic][l] = a[ic][l] * pivinv end
+		if b then
+			for l = 1, n do b[ic][l] = b[ic][l] * pivinv end
+		end
+		for ll = 1, n do
+			if ll ~= ic then
+				local tmp = a[ll][ic];
+				a[ll][ic] = 0;
+				local all, aic = a[ll], a[ic];
+				for l = 1, n do all[l] = all[l] - aic[l] * tmp end
+				if b then
+					local bll, bic = b[ll], b[ic];
+					for l = 1, m do bll[l] = bll[l] - bic[l] * tmp end
+				end
+			end
+		end
+	end
+	for l = n, 1, -1 do
+		if xr[l] ~= xc[l] then
+			for k = 1, n do a[k][xr[l]], a[k][xc[l]] = a[k][xc[l]], a[k][xr[l]] end
+		end
+	end
+	return 0;
+end
--- a/ext/klib/test/Makefile
+++ b/ext/klib/test/Makefile
@ -0,0 +1,72 @@
+CC=gcc
+CXX=g++
+CFLAGS=-g -Wall -O2 -I..
+CXXFLAGS=$(CFLAGS)
+PROGS=kbtree_test khash_keith khash_keith2 khash_test klist_test kseq_test kseq_bench \
+		kseq_bench2 ksort_test ksort_test-stl kvec_test kmin_test kstring_bench kstring_bench2 kstring_test \
+		kavl_test kavl-lite_test kthread_test2
+
+all:$(PROGS)
+
+clean:
+		rm -fr $(PROGS) *.dSYM a.out *.o
+
+kavl_test:kavl_test.c ../kavl.h
+		$(CC) $(CFLAGS) -o $@ kavl_test.c
+
+kavl-lite_test:kavl-lite_test.c ../kavl-lite.h
+		$(CC) $(CFLAGS) -o $@ kavl-lite_test.c
+
+kbtree_test:kbtree_test.c ../kbtree.h
+		$(CC) $(CFLAGS) -o $@ kbtree_test.c
+
+khash_keith:khash_keith.c ../khash.h
+		$(CC) $(CFLAGS) -o $@ khash_keith.c
+
+khash_keith2:khash_keith2.c ../khash.h
+		$(CC) $(CFLAGS) -o $@ khash_keith2.c
+
+khash_test:khash_test.c ../khash.h
+		$(CC) $(CFLAGS) -o $@ khash_test.c
+
+klist_test:klist_test.c ../klist.h
+		$(CC) $(CFLAGS) -o $@ klist_test.c
+
+kseq_test:kseq_test.c ../kseq.h
+		$(CC) $(CFLAGS) -o $@ kseq_test.c -lz
+
+kseq_bench:kseq_bench.c ../kseq.h
+		$(CC) $(CFLAGS) -o $@ kseq_bench.c -lz
+
+kseq_bench2:kseq_bench2.c ../kseq.h
+		$(CC) $(CFLAGS) -o $@ kseq_bench2.c -lz
+
+ksort_test:ksort_test.c ../ksort.h
+		$(CC) $(CFLAGS) -o $@ ksort_test.c
+
+ksort_test-stl:ksort_test.cc ../ksort.h
+		$(CXX) $(CXXFLAGS) -o $@ ksort_test.cc
+
+kvec_test:kvec_test.cc ../kvec.h
+		$(CXX) $(CXXFLAGS) -o $@ kvec_test.cc
+
+kmin_test:kmin_test.c ../kmath.h ../kmath.c
+		$(CC) $(CFLAGS) -o $@ kmin_test.c ../kmath.c
+
+kstring_bench:kstring_bench.c ../kstring.h ../kstring.c
+		$(CC) $(CFLAGS) -o $@ kstring_bench.c ../kstring.c
+
+kstring_bench2:kstring_bench2.c ../kstring.h ../kstring.c
+		$(CC) $(CFLAGS) -o $@ kstring_bench2.c ../kstring.c
+
+kstring_test:kstring_test.c ../kstring.h ../kstring.c
+		$(CC) $(CFLAGS) -o $@ kstring_test.c ../kstring.c
+
+kthread_test:kthread_test.c ../kthread.c
+		$(CC) $(CFLAGS) -fopenmp -o $@ kthread_test.c ../kthread.c
+
+kthread_test2:kthread_test2.c ../kthread.c
+		$(CC) $(CFLAGS) -o $@ kthread_test2.c ../kthread.c
+
+ketopt_test:ketopt_test.c ../ketopt.h
+		$(CC) $(CFLAGS) -o $@ ketopt_test.c
--- a/ext/klib/test/kavl-lite_test.c
+++ b/ext/klib/test/kavl-lite_test.c
@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include "kavl-lite.h"
+
+#define CALLOC(type, num) ((type*)calloc(num, sizeof(type)))
+
+struct my_node {
+	int key;
+	KAVLL_HEAD(struct my_node) head;
+};
+
+#define my_cmp(p, q) (((p)->key > (q)->key) - ((p)->key < (q)->key))
+KAVLL_INIT(my, struct my_node, head, my_cmp)
+
+void shuffle(int n, char a[])
+{
+	int i, j;
+	for (i = n; i > 1; --i) {
+		char tmp;
+		j = (int)(drand48() * i);
+		tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
+	}
+}
+
+int main(void)
+{
+	char buf[256];
+	int i, n;
+	struct my_node *root = 0;
+	struct my_node *p, *q, t;
+	my_itr_t itr;
+
+	for (i = 33, n = 0; i <= 126; ++i)
+		if (i != '(' && i != ')' && i != '.' && i != ';')
+			buf[n++] = i;
+	shuffle(n, buf);
+	for (i = 0; i < n; ++i) {
+		p = CALLOC(struct my_node, 1);
+		p->key = buf[i];
+		q = my_insert(&root, p);
+		if (p != q) free(p);
+	}
+	shuffle(n, buf);
+	for (i = 0; i < n/2; ++i) {
+		t.key = buf[i];
+		q = my_erase(&root, &t);
+		if (q) free(q);
+	}
+
+	my_itr_first(root, &itr);
+	do {
+		const struct my_node *r = kavll_at(&itr);
+		putchar(r->key);
+		free((void*)r);
+	} while (my_itr_next(&itr));
+	putchar('\n');
+	return 0;
+}
--- a/ext/klib/test/kavl_test.c
+++ b/ext/klib/test/kavl_test.c
@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include "kavl.h"
+
+#define CALLOC(type, num) ((type*)calloc(num, sizeof(type)))
+
+struct my_node {
+	int key;
+	KAVL_HEAD(struct my_node) head;
+};
+
+#define my_cmp(p, q) (((p)->key > (q)->key) - ((p)->key < (q)->key))
+KAVL_INIT(my, struct my_node, head, my_cmp)
+
+int check(struct my_node *p, int *hh)
+{
+	int c = 1, h[2] = {0, 0};
+	*hh = 0;
+	if (p) {
+		if (p->head.p[0]) c += check(p->head.p[0], &h[0]);
+		if (p->head.p[1]) c += check(p->head.p[1], &h[1]);
+		*hh = (h[0] > h[1]? h[0] : h[1]) + 1;
+		if (h[1] - h[0] != (int)p->head.balance)
+			fprintf(stderr, "%d - %d != %d at %c\n", h[1], h[0], p->head.balance, p->key);
+		if (c != (int)p->head.size)
+			fprintf(stderr, "%d != %d at %c\n", p->head.size, c, p->key);
+		return c;
+	} else return 0;
+}
+/*
+int print_tree(const struct my_node *p)
+{
+	int c = 1;
+	if (p == 0) return 0;
+	if (p->head.p[0] || p->head.p[1]) {
+		putchar('(');
+		if (p->head.p[0]) c += print_tree(p->head.p[0]);
+		else putchar('.');
+		putchar(',');
+		if (p->head.p[1]) c += print_tree(p->head.p[1]);
+		else putchar('.');
+		putchar(')');
+	}
+	putchar(p->key);
+	return c;
+}
+
+void check_and_print(struct my_node *root)
+{
+	int h;
+	check(root, &h);
+	print_tree(root);
+	putchar('\n');
+}
+*/
+void shuffle(int n, char a[])
+{
+	int i, j;
+	for (i = n; i > 1; --i) {
+		char tmp;
+		j = (int)(drand48() * i);
+		tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
+	}
+}
+
+int main(void)
+{
+	char buf[256];
+	int i, n, h;
+	struct my_node *root = 0;
+	struct my_node *p, *q, t;
+	kavl_itr_t(my) itr;
+	unsigned cnt;
+
+	for (i = 33, n = 0; i <= 126; ++i)
+		if (i != '(' && i != ')' && i != '.' && i != ';')
+			buf[n++] = i;
+	shuffle(n, buf);
+	for (i = 0; i < n; ++i) {
+		p = CALLOC(struct my_node, 1);
+		p->key = buf[i];
+		q = kavl_insert(my, &root, p, &cnt);
+		if (p != q) free(p);
+		check(root, &h);
+	}
+	shuffle(n, buf);
+	for (i = 0; i < n/2; ++i) {
+		t.key = buf[i];
+		q = kavl_erase(my, &root, &t, 0);
+		if (q) free(q);
+		check(root, &h);
+	}
+
+	kavl_itr_first(my, root, &itr);
+	do {
+		const struct my_node *r = kavl_at(&itr);
+		putchar(r->key);
+		free((void*)r);
+	} while (kavl_itr_next(my, &itr));
+	putchar('\n');
+	return 0;
+}
--- a/ext/klib/test/kbit_test.c
+++ b/ext/klib/test/kbit_test.c
@ -0,0 +1,137 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <emmintrin.h>
+#include "kbit.h"
+
+// from bowtie-0.9.8.1
+inline static int bt1_pop64(uint64_t x) // the kbi_popcount64() equivalence; similar to popcount_2() in wiki
+{
+   x -= ((x >> 1) & 0x5555555555555555llu);
+   x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
+   x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
+   x = x + (x >> 8);
+   x = x + (x >> 16);
+   x = x + (x >> 32);
+   return x & 0x3F;
+}
+
+inline static int bt1_countInU64(uint64_t dw, int c) // the kbi_DNAcount64() equivalence
+{
+	uint64_t dwA  = dw &  0xAAAAAAAAAAAAAAAAllu;
+	uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu;
+	uint64_t tmp;
+	switch (c) {
+	case 0: tmp = (dwA >> 1) | dwNA; break;
+	case 1: tmp = ~(dwA >> 1) & dwNA; break;
+	case 2: tmp = (dwA >> 1) & ~dwNA; break;
+	default: tmp = (dwA >> 1) & dwNA;
+	}
+	tmp = bt1_pop64(tmp);
+	if (c == 0) tmp = 32 - tmp;
+	return (int)tmp;
+}
+
+// from bigmagic
+static uint32_t sse2_bit_count32(const __m128i* block, const __m128i* block_end)
+{
+    const unsigned mu1 = 0x55555555;
+    const unsigned mu2 = 0x33333333;
+    const unsigned mu3 = 0x0F0F0F0F;
+    const unsigned mu4 = 0x0000003F;
+
+	uint32_t tcnt[4];
+
+    // Loading masks
+    __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
+    __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
+    __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
+    __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
+    __m128i mcnt;
+    mcnt = _mm_xor_si128(m1, m1); // cnt = 0
+
+    __m128i tmp1, tmp2;
+    do
+    {        
+        __m128i b = _mm_load_si128(block);
+        ++block;
+
+        // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
+        tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
+        tmp1 = _mm_and_si128(tmp1, m1); 
+        tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
+        b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
+
+        // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
+        tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
+        tmp1 = _mm_and_si128(tmp1, m2); 
+        tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
+        b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
+
+        // b = (b + (b >> 4)) & 0x0F0F0F0F;
+        tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
+        b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
+        b = _mm_and_si128(b, m3);                       //           & 0x0F0F0F0F
+
+        // b = b + (b >> 8);
+        tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
+        b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
+
+        // b = (b + (b >> 16)) & 0x0000003F;
+        tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
+        b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
+        b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
+
+        mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
+
+    } while (block < block_end);
+
+    _mm_store_si128((__m128i*)tcnt, mcnt);
+
+    return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
+}
+
+int main(void)
+{
+	int i, N = 100000000;
+	uint64_t *x, cnt;
+	clock_t t;
+	int c = 1;
+
+	x = (uint64_t*)calloc(N, 8);
+	srand48(11);
+	for (i = 0; i < N; ++i)
+		x[i] = (uint64_t)lrand48() << 32 ^ lrand48();
+
+	fprintf(stderr, "\n===> Calculate # of 1 in an integer (popcount) <===\n");
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += kbi_popcount64(x[i]);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "kbit", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += bt1_pop64(x[i]);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "wiki-popcount_2", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += __builtin_popcountl(x[i]);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "__builtin_popcountl", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	t = clock(); cnt = 0;
+	cnt += sse2_bit_count32((__m128i*)x, (__m128i*)(x+N));
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "SSE2-32bit", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	fprintf(stderr, "\n===> Count '%c' in 2-bit encoded integers <===\n", "ACGT"[c]);
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += kbi_DNAcount64(x[i], c);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "kbit", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += bt1_countInU64(x[i], c);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "bowtie1", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	fprintf(stderr, "\n");
+	free(x);
+	return 0;
+}
--- a/ext/klib/test/kbtree_test.c
+++ b/ext/klib/test/kbtree_test.c
@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+typedef const char *str_t;
+
+#include "kbtree.h"
+KBTREE_INIT(int, uint32_t, kb_generic_cmp)
+KBTREE_INIT(str, str_t, kb_str_cmp)
+
+static int data_size = 5000000;
+static unsigned *int_data;
+static char **str_data;
+
+void ht_init_data()
+{
+	int i;
+	char buf[256];
+	printf("--- generating data... ");
+	srand48(11);
+	int_data = (unsigned*)calloc(data_size, sizeof(unsigned));
+	str_data = (char**)calloc(data_size, sizeof(char*));
+	for (i = 0; i < data_size; ++i) {
+		int_data[i] = (unsigned)(data_size * drand48() / 4) * 271828183u;
+		sprintf(buf, "%x", int_data[i]);
+		str_data[i] = strdup(buf);
+	}
+	printf("done!\n");
+}
+void ht_destroy_data()
+{
+	int i;
+	for (i = 0; i < data_size; ++i) free(str_data[i]);
+	free(str_data); free(int_data);
+}
+
+void ht_khash_int()
+{
+	int i;
+	unsigned *data = int_data;
+	uint32_t *l, *u;
+	kbtree_t(int) *h;
+
+	h = kb_init(int, KB_DEFAULT_SIZE);
+	for (i = 0; i < data_size; ++i) {
+		if (kb_get(int, h, data[i]) == 0) kb_put(int, h, data[i]);
+		else kb_del(int, h, data[i]);
+	}
+	printf("[ht_khash_int] size: %d\n", kb_size(h));
+	if (1) {
+		int cnt = 0;
+		uint32_t x, y;
+		kb_interval(int, h, 2174625464u, &l, &u);
+		printf("interval for 2174625464: (%u, %u)\n", l? *l : 0, u? *u : 0);
+#define traverse_f(p) { if (cnt == 0) y = *p; ++cnt; }
+		__kb_traverse(uint32_t, h, traverse_f);
+		__kb_get_first(uint32_t, h, x);
+		printf("# of elements from traversal: %d\n", cnt);
+		printf("first element: %d == %d\n", x, y);
+	}
+	__kb_destroy(h);
+}
+void ht_khash_str()
+{
+	int i;
+	char **data = str_data;
+	kbtree_t(str) *h;
+
+	h = kb_init(str, KB_DEFAULT_SIZE);
+	for (i = 0; i < data_size; ++i) {
+		if (kb_get(str, h, data[i]) == 0) kb_put(str, h, data[i]);
+		else kb_del(str, h, data[i]);
+	}
+	printf("[ht_khash_int] size: %d\n", kb_size(h));
+	__kb_destroy(h);
+}
+void ht_timing(void (*f)(void))
+{
+	clock_t t = clock();
+	(*f)();
+	printf("[ht_timing] %.3lf sec\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+}
+int main(int argc, char *argv[])
+{
+	if (argc > 1) data_size = atoi(argv[1]);
+	ht_init_data();
+	ht_timing(ht_khash_int);
+	ht_timing(ht_khash_str);
+	ht_destroy_data();
+	return 0;
+}
--- a/ext/klib/test/ketopt_test.c
+++ b/ext/klib/test/ketopt_test.c
@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include "ketopt.h"
+
+static void test_opt(int c, int opt, const char *arg)
+{
+	if (c == 'x') fprintf(stderr, "-x\n");
+	else if (c == 'y') fprintf(stderr, "-y %s\n", arg);
+	else if (c == 301) fprintf(stderr, "--foo\n");
+	else if (c == 302) fprintf(stderr, "--bar %s\n", arg? arg : "(null)");
+	else if (c == 303) fprintf(stderr, "--opt %s\n", arg? arg : "(null)");
+	else if (c == '?') fprintf(stderr, "unknown option -%c\n", opt? opt : ':');
+	else if (c == ':') fprintf(stderr, "missing option argument: -%c\n", opt? opt : ':');
+}
+
+static void print_cmd(int argc, char *argv[], int ind)
+{
+	int i;
+	fprintf(stderr, "CMD: %s", argv[0]);
+	if (ind > 1) {
+		fputs(" [", stderr);
+		for (i = 1; i < ind; ++i) {
+			if (i != 1) fputc(' ', stderr);
+			fputs(argv[i], stderr);
+		}
+		fputc(']', stderr);
+	}
+	for (i = ind; i < argc; ++i)
+		fprintf(stderr, " %s", argv[i]);
+	fputc('\n', stderr);
+}
+
+static void test_ketopt(int argc, char *argv[])
+{
+	static ko_longopt_t longopts[] = {
+		{ "foo", ko_no_argument,       301 },
+		{ "bar", ko_required_argument, 302 },
+		{ "opt", ko_optional_argument, 303 },
+		{ NULL, 0, 0 }
+	};
+	ketopt_t opt = KETOPT_INIT;
+	int c;
+	fprintf(stderr, "===> ketopt() <===\n");
+	while ((c = ketopt(&opt, argc, argv, 1, "xy:", longopts)) >= 0)
+		test_opt(c, opt.opt, opt.arg);
+	print_cmd(argc, argv, opt.ind);
+}
+
+static void test_getopt(int argc, char *argv[])
+{
+	static struct option long_options[] = {
+		{ "foo", no_argument,       0, 301 },
+		{ "bar", required_argument, 0, 302 },
+		{ "opt", optional_argument, 0, 303 },
+		{0, 0, 0, 0}
+	};
+	int c, option_index;
+	fprintf(stderr, "===> getopt() <===\n");
+	while ((c = getopt_long(argc, argv, ":xy:", long_options, &option_index)) >= 0)
+		test_opt(c, optopt, optarg);
+	print_cmd(argc, argv, optind);
+}
+
+int main(int argc, char *argv[])
+{
+	int i;
+	char **argv2;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: ketopt_test [options] <argument> [...]\n");
+		fprintf(stderr, "Options:\n");
+		fprintf(stderr, "  -x           no argument\n");
+		fprintf(stderr, "  -y STR       required argument\n");
+		fprintf(stderr, "  --foo        no argument\n");
+		fprintf(stderr, "  --bar=STR    required argument\n");
+		fprintf(stderr, "  --opt[=STR]  optional argument\n");
+		fprintf(stderr, "\nExamples:\n");
+		fprintf(stderr, "  ketopt_test -xy1 -x arg1 -y -x -- arg2 -x\n");
+		fprintf(stderr, "  ketopt_test --foo --bar=1 --bar 2 --opt arg1 --opt=3\n");
+		fprintf(stderr, "  ketopt_test arg1 -y\n");
+		return 1;
+	}
+	argv2 = (char**)malloc(sizeof(char*) * argc);
+	for (i = 0; i < argc; ++i) argv2[i] = argv[i];
+	test_ketopt(argc, argv);
+	test_getopt(argc, argv2);
+	free(argv2);
+	return 0;
+}
--- a/ext/klib/test/kgraph_test.c
+++ b/ext/klib/test/kgraph_test.c
@ -0,0 +1,26 @@
+#include <stdio.h>
+#include "kgraph.h"
+
+KHASH_INIT2(e32, extern, uint32_t, int, 1, kh_int_hash_func, kh_int_hash_equal)
+
+typedef struct {
+	int i;
+	khash_t(e32) *_arc;
+} vertex_t;
+
+KGRAPH_INIT(g, extern, vertex_t, int, e32)
+KGRAPH_PRINT(g, extern)
+
+int main()
+{
+	int *pb, *pe;
+	kgraph_t(g) *g;
+	g = kg_init_g();
+	kg_put_a_g(g, 10, 20, 0, &pb, &pe);
+	kg_put_a_g(g, 20, 30, 0, &pb, &pe);
+	kg_put_a_g(g, 30, 10, 1, &pb, &pe);
+	kg_del_v_g(g, 20);
+	kg_print_g(g);
+	kg_destroy_g(g);
+	return 0;
+}
--- a/ext/klib/test/khash_keith.c
+++ b/ext/klib/test/khash_keith.c
@ -0,0 +1,95 @@
+/*
+ * This is an optimized version of the following C++ program:
+ *
+ *   http://keithlea.com/javabench/src/cpp/hash.cpp
+ *
+ * Keith in his benchmark (http://keithlea.com/javabench/data) showed that the
+ * Java implementation is twice as fast as the C++ version. In fact, this is
+ * only because the C++ implementation is substandard. Most importantly, Keith
+ * is using "sprintf()" to convert an integer to a string, which is known to be
+ * extremely inefficient.
+ */
+#include <stdio.h>
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
+inline void int2str(int c, int base, char *ret)
+{
+	const char *tab = "0123456789abcdef";
+	if (c == 0) ret[0] = '0', ret[1] = 0;
+	else {
+		int l, x, y;
+		char buf[16];
+		for (l = 0, x = c < 0? -c : c; x > 0; x /= base) buf[l++] = tab[x%base];
+		if (c < 0) buf[l++] = '-';
+		for (x = l - 1, y = 0; x >= 0; --x) ret[y++] = buf[x];
+		ret[y] = 0;
+	}
+}
+
+#ifndef _USE_STRDUP
+#define BLOCK_SIZE 0x100000
+int main(int argc, char *argv[])
+{
+	char **mem = 0;
+	int i, l, n = 1000000, ret, block_end = 0, curr = 0, c = 0;
+	khash_t(str) *h;
+	h = kh_init(str);
+	if (argc > 1) n = atoi(argv[1]);
+	mem = malloc(sizeof(void*));
+	mem[0] = malloc(BLOCK_SIZE); // memory buffer to avoid memory fragmentation
+	curr = block_end = 0;
+	for (i = 1; i <= n; ++i) {
+		char buf[16];
+		int2str(i, 16, buf);
+		khint_t k = kh_put(str, h, buf, &ret);
+		l = strlen(buf) + 1;
+		if (block_end + l > BLOCK_SIZE) {
+			++curr; block_end = 0;
+			mem = realloc(mem, (curr + 1) * sizeof(void*));
+			mem[curr] = malloc(BLOCK_SIZE);
+		}
+		memcpy(mem[curr] + block_end, buf, l);
+		kh_key(h, k) = mem[curr] + block_end;
+		block_end += l;
+		kh_val(h, k) = i;
+	}
+	for (i = 1; i <= n; ++i) {
+		char buf[16];
+		int2str(i, 10, buf);
+		khint_t k = kh_get(str, h, buf);
+		if (k != kh_end(h)) ++c;
+	}
+	printf("%d\n", c);
+	for (ret = 0; ret <= curr; ++ret) free(mem[ret]);
+	free(mem);
+	kh_destroy(str, h);
+	return 0;
+}
+#else // _USE_STRDUP
+int main(int argc, char *argv[])
+{
+	int i, l, n = 1000000, ret, c = 0;
+	khash_t(str) *h;
+	khint_t k;
+	h = kh_init(str);
+	if (argc > 1) n = atoi(argv[1]);
+	for (i = 1; i <= n; ++i) {
+		char buf[16];
+		int2str(i, 16, buf);
+		k = kh_put(str, h, strdup(buf), &ret);
+		kh_val(h, k) = i;
+	}
+	for (i = 1; i <= n; ++i) {
+		char buf[16];
+		int2str(i, 10, buf);
+		k = kh_get(str, h, buf);
+		if (k != kh_end(h)) ++c;
+	}
+	for (k = kh_begin(h); k != kh_end(h); ++k) // explicitly freeing memory takes 10-20% CPU time.
+		if (kh_exist(h, k)) free((char*)kh_key(h, k));
+	printf("%d\n", c);
+	kh_destroy(str, h);
+	return 0;
+}
+#endif
--- a/ext/klib/test/khash_keith2.c
+++ b/ext/klib/test/khash_keith2.c
@ -0,0 +1,67 @@
+/*
+ * This is an optimized version of the following C++ program:
+ *
+ *   http://keithlea.com/javabench/src/cpp/hash.cpp
+ *
+ * Keith in his benchmark (http://keithlea.com/javabench/data) showed that the
+ * Java implementation is twice as fast as the C++ version. In fact, this is
+ * only because the C++ implementation is substandard. Most importantly, Keith
+ * is using "sprintf()" to convert an integer to a string, which is known to be
+ * extremely inefficient.
+ */
+#include <stdio.h>
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
+inline void int2str(int c, int base, char *ret)
+{
+	const char *tab = "0123456789abcdef";
+	if (c == 0) ret[0] = '0', ret[1] = 0;
+	else {
+		int l, x, y;
+		char buf[16];
+		for (l = 0, x = c < 0? -c : c; x > 0; x /= base) buf[l++] = tab[x%base];
+		if (c < 0) buf[l++] = '-';
+		for (x = l - 1, y = 0; x >= 0; --x) ret[y++] = buf[x];
+		ret[y] = 0;
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int i, l, n = 1000, ret;
+	khash_t(str) *h, *h2;
+	khint_t k;
+	h = kh_init(str);
+	h2 = kh_init(str);
+	if (argc > 1) n = atoi(argv[1]);
+	for (i = 0; i < 10000; ++i) {
+		char buf[32];
+		strcpy(buf, "foo_");
+		int2str(i, 10, buf+4);
+		k = kh_put(str, h, strdup(buf), &ret);
+		kh_val(h, k) = i;
+	}
+	for (i = 0; i < n; ++i) {
+		for (k = kh_begin(h); k != kh_end(h); ++k) {
+			if (kh_exist(h, k)) {
+				khint_t k2 = kh_put(str, h2, kh_key(h, k), &ret);
+				if (ret) { // absent
+					kh_key(h2, k2) = strdup(kh_key(h, k));
+					kh_val(h2, k2) = kh_val(h, k);
+				} else kh_val(h2, k2) += kh_val(h, k);
+			}
+		}
+	}
+	k = kh_get(str, h, "foo_1"); printf("%d", kh_val(h, k));
+	k = kh_get(str, h, "foo_9999"); printf(" %d", kh_val(h, k));
+	k = kh_get(str, h2, "foo_1"); printf(" %d", kh_val(h2, k));
+	k = kh_get(str, h2, "foo_9999"); printf(" %d\n", kh_val(h2, k));
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) free((char*)kh_key(h, k));
+	for (k = kh_begin(h2); k != kh_end(h2); ++k)
+		if (kh_exist(h2, k)) free((char*)kh_key(h2, k));
+	kh_destroy(str, h);
+	kh_destroy(str, h2);
+	return 0;
+}
--- a/ext/klib/test/khash_test.c
+++ b/ext/klib/test/khash_test.c
@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "khash.h"
+KHASH_SET_INIT_STR(str)
+KHASH_MAP_INIT_INT(int, unsigned char)
+
+typedef struct {
+	unsigned key;
+	unsigned char val;
+} int_unpack_t;
+
+typedef struct {
+	unsigned key;
+	unsigned char val;
+} __attribute__ ((__packed__)) int_packed_t;
+
+#define hash_eq(a, b) ((a).key == (b).key)
+#define hash_func(a) ((a).key)
+
+KHASH_INIT(iun, int_unpack_t, char, 0, hash_func, hash_eq)
+KHASH_INIT(ipk, int_packed_t, char, 0, hash_func, hash_eq)
+
+static int data_size = 5000000;
+static unsigned *int_data;
+static char **str_data;
+
+void ht_init_data()
+{
+	int i;
+	char buf[256];
+	khint32_t x = 11;
+	printf("--- generating data... ");
+	int_data = (unsigned*)calloc(data_size, sizeof(unsigned));
+	str_data = (char**)calloc(data_size, sizeof(char*));
+	for (i = 0; i < data_size; ++i) {
+		int_data[i] = (unsigned)(data_size * ((double)x / UINT_MAX) / 4) * 271828183u;
+		sprintf(buf, "%x", int_data[i]);
+		str_data[i] = strdup(buf);
+		x = 1664525L * x + 1013904223L;
+	}
+	printf("done!\n");
+}
+
+void ht_destroy_data()
+{
+	int i;
+	for (i = 0; i < data_size; ++i) free(str_data[i]);
+	free(str_data); free(int_data);
+}
+
+void ht_khash_int()
+{
+	int i, ret;
+	unsigned *data = int_data;
+	khash_t(int) *h;
+	unsigned k;
+
+	h = kh_init(int);
+	for (i = 0; i < data_size; ++i) {
+		k = kh_put(int, h, data[i], &ret);
+		kh_val(h, k) = i&0xff;
+		if (!ret) kh_del(int, h, k);
+	}
+	printf("[ht_khash_int] size: %u\n", kh_size(h));
+	kh_destroy(int, h);
+}
+
+void ht_khash_str()
+{
+	int i, ret;
+	char **data = str_data;
+	khash_t(str) *h;
+	unsigned k;
+
+	h = kh_init(str);
+	for (i = 0; i < data_size; ++i) {
+		k = kh_put(str, h, data[i], &ret);
+		if (!ret) kh_del(str, h, k);
+	}
+	printf("[ht_khash_int] size: %u\n", kh_size(h));
+	kh_destroy(str, h);
+}
+
+void ht_khash_unpack()
+{
+	int i, ret;
+	unsigned *data = int_data;
+	khash_t(iun) *h;
+	unsigned k;
+
+	h = kh_init(iun);
+	for (i = 0; i < data_size; ++i) {
+		int_unpack_t x;
+		x.key = data[i]; x.val = i&0xff;
+		k = kh_put(iun, h, x, &ret);
+		if (!ret) kh_del(iun, h, k);
+	}
+	printf("[ht_khash_unpack] size: %u (sizeof=%ld)\n", kh_size(h), sizeof(int_unpack_t));
+	kh_destroy(iun, h);
+}
+
+void ht_khash_packed()
+{
+	int i, ret;
+	unsigned *data = int_data;
+	khash_t(ipk) *h;
+	unsigned k;
+
+	h = kh_init(ipk);
+	for (i = 0; i < data_size; ++i) {
+		int_packed_t x;
+		x.key = data[i]; x.val = i&0xff;
+		k = kh_put(ipk, h, x, &ret);
+		if (!ret) kh_del(ipk, h, k);
+	}
+	printf("[ht_khash_packed] size: %u (sizeof=%ld)\n", kh_size(h), sizeof(int_packed_t));
+	kh_destroy(ipk, h);
+}
+
+void ht_timing(void (*f)(void))
+{
+	clock_t t = clock();
+	(*f)();
+	printf("[ht_timing] %.3lf sec\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc > 1) data_size = atoi(argv[1]);
+	ht_init_data();
+	ht_timing(ht_khash_int);
+	ht_timing(ht_khash_str);
+	ht_timing(ht_khash_unpack);
+	ht_timing(ht_khash_packed);
+	ht_destroy_data();
+	return 0;
+}
--- a/ext/klib/test/klist_test.c
+++ b/ext/klib/test/klist_test.c
@ -0,0 +1,19 @@
+#include <stdio.h>
+#include "klist.h"
+
+#define __int_free(x)
+KLIST_INIT(32, int, __int_free)
+
+int main()
+{
+	klist_t(32) *kl;
+	kliter_t(32) *p;
+	kl = kl_init(32);
+	*kl_pushp(32, kl) = 1;
+	*kl_pushp(32, kl) = 10;
+	kl_shift(32, kl, 0);
+	for (p = kl_begin(kl); p != kl_end(kl); p = kl_next(p))
+		printf("%d\n", kl_val(p));
+	kl_destroy(32, kl);
+	return 0;
+}
--- a/ext/klib/test/kmin_test.c
+++ b/ext/klib/test/kmin_test.c
@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <math.h>
+#include "kmath.h"
+
+static int n_evals;
+
+double f_Chebyquad(int n, double *x, void *data)
+{
+    int i, j;
+    double y[20][20], f;
+    int np, iw;
+    double sum;
+    for (j = 0; j != n; ++j) {
+		y[0][j] = 1.;
+		y[1][j] = 2. * x[j] - 1.;
+    }
+    for (i = 1; i != n; ++i)
+		for (j = 0; j != n; ++j)
+			y[i+1][j] = 2. * y[1][j] * y[i][j] - y[i-1][j];
+    f = 0.;
+    np = n + 1;
+    iw = 1;
+    for (i = 0; i != np; ++i) {
+		sum = 0.;
+		for (j = 0; j != n; ++j) sum += y[i][j];
+		sum /= n;
+		if (iw > 0) sum += 1. / ((i - 1) * (i + 1));
+		iw = -iw;
+		f += sum * sum;
+    }
+	++n_evals;
+    return f;
+}
+
+int main()
+{
+	double x[20], y;
+	int n, i;
+	printf("\nMinimizer: Hooke-Jeeves\n");
+	for (n = 2; n <= 8; n += 2) {
+		for (i = 0; i != n; ++i) x[i] = (double)(i + 1) / n;
+		n_evals = 0;
+		y = kmin_hj(f_Chebyquad, n, x, 0, KMIN_RADIUS, KMIN_EPS, KMIN_MAXCALL);
+		printf("n=%d,min=%.8lg,n_evals=%d\n", n, y, n_evals);
+	}
+	printf("\n");
+	return 0;
+}
--- a/ext/klib/test/krmq_test.c
+++ b/ext/klib/test/krmq_test.c
@ -0,0 +1,151 @@
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <limits.h>
+#include "krmq.h"
+
+#define CALLOC(type, num) ((type*)calloc(num, sizeof(type)))
+
+struct my_node {
+	int key;
+	int val;
+	KRMQ_HEAD(struct my_node) head;
+};
+
+#define my_cmp(p, q) (((p)->key > (q)->key) - ((p)->key < (q)->key))
+#define my_lt2(p, q) ((p)->val < (q)->val)
+KRMQ_INIT(my, struct my_node, head, my_cmp, my_lt2)
+
+int check(struct my_node *p, int *hh, int *min)
+{
+	*hh = 0, *min = INT_MAX;
+	if (p) {
+		int c = 1, h[2] = {0, 0}, m[2] = {INT_MAX, INT_MAX};
+		*min = p->val;
+		if (p->head.p[0]) c += check(p->head.p[0], &h[0], &m[0]);
+		if (p->head.p[1]) c += check(p->head.p[1], &h[1], &m[1]);
+		*min = *min < m[0]? *min : m[0];
+		*min = *min < m[1]? *min : m[1];
+		*hh = (h[0] > h[1]? h[0] : h[1]) + 1;
+		if (*min != p->head.s->val)
+			fprintf(stderr, "min %d != %d at %c\n", *min, p->head.s->val, p->key);
+		if (h[1] - h[0] != (int)p->head.balance)
+			fprintf(stderr, "%d - %d != %d at %c\n", h[1], h[0], p->head.balance, p->key);
+		if (c != (int)p->head.size)
+			fprintf(stderr, "%d != %d at %c\n", p->head.size, c, p->key);
+		return c;
+	} else return 0;
+}
+
+int check_rmq(const struct my_node *root, int lo, int hi)
+{
+	struct my_node s, t, *p, *q;
+	krmq_itr_t(my) itr;
+	int min = INT_MAX;
+	s.key = lo, t.key = hi;
+	p = krmq_rmq(my, root, &s, &t);
+	krmq_interval(my, root, &s, 0, &q);
+	if (p == 0) return -1;
+	krmq_itr_find(my, root, q, &itr);
+	do {
+		const struct my_node *r = krmq_at(&itr);
+		if (r->key > hi) break;
+		//fprintf(stderr, "%c\t%d\n", r->key, r->val);
+		if (r->val < min) min = r->val;
+	} while (krmq_itr_next(my, &itr));
+	assert((min == INT_MAX && p == 0) || (min < INT_MAX && p));
+	if (min != p->val) fprintf(stderr, "rmq_min %d != %d\n", p->val, min);
+	return min;
+}
+
+int print_tree(const struct my_node *p)
+{
+	int c = 1;
+	if (p == 0) return 0;
+	if (p->head.p[0] || p->head.p[1]) {
+		putchar('(');
+		if (p->head.p[0]) c += print_tree(p->head.p[0]);
+		else putchar('.');
+		putchar(',');
+		if (p->head.p[1]) c += print_tree(p->head.p[1]);
+		else putchar('.');
+		putchar(')');
+	}
+	printf("%c:%d/%d", p->key, p->val, p->head.s->val);
+	return c;
+}
+
+void check_and_print(struct my_node *root)
+{
+	int h, min;
+	check(root, &h, &min);
+	print_tree(root);
+	putchar('\n');
+}
+
+void shuffle(int n, char a[])
+{
+	int i, j;
+	for (i = n; i > 1; --i) {
+		char tmp;
+		j = (int)(drand48() * i);
+		tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
+	}
+}
+
+int main(void)
+{
+	char buf[256];
+	int i, n, h, min;
+	struct my_node *root = 0;
+	struct my_node *p, *q, t;
+	krmq_itr_t(my) itr;
+	unsigned cnt;
+
+	srand48(123);
+	for (i = 33, n = 0; i <= 126; ++i)
+		if (i != '(' && i != ')' && i != '.' && i != ';')
+			buf[n++] = i;
+	shuffle(n, buf);
+	for (i = 0; i < n; ++i) {
+		p = CALLOC(struct my_node, 1);
+		p->key = buf[i];
+		p->val = i;
+		q = krmq_insert(my, &root, p, &cnt);
+		if (p != q) free(p);
+		check(root, &h, &min);
+	}
+
+	shuffle(n, buf);
+	for (i = 0; i < n/2; ++i) {
+		t.key = buf[i];
+		//fprintf(stderr, "i=%d, key=%c, n/2=%d\n", i, t.key, n/2);
+		q = krmq_erase(my, &root, &t, 0);
+		if (q) free(q);
+		check(root, &h, &min);
+	}
+	check_and_print(root);
+
+	check_rmq(root, '0', '9');
+	check_rmq(root, '!', '~');
+	check_rmq(root, 'A', 'Z');
+	check_rmq(root, 'F', 'G');
+	check_rmq(root, 'a', 'z');
+	for (i = 0; i < n; ++i) {
+		int lo, hi;
+		lo = (int)(drand48() * n);
+		hi = (int)(drand48() * n);
+		check_rmq(root, lo, hi);
+	}
+
+	krmq_itr_first(my, root, &itr);
+	do {
+		const struct my_node *r = krmq_at(&itr);
+		putchar(r->key);
+	} while (krmq_itr_next(my, &itr));
+	putchar('\n');
+	krmq_free(struct my_node, head, root, free);
+	return 0;
+}
--- a/ext/klib/test/kseq_bench.c
+++ b/ext/klib/test/kseq_bench.c
@ -0,0 +1,69 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "kseq.h"
+
+#define BUF_SIZE 4096
+KSTREAM_INIT(gzFile, gzread, BUF_SIZE)
+
+int main(int argc, char *argv[])
+{
+	gzFile fp;
+	clock_t t;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: kseq_bench <in.gz>\n");
+		return 1;
+	}
+	{
+		uint8_t *buf = malloc(BUF_SIZE);
+		fp = gzopen(argv[1], "r");
+		t = clock();
+		while (gzread(fp, buf, BUF_SIZE) > 0);
+		fprintf(stderr, "[gzread] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+		free(buf);
+	}
+	{
+		kstream_t *ks;
+		fp = gzopen(argv[1], "r");
+		ks = ks_init(fp);
+		t = clock();
+		while (ks_getc(ks) >= 0);
+		fprintf(stderr, "[ks_getc] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		ks_destroy(ks);
+		gzclose(fp);
+	}
+	{
+		kstream_t *ks;
+		kstring_t *s;
+		int dret;
+		s = calloc(1, sizeof(kstring_t));
+		fp = gzopen(argv[1], "r");
+		ks = ks_init(fp);
+		t = clock();
+		while (ks_getuntil(ks, '\n', s, &dret) >= 0);
+		fprintf(stderr, "[ks_getuntil] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		ks_destroy(ks);
+		gzclose(fp);
+		free(s->s); free(s);
+	}
+	if (argc == 2) {
+		fp = gzopen(argv[1], "r");
+		t = clock();
+		while (gzgetc(fp) >= 0);
+		fprintf(stderr, "[gzgetc] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+	}
+	if (argc == 2) {
+		char *buf = malloc(BUF_SIZE);
+		fp = gzopen(argv[1], "r");
+		t = clock();
+		while (gzgets(fp, buf, BUF_SIZE) > 0);
+		fprintf(stderr, "[gzgets] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+		free(buf);
+	}
+	return 0;
+}
--- a/ext/klib/test/kseq_bench2.c
+++ b/ext/klib/test/kseq_bench2.c
@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <time.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "kseq.h"
+KSTREAM_INIT(int, read, 4096)
+
+#define BUF_SIZE 65536
+
+int main(int argc, char *argv[])
+{
+	clock_t t;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: %s <in.txt>\n", argv[0]);
+		return 1;
+	}
+	{
+		FILE *fp;
+		char *s;
+		t = clock();
+		s = malloc(BUF_SIZE);
+		fp = fopen(argv[1], "r");
+		while (fgets(s, BUF_SIZE, fp));
+		fclose(fp);
+		fprintf(stderr, "[fgets] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	{
+		int fd, dret;
+		kstream_t *ks;
+		kstring_t s;
+		t = clock();
+		s.l = s.m = 0; s.s = 0;
+		fd = open(argv[1], O_RDONLY);
+		ks = ks_init(fd);
+		while (ks_getuntil(ks, '\n', &s, &dret) >= 0);
+		free(s.s);
+		ks_destroy(ks);
+		close(fd);
+		fprintf(stderr, "[kstream] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	return 0;
+}
--- a/ext/klib/test/kseq_test.c
+++ b/ext/klib/test/kseq_test.c
@ -0,0 +1,27 @@
+#include <zlib.h>
+#include <stdio.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+int main(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int l;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: %s <in.fasta>\n", argv[0]);
+		return 1;
+	}
+	fp = gzopen(argv[1], "r");
+	seq = kseq_init(fp);
+	while ((l = kseq_read(seq)) >= 0) {
+		printf("name: %s\n", seq->name.s);
+		if (seq->comment.l) printf("comment: %s\n", seq->comment.s);
+		printf("seq: %s\n", seq->seq.s);
+		if (seq->qual.l) printf("qual: %s\n", seq->qual.s);
+	}
+	printf("return value: %d\n", l);
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
--- a/ext/klib/test/kseq_test.dat
+++ b/ext/klib/test/kseq_test.dat
@ -0,0 +1,12 @@
+>1
+acgtacgtacgtagc
+>2 test
+acgatcgatc
+@3 test2
+cgctagcatagc
+cgatatgactta
+
+78wo82usd980
+d88fau
+
+238ud8
--- a/ext/klib/test/ksort_test.c
+++ b/ext/klib/test/ksort_test.c
@ -0,0 +1,104 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include "ksort.h"
+
+KSORT_INIT_GENERIC(int)
+
+int main(int argc, char *argv[])
+{
+	int i, N = 10000000;
+	int *array, x;
+	clock_t t1, t2;
+	if (argc > 1) N = atoi(argv[1]);
+	array = (int*)malloc(sizeof(int) * N);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	x = ks_ksmall(int, N, array, 10500);
+	t2 = clock();
+	fprintf(stderr, "ksmall [%d]: %.3lf\n", x, (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_introsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "introsort [%d]: %.3lf\n", array[10500], (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in introsort!\n");
+			exit(1);
+		}
+	}
+
+#ifndef _ALIGNED_ONLY
+	{ // test unaligned ksmall
+		srand48(11);
+		unsigned char *a;
+		int *b;
+		a = malloc(N * sizeof(int) + 1);
+		b = (int*)(a + 1);
+		for (i = 0; i < N; ++i) b[i] = (int)lrand48();
+		t1 = clock();
+		ks_introsort(int, N, b);
+		t2 = clock();
+		fprintf(stderr, "introsort [%d]: %.3lf (unaligned: 0x%lx) \n", b[10500], (double)(t2-t1)/CLOCKS_PER_SEC, (size_t)b);
+	}
+#endif
+
+	t1 = clock();
+	ks_introsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "introsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_combsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "combsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in combsort!\n");
+			exit(1);
+		}
+	}
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_mergesort(int, N, array, 0);
+	t2 = clock();
+	fprintf(stderr, "mergesort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in mergesort!\n");
+			exit(1);
+		}
+	}
+
+	t1 = clock();
+	ks_mergesort(int, N, array, 0);
+	t2 = clock();
+	fprintf(stderr, "mergesort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_heapmake(int, N, array);
+	ks_heapsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "heapsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in heapsort!\n");
+			exit(1);
+		}
+	}
+
+	free(array);
+	return 0;
+}
--- a/ext/klib/test/ksort_test.cc
+++ b/ext/klib/test/ksort_test.cc
@ -0,0 +1,997 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include <algorithm>
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(int)
+
+using namespace std;
+
+/**********************************
+ * BEGIN OF PAUL'S IMPLEMENTATION *
+ **********************************/
+
+/* Attractive Chaos: I have added inline where necessary. */
+
+/*
+Copyright (c) 2004 Paul Hsieh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+    Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+    Neither the name of sorttest nor the names of its contributors may be
+    used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+
+Recommended flags:
+------------------
+
+Intel C/C++:
+icl /O2 /G6 /Qaxi /Qxi /Qip sorttest.c
+
+WATCOM C/C++:
+wcl386 /otexan /6r sorttest.c
+
+GCC:
+gcc -O3 -mcpu=athlon-xp -march=athlon-xp sorttest.c
+
+MSVC:
+cl /O2 /Ot /Og /G6 sorttest.c
+
+*/
+
+static inline void sort2 (int * numbers) {
+int tmp;
+
+    if (numbers[0] <= numbers[1]) return;
+    tmp = numbers[0];
+    numbers[0] = numbers[1];
+    numbers[1] = tmp;
+}
+
+static inline void sort3 (int * numbers) {
+int tmp;
+
+    if (numbers[0] <= numbers[1]) {
+        if (numbers[1] <= numbers[2]) return;
+        if (numbers[2] <= numbers[0]) {
+            tmp = numbers[0];
+            numbers[0] = numbers[2];
+            numbers[2] = numbers[1];
+            numbers[1] = tmp;
+            return;
+        }
+        tmp = numbers[1];
+    } else {
+        tmp = numbers[0];
+        if (numbers[0] <= numbers[2]) {
+            numbers[0] = numbers[1];
+            numbers[1] = tmp;
+            return;
+        }
+        if (numbers[2] <= numbers[1]) {
+            numbers[0] = numbers[2];
+            numbers[2] = tmp;
+            return;
+        }
+        numbers[0] = numbers[1];
+    }
+    numbers[1] = numbers[2];
+    numbers[2] = tmp;
+}
+
+static inline void sort4 (int * num) {
+int tmp;
+  if (num[0] < num[1]) {
+    if (num[1] < num[2]) {
+      if (num[1] < num[3]) {
+        if (num[2] >= num[3]) {
+          tmp = num[2];
+          num[2] = num[3];
+          num[3] = tmp;
+        }
+      } else {
+        tmp = num[1];
+        if (num[0] < num[3]) {
+          num[1] = num[3];
+        } else {
+          num[1] = num[0];
+          num[0] = num[3];
+        }
+        num[3] = num[2];
+        num[2] = tmp;
+      }
+    } else {
+      if (num[0] < num[2]) {
+        if (num[2] < num[3]) {
+          if (num[1] < num[3]) {
+            tmp = num[1];
+          } else {
+            tmp = num[3];
+            num[3] = num[1];
+          }
+          num[1] = num[2];
+          num[2] = tmp;
+        } else {
+          if (num[0] < num[3]) {
+            tmp = num[3];
+          } else {
+            tmp = num[0];
+            num[0] = num[3];
+          }
+          num[3] = num[1];
+          num[1] = tmp;
+        }
+      } else {
+        if (num[0] < num[3]) {
+          tmp = num[0];
+          num[0] = num[2];
+          if (num[1] < num[3]) {
+            num[2] = num[1];
+          } else {
+            num[2] = num[3];
+            num[3] = num[1];
+          }
+          num[1] = tmp;
+        } else {
+          if (num[2] < num[3]) {
+            tmp = num[0];
+            num[0] = num[2];
+            num[2] = tmp;
+            tmp = num[1];
+            num[1] = num[3];
+          } else {
+            tmp = num[1];
+            num[1] = num[2];
+            num[2] = num[0];
+            num[0] = num[3];
+          }
+          num[3] = tmp;
+        }
+      }
+    }
+  } else {
+    tmp = num[0];
+    if (tmp < num[2]) {
+      if (tmp < num[3]) {
+        num[0] = num[1];
+        num[1] = tmp;
+        if (num[2] >= num[3]) {
+          tmp = num[2];
+          num[2] = num[3];
+          num[3] = tmp;
+        }
+      } else {
+        if (num[1] < num[3]) {
+          num[0] = num[1];
+          num[1] = num[3];
+        } else {
+          num[0] = num[3];
+        }
+        num[3] = num[2];
+        num[2] = tmp;
+      }
+    } else {
+      if (num[1] < num[2]) {
+        if (num[2] < num[3]) {
+          num[0] = num[1];
+          num[1] = num[2];
+          if (tmp < num[3]) {
+            num[2] = tmp;
+          } else {
+            num[2] = num[3];
+            num[3] = tmp;
+          }
+        } else {
+          if (num[1] < num[3]) {
+            num[0] = num[1];
+            num[1] = num[3];
+          } else {
+            num[0] = num[3];
+          }
+          num[3] = tmp;
+        }
+      } else {
+        if (num[1] < num[3]) {
+          num[0] = num[2];
+          if (tmp < num[3]) {
+            num[2] = tmp;
+          } else {
+            num[2] = num[3];
+            num[3] = tmp;
+          }
+        } else {
+          if (num[2] < num[3]) {
+            num[0] = num[2];
+            num[2] = num[1];
+            num[1] = num[3];
+            num[3] = tmp;
+          } else {
+            num[0] = num[3];
+            num[3] = tmp;
+            tmp = num[1];
+            num[1] = num[2];
+            num[2] = tmp;
+          }
+        }
+      }
+    }
+  }
+}
+
+static inline void sortAlt2 (int * numbers, int * altNumbers) {
+    if (numbers[0] <= numbers[1]) {
+        altNumbers[0] = numbers[0];
+        altNumbers[1] = numbers[1];
+    } else {
+        altNumbers[0] = numbers[1];
+        altNumbers[1] = numbers[0];
+    }
+}
+
+static inline void sortAlt3 (int * numbers, int * altNumbers) {
+    if (numbers[0] <= numbers[1]) {
+        if (numbers[1] <= numbers[2]) {
+            altNumbers[0] = numbers[0];
+            altNumbers[1] = numbers[1];
+            altNumbers[2] = numbers[2];
+        } else if (numbers[2] <= numbers[0]) {
+            altNumbers[0] = numbers[2];
+            altNumbers[1] = numbers[0];
+            altNumbers[2] = numbers[1];
+        } else {
+            altNumbers[0] = numbers[0];
+            altNumbers[1] = numbers[2];
+            altNumbers[2] = numbers[1];
+        }
+    } else {
+        if (numbers[0] <= numbers[2]) {
+            altNumbers[0] = numbers[1];
+            altNumbers[1] = numbers[0];
+            altNumbers[2] = numbers[2];
+        } else if (numbers[2] <= numbers[1]) {
+            altNumbers[0] = numbers[2];
+            altNumbers[1] = numbers[1];
+            altNumbers[2] = numbers[0];
+        } else {
+            altNumbers[0] = numbers[1];
+            altNumbers[1] = numbers[2];
+            altNumbers[2] = numbers[0];
+        }
+    }
+}
+
+/*
+ *  Insert Sort
+ */
+
+inline void insertSort (int numbers[], int qty) {
+int i, j, idx, q4;
+int tmp;
+
+    if (qty <= 4) {
+        if (qty == 4) sort4 (numbers);
+        else if (qty == 3) sort3 (numbers);
+        else if (qty == 2) sort2 (numbers);
+        return;
+    }
+
+    q4 = qty - 4;
+
+    for (i=0; i < q4; i++) {
+        idx = i;
+        for (j=i+1; j < qty; j++) {
+            if (numbers[j] < numbers[idx]) idx = j;
+        }
+        if (idx != i) {
+            tmp = numbers[idx];
+            numbers[idx] = numbers[i];
+            numbers[i] = tmp;
+        }
+    }
+
+    sort4 (numbers + q4);
+}
+
+/*
+ *  Heap Sort
+ */
+
+/* Assure the heap property for entries from top to last */
+static void siftDown (int numbers[], int top, int last) {
+int tmp = numbers[top];
+int maxIdx = top;
+
+    while (last >= (maxIdx += maxIdx)) {
+
+        /* This is where the comparison occurrs and where a sufficiently
+           good compiler can use a computed conditional result rather
+           than using control logic. */
+        if (maxIdx != last && numbers[maxIdx] < numbers[maxIdx + 1]) maxIdx++;
+
+        if (tmp >= numbers[maxIdx]) break;
+        numbers[top] = numbers[maxIdx];
+        top = maxIdx;
+    }
+    numbers[top] = tmp;
+}
+
+/* Peel off the top siftDown operation since its parameters are trivial to
+   fill in directly (and this saves us some moves.) */
+static void siftDown0 (int numbers[], int last) {
+int tmp;
+
+    if (numbers[0] < numbers[1]) {
+        tmp = numbers[1];
+        numbers[1] = numbers[0];
+        siftDown (numbers, 1, last);
+    } else {
+        tmp = numbers[0];
+    }
+    numbers[0] = numbers[last];
+    numbers[last] = tmp;
+}
+
+void heapSort (int numbers[], int qty) {
+int i;
+
+    if (qty <= 4) {
+        if (qty == 4) sort4 (numbers);
+        else if (qty == 3) sort3 (numbers);
+        else if (qty == 2) sort2 (numbers);
+        return;
+    }
+
+    i = qty / 2;
+    /* Enforce the heap property for each position in the tree */
+    for (  qty--; i >  0; i--) siftDown  (numbers, i, qty);
+    for (i = qty; i > 0; i--) siftDown0 (numbers, i);
+}
+
+/*
+ *  Quick Sort
+ */
+
+static int medianOf3 (int * numbers, int i, int j) {
+int tmp;
+
+    if (numbers[0] <= numbers[i]) {
+        if (numbers[j] <= numbers[0]) return numbers[0]; /* j 0 i */
+        if (numbers[i] <= numbers[j]) j = i;             /* 0 i j */
+                                                         /* 0 j i */
+    } else {
+        if (numbers[0] <= numbers[j]) return numbers[0]; /* i 0 j */
+        if (numbers[j] <= numbers[i]) j = i;             /* j i 0 */
+                                                         /* i j 0 */
+    }
+    tmp = numbers[j];
+    numbers[j] = numbers[0];
+    numbers[0] = tmp;
+    return tmp;
+}
+
+static void quickSortRecurse (int * numbers, int left, int right) {
+int pivot, lTmp, rTmp;
+
+    qsrStart:;
+
+#if defined(__GNUC__)
+    if (right <= left + 8) {
+        insertSort (numbers + left, right - left + 1);
+        return;
+    }
+#else
+    if (right <= left + 3) {
+        if (right == left + 1) {
+            sort2 (numbers + left);
+        } else if (right == left + 2) {
+            sort3 (numbers + left);
+        } else if (right == left + 3) {
+            sort4 (numbers + left);
+        }
+        return;
+    }
+#endif
+
+    lTmp = left;
+    rTmp = right;
+
+    pivot = medianOf3 (numbers + left, (right-left) >> 1, right-1-left);
+
+    goto QStart;
+    while (1) {
+        do {
+            right--;
+            if (left >= right) goto QEnd;
+            QStart:;
+        } while (numbers[right] > pivot);
+        numbers[left] = numbers[right];
+        do { 
+            left++;
+            if (left >= right) {
+                left = right;
+                goto QEnd;
+            }
+        } while (numbers[ left] < pivot);
+        numbers[right] = numbers[left];
+    }
+    QEnd:;
+    numbers[left] = pivot;
+
+    /* Only recurse the smaller partition */
+
+    if (left-1 - lTmp <= rTmp - left - 1) {
+        if (lTmp < left) quickSortRecurse (numbers,   lTmp, left-1);
+
+        /* Set up for larger partition */
+        left++;
+        right = rTmp;
+    } else {
+        if (rTmp > left) quickSortRecurse (numbers, left+1,   rTmp);
+
+        /* Set up for larger partition */
+        right = left - 1;
+        left = lTmp;
+    }
+
+    /* Rerun with larger partition (recursion not required.) */
+    goto qsrStart;
+}
+
+void quickSort (int numbers[], int qty) {
+    if (qty < 2) return;
+    quickSortRecurse (numbers, 0, qty - 1);
+}
+
+/*
+ *  Merge Sort
+ */
+
+static void mergesortInPlace (int * numbers, int * altNumbers, int qty);
+
+/* Perform mergesort, but store results in altNumbers */
+
+static void mergesortExchange (int * numbers, int * altNumbers, int qty) {
+int half, i0, i1, i;
+
+    if (qty == 2) {
+        sortAlt2 (numbers, altNumbers);
+        return;
+    }
+    if (qty == 3) {
+        sortAlt3 (numbers, altNumbers);
+        return;
+    }
+
+    half = (qty + 1)/2;
+
+    mergesortInPlace (numbers, altNumbers, half);
+    mergesortInPlace (numbers + half, altNumbers, qty - half);
+
+    i0 = 0; i1 = half;
+
+    for (i=0; i < qty; i++) {
+        if (i1 >= qty || (i0 < half && numbers[i0] < numbers[i1])) {
+            altNumbers[i] = numbers[i0];
+            i0++;
+        } else {
+            altNumbers[i] = numbers[i1];
+            i1++;
+        }
+    }
+}
+
+/* Perform mergesort and store results in numbers */
+
+static void mergesortInPlace (int * numbers, int * altNumbers, int qty) {
+int half, i0, i1, i;
+
+#if 0
+    if (qty == 2) {
+        sort2 (numbers);
+        return;
+    }
+    if (qty == 3) {
+        sort3 (numbers);
+        return;
+    }
+    if (qty == 4) {
+        sort4 (numbers);
+        return;
+    }
+#else
+    if (qty <= 12) {
+        insertSort (numbers, qty);
+        return;
+    }
+#endif
+
+    half = (qty + 1)/2;
+
+    mergesortExchange (numbers, altNumbers, half);
+    mergesortExchange (numbers + half, altNumbers + half, qty - half);
+
+    i0 = 0; i1 = half;
+
+    for (i=0; i < qty; i++) {
+        if (i1 >= qty || (i0 < half && altNumbers[i0] < altNumbers[i1])) {
+            numbers[i] = altNumbers[i0];
+            i0++;
+        } else {
+            numbers[i] = altNumbers[i1];
+            i1++;
+        }
+    }
+}
+
+#include <stdlib.h>
+
+void mergeSort (int numbers[], int qty) {
+int * tmpArray;
+
+    if (qty <= 12) {
+        insertSort (numbers, qty);
+        return;
+    }
+
+    tmpArray = (int *) malloc (qty * sizeof (int));
+    mergesortInPlace (numbers, tmpArray, qty);
+    free (tmpArray);
+}
+
+/********************************
+ * END OF PAUL'S IMPLEMENTATION *
+ ********************************/
+
+/*************************************************
+ *** Implementation 1: faster on sorted arrays ***
+ *************************************************/
+
+#define rstype_t unsigned
+#define rskey(x) (x)
+
+#define RS_MIN_SIZE 64
+
+typedef struct {
+	rstype_t *b, *e;
+} rsbucket_t;
+
+void rs_sort(rstype_t *beg, rstype_t *end, int n_bits, int s)
+{
+	rstype_t *i;
+	int size = 1<<n_bits, m = size - 1;
+	rsbucket_t *k, b[size], *be = b + size;
+
+	for (k = b; k != be; ++k) k->b = k->e = beg;
+	for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e;
+	for (k = b + 1; k != be; ++k)
+		k->e += (k-1)->e - beg, k->b = (k-1)->e;
+	for (k = b; k != be;) {
+		if (k->b != k->e) {
+			rsbucket_t *l;
+			if ((l = b + (rskey(*k->b)>>s&m)) != k) {
+				rstype_t tmp = *k->b, swap;
+				do {
+					swap = tmp; tmp = *l->b; *l->b++ = swap;
+					l = b + (rskey(tmp)>>s&m);
+				} while (l != k);
+				*k->b++ = tmp;
+			} else ++k->b;
+		} else ++k;
+	}
+	for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e;
+	if (s) {
+		s = s > n_bits? s - n_bits : 0;
+		for (k = b; k != be; ++k)
+			if (k->e - k->b > RS_MIN_SIZE) rs_sort(k->b, k->e, n_bits, s);
+			else if (k->e - k->b > 1)
+				for (i = k->b + 1; i < k->e; ++i)
+					if (rskey(*i) < rskey(*(i - 1))) {
+						rstype_t *j, tmp = *i;
+						for (j = i; j > k->b && rskey(tmp) < rskey(*(j-1)); --j)
+							*j = *(j - 1);
+						*j = tmp;
+					}
+	}
+}
+
+/*************************************************
+ *** Implementation 2: faster on random arrays ***
+ *************************************************/
+
+static inline void rs_insertsort(rstype_t *s, rstype_t *t)
+{
+	rstype_t *i;
+	for (i = s + 1; i < t; ++i) {
+		if (rskey(*i) < rskey(*(i - 1))) {
+			rstype_t *j, tmp = *i;
+			for (j = i; j > s && rskey(tmp) < rskey(*(j-1)); --j)
+				*j = *(j - 1);
+			*j = tmp;
+		}
+	}
+}
+/*
+void rs_sort2(rstype_t *beg, rstype_t *end, int n_bits, int s)
+{
+	int j, size = 1<<n_bits, m = size - 1;
+	unsigned long c[size];
+	rstype_t *i, *b[size], *e[size];
+
+	for (j = 0; j < size; ++j) c[j] = 0;
+	for (i = beg; i != end; ++i) ++c[rskey(*i)>>s&m];
+	b[0] = e[0] = beg;
+	for (j = 1; j != size; ++j) b[j] = e[j] = b[j - 1] + c[j - 1];
+	for (i = beg, j = 0; i != end;) {
+		rstype_t tmp = *i, swap;
+		int x;
+		for (;;) {
+			x = rskey(tmp)>>s&m;
+			if (e[x] == i) break;
+			swap = tmp; tmp = *e[x]; *e[x]++ = swap;
+		}
+		*i++ = tmp;
+		++e[x];
+		while (j != size && i >= b[j]) ++j;
+		while (j != size && e[j-1] == b[j]) ++j;
+		if (i < e[j-1]) i = e[j-1];
+	}
+	if (s) {
+		s = s > n_bits? s - n_bits : 0;
+		for (j = 0; j < size; ++j) {
+			if (c[j] >= RS_MIN_SIZE) rs_sort2(b[j], e[j], n_bits, s);
+			else if (c[j] >= 2) rs_insertsort(b[j], e[j]);
+		}
+	}
+}
+*/
+void radix_sort(unsigned *array, int offset, int end, int shift) {
+    int x, y, value, temp;
+    int last[256] = { 0 }, pointer[256];
+
+    for (x=offset; x<end; ++x) {
+        ++last[(array[x] >> shift) & 0xFF];
+    }
+
+    last[0] += offset;
+    pointer[0] = offset;
+    for (x=1; x<256; ++x) {
+        pointer[x] = last[x-1];
+        last[x] += last[x-1];
+    }
+
+    for (x=0; x<256; ++x) {
+        while (pointer[x] != last[x]) {
+            value = array[pointer[x]];
+            y = (value >> shift) & 0xFF;
+            while (x != y) {
+                temp = array[pointer[y]];
+                array[pointer[y]++] = value;
+                value = temp;
+                y = (value >> shift) & 0xFF;
+            }
+            array[pointer[x]++] = value;
+        }
+    }
+
+    if (shift > 0) {
+        shift -= 8;
+        for (x=0; x<256; ++x) {
+            temp = x > 0 ? pointer[x] - pointer[x-1] : pointer[0] - offset;
+            if (temp > 64) {
+                radix_sort(array, pointer[x] - temp, pointer[x], shift);
+            } else if (temp > 1) rs_insertsort(array + pointer[x] - temp, array + pointer[x]);
+        }
+    }
+}
+/*************************
+ *** END OF RADIX SORT ***
+ *************************/
+
+template< class _Type, unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold >
+inline void _RadixSort_Unsigned_PowerOf2Radix_1( _Type* a, long last, _Type bitMask, unsigned long shiftRightAmount )
+{
+	const unsigned long numberOfBins = PowerOfTwoRadix;
+	unsigned long count[ numberOfBins ];
+	for( unsigned long i = 0; i < numberOfBins; i++ )
+		count[ i ] = 0;
+	for ( long _current = 0; _current <= last; _current++ ) // Scan the array and count the number of times each value appears
+	{
+		unsigned long digit = (unsigned long)(( a[ _current ] & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on
+		count[ digit ]++;
+	}
+	long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin;
+	startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0;
+	for( unsigned long i = 1; i < numberOfBins; i++ )
+		startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count[ i - 1 ];
+	for ( long _current = 0; _current <= last; )
+	{
+		unsigned long digit;
+		_Type tmp = a[ _current ];  // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
+		while ( true ) {
+			digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount );   // extract the digit we are sorting based on
+			if ( endOfBin[ digit ] == _current )
+				break;
+			_Type tmp2;
+			//_swap( tmp, a[ endOfBin[ digit ] ] );
+			tmp2 = a[endOfBin[digit]]; a[endOfBin[digit]] = tmp; tmp = tmp2;
+			endOfBin[ digit ]++;
+		}
+		a[ _current ] = tmp;
+		endOfBin[ digit ]++;   // leave the element at its location and grow the bin
+		_current++;  // advance the current pointer to the next element
+		while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins )
+			nextBin++;
+		while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins )
+			nextBin++;
+		if ( _current < endOfBin[ nextBin - 1 ] )
+			_current = endOfBin[ nextBin - 1 ];
+	}
+	bitMask >>= Log2ofPowerOfTwoRadix;
+	if ( bitMask != 0 )   // end recursion when all the bits have been processes
+	{
+		if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix;
+		else shiftRightAmount  = 0;
+		for( unsigned long i = 0; i < numberOfBins; i++ )
+		{
+			long numberOfElements = endOfBin[ i ] - startOfBin[ i ];
+			if ( numberOfElements >= Threshold )  // endOfBin actually points to one beyond the bin
+				_RadixSort_Unsigned_PowerOf2Radix_1< _Type, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &a[ startOfBin[ i ]], numberOfElements - 1, bitMask, shiftRightAmount );
+			else if ( numberOfElements >= 2 )
+				rs_insertsort(&a[ startOfBin[ i ]], &a[ endOfBin[ i ]]);
+		}
+	}
+}
+inline void RadixSortInPlace_HybridUnsigned_Radix256( unsigned* a, unsigned long a_size )
+{
+	if ( a_size < 2 ) return;
+	unsigned long bitMask = 0xFF000000; // bitMask controls how many bits we process at a time
+	unsigned long shiftRightAmount = 24;
+	if ( a_size >= 32 )
+		_RadixSort_Unsigned_PowerOf2Radix_1<unsigned, 256, 8, 32>(a, a_size - 1, bitMask, shiftRightAmount );
+	else
+		rs_insertsort(a, a + a_size);
+}
+
+struct intcmp_t {
+	inline int operator() (int a, int b) const {
+		return a < b? -1 : a > b? 1 : 0;
+	}
+};
+
+int compare_int(int a, int b)
+{
+	return a < b? -1 : a > b? 1 : 0;
+}
+int compare(const void *a, const void *b)
+{
+	return *((int*)a) - *((int*)b);
+}
+
+int main(int argc, char *argv[])
+{
+	int i, N = 50000000;
+	int *array, *temp;
+	clock_t t1, t2;
+	if (argc == 1) fprintf(stderr, "Usage: %s [%d]\n", argv[0], N);
+	if (argc > 1) N = atoi(argv[1]);
+	temp = (int*)malloc(sizeof(int) * N);
+	array = (int*)malloc(sizeof(int) * N);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	rs_sort((unsigned*)array, (unsigned*)array + N, 8, 24);
+	t2 = clock();
+	fprintf(stderr, "radix sort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in radix sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	rs_sort((unsigned*)array, (unsigned*)array + N, 8, 24);
+	t2 = clock();
+	fprintf(stderr, "radix sort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	RadixSortInPlace_HybridUnsigned_Radix256((unsigned*)array, N);
+//	radix_sort((unsigned*)array, 0, N, 24);
+	t2 = clock();
+	fprintf(stderr, "vd's radix sort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in radix sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	RadixSortInPlace_HybridUnsigned_Radix256((unsigned*)array, N);
+//	radix_sort((unsigned*)array, 0, N, 24);
+	t2 = clock();
+	fprintf(stderr, "vd's radix sort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	sort(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL introsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	t1 = clock();
+	sort(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL introsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	stable_sort(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL stablesort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	t1 = clock();
+	stable_sort(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL stablesort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	make_heap(array, array+N);
+	sort_heap(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL heapsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in heap_sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	make_heap(array, array+N);
+	sort_heap(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL heapsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_combsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "combsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in combsort!\n");
+			exit(1);
+		}
+	}
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	qsort(array, N, sizeof(int), compare);
+	t2 = clock();
+	fprintf(stderr, "libc qsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_introsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "my introsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in intro_sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	ks_introsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "introsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_mergesort(int, N, array, 0);
+	t2 = clock();
+	fprintf(stderr, "iterative mergesort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in merge_sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	ks_mergesort(int, N, array, 0);
+	t2 = clock();
+	fprintf(stderr, "iterative mergesort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_heapmake(int, N, array);
+	ks_heapsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "my heapsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in heap_sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	ks_heapmake(int, N, array);
+	ks_heapsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "heapsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	heapSort(array, N);
+	t2 = clock();
+	fprintf(stderr, "Paul's heapsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in intro_sort!\n");
+			exit(1);
+		}
+	}
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	quickSort(array, N);
+	t2 = clock();
+	fprintf(stderr, "Paul's quicksort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in intro_sort!\n");
+			exit(1);
+		}
+	}
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	mergeSort(array, N);
+	t2 = clock();
+	fprintf(stderr, "Paul's mergesort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in intro_sort!\n");
+			exit(1);
+		}
+	}
+
+	free(array); free(temp);
+	return 0;
+}
--- a/ext/klib/test/kstring_bench.c
+++ b/ext/klib/test/kstring_bench.c
@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "kstring.h"
+
+#define N 10000000
+
+int main()
+{
+	int i;
+	clock_t t;
+	kstring_t s, s2;
+	srand48(11);
+	s.l = s.m = 0; s.s = 0;
+	t = clock();
+	for (i = 0; i < N; ++i) {
+		int x = lrand48();
+		s.l = 0;
+		kputw(x, &s);
+	}
+	fprintf(stderr, "kputw: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+	srand48(11);
+	t = clock();
+	for (i = 0; i < N; ++i) {
+		int x = lrand48();
+		s.l = 0;
+		ksprintf(&s, "%d", x);
+	}
+	fprintf(stderr, "ksprintf: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	srand48(11);
+	s2.l = s2.m = 0; s2.s = 0;
+	t = clock();
+	for (i = 0; i < N; ++i) {
+		int x = lrand48();
+		s2.l = s.l = 0;
+		kputw(x, &s2);
+		kputs(s2.s, &s);
+	}
+	fprintf(stderr, "kputw+kputs: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+	srand48(11);
+	t = clock();
+	for (i = 0; i < N; ++i) {
+		int x = lrand48();
+		s2.l = s.l = 0;
+		kputw(x, &s2);
+		ksprintf(&s, "%s", s2.s);
+	}
+	fprintf(stderr, "kputw+ksprintf: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+	return 0;
+}
--- a/ext/klib/test/kstring_bench2.c
+++ b/ext/klib/test/kstring_bench2.c
@ -0,0 +1,131 @@
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "kstring.h"
+
+#ifdef __APPLE__
+#define HAVE_STRNSTR
+#endif
+
+#ifdef __linux__
+#define HAVE_MEMMEM
+#endif
+
+static int str_len = 1024*1024*128;
+static int pat_len = 30;
+static int alphabet = 2;
+static int repeat = 50;
+
+char *gen_data(int len, int a)
+{
+	char *data;
+	int i;
+	long x;
+	srand48(11);
+	data = malloc(len);
+	for (i = 0; i < len; ++i)
+		data[i] = (int)(a * drand48()) + '!';
+	data[str_len - 1] = 0;
+	return data;
+}
+// http://srcvault.scali.eu.org/cgi-bin/Syntax/c/BoyerMoore.c
+char *BoyerMoore( unsigned char *data, unsigned int dataLength, unsigned char *string, unsigned int strLength )
+{
+	unsigned int skipTable[256], i;
+	unsigned char *search;
+	register unsigned char lastChar;
+
+	if (strLength == 0)
+		return NULL;
+
+	for (i = 0; i < 256; i++)
+		skipTable[i] = strLength;
+	search = string;
+	i = --strLength;
+	do {
+		skipTable[*search++] = i;
+	} while (i--);
+	lastChar = *--search;
+	search = data + strLength;
+	dataLength -= strLength+(strLength-1);
+	while ((int)dataLength > 0 ) {
+		unsigned int skip;
+		skip = skipTable[*search];
+		search += skip;
+		dataLength -= skip;
+		skip = skipTable[*search];
+		search += skip;
+		dataLength -= skip;
+		skip = skipTable[*search];
+		if (*search != lastChar) {
+			search += skip;
+			dataLength -= skip;
+			continue;
+		}
+		i = strLength;
+		do {
+			if (i-- == 0) return search;
+		} while (*--search == string[i]);
+		search += (strLength - i + 1);
+		dataLength--;
+	}
+	return NULL;
+}
+
+int main()
+{
+	char *data;
+	int i;
+	clock_t t;
+	t = clock();
+	data = gen_data(str_len, alphabet);
+	fprintf(stderr, "Generate data in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	{
+		t = clock(); srand48(1331);
+		for (i = 0; i < repeat; ++i) {
+			int y = lrand48() % (str_len - pat_len);
+			char *ret;
+			ret = kmemmem(data, str_len, data + y, pat_len, 0);
+//			printf("%d, %d\n", (int)(ret - data), y);
+		}
+		fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	if (1) {
+		t = clock(); srand48(1331);
+		for (i = 0; i < repeat; ++i) {
+			int y = lrand48() % (str_len - pat_len);
+			char *ret;
+			ret = BoyerMoore(data, str_len, data + y, pat_len);
+//			printf("%d, %d\n", (int)(ret - data), y);
+		}
+		fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+#ifdef HAVE_STRNSTR
+	if (1) {
+		char *tmp;
+		t = clock(); srand48(1331);
+		tmp = calloc(pat_len+1, 1);
+		for (i = 0; i < repeat; ++i) {
+			int y = lrand48() % (str_len - pat_len);
+			char *ret;
+			memcpy(tmp, data + y, pat_len);
+			ret = strnstr(data, tmp, str_len);
+		}
+		fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);		
+	}
+#endif
+#ifdef HAVE_MEMMEM
+	if (1) {
+		t = clock(); srand48(1331);
+		for (i = 0; i < repeat; ++i) {
+			int y = lrand48() % (str_len - pat_len);
+			char *ret;
+			ret = memmem(data, str_len, data + y, pat_len);
+//			printf("%d, %d\n", (int)(ret - data), y);
+		}
+		fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+#endif
+	return 0;
+}
--- a/ext/klib/test/kstring_test.c
+++ b/ext/klib/test/kstring_test.c
@ -0,0 +1,132 @@
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "kstring.h"
+
+int nfail = 0;
+
+void check(const char *what, const kstring_t *ks, const char *correct)
+{
+	if (ks->l != strlen(correct) || strcmp(ks->s, correct) != 0) {
+		fprintf(stderr, "%s produced \"%.*s\" (\"%s\" is correct)\tFAIL\n", what, (int)(ks->l), ks->s, correct);
+		nfail++;
+	}
+}
+
+void test_kputw(kstring_t *ks, int n)
+{
+	char buf[16];
+
+	ks->l = 0;
+	kputw(n, ks);
+
+	sprintf(buf, "%d", n);
+	check("kputw()", ks, buf);
+}
+
+void test_kputl(kstring_t *ks, long n)
+{
+	char buf[24];
+
+	ks->l = 0;
+	kputl(n, ks);
+
+	sprintf(buf, "%ld", n);
+	check("kputl()", ks, buf);
+}
+
+static char *mem_gets(char *buf, int buflen, void *vtextp)
+{
+	const char **textp = (const char **) vtextp;
+
+	const char *nl = strchr(*textp, '\n');
+	size_t n = nl? nl - *textp + 1 : strlen(*textp);
+
+	if (n == 0) return NULL;
+
+	if (n > buflen-1) n = buflen-1;
+	memcpy(buf, *textp, n);
+	buf[n] = '\0';
+	*textp += n;
+	return buf;
+}
+
+void test_kgetline(kstring_t *ks, const char *text, ...)
+{
+	const char *exp;
+	va_list arg;
+
+	va_start(arg, text);
+	while ((exp = va_arg(arg, const char *)) != NULL) {
+		ks->l = 0;
+		if (kgetline(ks, mem_gets, &text) != 0) kputs("EOF", ks);
+		check("kgetline()", ks, exp);
+	}
+	va_end(arg);
+
+	ks->l = 0;
+	if (kgetline(ks, mem_gets, &text) == 0) check("kgetline()", ks, "EOF");
+}
+
+int main(int argc, char **argv)
+{
+	kstring_t ks;
+
+	ks.l = ks.m = 0;
+	ks.s = NULL;
+
+	test_kputw(&ks, 0);
+	test_kputw(&ks, 1);
+	test_kputw(&ks, 37);
+	test_kputw(&ks, 12345);
+	test_kputw(&ks, -12345);
+	test_kputw(&ks, INT_MAX);
+	test_kputw(&ks, -INT_MAX);
+	test_kputw(&ks, INT_MIN);
+
+	test_kputl(&ks, 0);
+	test_kputl(&ks, 1);
+	test_kputl(&ks, 37);
+	test_kputl(&ks, 12345);
+	test_kputl(&ks, -12345);
+	test_kputl(&ks, INT_MAX);
+	test_kputl(&ks, -INT_MAX);
+	test_kputl(&ks, INT_MIN);
+	test_kputl(&ks, LONG_MAX);
+	test_kputl(&ks, -LONG_MAX);
+	test_kputl(&ks, LONG_MIN);
+
+	test_kgetline(&ks, "", NULL);
+	test_kgetline(&ks, "apple", "apple", NULL);
+	test_kgetline(&ks, "banana\n", "banana", NULL);
+	test_kgetline(&ks, "carrot\r\n", "carrot", NULL);
+	test_kgetline(&ks, "\n", "", NULL);
+	test_kgetline(&ks, "\n\n", "", "", NULL);
+	test_kgetline(&ks, "foo\nbar", "foo", "bar", NULL);
+	test_kgetline(&ks, "foo\nbar\n", "foo", "bar", NULL);
+	test_kgetline(&ks,
+		"abcdefghijklmnopqrstuvwxyz0123456789\nABCDEFGHIJKLMNOPQRSTUVWXYZ\n",
+		"abcdefghijklmnopqrstuvwxyz0123456789",
+		"ABCDEFGHIJKLMNOPQRSTUVWXYZ", NULL);
+
+	if (argc > 1) {
+		FILE *f = fopen(argv[1], "r");
+		if (f) {
+			for (ks.l = 0; kgetline(&ks, (kgets_func *)fgets, f) == 0; ks.l = 0)
+				puts(ks.s);
+			fclose(f);
+		}
+	}
+
+	free(ks.s);
+
+	if (nfail > 0) {
+		fprintf(stderr, "Total failures: %d\n", nfail);
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
--- a/ext/klib/test/kthread_test.c
+++ b/ext/klib/test/kthread_test.c
@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <pthread.h>
+#if HAVE_CILK
+#include <cilk/cilk.h>
+#include <cilk/cilk_api.h>
+#endif
+
+typedef struct {
+	int max_iter, w, h;
+	double xmin, xmax, ymin, ymax;
+	int *k;
+} global_t;
+
+static void compute(void *_g, int i, int tid)
+{
+	global_t *g = (global_t*)_g;
+	double x, x0 = g->xmin + (g->xmax - g->xmin) * (i%g->w) / g->w;
+	double y, y0 = g->ymin + (g->ymax - g->ymin) * (i/g->w) / g->h;
+	int k;
+
+	assert(g->k[i] < 0);
+	x = x0, y = y0;
+	for (k = 0; k < g->max_iter; ++k) {
+		double z = x * y;
+		x *= x; y *= y;
+		if (x + y >= 4) break;
+		x = x - y + x0;
+		y = z + z + y0; 
+	}
+	g->k[i] = k;
+}
+
+void kt_for(int n_threads, int n_items, void (*func)(void*,int,int), void *data);
+
+int main(int argc, char *argv[])
+{
+	int i, tmp, tot, type = 0, n_threads = 2;
+	global_t global = { 10240*100, 800, 600, -2., -1.2, -1.2, 1.2, 0 };
+//	global_t global = { 10240*1, 8, 6, -2., -1.2, -1.2, 1.2, 0 };
+
+	if (argc > 1) {
+		type = argv[1][0] == 'o'? 2 : argv[1][0] == 'c'? 3 : argv[1][0] == 'n'? 1 : 0;
+		if (argv[1][0] >= '0' && argv[1][0] <= '9')
+			n_threads = atoi(argv[1]);
+	} else {
+		fprintf(stderr, "Usage: ./a.out [openmp | cilk | #threads]\n");
+	}
+	tot = global.w * global.h;
+	global.k = calloc(tot, sizeof(int));
+	for (i = 0; i < tot; ++i) global.k[i] = -1;
+	if (type == 0) {
+		kt_for(n_threads, tot, compute, &global);
+	} else if (type == 2) {
+		#pragma omp parallel for
+		for (i = 0; i < tot; ++i)
+			compute(&global, i, 0);
+	} else if (type == 3) {
+		#if HAVE_CILK
+		cilk_for (i = 0; i < tot; ++i)
+			compute(&global, i, 0);
+		#endif
+	}
+	for (i = tmp = 0; i < tot; ++i) tmp += (global.k[i] < 0);
+	free(global.k);
+	assert(tmp == 0);
+	return 0;
+}
--- a/ext/klib/test/kthread_test2.c
+++ b/ext/klib/test/kthread_test2.c
@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
+
+typedef struct {
+	FILE *fp;
+	int max_lines, buf_size, n_threads;
+	char *buf;
+} pipeline_t;
+
+typedef struct {
+	int n_lines;
+	char **lines;
+} step_t;
+
+static void worker_for(void *_data, long i, int tid) // kt_for() callback
+{
+	step_t *step = (step_t*)_data;
+	char *s = step->lines[i];
+	int t, l, j;
+	l = strlen(s) - 1;
+	assert(s[l] == '\n'); // not supporting long lines
+	for (j = 0; j < l>>1; ++j)
+		t = s[j], s[j] = s[l - 1 - j], s[l - 1 - j] = t;
+}
+
+static void *worker_pipeline(void *shared, int step, void *in) // kt_pipeline() callback
+{
+	pipeline_t *p = (pipeline_t*)shared;
+	if (step == 0) { // step 0: read lines into the buffer
+		step_t *s;
+		s = calloc(1, sizeof(step_t));
+		s->lines = calloc(p->max_lines, sizeof(char*));
+		while (fgets(p->buf, p->buf_size, p->fp) != 0) {
+			s->lines[s->n_lines] = strdup(p->buf);
+			if (++s->n_lines >= p->max_lines)
+				break;
+		}
+		if (s->n_lines) return s;
+	} else if (step == 1) { // step 1: reverse lines
+		kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_lines);
+		return in;
+	} else if (step == 2) { // step 3: write the buffer to output
+		step_t *s = (step_t*)in;
+		while (s->n_lines > 0) {
+			fputs(s->lines[--s->n_lines], stdout);
+			free(s->lines[s->n_lines]);
+		}
+		free(s->lines); free(s);
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	pipeline_t pl;
+	int pl_threads;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: reverse <in.txt> [pipeline_threads [for_threads]]\n");
+		return 1;
+	}
+	pl.fp = strcmp(argv[1], "-")? fopen(argv[1], "r") : stdin;
+	if (pl.fp == 0) {
+		fprintf(stderr, "ERROR: failed to open the input file.\n");
+		return 1;
+	}
+	pl_threads = argc > 2? atoi(argv[2]) : 3;
+	pl.max_lines = 4096;
+	pl.buf_size = 0x10000;
+	pl.n_threads = argc > 3? atoi(argv[3]) : 1;
+	pl.buf = calloc(pl.buf_size, 1);
+	kt_pipeline(pl_threads, worker_pipeline, &pl, 3);
+	free(pl.buf);
+	if (pl.fp != stdin) fclose(pl.fp);
+	return 0;
+}
--- a/ext/klib/test/kvec_test.cc
+++ b/ext/klib/test/kvec_test.cc
@ -0,0 +1,69 @@
+#include <vector>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "kvec.h"
+
+int main()
+{
+	int M = 10, N = 20000000, i, j;
+	clock_t t;
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		int *array = (int*)malloc(N * sizeof(int));
+		for (j = 0; j < N; ++j) array[j] = j;
+		free(array);
+	}
+	printf("C array, preallocated: %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		int *array = 0, max = 0;
+		for (j = 0; j < N; ++j) {
+			if (j == max) {
+				max = !max? 1 : max << 1;
+				array = (int*)realloc(array, sizeof(int)*max);
+			}
+			array[j] = j;
+		}
+		free(array);
+	}
+	printf("C array, dynamic: %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		kvec_t(int) array;
+		kv_init(array);
+		kv_resize(int, array, N);
+		for (j = 0; j < N; ++j) kv_a(int, array, j) = j;
+		kv_destroy(array);
+	}
+	printf("C vector, dynamic(kv_a): %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		kvec_t(int) array;
+		kv_init(array);
+		for (j = 0; j < N; ++j)
+			kv_push(int, array, j);
+		kv_destroy(array);
+	}
+	printf("C vector, dynamic(kv_push): %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		std::vector<int> array;
+		array.reserve(N);
+		for (j = 0; j < N; ++j) array[j] = j;
+	}
+	printf("C++ vector, preallocated: %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		std::vector<int> array;
+		for (j = 0; j < N; ++j) array.push_back(j);
+	}
+	printf("C++ vector, dynamic: %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	return 0;
+}
--- a/ext/robin-map/robin_growth_policy.h
+++ b/ext/robin-map/robin_growth_policy.h
@ -0,0 +1,416 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TSL_ROBIN_GROWTH_POLICY_H
+#define TSL_ROBIN_GROWTH_POLICY_H
+
+#include <algorithm>
+#include <array>
+#include <climits>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <ratio>
+#include <stdexcept>
+
+// A change of the major version indicates an API and/or ABI break (change of
+// in-memory layout of the data structure)
+#define TSL_RH_VERSION_MAJOR 1
+// A change of the minor version indicates the addition of a feature without
+// impact on the API/ABI
+#define TSL_RH_VERSION_MINOR 3
+// A change of the patch version indicates a bugfix without additional
+// functionality
+#define TSL_RH_VERSION_PATCH 0
+
+#ifdef TSL_DEBUG
+#define tsl_rh_assert(expr) assert(expr)
+#else
+#define tsl_rh_assert(expr) (static_cast<void>(0))
+#endif
+
+/**
+ * If exceptions are enabled, throw the exception passed in parameter, otherwise
+ * call std::terminate.
+ */
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || \
+     (defined(_MSC_VER) && defined(_CPPUNWIND))) &&        \
+    !defined(TSL_NO_EXCEPTIONS)
+#define TSL_RH_THROW_OR_TERMINATE(ex, msg) throw ex(msg)
+#else
+#define TSL_RH_NO_EXCEPTIONS
+#ifdef TSL_DEBUG
+#include <iostream>
+#define TSL_RH_THROW_OR_TERMINATE(ex, msg) \
+  do {                                     \
+    std::cerr << msg << std::endl;         \
+    std::terminate();                      \
+  } while (0)
+#else
+#define TSL_RH_THROW_OR_TERMINATE(ex, msg) std::terminate()
+#endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define TSL_RH_LIKELY(exp) (__builtin_expect(!!(exp), true))
+#else
+#define TSL_RH_LIKELY(exp) (exp)
+#endif
+
+#define TSL_RH_UNUSED(x) static_cast<void>(x)
+
+namespace tsl {
+namespace rh {
+
+/**
+ * Grow the hash table by a factor of GrowthFactor keeping the bucket count to a
+ * power of two. It allows the table to use a mask operation instead of a modulo
+ * operation to map a hash to a bucket.
+ *
+ * GrowthFactor must be a power of two >= 2.
+ */
+template <std::size_t GrowthFactor>
+class power_of_two_growth_policy {
+ public:
+  /**
+   * Called on the hash table creation and on rehash. The number of buckets for
+   * the table is passed in parameter. This number is a minimum, the policy may
+   * update this value with a higher value if needed (but not lower).
+   *
+   * If 0 is given, min_bucket_count_in_out must still be 0 after the policy
+   * creation and bucket_for_hash must always return 0 in this case.
+   */
+  explicit power_of_two_growth_policy(std::size_t& min_bucket_count_in_out) {
+    if (min_bucket_count_in_out > max_bucket_count()) {
+      TSL_RH_THROW_OR_TERMINATE(std::length_error,
+                                "The hash table exceeds its maximum size.");
+    }
+
+    if (min_bucket_count_in_out > 0) {
+      min_bucket_count_in_out =
+          round_up_to_power_of_two(min_bucket_count_in_out);
+      m_mask = min_bucket_count_in_out - 1;
+    } else {
+      m_mask = 0;
+    }
+  }
+
+  /**
+   * Return the bucket [0, bucket_count()) to which the hash belongs.
+   * If bucket_count() is 0, it must always return 0.
+   */
+  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
+    return hash & m_mask;
+  }
+
+  /**
+   * Return the number of buckets that should be used on next growth.
+   */
+  std::size_t next_bucket_count() const {
+    if ((m_mask + 1) > max_bucket_count() / GrowthFactor) {
+      TSL_RH_THROW_OR_TERMINATE(std::length_error,
+                                "The hash table exceeds its maximum size.");
+    }
+
+    return (m_mask + 1) * GrowthFactor;
+  }
+
+  /**
+   * Return the maximum number of buckets supported by the policy.
+   */
+  std::size_t max_bucket_count() const {
+    // Largest power of two.
+    return (std::numeric_limits<std::size_t>::max() / 2) + 1;
+  }
+
+  /**
+   * Reset the growth policy as if it was created with a bucket count of 0.
+   * After a clear, the policy must always return 0 when bucket_for_hash is
+   * called.
+   */
+  void clear() noexcept { m_mask = 0; }
+
+ private:
+  static std::size_t round_up_to_power_of_two(std::size_t value) {
+    if (is_power_of_two(value)) {
+      return value;
+    }
+
+    if (value == 0) {
+      return 1;
+    }
+
+    --value;
+    for (std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
+      value |= value >> i;
+    }
+
+    return value + 1;
+  }
+
+  static constexpr bool is_power_of_two(std::size_t value) {
+    return value != 0 && (value & (value - 1)) == 0;
+  }
+
+ protected:
+  static_assert(is_power_of_two(GrowthFactor) && GrowthFactor >= 2,
+                "GrowthFactor must be a power of two >= 2.");
+
+  std::size_t m_mask;
+};
+
+/**
+ * Grow the hash table by GrowthFactor::num / GrowthFactor::den and use a modulo
+ * to map a hash to a bucket. Slower but it can be useful if you want a slower
+ * growth.
+ */
+template <class GrowthFactor = std::ratio<3, 2>>
+class mod_growth_policy {
+ public:
+  explicit mod_growth_policy(std::size_t& min_bucket_count_in_out) {
+    if (min_bucket_count_in_out > max_bucket_count()) {
+      TSL_RH_THROW_OR_TERMINATE(std::length_error,
+                                "The hash table exceeds its maximum size.");
+    }
+
+    if (min_bucket_count_in_out > 0) {
+      m_mod = min_bucket_count_in_out;
+    } else {
+      m_mod = 1;
+    }
+  }
+
+  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
+    return hash % m_mod;
+  }
+
+  std::size_t next_bucket_count() const {
+    if (m_mod == max_bucket_count()) {
+      TSL_RH_THROW_OR_TERMINATE(std::length_error,
+                                "The hash table exceeds its maximum size.");
+    }
+
+    const double next_bucket_count =
+        std::ceil(double(m_mod) * REHASH_SIZE_MULTIPLICATION_FACTOR);
+    if (!std::isnormal(next_bucket_count)) {
+      TSL_RH_THROW_OR_TERMINATE(std::length_error,
+                                "The hash table exceeds its maximum size.");
+    }
+
+    if (next_bucket_count > double(max_bucket_count())) {
+      return max_bucket_count();
+    } else {
+      return std::size_t(next_bucket_count);
+    }
+  }
+
+  std::size_t max_bucket_count() const { return MAX_BUCKET_COUNT; }
+
+  void clear() noexcept { m_mod = 1; }
+
+ private:
+  static constexpr double REHASH_SIZE_MULTIPLICATION_FACTOR =
+      1.0 * GrowthFactor::num / GrowthFactor::den;
+  static const std::size_t MAX_BUCKET_COUNT =
+      std::size_t(double(std::numeric_limits<std::size_t>::max() /
+                         REHASH_SIZE_MULTIPLICATION_FACTOR));
+
+  static_assert(REHASH_SIZE_MULTIPLICATION_FACTOR >= 1.1,
+                "Growth factor should be >= 1.1.");
+
+  std::size_t m_mod;
+};
+
+namespace detail {
+
+#if SIZE_MAX >= ULLONG_MAX
+#define TSL_RH_NB_PRIMES 51
+#elif SIZE_MAX >= ULONG_MAX
+#define TSL_RH_NB_PRIMES 40
+#else
+#define TSL_RH_NB_PRIMES 23
+#endif
+
+static constexpr const std::array<std::size_t, TSL_RH_NB_PRIMES> PRIMES = {{
+    1u,
+    5u,
+    17u,
+    29u,
+    37u,
+    53u,
+    67u,
+    79u,
+    97u,
+    131u,
+    193u,
+    257u,
+    389u,
+    521u,
+    769u,
+    1031u,
+    1543u,
+    2053u,
+    3079u,
+    6151u,
+    12289u,
+    24593u,
+    49157u,
+#if SIZE_MAX >= ULONG_MAX
+    98317ul,
+    196613ul,
+    393241ul,
+    786433ul,
+    1572869ul,
+    3145739ul,
+    6291469ul,
+    12582917ul,
+    25165843ul,
+    50331653ul,
+    100663319ul,
+    201326611ul,
+    402653189ul,
+    805306457ul,
+    1610612741ul,
+    3221225473ul,
+    4294967291ul,
+#endif
+#if SIZE_MAX >= ULLONG_MAX
+    6442450939ull,
+    12884901893ull,
+    25769803751ull,
+    51539607551ull,
+    103079215111ull,
+    206158430209ull,
+    412316860441ull,
+    824633720831ull,
+    1649267441651ull,
+    3298534883309ull,
+    6597069766657ull,
+#endif
+}};
+
+template <unsigned int IPrime>
+static constexpr std::size_t mod(std::size_t hash) {
+  return hash % PRIMES[IPrime];
+}
+
+// MOD_PRIME[iprime](hash) returns hash % PRIMES[iprime]. This table allows for
+// faster modulo as the compiler can optimize the modulo code better with a
+// constant known at the compilation.
+static constexpr const std::array<std::size_t (*)(std::size_t),
+                                  TSL_RH_NB_PRIMES>
+    MOD_PRIME = {{
+        &mod<0>,  &mod<1>,  &mod<2>,  &mod<3>,  &mod<4>,  &mod<5>,
+        &mod<6>,  &mod<7>,  &mod<8>,  &mod<9>,  &mod<10>, &mod<11>,
+        &mod<12>, &mod<13>, &mod<14>, &mod<15>, &mod<16>, &mod<17>,
+        &mod<18>, &mod<19>, &mod<20>, &mod<21>, &mod<22>,
+#if SIZE_MAX >= ULONG_MAX
+        &mod<23>, &mod<24>, &mod<25>, &mod<26>, &mod<27>, &mod<28>,
+        &mod<29>, &mod<30>, &mod<31>, &mod<32>, &mod<33>, &mod<34>,
+        &mod<35>, &mod<36>, &mod<37>, &mod<38>, &mod<39>,
+#endif
+#if SIZE_MAX >= ULLONG_MAX
+        &mod<40>, &mod<41>, &mod<42>, &mod<43>, &mod<44>, &mod<45>,
+        &mod<46>, &mod<47>, &mod<48>, &mod<49>, &mod<50>,
+#endif
+    }};
+
+}  // namespace detail
+
+/**
+ * Grow the hash table by using prime numbers as bucket count. Slower than
+ * tsl::rh::power_of_two_growth_policy in general but will probably distribute
+ * the values around better in the buckets with a poor hash function.
+ *
+ * To allow the compiler to optimize the modulo operation, a lookup table is
+ * used with constant primes numbers.
+ *
+ * With a switch the code would look like:
+ * \code
+ * switch(iprime) { // iprime is the current prime of the hash table
+ *     case 0: hash % 5ul;
+ *             break;
+ *     case 1: hash % 17ul;
+ *             break;
+ *     case 2: hash % 29ul;
+ *             break;
+ *     ...
+ * }
+ * \endcode
+ *
+ * Due to the constant variable in the modulo the compiler is able to optimize
+ * the operation by a series of multiplications, substractions and shifts.
+ *
+ * The 'hash % 5' could become something like 'hash - (hash * 0xCCCCCCCD) >> 34)
+ * * 5' in a 64 bits environment.
+ */
+class prime_growth_policy {
+ public:
+  explicit prime_growth_policy(std::size_t& min_bucket_count_in_out) {
+    auto it_prime = std::lower_bound(
+        detail::PRIMES.begin(), detail::PRIMES.end(), min_bucket_count_in_out);
+    if (it_prime == detail::PRIMES.end()) {
+      TSL_RH_THROW_OR_TERMINATE(std::length_error,
+                                "The hash table exceeds its maximum size.");
+    }
+
+    m_iprime = static_cast<unsigned int>(
+        std::distance(detail::PRIMES.begin(), it_prime));
+    if (min_bucket_count_in_out > 0) {
+      min_bucket_count_in_out = *it_prime;
+    } else {
+      min_bucket_count_in_out = 0;
+    }
+  }
+
+  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
+    return detail::MOD_PRIME[m_iprime](hash);
+  }
+
+  std::size_t next_bucket_count() const {
+    if (m_iprime + 1 >= detail::PRIMES.size()) {
+      TSL_RH_THROW_OR_TERMINATE(std::length_error,
+                                "The hash table exceeds its maximum size.");
+    }
+
+    return detail::PRIMES[m_iprime + 1];
+  }
+
+  std::size_t max_bucket_count() const { return detail::PRIMES.back(); }
+
+  void clear() noexcept { m_iprime = 0; }
+
+ private:
+  unsigned int m_iprime;
+
+  static_assert(std::numeric_limits<decltype(m_iprime)>::max() >=
+                    detail::PRIMES.size(),
+                "The type of m_iprime is not big enough.");
+};
+
+}  // namespace rh
+}  // namespace tsl
+
+#endif
--- a/ext/robin-map/robin_hash.h
+++ b/ext/robin-map/robin_hash.h
--- a/ext/robin-map/robin_map.h
+++ b/ext/robin-map/robin_map.h
@ -0,0 +1,815 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TSL_ROBIN_MAP_H
+#define TSL_ROBIN_MAP_H
+
+#include <cstddef>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "robin_hash.h"
+
+namespace tsl {
+
+/**
+ * Implementation of a hash map using open-addressing and the robin hood hashing
+ * algorithm with backward shift deletion.
+ *
+ * For operations modifying the hash map (insert, erase, rehash, ...), the
+ * strong exception guarantee is only guaranteed when the expression
+ * `std::is_nothrow_swappable<std::pair<Key, T>>::value &&
+ * std::is_nothrow_move_constructible<std::pair<Key, T>>::value` is true,
+ * otherwise if an exception is thrown during the swap or the move, the hash map
+ * may end up in a undefined state. Per the standard a `Key` or `T` with a
+ * noexcept copy constructor and no move constructor also satisfies the
+ * `std::is_nothrow_move_constructible<std::pair<Key, T>>::value` criterion (and
+ * will thus guarantee the strong exception for the map).
+ *
+ * When `StoreHash` is true, 32 bits of the hash are stored alongside the
+ * values. It can improve the performance during lookups if the `KeyEqual`
+ * function takes time (if it engenders a cache-miss for example) as we then
+ * compare the stored hashes before comparing the keys. When
+ * `tsl::rh::power_of_two_growth_policy` is used as `GrowthPolicy`, it may also
+ * speed-up the rehash process as we can avoid to recalculate the hash. When it
+ * is detected that storing the hash will not incur any memory penalty due to
+ * alignment (i.e. `sizeof(tsl::detail_robin_hash::bucket_entry<ValueType,
+ * true>) == sizeof(tsl::detail_robin_hash::bucket_entry<ValueType, false>)`)
+ * and `tsl::rh::power_of_two_growth_policy` is used, the hash will be stored
+ * even if `StoreHash` is false so that we can speed-up the rehash (but it will
+ * not be used on lookups unless `StoreHash` is true).
+ *
+ * `GrowthPolicy` defines how the map grows and consequently how a hash value is
+ * mapped to a bucket. By default the map uses
+ * `tsl::rh::power_of_two_growth_policy`. This policy keeps the number of
+ * buckets to a power of two and uses a mask to map the hash to a bucket instead
+ * of the slow modulo. Other growth policies are available and you may define
+ * your own growth policy, check `tsl::rh::power_of_two_growth_policy` for the
+ * interface.
+ *
+ * `std::pair<Key, T>` must be swappable.
+ *
+ * `Key` and `T` must be copy and/or move constructible.
+ *
+ * If the destructor of `Key` or `T` throws an exception, the behaviour of the
+ * class is undefined.
+ *
+ * Iterators invalidation:
+ *  - clear, operator=, reserve, rehash: always invalidate the iterators.
+ *  - insert, emplace, emplace_hint, operator[]: if there is an effective
+ * insert, invalidate the iterators.
+ *  - erase: always invalidate the iterators.
+ */
+template <class Key, class T, class Hash = std::hash<Key>,
+          class KeyEqual = std::equal_to<Key>,
+          class Allocator = std::allocator<std::pair<Key, T>>,
+          bool StoreHash = false,
+          class GrowthPolicy = tsl::rh::power_of_two_growth_policy<2>>
+class robin_map {
+ private:
+  template <typename U>
+  using has_is_transparent = tsl::detail_robin_hash::has_is_transparent<U>;
+
+  class KeySelect {
+   public:
+    using key_type = Key;
+
+    const key_type& operator()(
+        const std::pair<Key, T>& key_value) const noexcept {
+      return key_value.first;
+    }
+
+    key_type& operator()(std::pair<Key, T>& key_value) noexcept {
+      return key_value.first;
+    }
+  };
+
+  class ValueSelect {
+   public:
+    using value_type = T;
+
+    const value_type& operator()(
+        const std::pair<Key, T>& key_value) const noexcept {
+      return key_value.second;
+    }
+
+    value_type& operator()(std::pair<Key, T>& key_value) noexcept {
+      return key_value.second;
+    }
+  };
+
+  using ht = detail_robin_hash::robin_hash<std::pair<Key, T>, KeySelect,
+                                           ValueSelect, Hash, KeyEqual,
+                                           Allocator, StoreHash, GrowthPolicy>;
+
+ public:
+  using key_type = typename ht::key_type;
+  using mapped_type = T;
+  using value_type = typename ht::value_type;
+  using size_type = typename ht::size_type;
+  using difference_type = typename ht::difference_type;
+  using hasher = typename ht::hasher;
+  using key_equal = typename ht::key_equal;
+  using allocator_type = typename ht::allocator_type;
+  using reference = typename ht::reference;
+  using const_reference = typename ht::const_reference;
+  using pointer = typename ht::pointer;
+  using const_pointer = typename ht::const_pointer;
+  using iterator = typename ht::iterator;
+  using const_iterator = typename ht::const_iterator;
+
+ public:
+  /*
+   * Constructors
+   */
+  robin_map() : robin_map(ht::DEFAULT_INIT_BUCKETS_SIZE) {}
+
+  explicit robin_map(size_type bucket_count, const Hash& hash = Hash(),
+                     const KeyEqual& equal = KeyEqual(),
+                     const Allocator& alloc = Allocator())
+      : m_ht(bucket_count, hash, equal, alloc) {}
+
+  robin_map(size_type bucket_count, const Allocator& alloc)
+      : robin_map(bucket_count, Hash(), KeyEqual(), alloc) {}
+
+  robin_map(size_type bucket_count, const Hash& hash, const Allocator& alloc)
+      : robin_map(bucket_count, hash, KeyEqual(), alloc) {}
+
+  explicit robin_map(const Allocator& alloc)
+      : robin_map(ht::DEFAULT_INIT_BUCKETS_SIZE, alloc) {}
+
+  template <class InputIt>
+  robin_map(InputIt first, InputIt last,
+            size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
+            const Hash& hash = Hash(), const KeyEqual& equal = KeyEqual(),
+            const Allocator& alloc = Allocator())
+      : robin_map(bucket_count, hash, equal, alloc) {
+    insert(first, last);
+  }
+
+  template <class InputIt>
+  robin_map(InputIt first, InputIt last, size_type bucket_count,
+            const Allocator& alloc)
+      : robin_map(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
+
+  template <class InputIt>
+  robin_map(InputIt first, InputIt last, size_type bucket_count,
+            const Hash& hash, const Allocator& alloc)
+      : robin_map(first, last, bucket_count, hash, KeyEqual(), alloc) {}
+
+  robin_map(std::initializer_list<value_type> init,
+            size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
+            const Hash& hash = Hash(), const KeyEqual& equal = KeyEqual(),
+            const Allocator& alloc = Allocator())
+      : robin_map(init.begin(), init.end(), bucket_count, hash, equal, alloc) {}
+
+  robin_map(std::initializer_list<value_type> init, size_type bucket_count,
+            const Allocator& alloc)
+      : robin_map(init.begin(), init.end(), bucket_count, Hash(), KeyEqual(),
+                  alloc) {}
+
+  robin_map(std::initializer_list<value_type> init, size_type bucket_count,
+            const Hash& hash, const Allocator& alloc)
+      : robin_map(init.begin(), init.end(), bucket_count, hash, KeyEqual(),
+                  alloc) {}
+
+  robin_map& operator=(std::initializer_list<value_type> ilist) {
+    m_ht.clear();
+
+    m_ht.reserve(ilist.size());
+    m_ht.insert(ilist.begin(), ilist.end());
+
+    return *this;
+  }
+
+  allocator_type get_allocator() const { return m_ht.get_allocator(); }
+
+  /*
+   * Iterators
+   */
+  iterator begin() noexcept { return m_ht.begin(); }
+  const_iterator begin() const noexcept { return m_ht.begin(); }
+  const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
+
+  iterator end() noexcept { return m_ht.end(); }
+  const_iterator end() const noexcept { return m_ht.end(); }
+  const_iterator cend() const noexcept { return m_ht.cend(); }
+
+  /*
+   * Capacity
+   */
+  bool empty() const noexcept { return m_ht.empty(); }
+  size_type size() const noexcept { return m_ht.size(); }
+  size_type max_size() const noexcept { return m_ht.max_size(); }
+
+  /*
+   * Modifiers
+   */
+  void clear() noexcept { m_ht.clear(); }
+
+  std::pair<iterator, bool> insert(const value_type& value) {
+    return m_ht.insert(value);
+  }
+
+  template <class P, typename std::enable_if<std::is_constructible<
+                         value_type, P&&>::value>::type* = nullptr>
+  std::pair<iterator, bool> insert(P&& value) {
+    return m_ht.emplace(std::forward<P>(value));
+  }
+
+  std::pair<iterator, bool> insert(value_type&& value) {
+    return m_ht.insert(std::move(value));
+  }
+
+  iterator insert(const_iterator hint, const value_type& value) {
+    return m_ht.insert_hint(hint, value);
+  }
+
+  template <class P, typename std::enable_if<std::is_constructible<
+                         value_type, P&&>::value>::type* = nullptr>
+  iterator insert(const_iterator hint, P&& value) {
+    return m_ht.emplace_hint(hint, std::forward<P>(value));
+  }
+
+  iterator insert(const_iterator hint, value_type&& value) {
+    return m_ht.insert_hint(hint, std::move(value));
+  }
+
+  template <class InputIt>
+  void insert(InputIt first, InputIt last) {
+    m_ht.insert(first, last);
+  }
+
+  void insert(std::initializer_list<value_type> ilist) {
+    m_ht.insert(ilist.begin(), ilist.end());
+  }
+
+  template <class M>
+  std::pair<iterator, bool> insert_or_assign(const key_type& k, M&& obj) {
+    return m_ht.insert_or_assign(k, std::forward<M>(obj));
+  }
+
+  template <class M>
+  std::pair<iterator, bool> insert_or_assign(key_type&& k, M&& obj) {
+    return m_ht.insert_or_assign(std::move(k), std::forward<M>(obj));
+  }
+
+  template <class M>
+  iterator insert_or_assign(const_iterator hint, const key_type& k, M&& obj) {
+    return m_ht.insert_or_assign(hint, k, std::forward<M>(obj));
+  }
+
+  template <class M>
+  iterator insert_or_assign(const_iterator hint, key_type&& k, M&& obj) {
+    return m_ht.insert_or_assign(hint, std::move(k), std::forward<M>(obj));
+  }
+
+  /**
+   * Due to the way elements are stored, emplace will need to move or copy the
+   * key-value once. The method is equivalent to
+   * insert(value_type(std::forward<Args>(args)...));
+   *
+   * Mainly here for compatibility with the std::unordered_map interface.
+   */
+  template <class... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return m_ht.emplace(std::forward<Args>(args)...);
+  }
+
+  /**
+   * Due to the way elements are stored, emplace_hint will need to move or copy
+   * the key-value once. The method is equivalent to insert(hint,
+   * value_type(std::forward<Args>(args)...));
+   *
+   * Mainly here for compatibility with the std::unordered_map interface.
+   */
+  template <class... Args>
+  iterator emplace_hint(const_iterator hint, Args&&... args) {
+    return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  std::pair<iterator, bool> try_emplace(const key_type& k, Args&&... args) {
+    return m_ht.try_emplace(k, std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  std::pair<iterator, bool> try_emplace(key_type&& k, Args&&... args) {
+    return m_ht.try_emplace(std::move(k), std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  iterator try_emplace(const_iterator hint, const key_type& k, Args&&... args) {
+    return m_ht.try_emplace_hint(hint, k, std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  iterator try_emplace(const_iterator hint, key_type&& k, Args&&... args) {
+    return m_ht.try_emplace_hint(hint, std::move(k),
+                                 std::forward<Args>(args)...);
+  }
+
+  iterator erase(iterator pos) { return m_ht.erase(pos); }
+  iterator erase(const_iterator pos) { return m_ht.erase(pos); }
+  iterator erase(const_iterator first, const_iterator last) {
+    return m_ht.erase(first, last);
+  }
+  size_type erase(const key_type& key) { return m_ht.erase(key); }
+
+  /**
+   * Erase the element at position 'pos'. In contrast to the regular erase()
+   * function, erase_fast() does not return an iterator. This allows it to be
+   * faster especially in hash tables with a low load factor, where finding the
+   * next nonempty bucket would be costly.
+   */
+  void erase_fast(iterator pos) { return m_ht.erase_fast(pos); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup to the value if you already have the hash.
+   */
+  size_type erase(const key_type& key, std::size_t precalculated_hash) {
+    return m_ht.erase(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type erase(const K& key) {
+    return m_ht.erase(key);
+  }
+
+  /**
+   * @copydoc erase(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup to the value if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type erase(const K& key, std::size_t precalculated_hash) {
+    return m_ht.erase(key, precalculated_hash);
+  }
+
+  void swap(robin_map& other) { other.m_ht.swap(m_ht); }
+
+  /*
+   * Lookup
+   */
+  T& at(const Key& key) { return m_ht.at(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  T& at(const Key& key, std::size_t precalculated_hash) {
+    return m_ht.at(key, precalculated_hash);
+  }
+
+  const T& at(const Key& key) const { return m_ht.at(key); }
+
+  /**
+   * @copydoc at(const Key& key, std::size_t precalculated_hash)
+   */
+  const T& at(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.at(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  T& at(const K& key) {
+    return m_ht.at(key);
+  }
+
+  /**
+   * @copydoc at(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  T& at(const K& key, std::size_t precalculated_hash) {
+    return m_ht.at(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc at(const K& key)
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const T& at(const K& key) const {
+    return m_ht.at(key);
+  }
+
+  /**
+   * @copydoc at(const K& key, std::size_t precalculated_hash)
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const T& at(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.at(key, precalculated_hash);
+  }
+
+  T& operator[](const Key& key) { return m_ht[key]; }
+  T& operator[](Key&& key) { return m_ht[std::move(key)]; }
+
+  size_type count(const Key& key) const { return m_ht.count(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  size_type count(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.count(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type count(const K& key) const {
+    return m_ht.count(key);
+  }
+
+  /**
+   * @copydoc count(const K& key) const
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type count(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.count(key, precalculated_hash);
+  }
+
+  iterator find(const Key& key) { return m_ht.find(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  iterator find(const Key& key, std::size_t precalculated_hash) {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  const_iterator find(const Key& key) const { return m_ht.find(key); }
+
+  /**
+   * @copydoc find(const Key& key, std::size_t precalculated_hash)
+   */
+  const_iterator find(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  iterator find(const K& key) {
+    return m_ht.find(key);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  iterator find(const K& key, std::size_t precalculated_hash) {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const_iterator find(const K& key) const {
+    return m_ht.find(key);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const_iterator find(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  bool contains(const Key& key) const { return m_ht.contains(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  bool contains(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.contains(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  bool contains(const K& key) const {
+    return m_ht.contains(key);
+  }
+
+  /**
+   * @copydoc contains(const K& key) const
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  bool contains(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.contains(key, precalculated_hash);
+  }
+
+  std::pair<iterator, iterator> equal_range(const Key& key) {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  std::pair<iterator, iterator> equal_range(const Key& key,
+                                            std::size_t precalculated_hash) {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  std::pair<const_iterator, const_iterator> equal_range(const Key& key) const {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
+   */
+  std::pair<const_iterator, const_iterator> equal_range(
+      const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<iterator, iterator> equal_range(const K& key) {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<iterator, iterator> equal_range(const K& key,
+                                            std::size_t precalculated_hash) {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key)
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<const_iterator, const_iterator> equal_range(
+      const K& key, std::size_t precalculated_hash) const {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /*
+   * Bucket interface
+   */
+  size_type bucket_count() const { return m_ht.bucket_count(); }
+  size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
+
+  /*
+   *  Hash policy
+   */
+  float load_factor() const { return m_ht.load_factor(); }
+
+  float min_load_factor() const { return m_ht.min_load_factor(); }
+  float max_load_factor() const { return m_ht.max_load_factor(); }
+
+  /**
+   * Set the `min_load_factor` to `ml`. When the `load_factor` of the map goes
+   * below `min_load_factor` after some erase operations, the map will be
+   * shrunk when an insertion occurs. The erase method itself never shrinks
+   * the map.
+   *
+   * The default value of `min_load_factor` is 0.0f, the map never shrinks by
+   * default.
+   */
+  void min_load_factor(float ml) { m_ht.min_load_factor(ml); }
+  void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
+
+  void rehash(size_type count_) { m_ht.rehash(count_); }
+  void reserve(size_type count_) { m_ht.reserve(count_); }
+
+  /*
+   * Observers
+   */
+  hasher hash_function() const { return m_ht.hash_function(); }
+  key_equal key_eq() const { return m_ht.key_eq(); }
+
+  /*
+   * Other
+   */
+
+  /**
+   * Convert a const_iterator to an iterator.
+   */
+  iterator mutable_iterator(const_iterator pos) {
+    return m_ht.mutable_iterator(pos);
+  }
+
+  /**
+   * Serialize the map through the `serializer` parameter.
+   *
+   * The `serializer` parameter must be a function object that supports the
+   * following call:
+   *  - `template<typename U> void operator()(const U& value);` where the types
+   * `std::int16_t`, `std::uint32_t`, `std::uint64_t`, `float` and
+   * `std::pair<Key, T>` must be supported for U.
+   *
+   * The implementation leaves binary compatibility (endianness, IEEE 754 for
+   * floats, ...) of the types it serializes in the hands of the `Serializer`
+   * function object if compatibility is required.
+   */
+  template <class Serializer>
+  void serialize(Serializer& serializer) const {
+    m_ht.serialize(serializer);
+  }
+
+  /**
+   * Deserialize a previously serialized map through the `deserializer`
+   * parameter.
+   *
+   * The `deserializer` parameter must be a function object that supports the
+   * following call:
+   *  - `template<typename U> U operator()();` where the types `std::int16_t`,
+   * `std::uint32_t`, `std::uint64_t`, `float` and `std::pair<Key, T>` must be
+   * supported for U.
+   *
+   * If the deserialized hash map type is hash compatible with the serialized
+   * map, the deserialization process can be sped up by setting
+   * `hash_compatible` to true. To be hash compatible, the Hash, KeyEqual and
+   * GrowthPolicy must behave the same way than the ones used on the serialized
+   * map and the StoreHash must have the same value. The `std::size_t` must also
+   * be of the same size as the one on the platform used to serialize the map.
+   * If these criteria are not met, the behaviour is undefined with
+   * `hash_compatible` sets to true.
+   *
+   * The behaviour is undefined if the type `Key` and `T` of the `robin_map` are
+   * not the same as the types used during serialization.
+   *
+   * The implementation leaves binary compatibility (endianness, IEEE 754 for
+   * floats, size of int, ...) of the types it deserializes in the hands of the
+   * `Deserializer` function object if compatibility is required.
+   */
+  template <class Deserializer>
+  static robin_map deserialize(Deserializer& deserializer,
+                               bool hash_compatible = false) {
+    robin_map map(0);
+    map.m_ht.deserialize(deserializer, hash_compatible);
+
+    return map;
+  }
+
+  friend bool operator==(const robin_map& lhs, const robin_map& rhs) {
+    if (lhs.size() != rhs.size()) {
+      return false;
+    }
+
+    for (const auto& element_lhs : lhs) {
+      const auto it_element_rhs = rhs.find(element_lhs.first);
+      if (it_element_rhs == rhs.cend() ||
+          element_lhs.second != it_element_rhs->second) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  friend bool operator!=(const robin_map& lhs, const robin_map& rhs) {
+    return !operator==(lhs, rhs);
+  }
+
+  friend void swap(robin_map& lhs, robin_map& rhs) { lhs.swap(rhs); }
+
+ private:
+  ht m_ht;
+};
+
+/**
+ * Same as `tsl::robin_map<Key, T, Hash, KeyEqual, Allocator, StoreHash,
+ * tsl::rh::prime_growth_policy>`.
+ */
+template <class Key, class T, class Hash = std::hash<Key>,
+          class KeyEqual = std::equal_to<Key>,
+          class Allocator = std::allocator<std::pair<Key, T>>,
+          bool StoreHash = false>
+using robin_pg_map = robin_map<Key, T, Hash, KeyEqual, Allocator, StoreHash,
+                               tsl::rh::prime_growth_policy>;
+
+}  // end namespace tsl
+
+#endif
--- a/ext/robin-map/robin_set.h
+++ b/ext/robin-map/robin_set.h
@ -0,0 +1,668 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TSL_ROBIN_SET_H
+#define TSL_ROBIN_SET_H
+
+#include <cstddef>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "robin_hash.h"
+
+namespace tsl {
+
+/**
+ * Implementation of a hash set using open-addressing and the robin hood hashing
+ * algorithm with backward shift deletion.
+ *
+ * For operations modifying the hash set (insert, erase, rehash, ...), the
+ * strong exception guarantee is only guaranteed when the expression
+ * `std::is_nothrow_swappable<Key>::value &&
+ * std::is_nothrow_move_constructible<Key>::value` is true, otherwise if an
+ * exception is thrown during the swap or the move, the hash set may end up in a
+ * undefined state. Per the standard a `Key` with a noexcept copy constructor
+ * and no move constructor also satisfies the
+ * `std::is_nothrow_move_constructible<Key>::value` criterion (and will thus
+ * guarantee the strong exception for the set).
+ *
+ * When `StoreHash` is true, 32 bits of the hash are stored alongside the
+ * values. It can improve the performance during lookups if the `KeyEqual`
+ * function takes time (or engenders a cache-miss for example) as we then
+ * compare the stored hashes before comparing the keys. When
+ * `tsl::rh::power_of_two_growth_policy` is used as `GrowthPolicy`, it may also
+ * speed-up the rehash process as we can avoid to recalculate the hash. When it
+ * is detected that storing the hash will not incur any memory penalty due to
+ * alignment (i.e. `sizeof(tsl::detail_robin_hash::bucket_entry<ValueType,
+ * true>) == sizeof(tsl::detail_robin_hash::bucket_entry<ValueType, false>)`)
+ * and `tsl::rh::power_of_two_growth_policy` is used, the hash will be stored
+ * even if `StoreHash` is false so that we can speed-up the rehash (but it will
+ * not be used on lookups unless `StoreHash` is true).
+ *
+ * `GrowthPolicy` defines how the set grows and consequently how a hash value is
+ * mapped to a bucket. By default the set uses
+ * `tsl::rh::power_of_two_growth_policy`. This policy keeps the number of
+ * buckets to a power of two and uses a mask to set the hash to a bucket instead
+ * of the slow modulo. Other growth policies are available and you may define
+ * your own growth policy, check `tsl::rh::power_of_two_growth_policy` for the
+ * interface.
+ *
+ * `Key` must be swappable.
+ *
+ * `Key` must be copy and/or move constructible.
+ *
+ * If the destructor of `Key` throws an exception, the behaviour of the class is
+ * undefined.
+ *
+ * Iterators invalidation:
+ *  - clear, operator=, reserve, rehash: always invalidate the iterators.
+ *  - insert, emplace, emplace_hint, operator[]: if there is an effective
+ * insert, invalidate the iterators.
+ *  - erase: always invalidate the iterators.
+ */
+template <class Key, class Hash = std::hash<Key>,
+          class KeyEqual = std::equal_to<Key>,
+          class Allocator = std::allocator<Key>, bool StoreHash = false,
+          class GrowthPolicy = tsl::rh::power_of_two_growth_policy<2>>
+class robin_set {
+ private:
+  template <typename U>
+  using has_is_transparent = tsl::detail_robin_hash::has_is_transparent<U>;
+
+  class KeySelect {
+   public:
+    using key_type = Key;
+
+    const key_type& operator()(const Key& key) const noexcept { return key; }
+
+    key_type& operator()(Key& key) noexcept { return key; }
+  };
+
+  using ht = detail_robin_hash::robin_hash<Key, KeySelect, void, Hash, KeyEqual,
+                                           Allocator, StoreHash, GrowthPolicy>;
+
+ public:
+  using key_type = typename ht::key_type;
+  using value_type = typename ht::value_type;
+  using size_type = typename ht::size_type;
+  using difference_type = typename ht::difference_type;
+  using hasher = typename ht::hasher;
+  using key_equal = typename ht::key_equal;
+  using allocator_type = typename ht::allocator_type;
+  using reference = typename ht::reference;
+  using const_reference = typename ht::const_reference;
+  using pointer = typename ht::pointer;
+  using const_pointer = typename ht::const_pointer;
+  using iterator = typename ht::iterator;
+  using const_iterator = typename ht::const_iterator;
+
+  /*
+   * Constructors
+   */
+  robin_set() : robin_set(ht::DEFAULT_INIT_BUCKETS_SIZE) {}
+
+  explicit robin_set(size_type bucket_count, const Hash& hash = Hash(),
+                     const KeyEqual& equal = KeyEqual(),
+                     const Allocator& alloc = Allocator())
+      : m_ht(bucket_count, hash, equal, alloc) {}
+
+  robin_set(size_type bucket_count, const Allocator& alloc)
+      : robin_set(bucket_count, Hash(), KeyEqual(), alloc) {}
+
+  robin_set(size_type bucket_count, const Hash& hash, const Allocator& alloc)
+      : robin_set(bucket_count, hash, KeyEqual(), alloc) {}
+
+  explicit robin_set(const Allocator& alloc)
+      : robin_set(ht::DEFAULT_INIT_BUCKETS_SIZE, alloc) {}
+
+  template <class InputIt>
+  robin_set(InputIt first, InputIt last,
+            size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
+            const Hash& hash = Hash(), const KeyEqual& equal = KeyEqual(),
+            const Allocator& alloc = Allocator())
+      : robin_set(bucket_count, hash, equal, alloc) {
+    insert(first, last);
+  }
+
+  template <class InputIt>
+  robin_set(InputIt first, InputIt last, size_type bucket_count,
+            const Allocator& alloc)
+      : robin_set(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
+
+  template <class InputIt>
+  robin_set(InputIt first, InputIt last, size_type bucket_count,
+            const Hash& hash, const Allocator& alloc)
+      : robin_set(first, last, bucket_count, hash, KeyEqual(), alloc) {}
+
+  robin_set(std::initializer_list<value_type> init,
+            size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
+            const Hash& hash = Hash(), const KeyEqual& equal = KeyEqual(),
+            const Allocator& alloc = Allocator())
+      : robin_set(init.begin(), init.end(), bucket_count, hash, equal, alloc) {}
+
+  robin_set(std::initializer_list<value_type> init, size_type bucket_count,
+            const Allocator& alloc)
+      : robin_set(init.begin(), init.end(), bucket_count, Hash(), KeyEqual(),
+                  alloc) {}
+
+  robin_set(std::initializer_list<value_type> init, size_type bucket_count,
+            const Hash& hash, const Allocator& alloc)
+      : robin_set(init.begin(), init.end(), bucket_count, hash, KeyEqual(),
+                  alloc) {}
+
+  robin_set& operator=(std::initializer_list<value_type> ilist) {
+    m_ht.clear();
+
+    m_ht.reserve(ilist.size());
+    m_ht.insert(ilist.begin(), ilist.end());
+
+    return *this;
+  }
+
+  allocator_type get_allocator() const { return m_ht.get_allocator(); }
+
+  /*
+   * Iterators
+   */
+  iterator begin() noexcept { return m_ht.begin(); }
+  const_iterator begin() const noexcept { return m_ht.begin(); }
+  const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
+
+  iterator end() noexcept { return m_ht.end(); }
+  const_iterator end() const noexcept { return m_ht.end(); }
+  const_iterator cend() const noexcept { return m_ht.cend(); }
+
+  /*
+   * Capacity
+   */
+  bool empty() const noexcept { return m_ht.empty(); }
+  size_type size() const noexcept { return m_ht.size(); }
+  size_type max_size() const noexcept { return m_ht.max_size(); }
+
+  /*
+   * Modifiers
+   */
+  void clear() noexcept { m_ht.clear(); }
+
+  std::pair<iterator, bool> insert(const value_type& value) {
+    return m_ht.insert(value);
+  }
+
+  std::pair<iterator, bool> insert(value_type&& value) {
+    return m_ht.insert(std::move(value));
+  }
+
+  iterator insert(const_iterator hint, const value_type& value) {
+    return m_ht.insert_hint(hint, value);
+  }
+
+  iterator insert(const_iterator hint, value_type&& value) {
+    return m_ht.insert_hint(hint, std::move(value));
+  }
+
+  template <class InputIt>
+  void insert(InputIt first, InputIt last) {
+    m_ht.insert(first, last);
+  }
+
+  void insert(std::initializer_list<value_type> ilist) {
+    m_ht.insert(ilist.begin(), ilist.end());
+  }
+
+  /**
+   * Due to the way elements are stored, emplace will need to move or copy the
+   * key-value once. The method is equivalent to
+   * insert(value_type(std::forward<Args>(args)...));
+   *
+   * Mainly here for compatibility with the std::unordered_map interface.
+   */
+  template <class... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return m_ht.emplace(std::forward<Args>(args)...);
+  }
+
+  /**
+   * Due to the way elements are stored, emplace_hint will need to move or copy
+   * the key-value once. The method is equivalent to insert(hint,
+   * value_type(std::forward<Args>(args)...));
+   *
+   * Mainly here for compatibility with the std::unordered_map interface.
+   */
+  template <class... Args>
+  iterator emplace_hint(const_iterator hint, Args&&... args) {
+    return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
+  }
+
+  iterator erase(iterator pos) { return m_ht.erase(pos); }
+  iterator erase(const_iterator pos) { return m_ht.erase(pos); }
+  iterator erase(const_iterator first, const_iterator last) {
+    return m_ht.erase(first, last);
+  }
+  size_type erase(const key_type& key) { return m_ht.erase(key); }
+
+  /**
+   * Erase the element at position 'pos'. In contrast to the regular erase()
+   * function, erase_fast() does not return an iterator. This allows it to be
+   * faster especially in hash sets with a low load factor, where finding the
+   * next nonempty bucket would be costly.
+   */
+  void erase_fast(iterator pos) { return m_ht.erase_fast(pos); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup to the value if you already have the hash.
+   */
+  size_type erase(const key_type& key, std::size_t precalculated_hash) {
+    return m_ht.erase(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type erase(const K& key) {
+    return m_ht.erase(key);
+  }
+
+  /**
+   * @copydoc erase(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup to the value if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type erase(const K& key, std::size_t precalculated_hash) {
+    return m_ht.erase(key, precalculated_hash);
+  }
+
+  void swap(robin_set& other) { other.m_ht.swap(m_ht); }
+
+  /*
+   * Lookup
+   */
+  size_type count(const Key& key) const { return m_ht.count(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  size_type count(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.count(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type count(const K& key) const {
+    return m_ht.count(key);
+  }
+
+  /**
+   * @copydoc count(const K& key) const
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type count(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.count(key, precalculated_hash);
+  }
+
+  iterator find(const Key& key) { return m_ht.find(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  iterator find(const Key& key, std::size_t precalculated_hash) {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  const_iterator find(const Key& key) const { return m_ht.find(key); }
+
+  /**
+   * @copydoc find(const Key& key, std::size_t precalculated_hash)
+   */
+  const_iterator find(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  iterator find(const K& key) {
+    return m_ht.find(key);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  iterator find(const K& key, std::size_t precalculated_hash) {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const_iterator find(const K& key) const {
+    return m_ht.find(key);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const_iterator find(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  bool contains(const Key& key) const { return m_ht.contains(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  bool contains(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.contains(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  bool contains(const K& key) const {
+    return m_ht.contains(key);
+  }
+
+  /**
+   * @copydoc contains(const K& key) const
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  bool contains(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.contains(key, precalculated_hash);
+  }
+
+  std::pair<iterator, iterator> equal_range(const Key& key) {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  std::pair<iterator, iterator> equal_range(const Key& key,
+                                            std::size_t precalculated_hash) {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  std::pair<const_iterator, const_iterator> equal_range(const Key& key) const {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
+   */
+  std::pair<const_iterator, const_iterator> equal_range(
+      const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<iterator, iterator> equal_range(const K& key) {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<iterator, iterator> equal_range(const K& key,
+                                            std::size_t precalculated_hash) {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key)
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
+   */
+  template <
+      class K, class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<const_iterator, const_iterator> equal_range(
+      const K& key, std::size_t precalculated_hash) const {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /*
+   * Bucket interface
+   */
+  size_type bucket_count() const { return m_ht.bucket_count(); }
+  size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
+
+  /*
+   *  Hash policy
+   */
+  float load_factor() const { return m_ht.load_factor(); }
+
+  float min_load_factor() const { return m_ht.min_load_factor(); }
+  float max_load_factor() const { return m_ht.max_load_factor(); }
+
+  /**
+   * Set the `min_load_factor` to `ml`. When the `load_factor` of the set goes
+   * below `min_load_factor` after some erase operations, the set will be
+   * shrunk when an insertion occurs. The erase method itself never shrinks
+   * the set.
+   *
+   * The default value of `min_load_factor` is 0.0f, the set never shrinks by
+   * default.
+   */
+  void min_load_factor(float ml) { m_ht.min_load_factor(ml); }
+  void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
+
+  void rehash(size_type count_) { m_ht.rehash(count_); }
+  void reserve(size_type count_) { m_ht.reserve(count_); }
+
+  /*
+   * Observers
+   */
+  hasher hash_function() const { return m_ht.hash_function(); }
+  key_equal key_eq() const { return m_ht.key_eq(); }
+
+  /*
+   * Other
+   */
+
+  /**
+   * Convert a const_iterator to an iterator.
+   */
+  iterator mutable_iterator(const_iterator pos) {
+    return m_ht.mutable_iterator(pos);
+  }
+
+  friend bool operator==(const robin_set& lhs, const robin_set& rhs) {
+    if (lhs.size() != rhs.size()) {
+      return false;
+    }
+
+    for (const auto& element_lhs : lhs) {
+      const auto it_element_rhs = rhs.find(element_lhs);
+      if (it_element_rhs == rhs.cend()) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Serialize the set through the `serializer` parameter.
+   *
+   * The `serializer` parameter must be a function object that supports the
+   * following call:
+   *  - `template<typename U> void operator()(const U& value);` where the types
+   * `std::int16_t`, `std::uint32_t`, `std::uint64_t`, `float` and `Key` must be
+   * supported for U.
+   *
+   * The implementation leaves binary compatibility (endianness, IEEE 754 for
+   * floats, ...) of the types it serializes in the hands of the `Serializer`
+   * function object if compatibility is required.
+   */
+  template <class Serializer>
+  void serialize(Serializer& serializer) const {
+    m_ht.serialize(serializer);
+  }
+
+  /**
+   * Deserialize a previously serialized set through the `deserializer`
+   * parameter.
+   *
+   * The `deserializer` parameter must be a function object that supports the
+   * following call:
+   *  - `template<typename U> U operator()();` where the types `std::int16_t`,
+   * `std::uint32_t`, `std::uint64_t`, `float` and `Key` must be supported for
+   * U.
+   *
+   * If the deserialized hash set type is hash compatible with the serialized
+   * set, the deserialization process can be sped up by setting
+   * `hash_compatible` to true. To be hash compatible, the Hash, KeyEqual and
+   * GrowthPolicy must behave the same way than the ones used on the serialized
+   * set and the StoreHash must have the same value. The `std::size_t` must also
+   * be of the same size as the one on the platform used to serialize the set.
+   * If these criteria are not met, the behaviour is undefined with
+   * `hash_compatible` sets to true.
+   *
+   * The behaviour is undefined if the type `Key` of the `robin_set` is not the
+   * same as the type used during serialization.
+   *
+   * The implementation leaves binary compatibility (endianness, IEEE 754 for
+   * floats, size of int, ...) of the types it deserializes in the hands of the
+   * `Deserializer` function object if compatibility is required.
+   */
+  template <class Deserializer>
+  static robin_set deserialize(Deserializer& deserializer,
+                               bool hash_compatible = false) {
+    robin_set set(0);
+    set.m_ht.deserialize(deserializer, hash_compatible);
+
+    return set;
+  }
+
+  friend bool operator!=(const robin_set& lhs, const robin_set& rhs) {
+    return !operator==(lhs, rhs);
+  }
+
+  friend void swap(robin_set& lhs, robin_set& rhs) { lhs.swap(rhs); }
+
+ private:
+  ht m_ht;
+};
+
+/**
+ * Same as `tsl::robin_set<Key, Hash, KeyEqual, Allocator, StoreHash,
+ * tsl::rh::prime_growth_policy>`.
+ */
+template <class Key, class Hash = std::hash<Key>,
+          class KeyEqual = std::equal_to<Key>,
+          class Allocator = std::allocator<Key>, bool StoreHash = false>
+using robin_pg_set = robin_set<Key, Hash, KeyEqual, Allocator, StoreHash,
+                               tsl::rh::prime_growth_policy>;
+
+}  // end namespace tsl
+
+#endif
--- a/ext/spdlog/async.h
+++ b/ext/spdlog/async.h
@ -0,0 +1,100 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+//
+// Async logging using global thread pool
+// All loggers created here share same global thread pool.
+// Each log message is pushed to a queue along with a shared pointer to the
+// logger.
+// If a logger deleted while having pending messages in the queue, it's actual
+// destruction will defer
+// until all its messages are processed by the thread pool.
+// This is because each message in the queue holds a shared_ptr to the
+// originating logger.
+
+#include <spdlog/async_logger.h>
+#include <spdlog/details/registry.h>
+#include <spdlog/details/thread_pool.h>
+
+#include <functional>
+#include <memory>
+#include <mutex>
+
+namespace spdlog {
+
+namespace details {
+static const size_t default_async_q_size = 8192;
+}
+
+// async logger factory - creates async loggers backed with thread pool.
+// if a global thread pool doesn't already exist, create it with default queue
+// size of 8192 items and single thread.
+template <async_overflow_policy OverflowPolicy = async_overflow_policy::block>
+struct async_factory_impl {
+    template <typename Sink, typename... SinkArgs>
+    static std::shared_ptr<async_logger> create(std::string logger_name, SinkArgs &&...args) {
+        auto &registry_inst = details::registry::instance();
+
+        // create global thread pool if not already exists..
+
+        auto &mutex = registry_inst.tp_mutex();
+        std::lock_guard<std::recursive_mutex> tp_lock(mutex);
+        auto tp = registry_inst.get_tp();
+        if (tp == nullptr) {
+            tp = std::make_shared<details::thread_pool>(details::default_async_q_size, 1U);
+            registry_inst.set_tp(tp);
+        }
+
+        auto sink = std::make_shared<Sink>(std::forward<SinkArgs>(args)...);
+        auto new_logger = std::make_shared<async_logger>(std::move(logger_name), std::move(sink),
+                                                         std::move(tp), OverflowPolicy);
+        registry_inst.initialize_logger(new_logger);
+        return new_logger;
+    }
+};
+
+using async_factory = async_factory_impl<async_overflow_policy::block>;
+using async_factory_nonblock = async_factory_impl<async_overflow_policy::overrun_oldest>;
+
+template <typename Sink, typename... SinkArgs>
+inline std::shared_ptr<spdlog::logger> create_async(std::string logger_name,
+                                                    SinkArgs &&...sink_args) {
+    return async_factory::create<Sink>(std::move(logger_name),
+                                       std::forward<SinkArgs>(sink_args)...);
+}
+
+template <typename Sink, typename... SinkArgs>
+inline std::shared_ptr<spdlog::logger> create_async_nb(std::string logger_name,
+                                                       SinkArgs &&...sink_args) {
+    return async_factory_nonblock::create<Sink>(std::move(logger_name),
+                                                std::forward<SinkArgs>(sink_args)...);
+}
+
+// set global thread pool.
+inline void init_thread_pool(size_t q_size,
+                             size_t thread_count,
+                             std::function<void()> on_thread_start,
+                             std::function<void()> on_thread_stop) {
+    auto tp = std::make_shared<details::thread_pool>(q_size, thread_count, on_thread_start,
+                                                     on_thread_stop);
+    details::registry::instance().set_tp(std::move(tp));
+}
+
+inline void init_thread_pool(size_t q_size,
+                             size_t thread_count,
+                             std::function<void()> on_thread_start) {
+    init_thread_pool(q_size, thread_count, on_thread_start, [] {});
+}
+
+inline void init_thread_pool(size_t q_size, size_t thread_count) {
+    init_thread_pool(
+        q_size, thread_count, [] {}, [] {});
+}
+
+// get the global thread pool.
+inline std::shared_ptr<spdlog::details::thread_pool> thread_pool() {
+    return details::registry::instance().get_tp();
+}
+}  // namespace spdlog
--- a/ext/spdlog/async_logger-inl.h
+++ b/ext/spdlog/async_logger-inl.h
@ -0,0 +1,84 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/async_logger.h>
+#endif
+
+#include <spdlog/details/thread_pool.h>
+#include <spdlog/sinks/sink.h>
+
+#include <memory>
+#include <string>
+
+SPDLOG_INLINE spdlog::async_logger::async_logger(std::string logger_name,
+                                                 sinks_init_list sinks_list,
+                                                 std::weak_ptr<details::thread_pool> tp,
+                                                 async_overflow_policy overflow_policy)
+    : async_logger(std::move(logger_name),
+                   sinks_list.begin(),
+                   sinks_list.end(),
+                   std::move(tp),
+                   overflow_policy) {}
+
+SPDLOG_INLINE spdlog::async_logger::async_logger(std::string logger_name,
+                                                 sink_ptr single_sink,
+                                                 std::weak_ptr<details::thread_pool> tp,
+                                                 async_overflow_policy overflow_policy)
+    : async_logger(
+          std::move(logger_name), {std::move(single_sink)}, std::move(tp), overflow_policy) {}
+
+// send the log message to the thread pool
+SPDLOG_INLINE void spdlog::async_logger::sink_it_(const details::log_msg &msg){
+    SPDLOG_TRY{if (auto pool_ptr = thread_pool_.lock()){
+        pool_ptr->post_log(shared_from_this(), msg, overflow_policy_);
+}
+else {
+    throw_spdlog_ex("async log: thread pool doesn't exist anymore");
+}
+}
+SPDLOG_LOGGER_CATCH(msg.source)
+}
+
+// send flush request to the thread pool
+SPDLOG_INLINE void spdlog::async_logger::flush_(){
+    SPDLOG_TRY{if (auto pool_ptr = thread_pool_.lock()){
+        pool_ptr->post_flush(shared_from_this(), overflow_policy_);
+}
+else {
+    throw_spdlog_ex("async flush: thread pool doesn't exist anymore");
+}
+}
+SPDLOG_LOGGER_CATCH(source_loc())
+}
+
+//
+// backend functions - called from the thread pool to do the actual job
+//
+SPDLOG_INLINE void spdlog::async_logger::backend_sink_it_(const details::log_msg &msg) {
+    for (auto &sink : sinks_) {
+        if (sink->should_log(msg.level)) {
+            SPDLOG_TRY { sink->log(msg); }
+            SPDLOG_LOGGER_CATCH(msg.source)
+        }
+    }
+
+    if (should_flush_(msg)) {
+        backend_flush_();
+    }
+}
+
+SPDLOG_INLINE void spdlog::async_logger::backend_flush_() {
+    for (auto &sink : sinks_) {
+        SPDLOG_TRY { sink->flush(); }
+        SPDLOG_LOGGER_CATCH(source_loc())
+    }
+}
+
+SPDLOG_INLINE std::shared_ptr<spdlog::logger> spdlog::async_logger::clone(std::string new_name) {
+    auto cloned = std::make_shared<spdlog::async_logger>(*this);
+    cloned->name_ = std::move(new_name);
+    return cloned;
+}
--- a/ext/spdlog/async_logger.h
+++ b/ext/spdlog/async_logger.h
@ -0,0 +1,74 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+// Fast asynchronous logger.
+// Uses pre allocated queue.
+// Creates a single back thread to pop messages from the queue and log them.
+//
+// Upon each log write the logger:
+//    1. Checks if its log level is enough to log the message
+//    2. Push a new copy of the message to a queue (or block the caller until
+//    space is available in the queue)
+// Upon destruction, logs all remaining messages in the queue before
+// destructing..
+
+#include <spdlog/logger.h>
+
+namespace spdlog {
+
+// Async overflow policy - block by default.
+enum class async_overflow_policy {
+    block,           // Block until message can be enqueued
+    overrun_oldest,  // Discard oldest message in the queue if full when trying to
+                     // add new item.
+    discard_new      // Discard new message if the queue is full when trying to add new item.
+};
+
+namespace details {
+class thread_pool;
+}
+
+class SPDLOG_API async_logger final : public std::enable_shared_from_this<async_logger>,
+                                      public logger {
+    friend class details::thread_pool;
+
+public:
+    template <typename It>
+    async_logger(std::string logger_name,
+                 It begin,
+                 It end,
+                 std::weak_ptr<details::thread_pool> tp,
+                 async_overflow_policy overflow_policy = async_overflow_policy::block)
+        : logger(std::move(logger_name), begin, end),
+          thread_pool_(std::move(tp)),
+          overflow_policy_(overflow_policy) {}
+
+    async_logger(std::string logger_name,
+                 sinks_init_list sinks_list,
+                 std::weak_ptr<details::thread_pool> tp,
+                 async_overflow_policy overflow_policy = async_overflow_policy::block);
+
+    async_logger(std::string logger_name,
+                 sink_ptr single_sink,
+                 std::weak_ptr<details::thread_pool> tp,
+                 async_overflow_policy overflow_policy = async_overflow_policy::block);
+
+    std::shared_ptr<logger> clone(std::string new_name) override;
+
+protected:
+    void sink_it_(const details::log_msg &msg) override;
+    void flush_() override;
+    void backend_sink_it_(const details::log_msg &incoming_log_msg);
+    void backend_flush_();
+
+private:
+    std::weak_ptr<details::thread_pool> thread_pool_;
+    async_overflow_policy overflow_policy_;
+};
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "async_logger-inl.h"
+#endif
--- a/ext/spdlog/cfg/argv.h
+++ b/ext/spdlog/cfg/argv.h
@ -0,0 +1,40 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+#include <spdlog/cfg/helpers.h>
+#include <spdlog/details/registry.h>
+
+//
+// Init log levels using each argv entry that starts with "SPDLOG_LEVEL="
+//
+// set all loggers to debug level:
+// example.exe "SPDLOG_LEVEL=debug"
+
+// set logger1 to trace level
+// example.exe "SPDLOG_LEVEL=logger1=trace"
+
+// turn off all logging except for logger1 and logger2:
+// example.exe "SPDLOG_LEVEL=off,logger1=debug,logger2=info"
+
+namespace spdlog {
+namespace cfg {
+
+// search for SPDLOG_LEVEL= in the args and use it to init the levels
+inline void load_argv_levels(int argc, const char **argv) {
+    const std::string spdlog_level_prefix = "SPDLOG_LEVEL=";
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg.find(spdlog_level_prefix) == 0) {
+            auto levels_string = arg.substr(spdlog_level_prefix.size());
+            helpers::load_levels(levels_string);
+        }
+    }
+}
+
+inline void load_argv_levels(int argc, char **argv) {
+    load_argv_levels(argc, const_cast<const char **>(argv));
+}
+
+}  // namespace cfg
+}  // namespace spdlog
--- a/ext/spdlog/cfg/env.h
+++ b/ext/spdlog/cfg/env.h
@ -0,0 +1,36 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+#include <spdlog/cfg/helpers.h>
+#include <spdlog/details/os.h>
+#include <spdlog/details/registry.h>
+
+//
+// Init levels and patterns from env variables SPDLOG_LEVEL
+// Inspired from Rust's "env_logger" crate (https://crates.io/crates/env_logger).
+// Note - fallback to "info" level on unrecognized levels
+//
+// Examples:
+//
+// set global level to debug:
+// export SPDLOG_LEVEL=debug
+//
+// turn off all logging except for logger1:
+// export SPDLOG_LEVEL="*=off,logger1=debug"
+//
+
+// turn off all logging except for logger1 and logger2:
+// export SPDLOG_LEVEL="off,logger1=debug,logger2=info"
+
+namespace spdlog {
+namespace cfg {
+inline void load_env_levels() {
+    auto env_val = details::os::getenv("SPDLOG_LEVEL");
+    if (!env_val.empty()) {
+        helpers::load_levels(env_val);
+    }
+}
+
+}  // namespace cfg
+}  // namespace spdlog
--- a/ext/spdlog/cfg/helpers-inl.h
+++ b/ext/spdlog/cfg/helpers-inl.h
@ -0,0 +1,107 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/cfg/helpers.h>
+#endif
+
+#include <spdlog/details/os.h>
+#include <spdlog/details/registry.h>
+#include <spdlog/spdlog.h>
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <utility>
+
+namespace spdlog {
+namespace cfg {
+namespace helpers {
+
+// inplace convert to lowercase
+inline std::string &to_lower_(std::string &str) {
+    std::transform(str.begin(), str.end(), str.begin(), [](char ch) {
+        return static_cast<char>((ch >= 'A' && ch <= 'Z') ? ch + ('a' - 'A') : ch);
+    });
+    return str;
+}
+
+// inplace trim spaces
+inline std::string &trim_(std::string &str) {
+    const char *spaces = " \n\r\t";
+    str.erase(str.find_last_not_of(spaces) + 1);
+    str.erase(0, str.find_first_not_of(spaces));
+    return str;
+}
+
+// return (name,value) trimmed pair from given "name=value" string.
+// return empty string on missing parts
+// "key=val" => ("key", "val")
+// " key  =  val " => ("key", "val")
+// "key=" => ("key", "")
+// "val" => ("", "val")
+
+inline std::pair<std::string, std::string> extract_kv_(char sep, const std::string &str) {
+    auto n = str.find(sep);
+    std::string k, v;
+    if (n == std::string::npos) {
+        v = str;
+    } else {
+        k = str.substr(0, n);
+        v = str.substr(n + 1);
+    }
+    return std::make_pair(trim_(k), trim_(v));
+}
+
+// return vector of key/value pairs from sequence of "K1=V1,K2=V2,.."
+// "a=AAA,b=BBB,c=CCC,.." => {("a","AAA"),("b","BBB"),("c", "CCC"),...}
+inline std::unordered_map<std::string, std::string> extract_key_vals_(const std::string &str) {
+    std::string token;
+    std::istringstream token_stream(str);
+    std::unordered_map<std::string, std::string> rv{};
+    while (std::getline(token_stream, token, ',')) {
+        if (token.empty()) {
+            continue;
+        }
+        auto kv = extract_kv_('=', token);
+        rv[kv.first] = kv.second;
+    }
+    return rv;
+}
+
+SPDLOG_INLINE void load_levels(const std::string &input) {
+    if (input.empty() || input.size() > 512) {
+        return;
+    }
+
+    auto key_vals = extract_key_vals_(input);
+    std::unordered_map<std::string, level::level_enum> levels;
+    level::level_enum global_level = level::info;
+    bool global_level_found = false;
+
+    for (auto &name_level : key_vals) {
+        auto &logger_name = name_level.first;
+        auto level_name = to_lower_(name_level.second);
+        auto level = level::from_str(level_name);
+        // ignore unrecognized level names
+        if (level == level::off && level_name != "off") {
+            continue;
+        }
+        if (logger_name.empty())  // no logger name indicate global level
+        {
+            global_level_found = true;
+            global_level = level;
+        } else {
+            levels[logger_name] = level;
+        }
+    }
+
+    details::registry::instance().set_levels(std::move(levels),
+                                             global_level_found ? &global_level : nullptr);
+}
+
+}  // namespace helpers
+}  // namespace cfg
+}  // namespace spdlog
--- a/ext/spdlog/cfg/helpers.h
+++ b/ext/spdlog/cfg/helpers.h
@ -0,0 +1,29 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <unordered_map>
+
+namespace spdlog {
+namespace cfg {
+namespace helpers {
+//
+// Init levels from given string
+//
+// Examples:
+//
+// set global level to debug: "debug"
+// turn off all logging except for logger1: "off,logger1=debug"
+// turn off all logging except for logger1 and logger2: "off,logger1=debug,logger2=info"
+//
+SPDLOG_API void load_levels(const std::string &txt);
+}  // namespace helpers
+
+}  // namespace cfg
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "helpers-inl.h"
+#endif  // SPDLOG_HEADER_ONLY
--- a/ext/spdlog/common-inl.h
+++ b/ext/spdlog/common-inl.h
@ -0,0 +1,68 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/common.h>
+#endif
+
+#include <algorithm>
+#include <iterator>
+
+namespace spdlog {
+namespace level {
+
+#if __cplusplus >= 201703L
+constexpr
+#endif
+    static string_view_t level_string_views[] SPDLOG_LEVEL_NAMES;
+
+static const char *short_level_names[] SPDLOG_SHORT_LEVEL_NAMES;
+
+SPDLOG_INLINE const string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
+    return level_string_views[l];
+}
+
+SPDLOG_INLINE const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
+    return short_level_names[l];
+}
+
+SPDLOG_INLINE spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT {
+    auto it = std::find(std::begin(level_string_views), std::end(level_string_views), name);
+    if (it != std::end(level_string_views))
+        return static_cast<level::level_enum>(std::distance(std::begin(level_string_views), it));
+
+    // check also for "warn" and "err" before giving up..
+    if (name == "warn") {
+        return level::warn;
+    }
+    if (name == "err") {
+        return level::err;
+    }
+    return level::off;
+}
+}  // namespace level
+
+SPDLOG_INLINE spdlog_ex::spdlog_ex(std::string msg)
+    : msg_(std::move(msg)) {}
+
+SPDLOG_INLINE spdlog_ex::spdlog_ex(const std::string &msg, int last_errno) {
+#ifdef SPDLOG_USE_STD_FORMAT
+    msg_ = std::system_error(std::error_code(last_errno, std::generic_category()), msg).what();
+#else
+    memory_buf_t outbuf;
+    fmt::format_system_error(outbuf, last_errno, msg.c_str());
+    msg_ = fmt::to_string(outbuf);
+#endif
+}
+
+SPDLOG_INLINE const char *spdlog_ex::what() const SPDLOG_NOEXCEPT { return msg_.c_str(); }
+
+SPDLOG_INLINE void throw_spdlog_ex(const std::string &msg, int last_errno) {
+    SPDLOG_THROW(spdlog_ex(msg, last_errno));
+}
+
+SPDLOG_INLINE void throw_spdlog_ex(std::string msg) { SPDLOG_THROW(spdlog_ex(std::move(msg))); }
+
+}  // namespace spdlog
--- a/ext/spdlog/common.h
+++ b/ext/spdlog/common.h
@ -0,0 +1,411 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/tweakme.h>
+
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <exception>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+#ifdef SPDLOG_USE_STD_FORMAT
+    #include <version>
+    #if __cpp_lib_format >= 202207L
+        #include <format>
+    #else
+        #include <string_view>
+    #endif
+#endif
+
+#ifdef SPDLOG_COMPILED_LIB
+    #undef SPDLOG_HEADER_ONLY
+    #if defined(SPDLOG_SHARED_LIB)
+        #if defined(_WIN32)
+            #ifdef spdlog_EXPORTS
+                #define SPDLOG_API __declspec(dllexport)
+            #else  // !spdlog_EXPORTS
+                #define SPDLOG_API __declspec(dllimport)
+            #endif
+        #else  // !defined(_WIN32)
+            #define SPDLOG_API __attribute__((visibility("default")))
+        #endif
+    #else  // !defined(SPDLOG_SHARED_LIB)
+        #define SPDLOG_API
+    #endif
+    #define SPDLOG_INLINE
+#else  // !defined(SPDLOG_COMPILED_LIB)
+    #define SPDLOG_API
+    #define SPDLOG_HEADER_ONLY
+    #define SPDLOG_INLINE inline
+#endif  // #ifdef SPDLOG_COMPILED_LIB
+
+#include <spdlog/fmt/fmt.h>
+
+#if !defined(SPDLOG_USE_STD_FORMAT) && \
+    FMT_VERSION >= 80000  // backward compatibility with fmt versions older than 8
+    #define SPDLOG_FMT_RUNTIME(format_string) fmt::runtime(format_string)
+    #define SPDLOG_FMT_STRING(format_string) FMT_STRING(format_string)
+    #if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+        #include <spdlog/fmt/xchar.h>
+    #endif
+#else
+    #define SPDLOG_FMT_RUNTIME(format_string) format_string
+    #define SPDLOG_FMT_STRING(format_string) format_string
+#endif
+
+// visual studio up to 2013 does not support noexcept nor constexpr
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+    #define SPDLOG_NOEXCEPT _NOEXCEPT
+    #define SPDLOG_CONSTEXPR
+#else
+    #define SPDLOG_NOEXCEPT noexcept
+    #define SPDLOG_CONSTEXPR constexpr
+#endif
+
+// If building with std::format, can just use constexpr, otherwise if building with fmt
+// SPDLOG_CONSTEXPR_FUNC needs to be set the same as FMT_CONSTEXPR to avoid situations where
+// a constexpr function in spdlog could end up calling a non-constexpr function in fmt
+// depending on the compiler
+// If fmt determines it can't use constexpr, we should inline the function instead
+#ifdef SPDLOG_USE_STD_FORMAT
+    #define SPDLOG_CONSTEXPR_FUNC constexpr
+#else  // Being built with fmt
+    #if FMT_USE_CONSTEXPR
+        #define SPDLOG_CONSTEXPR_FUNC FMT_CONSTEXPR
+    #else
+        #define SPDLOG_CONSTEXPR_FUNC inline
+    #endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define SPDLOG_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+    #define SPDLOG_DEPRECATED __declspec(deprecated)
+#else
+    #define SPDLOG_DEPRECATED
+#endif
+
+// disable thread local on msvc 2013
+#ifndef SPDLOG_NO_TLS
+    #if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(__cplusplus_winrt)
+        #define SPDLOG_NO_TLS 1
+    #endif
+#endif
+
+#ifndef SPDLOG_FUNCTION
+    #define SPDLOG_FUNCTION static_cast<const char *>(__FUNCTION__)
+#endif
+
+#ifdef SPDLOG_NO_EXCEPTIONS
+    #define SPDLOG_TRY
+    #define SPDLOG_THROW(ex)                               \
+        do {                                               \
+            printf("spdlog fatal error: %s\n", ex.what()); \
+            std::abort();                                  \
+        } while (0)
+    #define SPDLOG_CATCH_STD
+#else
+    #define SPDLOG_TRY try
+    #define SPDLOG_THROW(ex) throw(ex)
+    #define SPDLOG_CATCH_STD             \
+        catch (const std::exception &) { \
+        }
+#endif
+
+namespace spdlog {
+
+class formatter;
+
+namespace sinks {
+class sink;
+}
+
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+using filename_t = std::wstring;
+    // allow macro expansion to occur in SPDLOG_FILENAME_T
+    #define SPDLOG_FILENAME_T_INNER(s) L##s
+    #define SPDLOG_FILENAME_T(s) SPDLOG_FILENAME_T_INNER(s)
+#else
+using filename_t = std::string;
+    #define SPDLOG_FILENAME_T(s) s
+#endif
+
+using log_clock = std::chrono::system_clock;
+using sink_ptr = std::shared_ptr<sinks::sink>;
+using sinks_init_list = std::initializer_list<sink_ptr>;
+using err_handler = std::function<void(const std::string &err_msg)>;
+#ifdef SPDLOG_USE_STD_FORMAT
+namespace fmt_lib = std;
+
+using string_view_t = std::string_view;
+using memory_buf_t = std::string;
+
+template <typename... Args>
+    #if __cpp_lib_format >= 202207L
+using format_string_t = std::format_string<Args...>;
+    #else
+using format_string_t = std::string_view;
+    #endif
+
+template <class T, class Char = char>
+struct is_convertible_to_basic_format_string
+    : std::integral_constant<bool, std::is_convertible<T, std::basic_string_view<Char>>::value> {};
+
+    #if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+using wstring_view_t = std::wstring_view;
+using wmemory_buf_t = std::wstring;
+
+template <typename... Args>
+        #if __cpp_lib_format >= 202207L
+using wformat_string_t = std::wformat_string<Args...>;
+        #else
+using wformat_string_t = std::wstring_view;
+        #endif
+    #endif
+    #define SPDLOG_BUF_TO_STRING(x) x
+#else  // use fmt lib instead of std::format
+namespace fmt_lib = fmt;
+
+using string_view_t = fmt::basic_string_view<char>;
+using memory_buf_t = fmt::basic_memory_buffer<char, 250>;
+
+template <typename... Args>
+using format_string_t = fmt::format_string<Args...>;
+
+template <class T>
+using remove_cvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+template <typename Char>
+    #if FMT_VERSION >= 90101
+using fmt_runtime_string = fmt::runtime_format_string<Char>;
+    #else
+using fmt_runtime_string = fmt::basic_runtime<Char>;
+    #endif
+
+// clang doesn't like SFINAE disabled constructor in std::is_convertible<> so have to repeat the
+// condition from basic_format_string here, in addition, fmt::basic_runtime<Char> is only
+// convertible to basic_format_string<Char> but not basic_string_view<Char>
+template <class T, class Char = char>
+struct is_convertible_to_basic_format_string
+    : std::integral_constant<bool,
+                             std::is_convertible<T, fmt::basic_string_view<Char>>::value ||
+                                 std::is_same<remove_cvref_t<T>, fmt_runtime_string<Char>>::value> {
+};
+
+    #if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+using wstring_view_t = fmt::basic_string_view<wchar_t>;
+using wmemory_buf_t = fmt::basic_memory_buffer<wchar_t, 250>;
+
+template <typename... Args>
+using wformat_string_t = fmt::wformat_string<Args...>;
+    #endif
+    #define SPDLOG_BUF_TO_STRING(x) fmt::to_string(x)
+#endif
+
+#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
+    #ifndef _WIN32
+        #error SPDLOG_WCHAR_TO_UTF8_SUPPORT only supported on windows
+    #endif  // _WIN32
+#endif      // SPDLOG_WCHAR_TO_UTF8_SUPPORT
+
+template <class T>
+struct is_convertible_to_any_format_string
+    : std::integral_constant<bool,
+                             is_convertible_to_basic_format_string<T, char>::value ||
+                                 is_convertible_to_basic_format_string<T, wchar_t>::value> {};
+
+#if defined(SPDLOG_NO_ATOMIC_LEVELS)
+using level_t = details::null_atomic_int;
+#else
+using level_t = std::atomic<int>;
+#endif
+
+#define SPDLOG_LEVEL_TRACE 0
+#define SPDLOG_LEVEL_DEBUG 1
+#define SPDLOG_LEVEL_INFO 2
+#define SPDLOG_LEVEL_WARN 3
+#define SPDLOG_LEVEL_ERROR 4
+#define SPDLOG_LEVEL_CRITICAL 5
+#define SPDLOG_LEVEL_OFF 6
+
+#if !defined(SPDLOG_ACTIVE_LEVEL)
+    #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
+#endif
+
+// Log level enum
+namespace level {
+enum level_enum : int {
+    trace = SPDLOG_LEVEL_TRACE,
+    debug = SPDLOG_LEVEL_DEBUG,
+    info = SPDLOG_LEVEL_INFO,
+    warn = SPDLOG_LEVEL_WARN,
+    err = SPDLOG_LEVEL_ERROR,
+    critical = SPDLOG_LEVEL_CRITICAL,
+    off = SPDLOG_LEVEL_OFF,
+    n_levels
+};
+
+#define SPDLOG_LEVEL_NAME_TRACE spdlog::string_view_t("trace", 5)
+#define SPDLOG_LEVEL_NAME_DEBUG spdlog::string_view_t("debug", 5)
+#define SPDLOG_LEVEL_NAME_INFO spdlog::string_view_t("info", 4)
+#define SPDLOG_LEVEL_NAME_WARNING spdlog::string_view_t("warning", 7)
+#define SPDLOG_LEVEL_NAME_ERROR spdlog::string_view_t("error", 5)
+#define SPDLOG_LEVEL_NAME_CRITICAL spdlog::string_view_t("critical", 8)
+#define SPDLOG_LEVEL_NAME_OFF spdlog::string_view_t("off", 3)
+
+#if !defined(SPDLOG_LEVEL_NAMES)
+    #define SPDLOG_LEVEL_NAMES                                                                  \
+        {                                                                                       \
+            SPDLOG_LEVEL_NAME_TRACE, SPDLOG_LEVEL_NAME_DEBUG, SPDLOG_LEVEL_NAME_INFO,           \
+                SPDLOG_LEVEL_NAME_WARNING, SPDLOG_LEVEL_NAME_ERROR, SPDLOG_LEVEL_NAME_CRITICAL, \
+                SPDLOG_LEVEL_NAME_OFF                                                           \
+        }
+#endif
+
+#if !defined(SPDLOG_SHORT_LEVEL_NAMES)
+
+    #define SPDLOG_SHORT_LEVEL_NAMES \
+        { "T", "D", "I", "W", "E", "C", "O" }
+#endif
+
+SPDLOG_API const string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
+SPDLOG_API const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
+SPDLOG_API spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT;
+
+}  // namespace level
+
+//
+// Color mode used by sinks with color support.
+//
+enum class color_mode { always, automatic, never };
+
+//
+// Pattern time - specific time getting to use for pattern_formatter.
+// local time by default
+//
+enum class pattern_time_type {
+    local,  // log localtime
+    utc     // log utc
+};
+
+//
+// Log exception
+//
+class SPDLOG_API spdlog_ex : public std::exception {
+public:
+    explicit spdlog_ex(std::string msg);
+    spdlog_ex(const std::string &msg, int last_errno);
+    const char *what() const SPDLOG_NOEXCEPT override;
+
+private:
+    std::string msg_;
+};
+
+[[noreturn]] SPDLOG_API void throw_spdlog_ex(const std::string &msg, int last_errno);
+[[noreturn]] SPDLOG_API void throw_spdlog_ex(std::string msg);
+
+struct source_loc {
+    SPDLOG_CONSTEXPR source_loc() = default;
+    SPDLOG_CONSTEXPR source_loc(const char *filename_in, int line_in, const char *funcname_in)
+        : filename{filename_in},
+          line{line_in},
+          funcname{funcname_in} {}
+
+    SPDLOG_CONSTEXPR bool empty() const SPDLOG_NOEXCEPT { return line <= 0; }
+    const char *filename{nullptr};
+    int line{0};
+    const char *funcname{nullptr};
+};
+
+struct file_event_handlers {
+    file_event_handlers()
+        : before_open(nullptr),
+          after_open(nullptr),
+          before_close(nullptr),
+          after_close(nullptr) {}
+
+    std::function<void(const filename_t &filename)> before_open;
+    std::function<void(const filename_t &filename, std::FILE *file_stream)> after_open;
+    std::function<void(const filename_t &filename, std::FILE *file_stream)> before_close;
+    std::function<void(const filename_t &filename)> after_close;
+};
+
+namespace details {
+
+// to_string_view
+
+SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(const memory_buf_t &buf)
+    SPDLOG_NOEXCEPT {
+    return spdlog::string_view_t{buf.data(), buf.size()};
+}
+
+SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(spdlog::string_view_t str)
+    SPDLOG_NOEXCEPT {
+    return str;
+}
+
+#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(const wmemory_buf_t &buf)
+    SPDLOG_NOEXCEPT {
+    return spdlog::wstring_view_t{buf.data(), buf.size()};
+}
+
+SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(spdlog::wstring_view_t str)
+    SPDLOG_NOEXCEPT {
+    return str;
+}
+#endif
+
+#ifndef SPDLOG_USE_STD_FORMAT
+template <typename T, typename... Args>
+inline fmt::basic_string_view<T> to_string_view(fmt::basic_format_string<T, Args...> fmt) {
+    return fmt;
+}
+#elif __cpp_lib_format >= 202207L
+template <typename T, typename... Args>
+SPDLOG_CONSTEXPR_FUNC std::basic_string_view<T> to_string_view(
+    std::basic_format_string<T, Args...> fmt) SPDLOG_NOEXCEPT {
+    return fmt.get();
+}
+#endif
+
+// make_unique support for pre c++14
+#if __cplusplus >= 201402L  // C++14 and beyond
+using std::enable_if_t;
+using std::make_unique;
+#else
+template <bool B, class T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args &&...args) {
+    static_assert(!std::is_array<T>::value, "arrays not supported");
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+#endif
+
+// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
+template <typename T, typename U, enable_if_t<!std::is_same<T, U>::value, int> = 0>
+constexpr T conditional_static_cast(U value) {
+    return static_cast<T>(value);
+}
+
+template <typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
+constexpr T conditional_static_cast(U value) {
+    return value;
+}
+
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "common-inl.h"
+#endif
--- a/ext/spdlog/details/backtracer-inl.h
+++ b/ext/spdlog/details/backtracer-inl.h
@ -0,0 +1,63 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/backtracer.h>
+#endif
+namespace spdlog {
+namespace details {
+SPDLOG_INLINE backtracer::backtracer(const backtracer &other) {
+    std::lock_guard<std::mutex> lock(other.mutex_);
+    enabled_ = other.enabled();
+    messages_ = other.messages_;
+}
+
+SPDLOG_INLINE backtracer::backtracer(backtracer &&other) SPDLOG_NOEXCEPT {
+    std::lock_guard<std::mutex> lock(other.mutex_);
+    enabled_ = other.enabled();
+    messages_ = std::move(other.messages_);
+}
+
+SPDLOG_INLINE backtracer &backtracer::operator=(backtracer other) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    enabled_ = other.enabled();
+    messages_ = std::move(other.messages_);
+    return *this;
+}
+
+SPDLOG_INLINE void backtracer::enable(size_t size) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    enabled_.store(true, std::memory_order_relaxed);
+    messages_ = circular_q<log_msg_buffer>{size};
+}
+
+SPDLOG_INLINE void backtracer::disable() {
+    std::lock_guard<std::mutex> lock{mutex_};
+    enabled_.store(false, std::memory_order_relaxed);
+}
+
+SPDLOG_INLINE bool backtracer::enabled() const { return enabled_.load(std::memory_order_relaxed); }
+
+SPDLOG_INLINE void backtracer::push_back(const log_msg &msg) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    messages_.push_back(log_msg_buffer{msg});
+}
+
+SPDLOG_INLINE bool backtracer::empty() const {
+    std::lock_guard<std::mutex> lock{mutex_};
+    return messages_.empty();
+}
+
+// pop all items in the q and apply the given fun on each of them.
+SPDLOG_INLINE void backtracer::foreach_pop(std::function<void(const details::log_msg &)> fun) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    while (!messages_.empty()) {
+        auto &front_msg = messages_.front();
+        fun(front_msg);
+        messages_.pop_front();
+    }
+}
+}  // namespace details
+}  // namespace spdlog
--- a/ext/spdlog/details/backtracer.h
+++ b/ext/spdlog/details/backtracer.h
@ -0,0 +1,45 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/circular_q.h>
+#include <spdlog/details/log_msg_buffer.h>
+
+#include <atomic>
+#include <functional>
+#include <mutex>
+
+// Store log messages in circular buffer.
+// Useful for storing debug data in case of error/warning happens.
+
+namespace spdlog {
+namespace details {
+class SPDLOG_API backtracer {
+    mutable std::mutex mutex_;
+    std::atomic<bool> enabled_{false};
+    circular_q<log_msg_buffer> messages_;
+
+public:
+    backtracer() = default;
+    backtracer(const backtracer &other);
+
+    backtracer(backtracer &&other) SPDLOG_NOEXCEPT;
+    backtracer &operator=(backtracer other);
+
+    void enable(size_t size);
+    void disable();
+    bool enabled() const;
+    void push_back(const log_msg &msg);
+    bool empty() const;
+
+    // pop all items in the q and apply the given fun on each of them.
+    void foreach_pop(std::function<void(const details::log_msg &)> fun);
+};
+
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "backtracer-inl.h"
+#endif
--- a/ext/spdlog/details/circular_q.h
+++ b/ext/spdlog/details/circular_q.h
@ -0,0 +1,115 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+// circular q view of std::vector.
+#pragma once
+
+#include <cassert>
+#include <vector>
+
+#include "spdlog/common.h"
+
+namespace spdlog {
+namespace details {
+template <typename T>
+class circular_q {
+    size_t max_items_ = 0;
+    typename std::vector<T>::size_type head_ = 0;
+    typename std::vector<T>::size_type tail_ = 0;
+    size_t overrun_counter_ = 0;
+    std::vector<T> v_;
+
+public:
+    using value_type = T;
+
+    // empty ctor - create a disabled queue with no elements allocated at all
+    circular_q() = default;
+
+    explicit circular_q(size_t max_items)
+        : max_items_(max_items + 1)  // one item is reserved as marker for full q
+          ,
+          v_(max_items_) {}
+
+    circular_q(const circular_q &) = default;
+    circular_q &operator=(const circular_q &) = default;
+
+    // move cannot be default,
+    // since we need to reset head_, tail_, etc to zero in the moved object
+    circular_q(circular_q &&other) SPDLOG_NOEXCEPT { copy_moveable(std::move(other)); }
+
+    circular_q &operator=(circular_q &&other) SPDLOG_NOEXCEPT {
+        copy_moveable(std::move(other));
+        return *this;
+    }
+
+    // push back, overrun (oldest) item if no room left
+    void push_back(T &&item) {
+        if (max_items_ > 0) {
+            v_[tail_] = std::move(item);
+            tail_ = (tail_ + 1) % max_items_;
+
+            if (tail_ == head_)  // overrun last item if full
+            {
+                head_ = (head_ + 1) % max_items_;
+                ++overrun_counter_;
+            }
+        }
+    }
+
+    // Return reference to the front item.
+    // If there are no elements in the container, the behavior is undefined.
+    const T &front() const { return v_[head_]; }
+
+    T &front() { return v_[head_]; }
+
+    // Return number of elements actually stored
+    size_t size() const {
+        if (tail_ >= head_) {
+            return tail_ - head_;
+        } else {
+            return max_items_ - (head_ - tail_);
+        }
+    }
+
+    // Return const reference to item by index.
+    // If index is out of range 0…size()-1, the behavior is undefined.
+    const T &at(size_t i) const {
+        assert(i < size());
+        return v_[(head_ + i) % max_items_];
+    }
+
+    // Pop item from front.
+    // If there are no elements in the container, the behavior is undefined.
+    void pop_front() { head_ = (head_ + 1) % max_items_; }
+
+    bool empty() const { return tail_ == head_; }
+
+    bool full() const {
+        // head is ahead of the tail by 1
+        if (max_items_ > 0) {
+            return ((tail_ + 1) % max_items_) == head_;
+        }
+        return false;
+    }
+
+    size_t overrun_counter() const { return overrun_counter_; }
+
+    void reset_overrun_counter() { overrun_counter_ = 0; }
+
+private:
+    // copy from other&& and reset it to disabled state
+    void copy_moveable(circular_q &&other) SPDLOG_NOEXCEPT {
+        max_items_ = other.max_items_;
+        head_ = other.head_;
+        tail_ = other.tail_;
+        overrun_counter_ = other.overrun_counter_;
+        v_ = std::move(other.v_);
+
+        // put &&other in disabled, but valid state
+        other.max_items_ = 0;
+        other.head_ = other.tail_ = 0;
+        other.overrun_counter_ = 0;
+    }
+};
+}  // namespace details
+}  // namespace spdlog
--- a/ext/spdlog/details/console_globals.h
+++ b/ext/spdlog/details/console_globals.h
@ -0,0 +1,28 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <mutex>
+#include <spdlog/details/null_mutex.h>
+
+namespace spdlog {
+namespace details {
+
+struct console_mutex {
+    using mutex_t = std::mutex;
+    static mutex_t &mutex() {
+        static mutex_t s_mutex;
+        return s_mutex;
+    }
+};
+
+struct console_nullmutex {
+    using mutex_t = null_mutex;
+    static mutex_t &mutex() {
+        static mutex_t s_mutex;
+        return s_mutex;
+    }
+};
+}  // namespace details
+}  // namespace spdlog
--- a/ext/spdlog/details/file_helper-inl.h
+++ b/ext/spdlog/details/file_helper-inl.h
@ -0,0 +1,153 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/file_helper.h>
+#endif
+
+#include <spdlog/common.h>
+#include <spdlog/details/os.h>
+
+#include <cerrno>
+#include <chrono>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <tuple>
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE file_helper::file_helper(const file_event_handlers &event_handlers)
+    : event_handlers_(event_handlers) {}
+
+SPDLOG_INLINE file_helper::~file_helper() { close(); }
+
+SPDLOG_INLINE void file_helper::open(const filename_t &fname, bool truncate) {
+    close();
+    filename_ = fname;
+
+    auto *mode = SPDLOG_FILENAME_T("ab");
+    auto *trunc_mode = SPDLOG_FILENAME_T("wb");
+
+    if (event_handlers_.before_open) {
+        event_handlers_.before_open(filename_);
+    }
+    for (int tries = 0; tries < open_tries_; ++tries) {
+        // create containing folder if not exists already.
+        os::create_dir(os::dir_name(fname));
+        if (truncate) {
+            // Truncate by opening-and-closing a tmp file in "wb" mode, always
+            // opening the actual log-we-write-to in "ab" mode, since that
+            // interacts more politely with eternal processes that might
+            // rotate/truncate the file underneath us.
+            std::FILE *tmp;
+            if (os::fopen_s(&tmp, fname, trunc_mode)) {
+                continue;
+            }
+            std::fclose(tmp);
+        }
+        if (!os::fopen_s(&fd_, fname, mode)) {
+            if (event_handlers_.after_open) {
+                event_handlers_.after_open(filename_, fd_);
+            }
+            return;
+        }
+
+        details::os::sleep_for_millis(open_interval_);
+    }
+
+    throw_spdlog_ex("Failed opening file " + os::filename_to_str(filename_) + " for writing",
+                    errno);
+}
+
+SPDLOG_INLINE void file_helper::reopen(bool truncate) {
+    if (filename_.empty()) {
+        throw_spdlog_ex("Failed re opening file - was not opened before");
+    }
+    this->open(filename_, truncate);
+}
+
+SPDLOG_INLINE void file_helper::flush() {
+    if (std::fflush(fd_) != 0) {
+        throw_spdlog_ex("Failed flush to file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE void file_helper::sync() {
+    if (!os::fsync(fd_)) {
+        throw_spdlog_ex("Failed to fsync file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE void file_helper::close() {
+    if (fd_ != nullptr) {
+        if (event_handlers_.before_close) {
+            event_handlers_.before_close(filename_, fd_);
+        }
+
+        std::fclose(fd_);
+        fd_ = nullptr;
+
+        if (event_handlers_.after_close) {
+            event_handlers_.after_close(filename_);
+        }
+    }
+}
+
+SPDLOG_INLINE void file_helper::write(const memory_buf_t &buf) {
+    if (fd_ == nullptr) return;
+    size_t msg_size = buf.size();
+    auto data = buf.data();
+
+    if (!details::os::fwrite_bytes(data, msg_size, fd_)) {
+        throw_spdlog_ex("Failed writing to file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE size_t file_helper::size() const {
+    if (fd_ == nullptr) {
+        throw_spdlog_ex("Cannot use size() on closed file " + os::filename_to_str(filename_));
+    }
+    return os::filesize(fd_);
+}
+
+SPDLOG_INLINE const filename_t &file_helper::filename() const { return filename_; }
+
+//
+// return file path and its extension:
+//
+// "mylog.txt" => ("mylog", ".txt")
+// "mylog" => ("mylog", "")
+// "mylog." => ("mylog.", "")
+// "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
+//
+// the starting dot in filenames is ignored (hidden files):
+//
+// ".mylog" => (".mylog". "")
+// "my_folder/.mylog" => ("my_folder/.mylog", "")
+// "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
+SPDLOG_INLINE std::tuple<filename_t, filename_t> file_helper::split_by_extension(
+    const filename_t &fname) {
+    auto ext_index = fname.rfind('.');
+
+    // no valid extension found - return whole path and empty string as
+    // extension
+    if (ext_index == filename_t::npos || ext_index == 0 || ext_index == fname.size() - 1) {
+        return std::make_tuple(fname, filename_t());
+    }
+
+    // treat cases like "/etc/rc.d/somelogfile or "/abc/.hiddenfile"
+    auto folder_index = fname.find_last_of(details::os::folder_seps_filename);
+    if (folder_index != filename_t::npos && folder_index >= ext_index - 1) {
+        return std::make_tuple(fname, filename_t());
+    }
+
+    // finally - return a valid base and extension tuple
+    return std::make_tuple(fname.substr(0, ext_index), fname.substr(ext_index));
+}
+
+}  // namespace details
+}  // namespace spdlog
--- a/ext/spdlog/details/file_helper.h
+++ b/ext/spdlog/details/file_helper.h
@ -0,0 +1,61 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <tuple>
+
+namespace spdlog {
+namespace details {
+
+// Helper class for file sinks.
+// When failing to open a file, retry several times(5) with a delay interval(10 ms).
+// Throw spdlog_ex exception on errors.
+
+class SPDLOG_API file_helper {
+public:
+    file_helper() = default;
+    explicit file_helper(const file_event_handlers &event_handlers);
+
+    file_helper(const file_helper &) = delete;
+    file_helper &operator=(const file_helper &) = delete;
+    ~file_helper();
+
+    void open(const filename_t &fname, bool truncate = false);
+    void reopen(bool truncate);
+    void flush();
+    void sync();
+    void close();
+    void write(const memory_buf_t &buf);
+    size_t size() const;
+    const filename_t &filename() const;
+
+    //
+    // return file path and its extension:
+    //
+    // "mylog.txt" => ("mylog", ".txt")
+    // "mylog" => ("mylog", "")
+    // "mylog." => ("mylog.", "")
+    // "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
+    //
+    // the starting dot in filenames is ignored (hidden files):
+    //
+    // ".mylog" => (".mylog". "")
+    // "my_folder/.mylog" => ("my_folder/.mylog", "")
+    // "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
+    static std::tuple<filename_t, filename_t> split_by_extension(const filename_t &fname);
+
+private:
+    const int open_tries_ = 5;
+    const unsigned int open_interval_ = 10;
+    std::FILE *fd_{nullptr};
+    filename_t filename_;
+    file_event_handlers event_handlers_;
+};
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "file_helper-inl.h"
+#endif
--- a/ext/spdlog/details/fmt_helper.h
+++ b/ext/spdlog/details/fmt_helper.h
@ -0,0 +1,141 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+#pragma once
+
+#include <chrono>
+#include <iterator>
+#include <spdlog/common.h>
+#include <spdlog/fmt/fmt.h>
+#include <type_traits>
+
+#ifdef SPDLOG_USE_STD_FORMAT
+    #include <charconv>
+    #include <limits>
+#endif
+
+// Some fmt helpers to efficiently format and pad ints and strings
+namespace spdlog {
+namespace details {
+namespace fmt_helper {
+
+inline void append_string_view(spdlog::string_view_t view, memory_buf_t &dest) {
+    auto *buf_ptr = view.data();
+    dest.append(buf_ptr, buf_ptr + view.size());
+}
+
+#ifdef SPDLOG_USE_STD_FORMAT
+template <typename T>
+inline void append_int(T n, memory_buf_t &dest) {
+    // Buffer should be large enough to hold all digits (digits10 + 1) and a sign
+    SPDLOG_CONSTEXPR const auto BUF_SIZE = std::numeric_limits<T>::digits10 + 2;
+    char buf[BUF_SIZE];
+
+    auto [ptr, ec] = std::to_chars(buf, buf + BUF_SIZE, n, 10);
+    if (ec == std::errc()) {
+        dest.append(buf, ptr);
+    } else {
+        throw_spdlog_ex("Failed to format int", static_cast<int>(ec));
+    }
+}
+#else
+template <typename T>
+inline void append_int(T n, memory_buf_t &dest) {
+    fmt::format_int i(n);
+    dest.append(i.data(), i.data() + i.size());
+}
+#endif
+
+template <typename T>
+SPDLOG_CONSTEXPR_FUNC unsigned int count_digits_fallback(T n) {
+    // taken from fmt: https://github.com/fmtlib/fmt/blob/8.0.1/include/fmt/format.h#L899-L912
+    unsigned int count = 1;
+    for (;;) {
+        // Integer division is slow so do it for a group of four digits instead
+        // of for every digit. The idea comes from the talk by Alexandrescu
+        // "Three Optimization Tips for C++". See speed-test for a comparison.
+        if (n < 10) return count;
+        if (n < 100) return count + 1;
+        if (n < 1000) return count + 2;
+        if (n < 10000) return count + 3;
+        n /= 10000u;
+        count += 4;
+    }
+}
+
+template <typename T>
+inline unsigned int count_digits(T n) {
+    using count_type =
+        typename std::conditional<(sizeof(T) > sizeof(uint32_t)), uint64_t, uint32_t>::type;
+#ifdef SPDLOG_USE_STD_FORMAT
+    return count_digits_fallback(static_cast<count_type>(n));
+#else
+    return static_cast<unsigned int>(fmt::
+    // fmt 7.0.0 renamed the internal namespace to detail.
+    // See: https://github.com/fmtlib/fmt/issues/1538
+    #if FMT_VERSION < 70000
+                                         internal
+    #else
+                                         detail
+    #endif
+                                     ::count_digits(static_cast<count_type>(n)));
+#endif
+}
+
+inline void pad2(int n, memory_buf_t &dest) {
+    if (n >= 0 && n < 100)  // 0-99
+    {
+        dest.push_back(static_cast<char>('0' + n / 10));
+        dest.push_back(static_cast<char>('0' + n % 10));
+    } else  // unlikely, but just in case, let fmt deal with it
+    {
+        fmt_lib::format_to(std::back_inserter(dest), SPDLOG_FMT_STRING("{:02}"), n);
+    }
+}
+
+template <typename T>
+inline void pad_uint(T n, unsigned int width, memory_buf_t &dest) {
+    static_assert(std::is_unsigned<T>::value, "pad_uint must get unsigned T");
+    for (auto digits = count_digits(n); digits < width; digits++) {
+        dest.push_back('0');
+    }
+    append_int(n, dest);
+}
+
+template <typename T>
+inline void pad3(T n, memory_buf_t &dest) {
+    static_assert(std::is_unsigned<T>::value, "pad3 must get unsigned T");
+    if (n < 1000) {
+        dest.push_back(static_cast<char>(n / 100 + '0'));
+        n = n % 100;
+        dest.push_back(static_cast<char>((n / 10) + '0'));
+        dest.push_back(static_cast<char>((n % 10) + '0'));
+    } else {
+        append_int(n, dest);
+    }
+}
+
+template <typename T>
+inline void pad6(T n, memory_buf_t &dest) {
+    pad_uint(n, 6, dest);
+}
+
+template <typename T>
+inline void pad9(T n, memory_buf_t &dest) {
+    pad_uint(n, 9, dest);
+}
+
+// return fraction of a second of the given time_point.
+// e.g.
+// fraction<std::milliseconds>(tp) -> will return the millis part of the second
+template <typename ToDuration>
+inline ToDuration time_fraction(log_clock::time_point tp) {
+    using std::chrono::duration_cast;
+    using std::chrono::seconds;
+    auto duration = tp.time_since_epoch();
+    auto secs = duration_cast<seconds>(duration);
+    return duration_cast<ToDuration>(duration) - duration_cast<ToDuration>(secs);
+}
+
+}  // namespace fmt_helper
+}  // namespace details
+}  // namespace spdlog
--- a/ext/spdlog/details/log_msg-inl.h
+++ b/ext/spdlog/details/log_msg-inl.h
@ -0,0 +1,44 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/log_msg.h>
+#endif
+
+#include <spdlog/details/os.h>
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE log_msg::log_msg(spdlog::log_clock::time_point log_time,
+                               spdlog::source_loc loc,
+                               string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : logger_name(a_logger_name),
+      level(lvl),
+      time(log_time)
+#ifndef SPDLOG_NO_THREAD_ID
+      ,
+      thread_id(os::thread_id())
+#endif
+      ,
+      source(loc),
+      payload(msg) {
+}
+
+SPDLOG_INLINE log_msg::log_msg(spdlog::source_loc loc,
+                               string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : log_msg(os::now(), loc, a_logger_name, lvl, msg) {}
+
+SPDLOG_INLINE log_msg::log_msg(string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : log_msg(os::now(), source_loc{}, a_logger_name, lvl, msg) {}
+
+}  // namespace details
+}  // namespace spdlog
--- a/Show More
+++ b/Show More