From: Andreas Tille <tille@debian.org>
Date: Sat, 14 Dec 2024 18:47:40 +0000 (+0100)
Subject: Import fermi-lite_0.1+git20221215.85f159e.orig.tar.xz
X-Git-Tag: archive/raspbian/0.1+git20221215.85f159e-1+rpi1^2~9
X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=8e09405be7caab1d1db1e89ad5071ebbc42cb9b8;p=fermi-lite.git

Import fermi-lite_0.1+git20221215.85f159e.orig.tar.xz

[dgit import orig fermi-lite_0.1+git20221215.85f159e.orig.tar.xz]
---

8e09405be7caab1d1db1e89ad5071ebbc42cb9b8
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..282c932
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.a
+.*.swp
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..d423a43
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,23 @@
+The MIT License
+
+Copyright (c) 2016 Broad Institute
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..edda713
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,46 @@
+CC=			gcc
+CFLAGS=		-g -Wall -O2 -Wno-unused-function #-fno-inline-functions -fno-inline-functions-called-once
+CPPFLAGS=
+INCLUDES=	
+OBJS=		kthread.o misc.o \
+			bseq.o htab.o bfc.o \
+			rle.o rope.o mrope.o rld0.o \
+			unitig.o mag.o bubble.o ksw.o
+PROG=		fml-asm
+LIBS=		-lm -lz -lpthread
+
+.SUFFIXES:.c .o
+
+.c.o:
+		$(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+fml-asm:libfml.a example.o
+		$(CC) $(CFLAGS) $^ -o $@ -L. -lfml $(LIBS)
+
+libfml.a:$(OBJS)
+		$(AR) -csru $@ $(OBJS)
+
+clean:
+		rm -fr gmon.out *.o ext/*.o a.out $(PROG) *~ *.a *.dSYM session*
+
+depend:
+		(LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c)
+
+# DO NOT DELETE
+
+bfc.o: htab.h kmer.h internal.h fml.h kvec.h ksort.h
+bseq.o: fml.h kseq.h
+bubble.o: mag.h kstring.h fml.h kvec.h ksw.h internal.h khash.h
+example.o: fml.h
+htab.o: htab.h kmer.h khash.h
+ksw.o: ksw.h
+mag.o: mag.h kstring.h fml.h kvec.h internal.h kseq.h khash.h ksort.h
+misc.o: internal.h fml.h kstring.h rle.h mrope.h rope.h rld0.h mag.h kvec.h
+misc.o: htab.h kmer.h khash.h
+mrope.o: mrope.h rope.h
+rld0.o: rld0.h
+rle.o: rle.h
+rope.o: rle.h rope.h
+unitig.o: kvec.h kstring.h rld0.h mag.h fml.h internal.h ksort.h
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6dc7ff9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,96 @@
+## Getting Started
+```sh
+git clone https://github.com/lh3/fermi-lite
+cd fermi-lite && make
+./fml-asm test/MT-simu.fq.gz > MT.fq
+# to compile your program:
+gcc -Wall -O2 prog.c -o prog -L/path/to/fermi-lite -lfml -lz -lm -lpthread
+```
+
+## Introduction
+
+Fermi-lite is a standalone C library as well as a command-line tool for
+assembling Illumina short reads in regions from 100bp to 10 million bp in size.
+It is largely a light-weight in-memory version of [fermikit][fk] without
+generating any intermediate files. It inherits the performance, the relatively
+small memory footprint and the features of fermikit. In particular, fermi-lite
+is able to retain heterozygous events and thus can be used to assemble diploid
+regions for the purpose of variant calling. It is one of the limited choices
+for local re-assembly and arguably the easiest to interface.
+
+If you use fermi-lite in your work, please cite the FermiKit paper:
+
+> Li H (2015) FermiKit: assembly-based variant calling for Illumina
+> resequencing data, *Bioinformatics*, **31**:3694-6.
+
+## Usage
+
+For now, see [example.c][example] for the basic use of the library. Here is a
+sketch of the example:
+```cpp
+#include <stdio.h>                      // for printf()
+#include "fml.h"                        // only one header file required
+
+int main(int argc, char *argv[])
+{
+	int i, n_seqs, n_utgs;
+	bseq1_t *seqs;                      // array of input sequences
+	fml_utg_t *utgs;                    // array of output unitigs
+	fml_opt_t opt;
+	if (argc == 1) return 1;            // do nothing if there is no input file
+	seqs = bseq_read(argv[1], &n_seqs); // or fill the array with callers' functions
+	fml_opt_init(&opt);                 // initialize parameters
+	utgs = fml_assemble(&opt, n_seqs, seqs, &n_utgs); // assemble!
+	for (i = 0; i < n_utgs; ++i)        // output in fasta
+		printf(">%d\n%s\n", i+1, utgs[i].seq);
+	fml_utg_destroy(n_utgs, utgs);      // deallocate unitigs
+	return 0;
+}
+```
+The `fml_assemble()` output is in fact a graph. You may have a look at the
+`fml_utg_print_gfa()` function in [misc.c][misc] about how to derive a
+[GFA][gfa] representation from an array of `fml_utg_t` objects.
+
+## Overview of the Assembly Algorithm
+
+Fermi-lite is an overlap-based assembler. Given a set of input reads, it counts
+*k*-mers, estimates the *k*-mer coverage, sets a threshold on *k*-mer
+occurrences to determine solid *k*-mers and then use them correct sequencing
+errors ([Li, 2015][bfc-paper]). After error correction, fermi-lite trims a read
+at an *l*-mer unique to the read. It then constructs an FM-index for trimmed
+reads ([Li, 2014][rb2-paper]) and builds a transitively reduced overlap graph from the
+FM-index ([Simpson and Durbin, 2010][sga-paper]; [Li, 2012][fm1-paper]),
+requiring at least *l*-bp overlaps. In this graph, fermi-lite trims tips and
+pops bubbles caused by uncorrected errors. If a sequence in the graph has
+multiple overlaps, fermi-lite discards overlaps significantly shorter than the
+longest overlap -- this is a technique applied to overlap graph only. The graph
+after these procedure is the final output. Sequences in this graph are unitigs.
+
+## Limitations
+
+1. Fermi-lite can efficiently assemble bacterial genomes. However, it has not
+   been carefully tuned for this type of assembly. While on a few GAGE-B data
+   sets fermi-lite appears to work well, it may not compete with recent
+   mainstream assemblers in general.
+
+2. Fermi-lite does not work with genomes more than tens of megabases as a
+   whole. It would take too much memory to stage all data in memory. For large
+   genomes, please use [fermikit][fk] instead.
+
+3. This is the first iteration of fermi-lite. It is still immarture. In
+   particular, I hope fermi-lite can be smart enough to automatically figure
+   out various parameters based on input, which is very challenging given the
+   high variability of input data.
+
+[sga-paper]: http://www.ncbi.nlm.nih.gov/pubmed/20529929
+[bfc-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25953801
+[rb2-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25107872
+[fm1-paper]: http://www.ncbi.nlm.nih.gov/pubmed/22569178
+[bfc]: http://github.com/lh3/bfc
+[rb2]: http://github.com/lh3/ropebwt2
+[fm2]: http://github.com/lh3/fermi2
+[fk]: http://github.com/lh3/fermikit
+[example]: https://github.com/lh3/fermi-lite/blob/master/example.c
+[header]: https://github.com/lh3/fermi-lite/blob/master/fml.h
+[misc]: https://github.com/lh3/fermi-lite/blob/master/misc.c
+[gfa]: https://github.com/pmelsted/GFA-spec
diff --git a/bfc.c b/bfc.c
new file mode 100644
index 0000000..27ccfde
--- /dev/null
+++ b/bfc.c
@@ -0,0 +1,674 @@
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+#include "htab.h"
+#include "kmer.h"
+#include "internal.h"
+#include "fml.h"
+
+/*******************
+ *** BFC options ***
+ *******************/
+
+typedef struct {
+	int n_threads, q, k, l_pre;
+	int min_cov; // a k-mer is considered solid if the count is no less than this
+
+	int max_end_ext;
+	int win_multi_ec;
+	float min_trim_frac;
+
+	// these ec options cannot be changed on the command line
+	int w_ec, w_ec_high, w_absent, w_absent_high;
+	int max_path_diff, max_heap;
+} bfc_opt_t;
+
+void bfc_opt_init(bfc_opt_t *opt)
+{
+	memset(opt, 0, sizeof(bfc_opt_t));
+	opt->n_threads = 1;
+	opt->q = 20;
+	opt->k = -1;
+	opt->l_pre = -1;
+
+	opt->min_cov = 4; // in BFC, this defaults to 3 because it has Bloom pre-filter
+	opt->win_multi_ec = 10;
+	opt->max_end_ext = 5;
+	opt->min_trim_frac = .8;
+
+	opt->w_ec = 1;
+	opt->w_ec_high = 7;
+	opt->w_absent = 3;
+	opt->w_absent_high = 1;
+	opt->max_path_diff = 15;
+	opt->max_heap = 100;
+}
+
+/**********************
+ *** K-mer counting ***
+ **********************/
+
+#define CNT_BUF_SIZE 256
+
+typedef struct { // cache to reduce locking
+	uint64_t y[2];
+	int is_high;
+} insbuf_t;
+
+typedef struct {
+	int k, q;
+	int n_seqs;
+	const bseq1_t *seqs;
+	bfc_ch_t *ch;
+	int *n_buf;
+	insbuf_t **buf;
+} cnt_step_t;
+
+bfc_kmer_t bfc_kmer_null = {{0,0,0,0}};
+
+static int bfc_kmer_bufclear(cnt_step_t *cs, int forced, int tid)
+{
+	int i, k, r;
+	if (cs->ch == 0) return 0;
+	for (i = k = 0; i < cs->n_buf[tid]; ++i) {
+		r = bfc_ch_insert(cs->ch, cs->buf[tid][i].y, cs->buf[tid][i].is_high, forced);
+		if (r < 0) cs->buf[tid][k++] = cs->buf[tid][i];
+	}
+	cs->n_buf[tid] = k;
+	return k;
+}
+
+static void bfc_kmer_insert(cnt_step_t *cs, const bfc_kmer_t *x, int is_high, int tid)
+{
+	int k = cs->k;
+	uint64_t y[2], hash;
+	hash = bfc_kmer_hash(k, x->x, y);
+	if (bfc_ch_insert(cs->ch, y, is_high, 0) < 0) {
+		insbuf_t *p;
+		if (bfc_kmer_bufclear(cs, 0, tid) == CNT_BUF_SIZE)
+			bfc_kmer_bufclear(cs, 1, tid);
+		p = &cs->buf[tid][cs->n_buf[tid]++];
+		p->y[0] = y[0], p->y[1] = y[1], p->is_high = is_high;
+	}
+}
+
+static void worker_count(void *_data, long k, int tid)
+{
+	cnt_step_t *cs = (cnt_step_t*)_data;
+	const bseq1_t *s = &cs->seqs[k];
+	int i, l;
+	bfc_kmer_t x = bfc_kmer_null;
+	uint64_t qmer = 0, mask = (1ULL<<cs->k) - 1;
+	for (i = l = 0; i < s->l_seq; ++i) {
+		int c = seq_nt6_table[(uint8_t)s->seq[i]] - 1;
+		if (c < 4) {
+			bfc_kmer_append(cs->k, x.x, c);
+			qmer = (qmer<<1 | (s->qual == 0 || s->qual[i] - 33 >= cs->q)) & mask;
+			if (++l >= cs->k) bfc_kmer_insert(cs, &x, (qmer == mask), tid);
+		} else l = 0, qmer = 0, x = bfc_kmer_null;
+	}
+}
+
+struct bfc_ch_s *fml_count(int n, const bseq1_t *seq, int k, int q, int l_pre, int n_threads)
+{
+	int i;
+	cnt_step_t cs;
+	cs.n_seqs = n, cs.seqs = seq, cs.k = k, cs.q = q;
+	cs.ch = bfc_ch_init(cs.k, l_pre);
+	cs.n_buf = calloc(n_threads, sizeof(int));
+	cs.buf = calloc(n_threads, sizeof(void*));
+	for (i = 0; i < n_threads; ++i)
+		cs.buf[i] = malloc(CNT_BUF_SIZE * sizeof(insbuf_t));
+	kt_for(n_threads, worker_count, &cs, cs.n_seqs);
+	for (i = 0; i < n_threads; ++i) free(cs.buf[i]);
+	free(cs.buf); free(cs.n_buf);
+	return cs.ch;
+}
+
+/***************
+ *** Correct ***
+ ***************/
+
+#define BFC_MAX_KMER     63
+#define BFC_MAX_BF_SHIFT 37
+
+#define BFC_MAX_PATHS 4
+#define BFC_EC_HIST 5
+#define BFC_EC_HIST_HIGH 2
+
+#define BFC_EC_MIN_COV_COEF .1
+
+/**************************
+ * Sequence struct for ec *
+ **************************/
+
+#include "kvec.h"
+
+typedef struct { // NOTE: unaligned memory
+	uint8_t b:3, q:1, ob:3, oq:1;
+	uint8_t dummy;
+	uint16_t lcov:6, hcov:6, solid_end:1, high_end:1, ec:1, absent:1;
+	int i;
+} ecbase_t;
+
+typedef kvec_t(ecbase_t) ecseq_t;
+
+static int bfc_seq_conv(const char *s, const char *q, int qthres, ecseq_t *seq)
+{
+	int i, l;
+	l = strlen(s);
+	kv_resize(ecbase_t, *seq, l);
+	seq->n = l;
+	for (i = 0; i < l; ++i) {
+		ecbase_t *c = &seq->a[i];
+		c->b = c->ob = seq_nt6_table[(int)s[i]] - 1;
+		c->q = c->oq = !q? 1 : q[i] - 33 >= qthres? 1 : 0;
+		if (c->b > 3) c->q = c->oq = 0;
+		c->i = i;
+	}
+	return l;
+}
+
+static inline ecbase_t ecbase_comp(const ecbase_t *b)
+{
+	ecbase_t r = *b;
+	r.b = b->b < 4? 3 - b->b : 4;
+	r.ob = b->ob < 4? 3 - b->ob : 4;
+	return r;
+}
+
+static void bfc_seq_revcomp(ecseq_t *seq)
+{
+	int i;
+	for (i = 0; i < seq->n>>1; ++i) {
+		ecbase_t tmp;
+		tmp = ecbase_comp(&seq->a[i]);
+		seq->a[i] = ecbase_comp(&seq->a[seq->n - 1 - i]);
+		seq->a[seq->n - 1 - i] = tmp;
+	}
+	if (seq->n&1) seq->a[i] = ecbase_comp(&seq->a[i]);
+}
+
+/***************************
+ * Independent ec routines *
+ ***************************/
+
+int bfc_ec_greedy_k(int k, int mode, const bfc_kmer_t *x, const bfc_ch_t *ch)
+{
+	int i, j, max = 0, max_ec = -1, max2 = 0;
+	for (i = 0; i < k; ++i) {
+		int c = (x->x[1]>>i&1)<<1 | (x->x[0]>>i&1);
+		for (j = 0; j < 4; ++j) {
+			bfc_kmer_t y = *x;
+			int ret;
+			if (j == c) continue;
+			bfc_kmer_change(k, y.x, i, j);
+			ret = bfc_ch_kmer_occ(ch, &y);
+			if (ret < 0) continue;
+			if ((max&0xff) < (ret&0xff)) max2 = max, max = ret, max_ec = i<<2 | j;
+			else if ((max2&0xff) < (ret&0xff)) max2 = ret;
+		}
+	}
+	return (max&0xff) * 3 > mode && (max2&0xff) < 3? max_ec : -1;
+}
+
+int bfc_ec_first_kmer(int k, const ecseq_t *s, int start, bfc_kmer_t *x)
+{
+	int i, l;
+	*x = bfc_kmer_null;
+	for (i = start, l = 0; i < s->n; ++i) {
+		ecbase_t *c = &s->a[i];
+		if (c->b < 4) {
+			bfc_kmer_append(k, x->x, c->b);
+			if (++l == k) break;
+		} else l = 0, *x = bfc_kmer_null;
+	}
+	return i;
+}
+
+void bfc_ec_kcov(int k, int min_occ, ecseq_t *s, const bfc_ch_t *ch)
+{
+	int i, l, r, j;
+	bfc_kmer_t x = bfc_kmer_null;
+	for (i = l = 0; i < s->n; ++i) {
+		ecbase_t *c = &s->a[i];
+		c->high_end = c->solid_end = c->lcov = c->hcov = 0;
+		if (c->b < 4) {
+			bfc_kmer_append(k, x.x, c->b);
+			if (++l >= k) {
+				if ((r = bfc_ch_kmer_occ(ch, &x)) >= 0) {
+					if ((r>>8&0x3f) >= min_occ+1) c->high_end = 1;
+					if ((r&0xff) >= min_occ) {
+						c->solid_end = 1;
+						for (j = i - k + 1; j <= i; ++j)
+							++s->a[j].lcov, s->a[j].hcov += c->high_end;
+					}
+				}
+			}
+		} else l = 0, x = bfc_kmer_null;
+	}
+}
+
+uint64_t bfc_ec_best_island(int k, const ecseq_t *s)
+{ // IMPORTANT: call bfc_ec_kcov() before calling this function!
+	int i, l, max, max_i;
+	for (i = k - 1, max = l = 0, max_i = -1; i < s->n; ++i) {
+		if (!s->a[i].solid_end) {
+			if (l > max) max = l, max_i = i;
+			l = 0;
+		} else ++l;
+	}
+	if (l > max) max = l, max_i = i;
+	return max > 0? (uint64_t)(max_i - max - k + 1) << 32 | max_i : 0;
+}
+
+/********************
+ * Correct one read *
+ ********************/
+
+#include "ksort.h"
+
+#define ECCODE_MISC      1
+#define ECCODE_MANY_N    2
+#define ECCODE_NO_SOLID  3
+#define ECCODE_UNCORR_N  4
+#define ECCODE_MANY_FAIL 5
+
+typedef struct {
+	uint32_t ec_code:3, brute:1, n_ec:14, n_ec_high:14;
+	uint32_t n_absent:24, max_heap:8;
+} ecstat_t;
+
+typedef struct {
+	uint8_t ec:1, ec_high:1, absent:1, absent_high:1, b:4;
+} bfc_penalty_t;
+
+typedef struct {
+	int tot_pen;
+	int i; // base position
+	int k; // position in the stack
+	int32_t ecpos_high[BFC_EC_HIST_HIGH];
+	int32_t ecpos[BFC_EC_HIST];
+	bfc_kmer_t x;
+} echeap1_t;
+
+typedef struct {
+	int parent, i, tot_pen;
+	uint8_t b;
+	bfc_penalty_t pen;
+	uint16_t cnt;
+} ecstack1_t;
+
+typedef struct {
+	const bfc_opt_t *opt;
+	const bfc_ch_t *ch;
+	kvec_t(echeap1_t) heap;
+	kvec_t(ecstack1_t) stack;
+	ecseq_t seq, tmp, ec[2];
+	int mode;
+	ecstat_t ori_st;
+} bfc_ec1buf_t;
+
+#define heap_lt(a, b) ((a).tot_pen > (b).tot_pen)
+KSORT_INIT(ec, echeap1_t, heap_lt)
+
+static bfc_ec1buf_t *ec1buf_init(const bfc_opt_t *opt, const bfc_ch_t *ch)
+{
+	bfc_ec1buf_t *e;
+	e = calloc(1, sizeof(bfc_ec1buf_t));
+	e->opt = opt, e->ch = ch;
+	return e;
+}
+
+static void ec1buf_destroy(bfc_ec1buf_t *e)
+{	
+	free(e->heap.a); free(e->stack.a); free(e->seq.a); free(e->tmp.a); free(e->ec[0].a); free(e->ec[1].a);
+	free(e);
+}
+
+#define weighted_penalty(o, p) ((o)->w_ec * (p).ec + (o)->w_ec_high * (p).ec_high + (o)->w_absent * (p).absent + (o)->w_absent_high * (p).absent_high)
+
+static void buf_update(bfc_ec1buf_t *e, const echeap1_t *prev, bfc_penalty_t pen, int cnt)
+{
+	ecstack1_t *q;
+	echeap1_t *r;
+	const bfc_opt_t *o = e->opt;
+	int b = pen.b;
+	// update stack
+	kv_pushp(ecstack1_t, e->stack, &q);
+	q->parent = prev->k;
+	q->i = prev->i;
+	q->b = b;
+	q->pen = pen;
+	q->cnt = cnt > 0? cnt&0xff : 0;
+	q->tot_pen = prev->tot_pen + weighted_penalty(o, pen);
+	// update heap
+	kv_pushp(echeap1_t, e->heap, &r);
+	r->i = prev->i + 1;
+	r->k = e->stack.n - 1;
+	r->x = prev->x;
+	if (pen.ec_high) {
+		memcpy(r->ecpos_high + 1, prev->ecpos_high, (BFC_EC_HIST_HIGH - 1) * 4);
+		r->ecpos_high[0] = prev->i;
+	} else memcpy(r->ecpos_high, prev->ecpos_high, BFC_EC_HIST_HIGH * 4);
+	if (pen.ec) {
+		memcpy(r->ecpos + 1, prev->ecpos, (BFC_EC_HIST - 1) * 4);
+		r->ecpos[0] = prev->i;
+	} else memcpy(r->ecpos, prev->ecpos, BFC_EC_HIST * 4);
+	r->tot_pen = q->tot_pen;
+	bfc_kmer_append(e->opt->k, r->x.x, b);
+	ks_heapup_ec(e->heap.n, e->heap.a);
+}
+
+static int buf_backtrack(ecstack1_t *s, int end, const ecseq_t *seq, ecseq_t *path)
+{
+	int i, n_absent = 0;
+	kv_resize(ecbase_t, *path, seq->n);
+	path->n = seq->n;
+	while (end >= 0) {
+		if ((i = s[end].i) < seq->n) {
+			path->a[i].b = s[end].b;
+			path->a[i].ec = s[end].pen.ec;
+			path->a[i].absent = s[end].pen.absent;
+			n_absent += s[end].pen.absent;
+		}
+		end = s[end].parent;
+	}
+	return n_absent;
+}
+
+static int bfc_ec1dir(bfc_ec1buf_t *e, const ecseq_t *seq, ecseq_t *ec, int start, int end, int *max_heap)
+{
+	echeap1_t z;
+	int i, l, rv = -1, path[BFC_MAX_PATHS], n_paths = 0, min_path = -1, min_path_pen = INT_MAX, n_failures = 0;
+	assert(end <= seq->n && end - start >= e->opt->k);
+	e->heap.n = e->stack.n = 0;
+	*max_heap = 0;
+	memset(&z, 0, sizeof(echeap1_t));
+	kv_resize(ecbase_t, *ec, seq->n);
+	ec->n = seq->n;
+	for (z.i = start, l = 0; z.i < end; ++z.i) {
+		int c = seq->a[z.i].b;
+		if (c < 4) {
+			if (++l == e->opt->k) break;
+			bfc_kmer_append(e->opt->k, z.x.x, c);
+		} else l = 0, z.x = bfc_kmer_null;
+	}
+	assert(z.i < end); // before calling this function, there must be at least one solid k-mer
+	z.k = -1;
+	for (i = 0; i < BFC_EC_HIST; ++i) z.ecpos[i] = -1;
+	for (i = 0; i < BFC_EC_HIST_HIGH; ++i) z.ecpos_high[i] = -1;
+	kv_push(echeap1_t, e->heap, z);
+	for (i = 0; i < seq->n; ++i) ec->a[i].b = seq->a[i].b, ec->a[i].ob = seq->a[i].ob;
+	// exhaustive error correction
+	while (1) {
+		int stop = 0;
+		*max_heap = *max_heap > 255? 255 : *max_heap > e->heap.n? *max_heap : e->heap.n;
+		if (e->heap.n == 0) { // may happen when there is an uncorrectable "N"
+			rv = -2;
+			break;
+		}
+		z = e->heap.a[0];
+		e->heap.a[0] = kv_pop(e->heap);
+		ks_heapdown_ec(0, e->heap.n, e->heap.a);
+		if (min_path >= 0 && z.tot_pen > min_path_pen + e->opt->max_path_diff) break;
+		if (z.i - end > e->opt->max_end_ext) stop = 1;
+		if (!stop) {
+			ecbase_t *c = z.i < seq->n? &seq->a[z.i] : 0;
+			int b, os = -1, fixed = 0, other_ext = 0, n_added = 0, added_cnt[4];
+			bfc_penalty_t added[4];
+			// test if the read extension alone is enough
+			if (z.i > end) fixed = 1;
+			if (c && c->b < 4) { // A, C, G or T
+				bfc_kmer_t x = z.x;
+				bfc_kmer_append(e->opt->k, x.x, c->b);
+				os = bfc_ch_kmer_occ(e->ch, &x);
+				if (c->q && (os&0xff) >= e->opt->min_cov + 1 && c->lcov >= e->opt->min_cov + 1) fixed = 1;
+				else if (c->hcov > e->opt->k * .75) fixed = 1;
+			}
+			// extension
+			for (b = 0; b < 4; ++b) {
+				bfc_penalty_t pen;
+				if (fixed && c && b != c->b) continue;
+				if (c == 0 || b != c->b) {
+					int s;
+					bfc_kmer_t x = z.x;
+					pen.ec = 0, pen.ec_high = 0, pen.absent = 0, pen.absent_high = 0, pen.b = b;
+					if (c) { // not over the end
+						if (c->q && z.ecpos_high[BFC_EC_HIST_HIGH-1] >= 0 && z.i - z.ecpos_high[BFC_EC_HIST_HIGH-1] < e->opt->win_multi_ec) continue; // no close highQ corrections
+						if (z.ecpos[BFC_EC_HIST-1] >= 0 && z.i - z.ecpos[BFC_EC_HIST-1] < e->opt->win_multi_ec) continue; // no clustered corrections
+					}
+					bfc_kmer_append(e->opt->k, x.x, b);
+					s = bfc_ch_kmer_occ(e->ch, &x);
+					if (s < 0 || (s&0xff) < e->opt->min_cov) continue; // not solid
+					//if (os >= 0 && (s&0xff) - (os&0xff) < 2) continue; // not sufficiently better than the read path
+					pen.ec = c && c->b < 4? 1 : 0;
+					pen.ec_high = pen.ec? c->oq : 0;
+					pen.absent = 0;
+					pen.absent_high = ((s>>8&0xff) < e->opt->min_cov);
+					pen.b = b;
+					added_cnt[n_added] = s;
+					added[n_added++] = pen;
+					++other_ext;
+				} else {
+					pen.ec = pen.ec_high = 0;
+					pen.absent = (os < 0 || (os&0xff) < e->opt->min_cov);
+					pen.absent_high = (os < 0 || (os>>8&0xff) < e->opt->min_cov);
+					pen.b = b;
+					added_cnt[n_added] = os;
+					added[n_added++] = pen;
+				}
+			} // ~for(b)
+			if (fixed == 0 && other_ext == 0) ++n_failures;
+			if (n_failures > seq->n * 2) {
+				rv = -3;
+				break;
+			}
+			if (c || n_added == 1) {
+				if (n_added > 1 && e->heap.n > e->opt->max_heap) { // to prevent heap explosion
+					int min_b = -1, min = INT_MAX;
+					for (b = 0; b < n_added; ++b) {
+						int t = weighted_penalty(e->opt, added[b]);
+						if (min > t) min = t, min_b = b;
+					}
+					buf_update(e, &z, added[min_b], added_cnt[min_b]);
+				} else {
+					for (b = 0; b < n_added; ++b)
+						buf_update(e, &z, added[b], added_cnt[b]);
+				}
+			} else {
+				if (n_added == 0)
+					e->stack.a[z.k].tot_pen += e->opt->w_absent * (e->opt->max_end_ext - (z.i - end));
+				stop = 1;
+			}
+		} // ~if(!stop)
+		if (stop) {
+			if (e->stack.a[z.k].tot_pen < min_path_pen)
+				min_path_pen = e->stack.a[z.k].tot_pen, min_path = n_paths;
+			path[n_paths++] = z.k;
+			if (n_paths == BFC_MAX_PATHS) break;
+		}
+	} // ~while(1)
+	// backtrack
+	if (n_paths == 0) return rv;
+	assert(min_path >= 0 && min_path < n_paths && e->stack.a[path[min_path]].tot_pen == min_path_pen);
+	rv = buf_backtrack(e->stack.a, path[min_path], seq, ec);
+	for (i = 0; i < ec->n; ++i) // mask out uncorrected regions
+		if (i < start + e->opt->k || i >= end) ec->a[i].b = 4;
+	return rv;
+}
+
+ecstat_t bfc_ec1(bfc_ec1buf_t *e, char *seq, char *qual)
+{
+	int i, start = 0, end = 0, n_n = 0, rv[2], max_heap[2];
+	uint64_t r;
+	ecstat_t s;
+
+	s.ec_code = ECCODE_MISC, s.brute = 0, s.n_ec = s.n_ec_high = 0, s.n_absent = s.max_heap = 0;
+	bfc_seq_conv(seq, qual, e->opt->q, &e->seq);
+	for (i = 0; i < e->seq.n; ++i)
+		if (e->seq.a[i].ob > 3) ++n_n;
+	if (n_n > e->seq.n * .05) {
+		s.ec_code = ECCODE_MANY_N;
+		return s;
+	}
+	bfc_ec_kcov(e->opt->k, e->opt->min_cov, &e->seq, e->ch);
+	r = bfc_ec_best_island(e->opt->k, &e->seq);
+	if (r == 0) { // no solid k-mer
+		bfc_kmer_t x;
+		int ec = -1;
+		while ((end = bfc_ec_first_kmer(e->opt->k, &e->seq, start, &x)) < e->seq.n) {
+			ec = bfc_ec_greedy_k(e->opt->k, e->mode, &x, e->ch);
+			if (ec >= 0) break;
+			if (end + (e->opt->k>>1) >= e->seq.n) break;
+			start = end - (e->opt->k>>1);
+		}
+		if (ec >= 0) {
+			e->seq.a[end - (ec>>2)].b = ec&3;
+			++end; start = end - e->opt->k;
+			s.brute = 1;
+		} else {
+			s.ec_code = ECCODE_NO_SOLID;
+			return s;
+		}
+	} else start = r>>32, end = (uint32_t)r;
+	if ((rv[0] = bfc_ec1dir(e, &e->seq, &e->ec[0], start, e->seq.n, &max_heap[0])) < 0) {
+		s.ec_code = rv[0] == -2? ECCODE_UNCORR_N : rv[0] == -3? ECCODE_MANY_FAIL : ECCODE_MISC;
+		return s;
+	}
+	bfc_seq_revcomp(&e->seq);
+	if ((rv[1] = bfc_ec1dir(e, &e->seq, &e->ec[1], e->seq.n - end, e->seq.n, &max_heap[1])) < 0) {
+		s.ec_code = rv[1] == -2? ECCODE_UNCORR_N : rv[1] == -3? ECCODE_MANY_FAIL : ECCODE_MISC;
+		return s;
+	}
+	s.max_heap = max_heap[0] > max_heap[1]? max_heap[0] : max_heap[1];
+	s.ec_code = 0, s.n_absent = rv[0] + rv[1];
+	bfc_seq_revcomp(&e->ec[1]);
+	bfc_seq_revcomp(&e->seq);
+	for (i = 0; i < e->seq.n; ++i) {
+		ecbase_t *c = &e->seq.a[i];
+		if (e->ec[0].a[i].b == e->ec[1].a[i].b)
+			c->b = e->ec[0].a[i].b > 3? e->seq.a[i].b : e->ec[0].a[i].b;
+		else if (e->ec[1].a[i].b > 3) c->b = e->ec[0].a[i].b;
+		else if (e->ec[0].a[i].b > 3) c->b = e->ec[1].a[i].b;
+		else c->b = e->seq.a[i].ob;
+	}
+	for (i = 0; i < e->seq.n; ++i) {
+		int is_diff = !(e->seq.a[i].b == e->seq.a[i].ob);
+		if (is_diff) {
+			++s.n_ec;
+			if (e->seq.a[i].q) ++s.n_ec_high;
+		}
+		seq[i] = (is_diff? "acgtn" : "ACGTN")[e->seq.a[i].b];
+		if (qual) qual[i] = is_diff? 34 + e->seq.a[i].ob : "+?"[e->seq.a[i].q];
+	}
+	return s;
+}
+
+/********************
+ * Error correction *
+ ********************/
+
+typedef struct {
+	const bfc_opt_t *opt;
+	const bfc_ch_t *ch;
+	bfc_ec1buf_t **e;
+	int64_t n_processed;
+	int n_seqs, flt_uniq;
+	bseq1_t *seqs;
+} ec_step_t;
+
+static uint64_t max_streak(int k, const bfc_ch_t *ch, const bseq1_t *s)
+{
+	int i, l;
+	uint64_t max = 0, t = 0;
+	bfc_kmer_t x = bfc_kmer_null;
+	for (i = l = 0; i < s->l_seq; ++i) {
+		int c = seq_nt6_table[(uint8_t)s->seq[i]] - 1;
+		if (c < 4) { // not an ambiguous base
+			bfc_kmer_append(k, x.x, c);
+			if (++l >= k) { // ok, we have a k-mer now
+				if (bfc_ch_kmer_occ(ch, &x) > 0) t += 1ULL<<32;
+				else t = i + 1;
+			} else t = i + 1;
+		} else l = 0, x = bfc_kmer_null, t = i + 1;
+		max = max > t? max : t;
+	}
+	return max;
+}
+
+static void worker_ec(void *_data, long k, int tid)
+{
+	ec_step_t *es = (ec_step_t*)_data;
+	bseq1_t *s = &es->seqs[k];
+	if (es->flt_uniq) {
+		uint64_t max;
+		max = max_streak(es->opt->k, es->ch, s);
+		if (max>>32 && (double)((max>>32) + es->opt->k - 1) / s->l_seq > es->opt->min_trim_frac) {
+			int start = (uint32_t)max, end = start + (max>>32);
+			start -= es->opt->k - 1;
+			assert(start >= 0 && end <= s->l_seq);
+			memmove(s->seq, s->seq + start, end - start);
+			s->l_seq = end - start;
+			s->seq[s->l_seq] = 0;
+			if (s->qual) {
+				memmove(s->qual, s->qual + start, s->l_seq);
+				s->qual[s->l_seq] = 0;
+			}
+		} else {
+			free(s->seq); free(s->qual);
+			s->l_seq = 0, s->seq = s->qual = 0;
+		}
+	} else bfc_ec1(es->e[tid], s->seq, s->qual);
+}
+
+float fml_correct_core(const fml_opt_t *opt, int flt_uniq, int n, bseq1_t *seq)
+{
+	bfc_ch_t *ch;
+	int i, mode;
+	uint64_t hist[256], hist_high[64], tot_len = 0, sum_k = 0, tot_k = 0;
+	ec_step_t es;
+	bfc_opt_t bfc_opt;
+	float kcov;
+
+	// initialize BFC options
+	bfc_opt_init(&bfc_opt);
+	bfc_opt.n_threads = opt->n_threads; // copy from FML options
+	bfc_opt.k = flt_uniq? opt->min_asm_ovlp : opt->ec_k;
+	for (i = 0; i < n; ++i) tot_len += seq[i].l_seq; // compute total length
+	bfc_opt.l_pre = tot_len - 8 < 20? tot_len - 8 : 20;
+
+	memset(&es, 0, sizeof(ec_step_t));
+	es.opt = &bfc_opt, es.n_seqs = n, es.seqs = seq, es.flt_uniq = flt_uniq;
+
+	es.ch = ch = fml_count(n, seq, bfc_opt.k, bfc_opt.q, bfc_opt.l_pre, bfc_opt.n_threads);
+	mode = bfc_ch_hist(ch, hist, hist_high);
+	for (i = opt->min_cnt; i < 256; ++i)
+		sum_k += hist[i], tot_k += i * hist[i];
+	kcov = (float)tot_k / sum_k;
+	bfc_opt.min_cov = (int)(BFC_EC_MIN_COV_COEF * kcov + .499);
+	bfc_opt.min_cov = bfc_opt.min_cov < opt->max_cnt? bfc_opt.min_cov : opt->max_cnt;
+	bfc_opt.min_cov = bfc_opt.min_cov > opt->min_cnt? bfc_opt.min_cov : opt->min_cnt;
+
+	es.e = calloc(es.opt->n_threads, sizeof(void*));
+	for (i = 0; i < es.opt->n_threads; ++i)
+		es.e[i] = ec1buf_init(es.opt, ch), es.e[i]->mode = mode;
+	kt_for(es.opt->n_threads, worker_ec, &es, es.n_seqs);
+	for (i = 0; i < es.opt->n_threads; ++i)
+		ec1buf_destroy(es.e[i]);
+	free(es.e);
+	bfc_ch_destroy(ch);
+	return kcov;
+}
+
+float fml_correct(const fml_opt_t *opt, int n, bseq1_t *seq)
+{
+	return fml_correct_core(opt, 0, n, seq);
+}
+
+float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq)
+{
+	return fml_correct_core(opt, 1, n, seq);
+}
diff --git a/bseq.c b/bseq.c
new file mode 100644
index 0000000..719bafb
--- /dev/null
+++ b/bseq.c
@@ -0,0 +1,61 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "fml.h"
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+bseq1_t *bseq_read(const char *fn, int *n_)
+{
+	gzFile fp;
+	bseq1_t *seqs;
+	kseq_t *ks;
+	int m, n;
+	uint64_t size = 0;
+
+	*n_ = 0;
+	fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+	if (fp == 0) return 0;
+	ks = kseq_init(fp);
+
+	m = n = 0; seqs = 0;
+	while (kseq_read(ks) >= 0) {
+		bseq1_t *s;
+		if (n >= m) {
+			m = m? m<<1 : 256;
+			seqs = realloc(seqs, m * sizeof(bseq1_t));
+		}
+		s = &seqs[n];
+		s->seq = strdup(ks->seq.s);
+		s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
+		s->l_seq = ks->seq.l;
+		size += seqs[n++].l_seq;
+	}
+	*n_ = n;
+
+	kseq_destroy(ks);
+	gzclose(fp);
+	return seqs;
+}
+
+void seq_reverse(int l, unsigned char *s)
+{
+	int i;
+	for (i = 0; i < l>>1; ++i) {
+		int tmp = s[l-1-i];
+		s[l-1-i] = s[i]; s[i] = tmp;
+	}
+}
+
+void seq_revcomp6(int l, unsigned char *s)
+{
+	int i;
+	for (i = 0; i < l>>1; ++i) {
+		int tmp = s[l-1-i];
+		tmp = (tmp >= 1 && tmp <= 4)? 5 - tmp : tmp;
+		s[l-1-i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i];
+		s[i] = tmp;
+	}
+	if (l&1) s[i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i];
+}
diff --git a/bubble.c b/bubble.c
new file mode 100644
index 0000000..cbe54be
--- /dev/null
+++ b/bubble.c
@@ -0,0 +1,366 @@
+#include <limits.h>
+#include <stdio.h>
+#include "mag.h"
+#include "kvec.h"
+#include "ksw.h"
+#include "internal.h"
+#include "khash.h"
+KHASH_DECLARE(64, uint64_t, uint64_t)
+
+typedef khash_t(64) hash64_t;
+
+#define MAX_N_DIFF 2.01 // for evaluating alignment after SW
+#define MAX_R_DIFF 0.1
+#define L_DIFF_COEF 0.2 // n_diff=|l_0 - l_1|*L_DIFF_COEF
+
+#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
+#define edge_is_del(_x)   ((_x).x == (uint64_t)-2 || (_x).y == 0)
+
+/******************
+ * Closed bubbles *
+ ******************/
+
+typedef struct {
+	uint64_t id;
+	int cnt[2];
+	int n[2][2], d[2][2];
+	uint64_t v[2][2];
+} trinfo_t;
+
+const trinfo_t g_trinull = {-1, {0, 0}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{-1, -1}, {-1, -1}}};
+
+typedef struct {
+	int n, m;
+	trinfo_t **buf;
+} tipool_t;
+
+struct mogb_aux {
+	tipool_t pool;
+	ku64_v stack;
+	hash64_t *h;
+};
+
+mogb_aux_t *mag_b_initaux(void)
+{
+	mogb_aux_t *aux = calloc(1, sizeof(mogb_aux_t));
+	aux->h = kh_init(64);
+	return aux;
+}
+
+void mag_b_destroyaux(mogb_aux_t *b)
+{
+	int i;
+	for (i = 0; i < b->pool.m; ++i)
+		free(b->pool.buf[i]);
+	free(b->pool.buf); free(b->stack.a);
+	kh_destroy(64, b->h);
+	free(b);
+}
+
+#define tiptr(p) ((trinfo_t*)(p)->ptr)
+
+static inline trinfo_t *tip_alloc(tipool_t *pool, uint32_t id)
+{ // allocate an object from the memory pool
+	trinfo_t *p;
+	if (pool->n == pool->m) {
+		int i, new_m = pool->m? pool->m<<1 : 256;
+		pool->buf = realloc(pool->buf, new_m * sizeof(void*));
+		for (i = pool->m; i < new_m; ++i)
+			pool->buf[i] = malloc(sizeof(trinfo_t));
+		pool->m = new_m;
+	}
+	p = pool->buf[pool->n++];
+	*p = g_trinull;
+	p->id = id;
+	return p;
+}
+
+static void backtrace(mag_t *g, uint64_t end, uint64_t start, hash64_t *h)
+{
+	while (end>>32 != start) {
+		int ret;
+		kh_put(64, h, end>>33, &ret);
+		end = tiptr(&g->v.a[end>>33])->v[(end>>32^1)&1][end&1];
+	}
+}
+
+void mag_vh_simplify_bubble(mag_t *g, uint64_t idd, int max_vtx, int max_dist, mogb_aux_t *a)
+{
+	int i, n_pending = 0;
+	magv_t *p, *q;
+
+	p = &g->v.a[idd>>1];
+	if (p->len < 0 || p->nei[idd&1].n < 2) return; // stop if p is deleted or it has 0 or 1 neighbor
+	// reset aux data
+	a->stack.n = a->pool.n = 0;
+	if (kh_n_buckets(a->h) >= 64) {
+		kh_destroy(64, a->h);
+		a->h = kh_init(64);
+	} else kh_clear(64, a->h);
+	// add the initial vertex
+	p->ptr = tip_alloc(&a->pool, idd>>1);
+	tiptr(p)->d[(idd&1)^1][0] = -p->len;
+	tiptr(p)->n[(idd&1)^1][0] = -p->nsr;
+	kv_push(uint64_t, a->stack, idd^1);
+	// essentially a topological sorting
+	while (a->stack.n) {
+		uint64_t x, y;
+		ku128_v *r;
+		if (a->stack.n == 1 && a->stack.a[0] != (idd^1) && n_pending == 0) break; // found the other end of the bubble
+		x = kv_pop(a->stack);
+		p = &g->v.a[x>>1];
+		//printf("%lld:%lld\n", p->k[0], p->k[1]);
+		r = &p->nei[(x&1)^1]; // we will look the the neighbors from the other end of the unitig
+		if (a->pool.n > max_vtx || tiptr(p)->d[x&1][0] > max_dist || tiptr(p)->d[x&1][1] > max_dist || r->n == 0) break; // we failed
+		// set the distance to p's neighbors
+		for (i = 0; i < r->n; ++i) {
+			int nsr, dist, which;
+			if ((int64_t)r->a[i].x < 0) continue;
+			y = mag_tid2idd(g->h, r->a[i].x);
+			if (y == (idd^1)) { // there is a loop involving the initial vertex
+				a->stack.n = 0;
+				break; // not a bubble; stop; this will jump out of the while() loop
+			}
+			q = &g->v.a[y>>1];
+			if (q->ptr == 0) { // has not been attempted
+				q->ptr = tip_alloc(&a->pool, y>>1), ++n_pending;
+				mag_v128_clean(&q->nei[y&1]); // make sure there are no deleted edges
+			}
+			nsr  = tiptr(p)->n[x&1][0] + p->nsr; which = 0;
+			dist = tiptr(p)->d[x&1][0] + p->len - r->a[i].y;
+			//printf("01 [%d]\t[%d,%d]\t[%d,%d]\n", i, tiptr(q)->n[y&1][0], tiptr(q)->n[y&1][1], tiptr(q)->d[y&1][0], tiptr(q)->d[y&1][1]);
+			// test and possibly update the tentative distance
+			if (nsr > tiptr(q)->n[y&1][0]) { // then move the best to the 2nd best and update the best
+				tiptr(q)->n[y&1][1] = tiptr(q)->n[y&1][0]; tiptr(q)->n[y&1][0] = nsr;
+				tiptr(q)->v[y&1][1] = tiptr(q)->v[y&1][0]; tiptr(q)->v[y&1][0] = (x^1)<<32|i<<1|which;
+				tiptr(q)->d[y&1][1] = tiptr(q)->d[y&1][0]; tiptr(q)->d[y&1][0] = dist;
+				nsr  = tiptr(p)->n[x&1][1] + p->nsr; which = 1; // now nsr is the 2nd best
+				dist = tiptr(p)->d[x&1][1] + p->len - r->a[i].y;
+			}
+			if (nsr > tiptr(q)->n[y&1][1]) // update the 2nd best
+				tiptr(q)->n[y&1][1] = nsr, tiptr(q)->v[y&1][1] = (x^1)<<32|i<<1|which, tiptr(q)->d[y&1][1] = dist;
+			if (++tiptr(q)->cnt[y&1] == q->nei[y&1].n) { // all q's predecessors have been processed; then push
+				kv_push(uint64_t, a->stack, y);
+				--n_pending;
+			}
+		}
+	}
+	if (n_pending == 0 && a->stack.n == 1) { // found a bubble
+		uint64_t x = a->stack.a[0];
+		p = &g->v.a[x>>1];
+		//printf("(%d,%d)\t(%d,%d)\n", tiptr(p)->n[x&1][0], tiptr(p)->n[x&1][1], tiptr(p)->d[x&1][0], tiptr(p)->d[x&1][1]);
+		backtrace(g, tiptr(p)->v[x&1][0], idd, a->h);
+		backtrace(g, tiptr(p)->v[x&1][1], idd, a->h);
+	}
+	for (i = 0; i < a->pool.n; ++i) // reset p->ptr
+		g->v.a[a->pool.buf[i]->id].ptr = 0;
+	if (kh_size(a->h)) { // bubble detected; then remove verticies not in the top two paths
+		for (i = 1; i < a->pool.n; ++i) { // i=0 corresponds to the initial vertex which we want to exclude
+			uint64_t id = a->pool.buf[i]->id;
+			if (id != a->stack.a[0]>>1 && kh_get(64, a->h, id) == kh_end(a->h)) // not in the top two paths
+				mag_v_del(g, &g->v.a[id]);
+		}
+	}
+}
+
+void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist)
+{
+	int64_t i;
+	mogb_aux_t *a;
+	a = mag_b_initaux();
+	for (i = 0; i < g->v.n; ++i) {
+		mag_vh_simplify_bubble(g, i<<1|0, max_vtx, max_dist, a);
+		mag_vh_simplify_bubble(g, i<<1|1, max_vtx, max_dist, a);
+	}
+	mag_b_destroyaux(a);
+	mag_g_merge(g, 0, 0);
+}
+
+int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int max_bdiff, int aggressive)
+{
+	magv_t *p = &g->v.a[idd>>1], *q[2];
+	ku128_v *r;
+	int i, j, k, dir[2], l[2], ret = -1;
+	char *seq[2], *cov[2];
+	float n_diff, r_diff, avg[2], max_n_diff = aggressive? MAX_N_DIFF * 2. : MAX_N_DIFF;
+
+	if (p->len < 0 || p->nei[idd&1].n != 2) return ret; // deleted or no bubble
+	r = &p->nei[idd&1];
+	for (j = 0; j < 2; ++j) {
+		uint64_t x;
+		if ((int64_t)r->a[j].x < 0) return ret;
+		x = mag_tid2idd(g->h, r->a[j].x);
+		dir[j] = x&1;
+		q[j] = &g->v.a[x>>1];
+		if (q[j]->nei[0].n != 1 || q[j]->nei[1].n != 1) return ret; // no bubble
+		l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); // bubble length, excluding overlaps
+	}
+	if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble
+	if (l[0] - l[1] > max_bdiff || l[1] - l[0] > max_bdiff) return 1; // huge bubble differences
+	for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[]
+		if (l[j] > 0) {
+			seq[j] = malloc(l[j]<<1);
+			cov[j] = seq[j] + l[j];
+			for (i = 0; i < l[j]; ++i) {
+				seq[j][i] = q[j]->seq[i + q[j]->nei[0].a->y];
+				cov[j][i] = q[j]->cov[i + q[j]->nei[0].a->y];
+			}
+			if (dir[j]) {
+				seq_revcomp6(l[j], (uint8_t*)seq[j]);
+				seq_reverse(l[j], (uint8_t*)cov[j]);
+			}
+			for (i = 0, avg[j] = 0.; i < l[j]; ++i) {
+				--seq[j][i]; // change DNA6 encoding to DNA4 for SW below
+				avg[j] += cov[j][i] - 33;
+			}
+			avg[j] /= l[j];
+		} else { // l[j] <= 0; this may happen around a tandem repeat
+			int beg, end;
+			seq[j] = cov[j] = 0;
+			beg = q[j]->nei[0].a->y; end = q[j]->len - q[j]->nei[1].a->y;
+			if (beg > end) beg ^= end, end ^= beg, beg ^= end; // swap
+			if (beg < end) {
+				for (i = beg, avg[j] = 0.; i < end; ++i)
+					avg[j] += q[j]->cov[i] - 33;
+				avg[j] /= end - beg;
+			} else avg[j] = q[j]->cov[beg] - 33; // FIXME: when q[j] is contained, weird thing may happen
+		}
+	}
+	ret = 1;
+	if (l[0] > 0 && l[1] > 0) { // then do SW to compute n_diff and r_diff
+		int8_t mat[16];
+		kswr_t aln;
+		for (i = k = 0; i < 4; ++i)
+			for (j = 0; j < 4; ++j)
+				mat[k++] = i == j? 5 : -4;
+		aln = ksw_align(l[0], (uint8_t*)seq[0], l[1], (uint8_t*)seq[1], 4, mat, 5, 2, 0, 0);
+		n_diff = ((l[0] < l[1]? l[0] : l[1]) * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score
+		r_diff = n_diff / ((l[0] + l[1]) / 2.);
+		//fprintf(stderr, "===> %f %f <===\n", n_diff, r_diff); for (j = 0; j < 2; ++j) { for (i = 0; i < l[j]; ++i) fputc("ACGTN"[(int)seq[j][i]], stderr); fputc('\n', stderr); }
+	} else {
+		n_diff = abs(l[0] - l[1]) * L_DIFF_COEF;
+		r_diff = 1.;
+		//fprintf(stderr, "---> (%d,%d) <---\n", l[0], l[1]);
+	}
+	if (n_diff < max_n_diff || r_diff < MAX_R_DIFF) {
+		j = avg[0] < avg[1]? 0 : 1;
+		if (aggressive || (avg[j] < max_cov && avg[j] / (avg[j^1] + avg[j]) < max_frac)) {
+			mag_v_del(g, q[j]);
+			ret = 2;
+		}
+	}
+	free(seq[0]); free(seq[1]);
+	return ret;
+}
+
+void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive)
+{
+	int64_t i, n_examined = 0, n_popped = 0;
+	int ret;
+
+	for (i = 0; i < g->v.n; ++i) {
+		ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, max_bdiff, aggressive);
+		if (ret >= 1) ++n_examined;
+		if (ret >= 2) ++n_popped;
+		ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, max_bdiff, aggressive);
+		if (ret >= 1) ++n_examined;
+		if (ret >= 2) ++n_popped;
+	}
+	if (fm_verbose >= 3)
+		fprintf(stderr, "[M::%s] examined %ld bubbles and popped %ld\n", __func__, (long)n_examined, (long)n_popped);
+	mag_g_merge(g, 0, min_merge_len);
+}
+
+/****************
+ * Open bubbles *
+ ****************/
+
+void mag_v_pop_open(mag_t *g, magv_t *p, int min_elen)
+{
+	int i, j, k, l, dir, max_l, l_qry;
+	magv_t *q, *t;
+	ku128_v *r, *s;
+	uint8_t *seq;
+	int8_t mat[16];
+
+	if (p->len < 0 || p->len >= min_elen) return;
+	//if (p->nei[0].n && p->nei[1].n) return; // FIXME: between this and the next line, which is better?
+	if (p->nei[0].n + p->nei[1].n != 1) return;
+	dir = p->nei[0].n? 0 : 1;
+	// initialize the scoring system
+	for (i = k = 0; i < 4; ++i)
+		for (j = 0; j < 4; ++j)
+			mat[k++] = i == j? 5 : -4;
+	
+	s = &p->nei[dir];
+	for (l = 0; l < s->n; ++l) { // if we use "if (p->nei[0].n + p->nei[1].n != 1)", s->n == 1
+		uint64_t v;
+		kswq_t *qry;
+		if ((int64_t)s->a[l].x < 0) continue;
+		v = mag_tid2idd(g->h, s->a[l].x);
+		q = &g->v.a[v>>1];
+		if (q == p || q->nei[v&1].n == 1) continue;
+		// get the query ready
+		max_l = (p->len - s->a[l].y) * 2;
+		seq = malloc(max_l + 1);
+		if (dir == 0) { // forward strand
+			for (j = s->a[l].y, k = 0; j < p->len; ++j)
+				seq[k++] = p->seq[j] - 1;
+		} else { // reverse
+			for (j = p->len - s->a[l].y - 1, k = 0; j >= 0; --j)
+				seq[k++] = 4 - p->seq[j];
+		}
+		l_qry = k;
+		qry = ksw_qinit(2, l_qry, seq, 4, mat);
+		//fprintf(stderr, "===> %lld:%lld:%d[%d], %d, %ld <===\n", p->k[0], p->k[1], s->n, l, p->nsr, q->nei[v&1].n);
+		//for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fputc('\n', stderr);
+
+		r = &q->nei[v&1];
+		for (i = 0; i < r->n; ++i) {
+			uint64_t w;
+			kswr_t aln;
+			if (r->a[i].x == p->k[dir] || (int64_t)r->a[i].x < 0) continue;
+			w = mag_tid2idd(g->h, r->a[i].x);
+			// get the target sequence
+			t = &g->v.a[w>>1];
+			if (w&1) { // reverse strand
+				for (j = t->len - r->a[i].y - 1, k = 0; j >= 0 && k < max_l; --j)
+					seq[k++] = 4 - t->seq[j];
+			} else {
+				for (j = r->a[i].y, k = 0; j < t->len && k < max_l; ++j)
+					seq[k++] = t->seq[j] - 1;
+			}
+			aln = ksw_align(0, 0, k, seq, 4, mat, 5, 2, 0, &qry);
+			//for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fprintf(stderr, "\t%d\t%f\n", aln.score, (l_qry * 5. - aln.score) / (5. + 4.));
+			if (aln.score >= l_qry * 5 / 2) {
+				double r_diff, n_diff;
+				n_diff = (l_qry * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score
+				r_diff = n_diff / l_qry;
+				if (n_diff < MAX_N_DIFF || r_diff < MAX_R_DIFF) break;
+			}
+		}
+
+		if (i != r->n) {
+			// mark delete in p and delete in q
+			edge_mark_del(s->a[l]);
+			for (i = 0; i < r->n; ++i)
+				if (r->a[i].x == p->k[dir])
+					edge_mark_del(r->a[i]);
+		}
+		free(seq); free(qry);
+	}
+
+	for (i = 0; i < s->n; ++i)
+		if (!edge_is_del(s->a[i])) break;
+	if (i == s->n) mag_v_del(g, p); // p is not connected to any other vertices
+}
+
+void mag_g_pop_open(mag_t *g, int min_elen)
+{
+	int64_t i;
+	for (i = 0; i < g->v.n; ++i)
+		mag_v_pop_open(g, &g->v.a[i], min_elen);
+	if (fm_verbose >= 3)
+		fprintf(stderr, "[M:%s] popped open bubbles\n", __func__);
+	mag_g_merge(g, 0, 0);
+}
diff --git a/example.c b/example.c
new file mode 100644
index 0000000..5cf65dd
--- /dev/null
+++ b/example.c
@@ -0,0 +1,50 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "fml.h"
+
+int main(int argc, char *argv[])
+{
+	fml_opt_t opt;
+	int c, n_seqs, n_utg, gfa_out = 0;
+	bseq1_t *seqs;
+	fml_utg_t *utg;
+
+	fml_opt_init(&opt);
+	while ((c = getopt(argc, argv, "gOAe:l:r:t:c:d:v:")) >= 0) {
+		if (c == 'e') opt.ec_k = atoi(optarg);
+		else if (c == 'l') opt.min_asm_ovlp = atoi(optarg);
+		else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg);
+		else if (c == 'A') opt.mag_opt.flag |= MAG_F_AGGRESSIVE;
+		else if (c == 'O') opt.mag_opt.flag &= ~MAG_F_POPOPEN;
+		else if (c == 'd') opt.mag_opt.max_bdiff = atoi(optarg);
+		else if (c == 't') opt.n_threads = atoi(optarg);
+		else if (c == 'g') gfa_out = 1;
+		else if (c == 'v') fm_verbose = atoi(optarg);
+		else if (c == 'c') {
+			char *p;
+			opt.min_cnt = strtol(optarg, &p, 10);
+			if (*p == ',') opt.max_cnt = strtol(p + 1, &p, 10);
+		}
+	}
+	if (argc == optind) {
+		fprintf(stderr, "Usage: fml-asm [options] <in.fq>\n");
+		fprintf(stderr, "Options:\n");
+		fprintf(stderr, "  -e INT          k-mer length for error correction (0 for auto; -1 to disable) [%d]\n", opt.ec_k);
+		fprintf(stderr, "  -c INT1[,INT2]  range of k-mer & read count thresholds for ec and graph cleaning [%d,%d]\n", opt.min_cnt, opt.max_cnt);
+		fprintf(stderr, "  -l INT          min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp);
+		fprintf(stderr, "  -r FLOAT        drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1);
+		fprintf(stderr, "  -t INT          number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads);
+		fprintf(stderr, "  -d INT          retain a bubble if one side is longer than the other side by >INT-bp [%d]\n", opt.mag_opt.max_bdiff);
+		fprintf(stderr, "  -A              discard heterozygotes (apply this to assemble bacterial genomes; override -O)\n");
+		fprintf(stderr, "  -O              don't apply aggressive tip trimming\n");
+		fprintf(stderr, "  -g              output the assembly graph in the GFA format\n");
+		return 1;
+	}
+	seqs = bseq_read(argv[optind], &n_seqs);
+	utg = fml_assemble(&opt, n_seqs, seqs, &n_utg);
+	if (!gfa_out) fml_utg_print(n_utg, utg);
+	else fml_utg_print_gfa(n_utg, utg);
+	fml_utg_destroy(n_utg, utg);
+	return 0;
+}
diff --git a/fml.h b/fml.h
new file mode 100644
index 0000000..fd7d466
--- /dev/null
+++ b/fml.h
@@ -0,0 +1,193 @@
+#ifndef FML_H
+#define FML_H
+
+#define FML_VERSION "r55"
+
+#include <stdint.h>
+
+typedef struct {
+	int32_t l_seq;
+	char *seq, *qual; // NULL-terminated strings; length expected to match $l_seq
+} bseq1_t;
+
+#define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default)
+#define MAG_F_POPOPEN    0x40 // aggressive tip trimming (default)
+#define MAG_F_NO_SIMPL   0x80 // skip bubble simplification (default)
+
+typedef struct {
+	int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth;
+	float min_dratio1, max_bcov, max_bfrac;
+} magopt_t;
+
+typedef struct {
+	int n_threads;        // number of threads; don't use multi-threading for small data sets
+	int ec_k;             // k-mer length for error correction; 0 for auto estimate
+	int min_cnt, max_cnt; // both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt]
+	int min_asm_ovlp;     // min overlap length during assembly
+	int min_merge_len;    // during assembly, don't explicitly merge an overlap if shorter than this value
+	magopt_t mag_opt;     // graph cleaning options
+} fml_opt_t;
+
+struct rld_t;
+struct mag_t;
+
+typedef struct {
+	uint32_t len:31, from:1; // $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end
+	uint32_t id:31, to:1;    // $id: unitig number
+} fml_ovlp_t;
+
+typedef struct {
+	int32_t len;      // length of sequence
+	int32_t nsr;      // number of supporting reads
+	char *seq;        // unitig sequence
+	char *cov;        // cov[i]-33 gives per-base coverage at i
+	int n_ovlp[2];    // number of 5'-end [0] and 3'-end [1] overlaps
+	fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1]
+} fml_utg_t;
+
+extern int fm_verbose;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/************************
+ * High-level functions *
+ ************************/
+
+/**
+ * Read all sequences from a FASTA/FASTQ file
+ *
+ * @param fn       filename; NULL or "-" for stdin
+ * @param n        (out) number of sequences read into RAM
+ *
+ * @return array of sequences
+ */
+bseq1_t *bseq_read(const char *fn, int *n);
+
+/**
+ * Initialize default parameters
+ *
+ * @param opt      (out) pointer to parameters
+ */
+void fml_opt_init(fml_opt_t *opt);
+
+/**
+ * Assemble a list of sequences
+ *
+ * @param opt      parameters
+ * @param n_seqs   number of input sequences
+ * @param seqs     sequences to assemble; FREED on return
+ * @param n_utg    (out) number of unitigs in return
+ *
+ * @return array of unitigs
+ */
+fml_utg_t *fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg);
+
+/**
+ * Free unitigs
+ *
+ * @param n_utg    number of unitigs
+ * @param utg      array of unitigs
+ */
+void fml_utg_destroy(int n_utg, fml_utg_t *utg);
+
+/************************************************
+ * Mid-level functions called by fml_assemble() *
+ ************************************************/
+
+/**
+ * Adjust parameters based on input sequences
+ *
+ * @param opt       parameters to update IN PLACE
+ * @param n_seqs    number of sequences
+ * @param seqs      array of sequences
+ */
+void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs);
+
+/**
+ * Error correction
+ *
+ * @param opt       parameters
+ * @param n         number of sequences
+ * @param seq       array of sequences; corrected IN PLACE
+ *
+ * @return k-mer coverage
+ */
+float fml_correct(const fml_opt_t *opt, int n, bseq1_t *seq);
+float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq);
+
+/**
+ * Construct FMD-index
+ *
+ * @param opt       parameters
+ * @param n         number of sequences
+ * @param seq       array of sequences; FREED on return
+ *
+ * @return FMD-index on success; NULL if all input sequences are zero in length
+ */
+struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq);
+
+/**
+ * Generate initial overlap graph
+ *
+ * @param opt       parameters
+ * @param e         FMD-index; FREED on return
+ *
+ * @return overlap graph in the "mag" structure
+ */
+struct mag_t *fml_fmi2mag(const fml_opt_t *opt, struct rld_t *e);
+
+/**
+ * Clean a mag graph
+ *
+ * @param opt       parameters
+ * @param g         overlap graph; modified IN PLACE
+ */
+void fml_mag_clean(const fml_opt_t *opt, struct mag_t *g);
+
+/**
+ * Convert a graph in mag to fml_utg_t
+ *
+ * @param g         graph in the "mag" structure; FREED on return
+ * @param n_utg     (out) number of unitigs
+ *
+ * @return array of unitigs
+ */
+fml_utg_t *fml_mag2utg(struct mag_t *g, int *n_utg);
+
+/**
+ * Output unitig graph in the mag format
+ *
+ * @param n_utg     number of unitigs
+ * @param utg       array of unitigs
+ */
+void fml_utg_print(int n_utgs, const fml_utg_t *utg);
+
+/**
+ * Output unitig graph in the GFA format
+ *
+ * @param n_utg     number of unitigs
+ * @param utg       array of unitigs
+ */
+void fml_utg_print_gfa(int n, const fml_utg_t *utg);
+
+/**
+ * Deallocate an FM-index
+ *
+ * @param e         pointer to the FM-index
+ */
+void fml_fmi_destroy(struct rld_t *e);
+
+/**
+ * Deallocate a mag graph
+ *
+ * @param g         pointer to the mag graph
+ */
+void fml_mag_destroy(struct mag_t *g);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/htab.c b/htab.c
new file mode 100644
index 0000000..13b4150
--- /dev/null
+++ b/htab.c
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "htab.h"
+#include "khash.h"
+
+#define _cnt_eq(a, b) ((a)>>14 == (b)>>14)
+#define _cnt_hash(a) ((a)>>14)
+KHASH_INIT(cnt, uint64_t, char, 0, _cnt_hash, _cnt_eq)
+typedef khash_t(cnt) cnthash_t;
+
+struct bfc_ch_s {
+	int k;
+	cnthash_t **h;
+	// private
+	int l_pre;
+};
+
+bfc_ch_t *bfc_ch_init(int k, int l_pre)
+{
+	bfc_ch_t *ch;
+	int i;
+	assert(k <= 63);
+	if (k * 2 - l_pre > BFC_CH_KEYBITS)
+		l_pre = k * 2 - BFC_CH_KEYBITS;
+	if (l_pre > BFC_CH_MAXPRE) l_pre = BFC_CH_MAXPRE;
+	assert(k - l_pre < BFC_CH_KEYBITS);
+	ch = calloc(1, sizeof(bfc_ch_t));
+	ch->k = k, ch->l_pre = l_pre;
+	ch->h = calloc(1<<ch->l_pre, sizeof(void*));
+	for (i = 0; i < 1<<ch->l_pre; ++i)
+		ch->h[i] = kh_init(cnt);
+	return ch;
+}
+
+void bfc_ch_destroy(bfc_ch_t *ch)
+{
+	int i;
+	if (ch == 0) return;
+	for (i = 0; i < 1<<ch->l_pre; ++i)
+		kh_destroy(cnt, ch->h[i]);
+	free(ch->h); free(ch);
+}
+
+static inline cnthash_t *get_subhash(const bfc_ch_t *ch, const uint64_t x[2], uint64_t *key)
+{
+	if (ch->k <= 32) {
+		int t = ch->k * 2 - ch->l_pre;
+		uint64_t z = x[0] << ch->k | x[1];
+		*key = (z & ((1ULL<<t) - 1)) << 14 | 1;
+		return ch->h[z>>t];
+	} else {
+		int t = ch->k - ch->l_pre;
+		int shift = t + ch->k < BFC_CH_KEYBITS? ch->k : BFC_CH_KEYBITS - t;
+		*key = ((x[0] & ((1ULL<<t) - 1)) << shift ^ x[1]) << 14 | 1;
+		return ch->h[x[0]>>t];
+	}
+}
+
+int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced)
+{
+	int absent;
+	uint64_t key;
+	cnthash_t *h;
+	khint_t k;
+	h = get_subhash(ch, x, &key);
+	if (__sync_lock_test_and_set(&h->lock, 1)) {
+		if (forced) // then wait until the hash table is unlocked by the thread using it
+			while (__sync_lock_test_and_set(&h->lock, 1))
+				while (h->lock); // lock
+		else return -1;
+	}
+	k = kh_put(cnt, h, key, &absent);
+	if (absent) {
+		if (is_high) kh_key(h, k) |= 1<<8;
+	} else {
+		if ((kh_key(h, k) & 0xff) != 0xff) ++kh_key(h, k);
+		if (is_high && (kh_key(h, k) >> 8 & 0x3f) != 0x3f) kh_key(h, k) += 1<<8;
+	}
+	__sync_lock_release(&h->lock); // unlock
+	return 0;
+}
+
+int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2])
+{
+	uint64_t key;
+	cnthash_t *h;
+	khint_t itr;
+	h = get_subhash(ch, x, &key);
+	itr = kh_get(cnt, h, key);
+	return itr == kh_end(h)? -1 : kh_key(h, itr) & 0x3fff;
+}
+
+int bfc_ch_kmer_occ(const bfc_ch_t *ch, const bfc_kmer_t *z)
+{
+	uint64_t x[2];
+	bfc_kmer_hash(ch->k, z->x, x);
+	return bfc_ch_get(ch, x);
+}
+
+uint64_t bfc_ch_count(const bfc_ch_t *ch)
+{
+	int i;
+	uint64_t cnt = 0;
+	for (i = 0; i < 1<<ch->l_pre; ++i)
+		cnt += kh_size(ch->h[i]);
+	return cnt;
+}
+
+int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64])
+{
+	int i, max_i = -1;
+	uint64_t max;
+	memset(cnt, 0, 256 * 8);
+	memset(high, 0, 64 * 8);
+	for (i = 0; i < 1<<ch->l_pre; ++i) {
+		khint_t k;
+		cnthash_t *h = ch->h[i];
+		for (k = 0; k != kh_end(h); ++k)
+			if (kh_exist(h, k))
+				++cnt[kh_key(h, k) & 0xff], ++high[kh_key(h, k)>>8 & 0x3f];
+	}
+	for (i = 3, max = 0; i < 256; ++i)
+		if (cnt[i] > max)
+			max = cnt[i], max_i = i;
+	return max_i;
+}
+
+int bfc_ch_get_k(const bfc_ch_t *ch)
+{
+	return ch->k;
+}
diff --git a/htab.h b/htab.h
new file mode 100644
index 0000000..118244c
--- /dev/null
+++ b/htab.h
@@ -0,0 +1,23 @@
+#ifndef BFC_HTAB_H
+#define BFC_HTAB_H
+
+#include <stdint.h>
+#include "kmer.h"
+
+#define BFC_CH_KEYBITS 50
+#define BFC_CH_MAXPRE  20
+
+struct bfc_ch_s;
+typedef struct bfc_ch_s bfc_ch_t;
+
+bfc_ch_t *bfc_ch_init(int k, int l_pre);
+void bfc_ch_destroy(bfc_ch_t *ch);
+int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced);
+int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2]);
+uint64_t bfc_ch_count(const bfc_ch_t *ch);
+int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64]);
+int bfc_ch_get_k(const bfc_ch_t *ch);
+
+int bfc_ch_kmer_occ(const bfc_ch_t *ch, const bfc_kmer_t *z);
+
+#endif
diff --git a/internal.h b/internal.h
new file mode 100644
index 0000000..37e4d96
--- /dev/null
+++ b/internal.h
@@ -0,0 +1,21 @@
+#ifndef FML_INTERNAL_H
+#define FML_INTERNAL_H
+
+#include "fml.h"
+
+extern unsigned char seq_nt6_table[256];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
+void seq_reverse(int l, unsigned char *s);
+void seq_revcomp6(int l, unsigned char *s);
+struct bfc_ch_s *fml_count(int n, const bseq1_t *seq, int k, int q, int l_pre, int n_threads);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..870ae5d
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,614 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied; \
+		volatile int lock; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (new_n_buckets>>1) + (new_n_buckets>>2)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) return -1;							\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) return -1;						\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= (h->n_buckets>>2) + (h->n_buckets>>1)) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: 0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/kmer.h b/kmer.h
new file mode 100644
index 0000000..b53b7d2
--- /dev/null
+++ b/kmer.h
@@ -0,0 +1,106 @@
+#ifndef BFC_KMER_H
+#define BFC_KMER_H
+
+#include <stdint.h>
+
+typedef struct {
+	uint64_t x[4];
+} bfc_kmer_t;
+
+static inline void bfc_kmer_append(int k, uint64_t x[4], int c)
+{ // IMPORTANT: 0 <= c < 4
+	uint64_t mask = (1ULL<<k) - 1;
+	x[0] = (x[0]<<1 | (c&1))  & mask;
+	x[1] = (x[1]<<1 | (c>>1)) & mask;
+	x[2] = x[2]>>1 | (1ULL^(c&1))<<(k-1);
+	x[3] = x[3]>>1 | (1ULL^c>>1) <<(k-1);
+}
+
+static inline void bfc_kmer_change(int k, uint64_t x[4], int d, int c) // d-bp from the 3'-end of k-mer; 0<=d<k
+{ // IMPORTANT: 0 <= c < 4
+	uint64_t t = ~(1ULL<<d);
+	x[0] = (uint64_t) (c&1)<<d | (x[0]&t);
+	x[1] = (uint64_t)(c>>1)<<d | (x[1]&t);
+	t = ~(1ULL<<(k-1-d));
+	x[2] = (uint64_t)(1^(c&1))<<(k-1-d) | (x[2]&t);
+	x[3] = (uint64_t)(1^ c>>1)<<(k-1-d) | (x[3]&t);
+}
+
+// Thomas Wang's integer hash functions. See <https://gist.github.com/lh3/59882d6b96166dfc3d8d> for a snapshot.
+static inline uint64_t bfc_hash_64(uint64_t key, uint64_t mask)
+{
+	key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
+	key = key ^ key >> 24;
+	key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
+	key = key ^ key >> 14;
+	key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
+	key = key ^ key >> 28;
+	key = (key + (key << 31)) & mask;
+	return key;
+}
+
+static inline uint64_t bfc_hash_64_inv(uint64_t key, uint64_t mask)
+{
+	uint64_t tmp;
+ 
+	// Invert key = key + (key << 31)
+	tmp = (key - (key << 31));
+	key = (key - (tmp << 31)) & mask;
+ 
+	// Invert key = key ^ (key >> 28)
+	tmp = key ^ key >> 28;
+	key = key ^ tmp >> 28;
+ 
+	// Invert key *= 21
+	key = (key * 14933078535860113213ull) & mask;
+ 
+	// Invert key = key ^ (key >> 14)
+	tmp = key ^ key >> 14;
+	tmp = key ^ tmp >> 14;
+	tmp = key ^ tmp >> 14;
+	key = key ^ tmp >> 14;
+ 
+	// Invert key *= 265
+	key = (key * 15244667743933553977ull) & mask;
+ 
+	// Invert key = key ^ (key >> 24)
+	tmp = key ^ key >> 24;
+	key = key ^ tmp >> 24;
+ 
+	// Invert key = (~key) + (key << 21)
+	tmp = ~key;
+	tmp = ~(key - (tmp << 21));
+	tmp = ~(key - (tmp << 21));
+	key = ~(key - (tmp << 21)) & mask;
+ 
+	return key;
+}
+
+static inline uint64_t bfc_kmer_hash(int k, const uint64_t x[4], uint64_t h[2])
+{
+	int t = k>>1, u = ((x[1]>>t&1) > (x[3]>>t&1)); // the middle base is always different
+	uint64_t mask = (1ULL<<k) - 1, ret;
+	h[0] = bfc_hash_64((x[u<<1|0] + x[u<<1|1]) & mask, mask);
+	h[1] = bfc_hash_64(h[0] ^ x[u<<1|1], mask);
+	ret = (h[0] ^ h[1]) << k | ((h[0] + h[1]) & mask);
+	h[0] = (h[0] + h[1]) & mask;
+	return ret;
+}
+
+static inline void bfc_kmer_hash_inv(int k, const uint64_t h[2], uint64_t y[2])
+{
+	uint64_t mask = (1ULL<<k) - 1, t = (h[0] - h[1]) & mask;
+	y[1] = bfc_hash_64_inv(h[1], mask) ^ t;
+	y[0] = (bfc_hash_64_inv(t, mask) - y[1]) & mask;
+}
+
+static inline char *bfc_kmer_2str(int k, const uint64_t y[2], char *buf)
+{
+	int l;
+	for (l = 0; l < k; ++l)
+		buf[k - 1 - l] = "ACGT"[(y[1]>>l&1)<<1 | (y[0]>>l&1)];
+	buf[k] = 0;
+	return buf;
+}
+
+#endif
diff --git a/kseq.h b/kseq.h
new file mode 100644
index 0000000..84c1fa3
--- /dev/null
+++ b/kseq.h
@@ -0,0 +1,248 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
+#define __KS_TYPE(type_t) \
+	typedef struct __kstream_t { \
+		int begin, end; \
+		int is_eof:2, bufsize:30; \
+		type_t f; \
+		unsigned char *buf; \
+	} kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(SCOPE, type_t, __bufsize) \
+	SCOPE kstream_t *ks_init(type_t f) \
+	{ \
+		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+		ks->f = f; ks->bufsize = __bufsize; \
+		ks->buf = (unsigned char*)malloc(__bufsize); \
+		return ks; \
+	} \
+	SCOPE void ks_destroy(kstream_t *ks) \
+	{ \
+		if (!ks) return; \
+		free(ks->buf); \
+		free(ks); \
+	}
+
+#define __KS_INLINED(__read) \
+	static inline int ks_getc(kstream_t *ks) \
+	{ \
+		if (ks->is_eof && ks->begin >= ks->end) return -1; \
+		if (ks->begin >= ks->end) { \
+			ks->begin = 0; \
+			ks->end = __read(ks->f, ks->buf, ks->bufsize); \
+			if (ks->end < ks->bufsize) ks->is_eof = 1; \
+			if (ks->end == 0) return -1; \
+		} \
+		return (int)ks->buf[ks->begin++]; \
+	} \
+	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(SCOPE, __read) \
+	SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append)  \
+	{ \
+		if (dret) *dret = 0; \
+		str->l = append? str->l : 0; \
+		if (ks->begin >= ks->end && ks->is_eof) return -1; \
+		for (;;) { \
+			int i; \
+			if (ks->begin >= ks->end) { \
+				if (!ks->is_eof) { \
+					ks->begin = 0; \
+					ks->end = __read(ks->f, ks->buf, ks->bufsize); \
+					if (ks->end < ks->bufsize) ks->is_eof = 1; \
+					if (ks->end == 0) break; \
+				} else break; \
+			} \
+			if (delimiter == KS_SEP_LINE) {  \
+				for (i = ks->begin; i < ks->end; ++i)  \
+					if (ks->buf[i] == '\n') break; \
+			} else if (delimiter > KS_SEP_MAX) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (ks->buf[i] == delimiter) break; \
+			} else if (delimiter == KS_SEP_SPACE) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (isspace(ks->buf[i])) break; \
+			} else if (delimiter == KS_SEP_TAB) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break;  \
+			} else i = 0; /* never come to here! */ \
+			if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
+				str->m = str->l + (i - ks->begin) + 1; \
+				kroundup32(str->m); \
+				str->s = (char*)realloc(str->s, str->m); \
+			} \
+			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin);  \
+			str->l = str->l + (i - ks->begin); \
+			ks->begin = i + 1; \
+			if (i < ks->end) { \
+				if (dret) *dret = ks->buf[i]; \
+				break; \
+			} \
+		} \
+		if (str->s == 0) { \
+			str->m = 1; \
+			str->s = (char*)calloc(1, 1); \
+		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+		str->s[str->l] = '\0';											\
+		return str->l; \
+	}
+
+#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
+	__KS_TYPE(type_t) \
+	__KS_BASIC(SCOPE, type_t, __bufsize) \
+	__KS_GETUNTIL(SCOPE, __read) \
+	__KS_INLINED(__read)
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
+
+#define KSTREAM_DECLARE(type_t, __read) \
+	__KS_TYPE(type_t) \
+	extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
+	extern kstream_t *ks_init(type_t f); \
+	extern void ks_destroy(kstream_t *ks); \
+	__KS_INLINED(__read)
+
+/******************
+ * FASTA/Q parser *
+ ******************/
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+	SCOPE kseq_t *kseq_init(type_t fd)									\
+	{																	\
+		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
+		s->f = ks_init(fd);												\
+		return s;														\
+	}																	\
+	SCOPE void kseq_destroy(kseq_t *ks)									\
+	{																	\
+		if (!ks) return;												\
+		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
+		ks_destroy(ks->f);												\
+		free(ks);														\
+	}
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+	SCOPE int kseq_read(kseq_t *seq) \
+	{ \
+		int c; \
+		kstream_t *ks = seq->f; \
+		if (seq->last_char == 0) { /* then jump to the next header line */ \
+			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+			if (c == -1) return -1; /* end of file */ \
+			seq->last_char = c; \
+		} /* else: the first header char has been read in the previous call */ \
+		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+			seq->seq.m = 256; \
+			seq->seq.s = (char*)malloc(seq->seq.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+			if (c == '\n') continue; /* skip empty lines */ \
+			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+		} \
+		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
+		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+			seq->seq.m = seq->seq.l + 2; \
+			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+		} \
+		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+		if (c != '+') return seq->seq.l; /* FASTA */ \
+		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+			seq->qual.m = seq->seq.m; \
+			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+		if (c == -1) return -2; /* error: no quality string */ \
+		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+		seq->last_char = 0;	/* we have not come to the next header line */ \
+		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+		return seq->seq.l; \
+	}
+
+#define __KSEQ_TYPE(type_t)						\
+	typedef struct {							\
+		kstring_t name, comment, seq, qual;		\
+		int last_char;							\
+		kstream_t *f;							\
+	} kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+	KSTREAM_INIT2(SCOPE, type_t, __read, 16384)	\
+	__KSEQ_TYPE(type_t)							\
+	__KSEQ_BASIC(SCOPE, type_t)					\
+	__KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+	__KS_TYPE(type_t) \
+	__KSEQ_TYPE(type_t) \
+	extern kseq_t *kseq_init(type_t fd); \
+	void kseq_destroy(kseq_t *ks); \
+	int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/ksort.h b/ksort.h
new file mode 100644
index 0000000..e659b22
--- /dev/null
+++ b/ksort.h
@@ -0,0 +1,309 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  2011-04-10 (0.1.6):
+
+  	* Added sample
+
+  2011-03 (0.1.5):
+
+	* Added shuffle/permutation
+
+  2008-11-16 (0.1.4):
+
+    * Fixed a bug in introsort() that happens in rare cases.
+
+  2008-11-05 (0.1.3):
+
+    * Fixed a bug in introsort() for complex comparisons.
+
+	* Fixed a bug in mergesort(). The previous version is not stable.
+
+  2008-09-15 (0.1.2):
+
+	* Accelerated introsort. On my Mac (not on another Linux machine),
+	  my implementation is as fast as std::sort on random input.
+
+	* Added combsort and in introsort, switch to combsort if the
+	  recursion is too deep.
+
+  2008-09-13 (0.1.1):
+
+	* Added k-small algorithm
+
+  2008-09-05 (0.1.0):
+
+	* Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+	void *left, *right;
+	int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt)								\
+	void ks_mergesort_##name(size_t n, type_t array[], type_t temp[])	\
+	{																	\
+		type_t *a2[2], *a, *b;											\
+		int curr, shift;												\
+																		\
+		a2[0] = array;													\
+		a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);		\
+		for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {			\
+			a = a2[curr]; b = a2[1-curr];								\
+			if (shift == 0) {											\
+				type_t *p = b, *i, *eb = a + n;							\
+				for (i = a; i < eb; i += 2) {							\
+					if (i == eb - 1) *p++ = *i;							\
+					else {												\
+						if (__sort_lt(*(i+1), *i)) {					\
+							*p++ = *(i+1); *p++ = *i;					\
+						} else {										\
+							*p++ = *i; *p++ = *(i+1);					\
+						}												\
+					}													\
+				}														\
+			} else {													\
+				size_t i, step = 1ul<<shift;							\
+				for (i = 0; i < n; i += step<<1) {						\
+					type_t *p, *j, *k, *ea, *eb;						\
+					if (n < i + step) {									\
+						ea = a + n; eb = a;								\
+					} else {											\
+						ea = a + i + step;								\
+						eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+					}													\
+					j = a + i; k = a + i + step; p = b + i;				\
+					while (j < ea && k < eb) {							\
+						if (__sort_lt(*k, *j)) *p++ = *k++;				\
+						else *p++ = *j++;								\
+					}													\
+					while (j < ea) *p++ = *j++;							\
+					while (k < eb) *p++ = *k++;							\
+				}														\
+			}															\
+			curr = 1 - curr;											\
+		}																\
+		if (curr == 1) {												\
+			type_t *p = a2[0], *i = a2[1], *eb = array + n;				\
+			for (; p < eb; ++i) *p++ = *i;								\
+		}																\
+		if (temp == 0) free(a2[1]);										\
+	}																	\
+	void ks_heapdown_##name(size_t i, size_t n, type_t l[]) \
+	{ \
+		size_t k = i; \
+		type_t tmp = l[i]; \
+		while ((k = (k << 1) + 1) < n) { \
+			if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
+			if (__sort_lt(l[k], tmp)) break; \
+			l[i] = l[k]; i = k; \
+		} \
+		l[i] = tmp; \
+	} \
+	void ks_heapup_##name(size_t n, type_t l[]) \
+	{ \
+		size_t i, k = n - 1; \
+		type_t tmp = l[k]; \
+		while (k) { \
+			i = (k - 1) >> 1; \
+			if (__sort_lt(tmp, l[i])) break; \
+			l[k] = l[i]; k = i; \
+		} \
+		l[k] = tmp; \
+	} \
+	void ks_heapmake_##name(size_t lsize, type_t l[])					\
+	{																	\
+		size_t i;														\
+		for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i)				\
+			ks_heapdown_##name(i, lsize, l);							\
+	}																	\
+	void ks_heapsort_##name(size_t lsize, type_t l[])					\
+	{																	\
+		size_t i;														\
+		for (i = lsize - 1; i > 0; --i) {								\
+			type_t tmp;													\
+			tmp = *l; *l = l[i]; l[i] = tmp; ks_heapdown_##name(0, i, l); \
+		}																\
+	}																	\
+	static inline void __ks_insertsort_##name(type_t *s, type_t *t)		\
+	{																	\
+		type_t *i, *j, swap_tmp;										\
+		for (i = s + 1; i < t; ++i)										\
+			for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) {			\
+				swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp;			\
+			}															\
+	}																	\
+	void ks_combsort_##name(size_t n, type_t a[])						\
+	{																	\
+		const double shrink_factor = 1.2473309501039786540366528676643; \
+		int do_swap;													\
+		size_t gap = n;													\
+		type_t tmp, *i, *j;												\
+		do {															\
+			if (gap > 2) {												\
+				gap = (size_t)(gap / shrink_factor);					\
+				if (gap == 9 || gap == 10) gap = 11;					\
+			}															\
+			do_swap = 0;												\
+			for (i = a; i < a + n - gap; ++i) {							\
+				j = i + gap;											\
+				if (__sort_lt(*j, *i)) {								\
+					tmp = *i; *i = *j; *j = tmp;						\
+					do_swap = 1;										\
+				}														\
+			}															\
+		} while (do_swap || gap > 2);									\
+		if (gap != 1) __ks_insertsort_##name(a, a + n);					\
+	}																	\
+	void ks_introsort_##name(size_t n, type_t a[])						\
+	{																	\
+		int d;															\
+		ks_isort_stack_t *top, *stack;									\
+		type_t rp, swap_tmp;											\
+		type_t *s, *t, *i, *j, *k;										\
+																		\
+		if (n < 1) return;												\
+		else if (n == 2) {												\
+			if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+			return;														\
+		}																\
+		for (d = 2; 1ul<<d < n; ++d);									\
+		stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+		top = stack; s = a; t = a + (n-1); d <<= 1;						\
+		while (1) {														\
+			if (s < t) {												\
+				if (--d == 0) {											\
+					ks_combsort_##name(t - s + 1, s);					\
+					t = s;												\
+					continue;											\
+				}														\
+				i = s; j = t; k = i + ((j-i)>>1) + 1;					\
+				if (__sort_lt(*k, *i)) {								\
+					if (__sort_lt(*k, *j)) k = j;						\
+				} else k = __sort_lt(*j, *i)? i : j;					\
+				rp = *k;												\
+				if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }	\
+				for (;;) {												\
+					do ++i; while (__sort_lt(*i, rp));					\
+					do --j; while (i <= j && __sort_lt(rp, *j));		\
+					if (j <= i) break;									\
+					swap_tmp = *i; *i = *j; *j = swap_tmp;				\
+				}														\
+				swap_tmp = *i; *i = *t; *t = swap_tmp;					\
+				if (i-s > t-i) {										\
+					if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+					s = t-i > 16? i+1 : t;								\
+				} else {												\
+					if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+					t = i-s > 16? i-1 : s;								\
+				}														\
+			} else {													\
+				if (top == stack) {										\
+					free(stack);										\
+					__ks_insertsort_##name(a, a+n);						\
+					return;												\
+				} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+			}															\
+		}																\
+	}																	\
+	/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+	/* 0 <= kk < n */													\
+	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)			\
+	{																	\
+		type_t *low, *high, *k, *ll, *hh, *mid;							\
+		low = arr; high = arr + n - 1; k = arr + kk;					\
+		for (;;) {														\
+			if (high <= low) return *k;									\
+			if (high == low + 1) {										\
+				if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+				return *k;												\
+			}															\
+			mid = low + (high - low) / 2;								\
+			if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+			if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+			if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);	\
+			KSORT_SWAP(type_t, *mid, *(low+1));							\
+			ll = low + 1; hh = high;									\
+			for (;;) {													\
+				do ++ll; while (__sort_lt(*ll, *low));					\
+				do --hh; while (__sort_lt(*low, *hh));					\
+				if (hh < ll) break;										\
+				KSORT_SWAP(type_t, *ll, *hh);							\
+			}															\
+			KSORT_SWAP(type_t, *low, *hh);								\
+			if (hh <= k) low = ll;										\
+			if (hh >= k) high = hh - 1;									\
+		}																\
+	}																	\
+	void ks_shuffle_##name(size_t n, type_t a[])						\
+	{																	\
+		int i, j;														\
+		for (i = n; i > 1; --i) {										\
+			type_t tmp;													\
+			j = (int)(drand48() * i);									\
+			tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;					\
+		}																\
+	}																	\
+	void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \
+	{ /* reference: http://code.activestate.com/recipes/272884/ */ \
+		int i, k, pop = n; \
+		for (i = (int)r, k = 0; i >= 0; --i) { \
+			double z = 1., x = drand48(); \
+			type_t tmp; \
+			while (x < z) z -= z * i / (pop--); \
+			if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \
+			++k; \
+		} \
+	}
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/kstring.h b/kstring.h
new file mode 100644
index 0000000..b179a8c
--- /dev/null
+++ b/kstring.h
@@ -0,0 +1,169 @@
+/* The MIT License
+
+   Copyright (c) by Attractive Chaos <attractor@live.co.uk> 
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+typedef struct {
+	uint64_t tab[4];
+	int sep, finished;
+	const char *p; // end of the current token
+} ks_tokaux_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	int ksprintf(kstring_t *s, const char *fmt, ...);
+	int ksprintf_fast(kstring_t *s, const char *fmt, ...);
+	int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+	char *kstrstr(const char *str, const char *pat, int **_prep);
+	char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
+	void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
+
+	/* kstrtok() is similar to strtok_r() except that str is not
+	 * modified and both str and sep can be NULL. For efficiency, it is
+	 * actually recommended to set both to NULL in the subsequent calls
+	 * if sep is not changed. */
+	char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline void ks_resize(kstring_t *s, size_t size)
+{
+	if (s->m < size) {
+		s->m = size;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	s->s[s->l] = 0;
+	return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+	return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+	if (s->l + 1 >= s->m) {
+		s->m = s->l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	s->s[s->l++] = c;
+	s->s[s->l] = 0;
+	return c;
+}
+
+static inline int kputw(int c, kstring_t *s)
+{
+	char buf[16];
+	int l, x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+	char buf[16];
+	int l, i;
+	unsigned x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputl(long c, kstring_t *s)
+{
+	char buf[32];
+	long l, x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+	int max = 0, *offsets = 0;
+	*n = ksplit_core(s->s, delimiter, &max, &offsets);
+	return offsets;
+}
+
+#endif
diff --git a/ksw.c b/ksw.c
new file mode 100644
index 0000000..71f2635
--- /dev/null
+++ b/ksw.c
@@ -0,0 +1,352 @@
+/* The MIT License
+
+   Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <emmintrin.h>
+#include "ksw.h"
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
+
+struct _kswq_t {
+	int qlen, slen;
+	uint8_t shift, mdiff, max, size;
+	__m128i *qp, *H0, *H1, *E, *Hmax;
+};
+
+/**
+ * Initialize the query data structure
+ *
+ * @param size   Number of bytes used to store a score; valid valures are 1 or 2
+ * @param qlen   Length of the query sequence
+ * @param query  Query sequence
+ * @param m      Size of the alphabet
+ * @param mat    Scoring matrix in a one-dimension array
+ *
+ * @return       Query data structure
+ */
+kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
+{
+	kswq_t *q;
+	int slen, a, tmp, p;
+
+	size = size > 1? 2 : 1;
+	p = 8 * (3 - size); // # values per __m128i
+	slen = (qlen + p - 1) / p; // segmented length
+	q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
+	q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
+	q->H0 = q->qp + slen * m;
+	q->H1 = q->H0 + slen;
+	q->E  = q->H1 + slen;
+	q->Hmax = q->E + slen;
+	q->slen = slen; q->qlen = qlen; q->size = size;
+	// compute shift
+	tmp = m * m;
+	for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
+		if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
+		if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
+	}
+	q->max = q->mdiff;
+	q->shift = 256 - q->shift; // NB: q->shift is uint8_t
+	q->mdiff += q->shift; // this is the difference between the min and max scores
+	// An example: p=8, qlen=19, slen=3 and segmentation:
+	//  {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
+	if (size == 1) {
+		int8_t *t = (int8_t*)q->qp;
+		for (a = 0; a < m; ++a) {
+			int i, k, nlen = slen * p;
+			const int8_t *ma = mat + a * m;
+			for (i = 0; i < slen; ++i)
+				for (k = i; k < nlen; k += slen) // p iterations
+					*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
+		}
+	} else {
+		int16_t *t = (int16_t*)q->qp;
+		for (a = 0; a < m; ++a) {
+			int i, k, nlen = slen * p;
+			const int8_t *ma = mat + a * m;
+			for (i = 0; i < slen; ++i)
+				for (k = i; k < nlen; k += slen) // p iterations
+					*t++ = (k >= qlen? 0 : ma[query[k]]);
+		}
+	}
+	return q;
+}
+
+kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+{
+	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
+	uint64_t *b;
+	__m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
+	kswr_t r;
+
+#define __max_16(ret, xx) do { \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
+    	(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
+	} while (0)
+
+	// initialization
+	r = g_defr;
+	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
+	m_b = n_b = 0; b = 0;
+	zero = _mm_set1_epi32(0);
+	gapoe = _mm_set1_epi8(_gapo + _gape);
+	gape = _mm_set1_epi8(_gape);
+	shift = _mm_set1_epi8(q->shift);
+	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
+	slen = q->slen;
+	for (i = 0; i < slen; ++i) {
+		_mm_store_si128(E + i, zero);
+		_mm_store_si128(H0 + i, zero);
+		_mm_store_si128(Hmax + i, zero);
+	}
+	// the core loop
+	for (i = 0; i < tlen; ++i) {
+		int j, k, cmp, imax;
+		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+		h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
+		for (j = 0; LIKELY(j < slen); ++j) {
+			/* SW cells are computed in the following order:
+			 *   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+			 *   E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
+			 *   F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
+			 */
+			// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
+			h = _mm_adds_epu8(h, _mm_load_si128(S + j));
+			h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
+			e = _mm_load_si128(E + j); // e=E'(i,j)
+			h = _mm_max_epu8(h, e);
+			h = _mm_max_epu8(h, f); // h=H'(i,j)
+			max = _mm_max_epu8(max, h); // set max
+			_mm_store_si128(H1 + j, h); // save to H'(i,j)
+			// now compute E'(i+1,j)
+			h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo
+			e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape
+			e = _mm_max_epu8(e, h); // e=E'(i+1,j)
+			_mm_store_si128(E + j, e); // save to E'(i+1,j)
+			// now compute F'(i,j+1)
+			f = _mm_subs_epu8(f, gape);
+			f = _mm_max_epu8(f, h);
+			// get H'(i-1,j) and prepare for the next j
+			h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
+		}
+		// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
+		for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
+			f = _mm_slli_si128(f, 1);
+			for (j = 0; LIKELY(j < slen); ++j) {
+				h = _mm_load_si128(H1 + j);
+				h = _mm_max_epu8(h, f); // h=H'(i,j)
+				_mm_store_si128(H1 + j, h);
+				h = _mm_subs_epu8(h, gapoe);
+				f = _mm_subs_epu8(f, gape);
+				cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
+				if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
+			}
+		}
+end_loop16:
+		//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
+		__max_16(imax, max); // imax is the maximum number in max
+		if (imax >= minsc) { // write the b array; this condition adds branching unfornately
+			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
+				if (n_b == m_b) {
+					m_b = m_b? m_b<<1 : 8;
+					b = (uint64_t*)realloc(b, 8 * m_b);
+				}
+				b[n_b++] = (uint64_t)imax<<32 | i;
+			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
+		}
+		if (imax > gmax) {
+			gmax = imax; te = i; // te is the end position on the target
+			for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
+				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+			if (gmax + q->shift >= 255 || gmax >= endsc) break;
+		}
+		S = H1; H1 = H0; H0 = S; // swap H0 and H1
+	}
+	r.score = gmax + q->shift < 255? gmax : 255;
+	r.te = te;
+	if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
+		int max = -1, low, high, qlen = slen * 16;
+		uint8_t *t = (uint8_t*)Hmax;
+		for (i = 0; i < qlen; ++i, ++t)
+			if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
+		//printf("%d,%d\n", max, gmax);
+		if (b) {
+			i = (r.score + q->max - 1) / q->max;
+			low = te - i; high = te + i;
+			for (i = 0; i < n_b; ++i) {
+				int e = (int32_t)b[i];
+				if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2)
+					r.score2 = b[i]>>32, r.te2 = e;
+			}
+		}
+	}
+	free(b);
+	return r;
+}
+
+kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+{
+	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
+	uint64_t *b;
+	__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
+	kswr_t r;
+
+#define __max_8(ret, xx) do { \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
+    	(ret) = _mm_extract_epi16((xx), 0); \
+	} while (0)
+
+	// initialization
+	r = g_defr;
+	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
+	m_b = n_b = 0; b = 0;
+	zero = _mm_set1_epi32(0);
+	gapoe = _mm_set1_epi16(_gapo + _gape);
+	gape = _mm_set1_epi16(_gape);
+	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
+	slen = q->slen;
+	for (i = 0; i < slen; ++i) {
+		_mm_store_si128(E + i, zero);
+		_mm_store_si128(H0 + i, zero);
+		_mm_store_si128(Hmax + i, zero);
+	}
+	// the core loop
+	for (i = 0; i < tlen; ++i) {
+		int j, k, imax;
+		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+		h = _mm_slli_si128(h, 2);
+		for (j = 0; LIKELY(j < slen); ++j) {
+			h = _mm_adds_epi16(h, *S++);
+			e = _mm_load_si128(E + j);
+			h = _mm_max_epi16(h, e);
+			h = _mm_max_epi16(h, f);
+			max = _mm_max_epi16(max, h);
+			_mm_store_si128(H1 + j, h);
+			h = _mm_subs_epu16(h, gapoe);
+			e = _mm_subs_epu16(e, gape);
+			e = _mm_max_epi16(e, h);
+			_mm_store_si128(E + j, e);
+			f = _mm_subs_epu16(f, gape);
+			f = _mm_max_epi16(f, h);
+			h = _mm_load_si128(H0 + j);
+		}
+		for (k = 0; LIKELY(k < 16); ++k) {
+			f = _mm_slli_si128(f, 2);
+			for (j = 0; LIKELY(j < slen); ++j) {
+				h = _mm_load_si128(H1 + j);
+				h = _mm_max_epi16(h, f);
+				_mm_store_si128(H1 + j, h);
+				h = _mm_subs_epu16(h, gapoe);
+				f = _mm_subs_epu16(f, gape);
+				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
+			}
+		}
+end_loop8:
+		__max_8(imax, max);
+		if (imax >= minsc) {
+			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
+				if (n_b == m_b) {
+					m_b = m_b? m_b<<1 : 8;
+					b = (uint64_t*)realloc(b, 8 * m_b);
+				}
+				b[n_b++] = (uint64_t)imax<<32 | i;
+			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
+		}
+		if (imax > gmax) {
+			gmax = imax; te = i;
+			for (j = 0; LIKELY(j < slen); ++j)
+				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+			if (gmax >= endsc) break;
+		}
+		S = H1; H1 = H0; H0 = S;
+	}
+	r.score = gmax; r.te = te;
+	{
+		int max = -1, low, high, qlen = slen * 8;
+		uint16_t *t = (uint16_t*)Hmax;
+		for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
+			if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
+		if (b) {
+			i = (r.score + q->max - 1) / q->max;
+			low = te - i; high = te + i;
+			for (i = 0; i < n_b; ++i) {
+				int e = (int32_t)b[i];
+				if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2)
+					r.score2 = b[i]>>32, r.te2 = e;
+			}
+		}
+	}
+	free(b);
+	return r;
+}
+
+static void revseq(int l, uint8_t *s)
+{
+	int i, t;
+	for (i = 0; i < l>>1; ++i)
+		t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
+}
+
+kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+{
+	int size;
+	kswq_t *q;
+	kswr_t r, rr;
+	kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
+
+	q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
+	if (qry && *qry == 0) *qry = q;
+	func = q->size == 2? ksw_i16 : ksw_u8;
+	size = q->size;
+	r = func(q, tlen, target, gapo, gape, xtra);
+	if (qry == 0) free(q);
+	if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
+	revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
+	q = ksw_qinit(size, r.qe + 1, query, m, mat);
+	rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
+	revseq(r.qe + 1, query); revseq(r.te + 1, target);
+	free(q);
+	if (r.score == rr.score)
+		r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
+	return r;
+}
diff --git a/ksw.h b/ksw.h
new file mode 100644
index 0000000..160cee8
--- /dev/null
+++ b/ksw.h
@@ -0,0 +1,71 @@
+#ifndef __AC_KSW_H
+#define __AC_KSW_H
+
+#include <stdint.h>
+
+#define KSW_XBYTE  0x10000
+#define KSW_XSTOP  0x20000
+#define KSW_XSUBO  0x40000
+#define KSW_XSTART 0x80000
+
+struct _kswq_t;
+typedef struct _kswq_t kswq_t;
+
+typedef struct {
+	int score; // best score
+	int te, qe; // target end and query end
+	int score2, te2; // second best score and ending position on the target
+	int tb, qb; // target start and query start
+} kswr_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/**
+	 * Aligning two sequences
+	 *
+	 * @param qlen    length of the query sequence (typically <tlen)
+	 * @param query   query sequence with 0 <= query[i] < m
+	 * @param tlen    length of the target sequence
+	 * @param target  target sequence
+	 * @param m       number of residue types
+	 * @param mat     m*m scoring matrix in one-dimention array
+	 * @param gapo    gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+	 * @param gape    gap extension penalty
+	 * @param xtra    extra information (see below)
+	 * @param qry     query profile (see below)
+	 *
+	 * @return        alignment information in a struct; unset values to -1
+	 *
+	 * When xtra==0, ksw_align() uses a signed two-byte integer to store a
+	 * score and only finds the best score and the end positions. The 2nd best
+	 * score or the start positions are not attempted. The default behavior can
+	 * be tuned by setting KSW_X* flags:
+	 *
+	 *   KSW_XBYTE:  use an unsigned byte to store a score. If overflow occurs,
+	 *               kswr_t::score will be set to 255
+	 *
+	 *   KSW_XSUBO:  track the 2nd best score and the ending position on the
+	 *               target if the 2nd best is higher than (xtra&0xffff)
+	 *
+	 *   KSW_XSTOP:  stop if the maximum score is above (xtra&0xffff)
+	 *
+	 *   KSW_XSTART: find the start positions
+	 *
+	 * When *qry==NULL, ksw_align() will compute and allocate the query profile
+	 * and when the function returns, *qry will point to the profile, which can
+	 * be deallocated simply by free(). If one query is aligned against multiple
+	 * target sequences, *qry should be set to NULL during the first call and
+	 * freed after the last call. Note that qry can equal 0. In this case, the
+	 * query profile will be deallocated in ksw_align().
+	 */
+	kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
+
+	kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/kthread.c b/kthread.c
new file mode 100644
index 0000000..de64a9d
--- /dev/null
+++ b/kthread.c
@@ -0,0 +1,65 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <limits.h>
+
+/************
+ * kt_for() *
+ ************/
+
+struct kt_for_t;
+
+typedef struct {
+	struct kt_for_t *t;
+	long i;
+} ktf_worker_t;
+
+typedef struct kt_for_t {
+	int n_threads;
+	long n;
+	ktf_worker_t *w;
+	void (*func)(void*,long,int);
+	void *data;
+} kt_for_t;
+
+static inline long steal_work(kt_for_t *t)
+{
+	int i, min_i = -1;
+	long k, min = LONG_MAX;
+	for (i = 0; i < t->n_threads; ++i)
+		if (min > t->w[i].i) min = t->w[i].i, min_i = i;
+	k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
+	return k >= t->n? -1 : k;
+}
+
+static void *ktf_worker(void *data)
+{
+	ktf_worker_t *w = (ktf_worker_t*)data;
+	long i;
+	for (;;) {
+		i = __sync_fetch_and_add(&w->i, w->t->n_threads);
+		if (i >= w->t->n) break;
+		w->t->func(w->t->data, i, w - w->t->w);
+	}
+	while ((i = steal_work(w->t)) >= 0)
+		w->t->func(w->t->data, i, w - w->t->w);
+	pthread_exit(0);
+}
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
+{
+	if (n_threads > 1) {
+		int i;
+		kt_for_t t;
+		pthread_t *tid;
+		t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
+		t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
+		tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+		for (i = 0; i < n_threads; ++i)
+			t.w[i].t = &t, t.w[i].i = i;
+		for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
+		for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+	} else {
+		long j;
+		for (j = 0; j < n; ++j) func(data, j, 0);
+	}
+}
diff --git a/kvec.h b/kvec.h
new file mode 100644
index 0000000..632fce4
--- /dev/null
+++ b/kvec.h
@@ -0,0 +1,110 @@
+/* The MIT License
+
+   Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "kvec.h"
+int main() {
+	kvec_t(int) array;
+	kv_init(array);
+	kv_push(int, array, 10); // append
+	kv_a(int, array, 20) = 5; // dynamic
+	kv_A(array, 20) = 4; // static
+	kv_destroy(array);
+	return 0;
+}
+*/
+
+/*
+  2008-09-22 (0.1.0):
+
+	* The initial version.
+
+*/
+
+#ifndef AC_KVEC_H
+#define AC_KVEC_H
+
+#include <stdlib.h>
+
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+#define kvec_t(type) struct { size_t n, m; type *a; }
+#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
+#define kv_destroy(v) free((v).a)
+#define kv_A(v, i) ((v).a[(i)])
+#define kv_pop(v) ((v).a[--(v).n])
+#define kv_size(v) ((v).n)
+#define kv_max(v) ((v).m)
+
+#define kv_resize(type, v, s) do { \
+		if ((v).m < (s)) { \
+			(v).m = (s); \
+			kv_roundup32((v).m); \
+			(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
+		} \
+	} while (0)
+
+#define kv_copy(type, v1, v0) do {							\
+		if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n);	\
+		(v1).n = (v0).n;									\
+		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n);		\
+	} while (0)												\
+
+#define kv_push(type, v, x) do {									\
+		if ((v).n == (v).m) {										\
+			(v).m = (v).m? (v).m<<1 : 2;							\
+			(v).a = (type*)realloc((v).a, sizeof(type) * (v).m);	\
+		}															\
+		(v).a[(v).n++] = (x);										\
+	} while (0)
+
+#define kv_pushp(type, v, p) do { \
+		if ((v).n == (v).m) { \
+			(v).m = (v).m? (v).m<<1 : 2; \
+			(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
+		} \
+		*(p) = &(v).a[(v).n++]; \
+	} while (0)
+
+#define kv_a(type, v, i) ((v).m <= (size_t)(i)?						\
+						  ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
+						   (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+						  : (v).n <= (size_t)(i)? (v).n = (i)			\
+						  : 0), (v).a[(i)]
+
+#define kv_reverse(type, v, start) do { \
+		if ((v).m > 0 && (v).n > (start)) { \
+			size_t __i, __end = (v).n - (start); \
+			type *__a = (v).a + (start); \
+			for (__i = 0; __i < __end>>1; ++__i) { \
+				type __t = __a[__end - 1 - __i]; \
+				__a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
+			} \
+		} \
+	} while (0)
+
+#endif
diff --git a/mag.c b/mag.c
new file mode 100644
index 0000000..acdca29
--- /dev/null
+++ b/mag.c
@@ -0,0 +1,620 @@
+/* remaining problems:
+
+  1. multiedges due to tandem repeats
+*/
+
+#include <math.h>
+#include <zlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "mag.h"
+#include "kvec.h"
+#include "internal.h"
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
+
+#include "khash.h"
+KHASH_INIT2(64,, khint64_t, uint64_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef khash_t(64) hash64_t;
+
+#define ku128_xlt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y > (b).y))
+#define ku128_ylt(a, b) ((int64_t)(a).y > (int64_t)(b).y)
+#include "ksort.h"
+KSORT_INIT(128x, ku128_t, ku128_xlt)
+KSORT_INIT(128y, ku128_t, ku128_ylt)
+KSORT_INIT_GENERIC(uint64_t)
+
+#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
+#define edge_is_del(_x)   ((_x).x == (uint64_t)-2 || (_x).y == 0)
+
+int fm_verbose = 1;
+
+/*********************
+ * Vector operations *
+ *********************/
+
+static inline void v128_clean(ku128_v *r)
+{
+	int i, j;
+	for (i = j = 0; i < r->n; ++i)
+		if (!edge_is_del(r->a[i])) { // keep this arc
+			if (j != i) r->a[j++] = r->a[i];
+			else ++j;
+		}
+	r->n = j;
+}
+
+void mag_v128_clean(ku128_v *r)
+{
+	v128_clean(r);
+}
+
+static inline void v128_rmdup(ku128_v *r)
+{
+	int l, cnt;
+	uint64_t x;
+	if (r->n > 1) ks_introsort(128x, r->n, r->a);
+	for (l = cnt = 0; l < r->n; ++l) // jump to the first node to be retained
+		if (edge_is_del(r->a[l])) ++cnt;
+		else break;
+	if (l == r->n) { // no good arcs
+		r->n = 0;
+		return;
+	}
+	x = r->a[l].x;
+	for (++l; l < r->n; ++l) { // mark duplicated node
+		if (edge_is_del(r->a[l]) || r->a[l].x == x)
+			edge_mark_del(r->a[l]), ++cnt;
+		else x = r->a[l].x;
+	}
+	if (cnt) v128_clean(r);
+}
+
+static inline void v128_cap(ku128_v *r, int max)
+{
+	int i, thres;
+	if (r->n <= max) return;
+	ks_introsort(128y, r->n, r->a);
+	thres = r->a[max].y;
+	for (i = 0; i < r->n; ++i)
+		if (r->a[i].y == thres) break;
+	r->n = i;
+}
+
+/*************************************************
+ * Mapping between vertex id and interval end id *
+ *************************************************/
+
+void mag_g_build_hash(mag_t *g)
+{
+	long i;
+	int j, ret;
+	hash64_t *h;
+	h = kh_init(64);
+	for (i = 0; i < g->v.n; ++i) {
+		const magv_t *p = &g->v.a[i];
+		for (j = 0; j < 2; ++j) {
+			khint_t k = kh_put(64, h, p->k[j], &ret);
+			if (ret == 0) {
+				if (fm_verbose >= 2)
+					fprintf(stderr, "[W::%s] terminal %ld is duplicated.\n", __func__, (long)p->k[j]);
+				kh_val(h, k) = (uint64_t)-1;
+			} else kh_val(h, k) = i<<1|j;
+		}
+	}
+	g->h = h;
+}
+
+static inline uint64_t tid2idd(hash64_t *h, uint64_t tid)
+{
+	khint_t k = kh_get(64, h, tid);
+	assert(k != kh_end(h));
+	return kh_val(h, k);
+}
+
+uint64_t mag_tid2idd(void *h, uint64_t tid) // exported version
+{
+	return tid2idd(h, tid);
+}
+
+void mag_g_amend(mag_t *g)
+{
+	int i, j, l, ll;
+	for (i = 0; i < g->v.n; ++i) {
+		magv_t *p = &g->v.a[i];
+		ku128_v *r;
+		for (j = 0; j < 2; ++j) {
+			for (l = 0; l < p->nei[j].n; ++l) {
+				khint_t k;
+				uint64_t z, x = p->nei[j].a[l].x;
+				k = kh_get(64, g->h, x);
+				if (k == kh_end((hash64_t*)g->h)) { // neighbor is not in the hash table; likely due to tip removal
+					edge_mark_del(p->nei[j].a[l]);
+					continue;
+				} else z = kh_val((hash64_t*)g->h, k);
+				r = &g->v.a[z>>1].nei[z&1];
+				for (ll = 0, z = p->k[j]; ll < r->n; ++ll)
+					if (r->a[ll].x == z) break;
+				if (ll == r->n) // not in neighbor's neighor
+					edge_mark_del(p->nei[j].a[l]);
+			}
+			v128_rmdup(&p->nei[j]);
+		}
+	}
+}
+
+/*********************************
+ * Graph I/O initialization etc. *
+ *********************************/
+
+void mag_v_write(const magv_t *p, kstring_t *out)
+{
+	int j, k;
+	if (p->len <= 0) return;
+	out->l = 0;
+	kputc('@', out); kputl(p->k[0], out); kputc(':', out); kputl(p->k[1], out);
+	kputc('\t', out); kputw(p->nsr, out);
+	for (j = 0; j < 2; ++j) {
+		const ku128_v *r = &p->nei[j];
+		kputc('\t', out);
+		for (k = 0; k < r->n; ++k) {
+			if (edge_is_del(r->a[k])) continue;
+			kputl(r->a[k].x, out); kputc(',', out); kputw((int32_t)r->a[k].y, out);
+			kputc(';', out);
+		}
+		if (p->nei[j].n == 0) kputc('.', out);
+	}
+	kputc('\n', out);
+	ks_resize(out, out->l + 2 * p->len + 5);
+	for (j = 0; j < p->len; ++j)
+		out->s[out->l++] = "ACGT"[(int)p->seq[j] - 1];
+	out->s[out->l] = 0;
+	kputsn("\n+\n", 3, out);
+	kputsn(p->cov, p->len, out);
+	kputc('\n', out);
+}
+
+void mag_g_print(const mag_t *g)
+{
+	int i;
+	kstring_t out;
+	out.l = out.m = 0; out.s = 0;
+	for (i = 0; i < g->v.n; ++i) {
+		if (g->v.a[i].len < 0) continue;
+		mag_v_write(&g->v.a[i], &out);
+		fwrite(out.s, 1, out.l, stdout);
+	}
+	free(out.s);
+	fflush(stdout);
+}
+
+/**************************
+ * Basic graph operations *
+ **************************/
+
+void mag_v_destroy(magv_t *v)
+{
+	free(v->nei[0].a); free(v->nei[1].a);
+	free(v->seq); free(v->cov);
+	memset(v, 0, sizeof(magv_t));
+	v->len = -1;
+}
+
+void mag_g_destroy(mag_t *g)
+{
+	int i;
+	kh_destroy(64, g->h);
+	for (i = 0; i < g->v.n; ++i)
+		mag_v_destroy(&g->v.a[i]);
+	free(g->v.a);
+	free(g);
+}
+
+void mag_v_copy_to_empty(magv_t *dst, const magv_t *src) // NB: memory leak if dst is allocated
+{
+	memcpy(dst, src, sizeof(magv_t));
+	dst->max_len = dst->len + 1;
+	kroundup32(dst->max_len);
+	dst->seq = calloc(dst->max_len, 1); memcpy(dst->seq, src->seq, src->len);
+	dst->cov = calloc(dst->max_len, 1); memcpy(dst->cov, src->cov, src->len);
+	kv_init(dst->nei[0]); kv_copy(ku128_t, dst->nei[0], src->nei[0]);
+	kv_init(dst->nei[1]); kv_copy(ku128_t, dst->nei[1], src->nei[1]);
+}
+
+void mag_eh_add(mag_t *g, uint64_t u, uint64_t v, int ovlp) // add v to u
+{
+	ku128_v *r;
+	ku128_t *q;
+	uint64_t idd;
+	int i;
+	if ((int64_t)u < 0) return;
+	idd = tid2idd(g->h, u);
+	r = &g->v.a[idd>>1].nei[idd&1];
+	for (i = 0; i < r->n; ++i) // no multi-edges
+		if (r->a[i].x == v) return;
+	kv_pushp(ku128_t, *r, &q);
+	q->x = v; q->y = ovlp;
+}
+
+void mag_eh_markdel(mag_t *g, uint64_t u, uint64_t v) // mark deletion of v from u
+{
+	int i;	
+	uint64_t idd;
+	if ((int64_t)u < 0) return;
+	idd = tid2idd(g->h, u);
+	ku128_v *r = &g->v.a[idd>>1].nei[idd&1];
+	for (i = 0; i < r->n; ++i)
+		if (r->a[i].x == v) edge_mark_del(r->a[i]);
+}
+
+void mag_v_del(mag_t *g, magv_t *p)
+{
+	int i, j;
+	khint_t k;
+	if (p->len < 0) return;
+	for (i = 0; i < 2; ++i) {
+		ku128_v *r = &p->nei[i];
+		for (j = 0; j < r->n; ++j)
+			if (!edge_is_del(r->a[j]) && r->a[j].x != p->k[0] && r->a[j].x != p->k[1])
+				mag_eh_markdel(g, r->a[j].x, p->k[i]);
+	}
+	for (i = 0; i < 2; ++i) {
+		k = kh_get(64, g->h, p->k[i]);
+		kh_del(64, g->h, k);
+	}
+	mag_v_destroy(p);
+}
+
+void mag_v_transdel(mag_t *g, magv_t *p, int min_ovlp)
+{
+	if (p->nei[0].n && p->nei[1].n) {
+		int i, j, ovlp;
+		for (i = 0; i < p->nei[0].n; ++i) {
+			if (edge_is_del(p->nei[0].a[i]) || p->nei[0].a[i].x == p->k[0] || p->nei[0].a[i].x == p->k[1]) continue; // due to p->p loop
+			for (j = 0; j < p->nei[1].n; ++j) {
+				if (edge_is_del(p->nei[1].a[j]) || p->nei[1].a[j].x == p->k[0] || p->nei[1].a[j].x == p->k[1]) continue;
+				ovlp = (int)(p->nei[0].a[i].y + p->nei[1].a[j].y) - p->len;
+				if (ovlp >= min_ovlp) {
+					mag_eh_add(g, p->nei[0].a[i].x, p->nei[1].a[j].x, ovlp);
+					mag_eh_add(g, p->nei[1].a[j].x, p->nei[0].a[i].x, ovlp);
+				}
+			}
+		}
+	}
+	mag_v_del(g, p);
+}
+
+void mag_v_flip(mag_t *g, magv_t *p)
+{
+	ku128_v t;
+	khint_t k;
+	hash64_t *h = (hash64_t*)g->h;
+
+	seq_revcomp6(p->len, (uint8_t*)p->seq);
+	seq_reverse(p->len, (uint8_t*)p->cov);
+	p->k[0] ^= p->k[1]; p->k[1] ^= p->k[0]; p->k[0] ^= p->k[1];
+	t = p->nei[0]; p->nei[0] = p->nei[1]; p->nei[1] = t;
+	k = kh_get(64, h, p->k[0]);
+	assert(k != kh_end(h));
+	kh_val(h, k) ^= 1;
+	k = kh_get(64, h, p->k[1]);
+	assert(k != kh_end(h));
+	kh_val(h, k) ^= 1;
+}
+
+/*********************
+ * Unambiguous merge *
+ *********************/
+
+int mag_vh_merge_try(mag_t *g, magv_t *p, int min_merge_len) // merge p's neighbor to the right-end of p
+{
+	magv_t *q;
+	khint_t kp, kq;
+	int i, j, new_l;
+	hash64_t *h = (hash64_t*)g->h;
+
+	// check if an unambiguous merge can be performed
+	if (p->nei[1].n != 1) return -1; // multiple or no neighbor; do not merge
+	if ((int64_t)p->nei[1].a[0].x < 0) return -2; // deleted neighbor
+	if ((int)p->nei[1].a[0].y < min_merge_len) return -5;
+	kq = kh_get(64, g->h, p->nei[1].a[0].x);
+	assert(kq != kh_end(h)); // otherwise the neighbor is non-existant
+	q = &g->v.a[kh_val((hash64_t*)g->h, kq)>>1];
+	if (p == q) return -3; // we have a loop p->p. We cannot merge in this case
+	if (q->nei[kh_val(h, kq)&1].n != 1) return -4; // the neighbor q has multiple neighbors. cannot be an unambiguous merge
+
+	// we can perform a merge; do further consistency check (mostly check bugs)
+	if (kh_val(h, kq)&1) mag_v_flip(g, q); // a "><" bidirectional arc; flip q
+	kp = kh_get(64, g->h, p->k[1]); assert(kp != kh_end(h)); // get the iterator to p
+	kh_del(64, g->h, kp); kh_del(64, g->h, kq); // remove the two ends of the arc in the hash table
+	assert(p->k[1] == q->nei[0].a[0].x && q->k[0] == p->nei[1].a[0].x); // otherwise inconsistent topology
+	assert(p->nei[1].a[0].y == q->nei[0].a[0].y); // the overlap length must be the same
+	assert(p->len >= p->nei[1].a[0].y && q->len >= p->nei[1].a[0].y); // and the overlap is shorter than both vertices
+
+	// update the read count and sequence length
+	p->nsr += q->nsr;
+	new_l = p->len + q->len - p->nei[1].a[0].y;
+	if (new_l + 1 > p->max_len) { // then double p->seq and p->cov
+		p->max_len = new_l + 1;
+		kroundup32(p->max_len);
+		p->seq = realloc(p->seq, p->max_len);
+		p->cov = realloc(p->cov, p->max_len);
+	}
+	// merge seq and cov
+	for (i = p->len - p->nei[1].a[0].y, j = 0; j < q->len; ++i, ++j) { // write seq and cov
+		p->seq[i] = q->seq[j];
+		if (i < p->len) {
+			if ((int)p->cov[i] + (q->cov[j] - 33) > 126) p->cov[i] = 126;
+			else p->cov[i] += q->cov[j] - 33;
+		} else p->cov[i] = q->cov[j];
+	}
+	p->seq[new_l] = p->cov[new_l] = 0;
+	p->len = new_l;
+	// merge neighbors
+	free(p->nei[1].a);
+	p->nei[1] = q->nei[1]; p->k[1] = q->k[1];
+	q->nei[1].a = 0; // to avoid freeing p->nei[1] by mag_v_destroy() below
+	// update the hash table for the right end of p
+	kp = kh_get(64, g->h, p->k[1]);
+	assert(kp != kh_end((hash64_t*)g->h));
+	kh_val(h, kp) = (p - g->v.a)<<1 | 1;
+	// clean up q
+	mag_v_destroy(q);
+	return 0;
+}
+
+void mag_g_merge(mag_t *g, int rmdup, int min_merge_len)
+{
+	int i;
+	uint64_t n = 0;
+	for (i = 0; i < g->v.n; ++i) { // remove multiedges; FIXME: should we do that?
+		if (rmdup) {
+			v128_rmdup(&g->v.a[i].nei[0]);
+			v128_rmdup(&g->v.a[i].nei[1]);
+		} else {
+			v128_clean(&g->v.a[i].nei[0]);
+			v128_clean(&g->v.a[i].nei[1]);
+		}
+	}
+	for (i = 0; i < g->v.n; ++i) {
+		magv_t *p = &g->v.a[i];
+		if (p->len < 0) continue;
+		while (mag_vh_merge_try(g, p, min_merge_len) == 0) ++n;
+		mag_v_flip(g, p);
+		while (mag_vh_merge_try(g, p, min_merge_len) == 0) ++n;
+	}
+	if (fm_verbose >= 3)
+		fprintf(stderr, "[M::%s] unambiguously merged %ld pairs of vertices\n", __func__, (long)n);
+}
+
+/*****************************
+ * Easy graph simplification *
+ *****************************/
+
+typedef magv_t *magv_p;
+
+#define mag_vlt1(a, b) ((a)->nsr < (b)->nsr || ((a)->nsr == (b)->nsr && (a)->len < (b)->len))
+KSORT_INIT(vlt1, magv_p, mag_vlt1)
+
+#define mag_vlt2(a, b) ((a)->nei[0].n + (a)->nei[1].n < (b)->nei[0].n + (b)->nei[1].n)
+KSORT_INIT(vlt2, magv_p, mag_vlt2)
+
+int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr)
+{
+	int i;
+	kvec_t(magv_p) a = {0,0,0};
+
+	for (i = 0; i < g->v.n; ++i) {
+		magv_t *p = &g->v.a[i];
+		if (p->len < 0 || (p->nei[0].n > 0 && p->nei[1].n > 0)) continue;
+		if (p->len >= min_len || p->nsr >= min_nsr) continue;
+		kv_push(magv_p, a, p);
+	}
+	ks_introsort(vlt1, a.n, a.a);
+	for (i = 0; i < a.n; ++i) mag_v_del(g, a.a[i]);
+	free(a.a);
+	if (fm_verbose >= 3)
+		fprintf(stderr, "[M::%s] removed %ld tips (min_len=%d, min_nsr=%d)\n", __func__, a.n, min_len, min_nsr);
+	return a.n;
+}
+
+int mag_g_rm_vint(mag_t *g, int min_len, int min_nsr, int min_ovlp)
+{
+	int i;
+	kvec_t(magv_p) a = {0,0,0};
+
+	for (i = 0; i < g->v.n; ++i) {
+		magv_t *p = &g->v.a[i];
+		if (p->len >= 0 && p->len < min_len && p->nsr < min_nsr)
+			kv_push(magv_p, a, p);
+	}
+	ks_introsort(vlt1, a.n, a.a);
+	for (i = 0; i < a.n; ++i) mag_v_transdel(g, a.a[i], min_ovlp);
+	free(a.a);
+	if (fm_verbose >= 3)
+		fprintf(stderr, "[M::%s] removed %ld internal vertices (min_len=%d, min_nsr=%d)\n", __func__, a.n, min_len, min_nsr);
+	return a.n;
+}
+
+void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr)
+{
+	int i, j, k;
+	kvec_t(magv_p) a = {0,0,0};
+	uint64_t n_marked = 0;
+
+	for (i = 0; i < g->v.n; ++i) {
+		magv_t *p = &g->v.a[i];
+		if (p->len < 0) continue;
+		if ((p->nei[0].n == 0 || p->nei[1].n == 0) && p->len < min_len && p->nsr < min_nsr)
+			continue; // skip tips
+		kv_push(magv_p, a, p);
+	}
+	ks_introsort(vlt1, a.n, a.a);
+
+	for (i = a.n - 1; i >= 0; --i) {
+		magv_t *p = a.a[i];
+		for (j = 0; j < 2; ++j) {
+			ku128_v *r = &p->nei[j];
+			int max_ovlp = min_ovlp, max_k = -1;
+			if (r->n == 0) continue; // no overlapping reads
+			for (k = 0; k < r->n; ++k) // get the max overlap length
+				if (max_ovlp < r->a[k].y)
+					max_ovlp = r->a[k].y, max_k = k;
+			if (max_k >= 0) { // test if max_k is a tip
+				uint64_t x = tid2idd(g->h, r->a[max_k].x);
+				magv_t *q = &g->v.a[x>>1];
+				if (q->len >= 0 && (q->nei[0].n == 0 || q->nei[1].n == 0) && q->len < min_len && q->nsr < min_nsr)
+					max_ovlp = min_ovlp;
+			}
+			for (k = 0; k < r->n; ++k) {
+				if (edge_is_del(r->a[k])) continue;
+				if (r->a[k].y < min_ovlp || (double)r->a[k].y / max_ovlp < min_ratio) {
+					mag_eh_markdel(g, r->a[k].x, p->k[j]); // FIXME: should we check if r->a[k] is p itself?
+					edge_mark_del(r->a[k]);
+					++n_marked;
+				}
+			}
+		}
+	}
+	free(a.a);
+	if (fm_verbose >= 3)
+		fprintf(stderr, "[M::%s] removed %ld edges\n", __func__, (long)n_marked);
+}
+
+/*********************************************
+ * A-statistics and simplistic flow analysis *
+ *********************************************/
+
+#define A_THRES 20.
+#define A_MIN_SUPP 5
+
+double mag_cal_rdist(mag_t *g)
+{
+	magv_v *v = &g->v;
+	int j;
+	uint64_t *srt;
+	double rdist = -1.;
+	int64_t i, sum_n_all, sum_n, sum_l;
+
+	srt = calloc(v->n, 8);
+	for (i = 0, sum_n_all = 0; i < v->n; ++i) {
+		srt[i] = (uint64_t)v->a[i].nsr<<32 | i;
+		sum_n_all += v->a[i].nsr;
+	}
+	ks_introsort_uint64_t(v->n, srt);
+
+	for (j = 0; j < 2; ++j) {
+		sum_n = sum_l = 0;
+		for (i = v->n - 1; i >= 0; --i) {
+			const magv_t *p = &v->a[srt[i]<<32>>32];
+			int tmp1, tmp2;
+			tmp1 = tmp2 = 0;
+			if (p->nei[0].n) ++tmp1, tmp2 += p->nei[0].a[0].y;
+			if (p->nei[1].n) ++tmp1, tmp2 += p->nei[1].a[0].y;
+			if (tmp1) tmp2 /= tmp1;
+			if (rdist > 0.) {
+				double A = (p->len - tmp1) / rdist - p->nsr * M_LN2;
+				if (A < A_THRES) continue;
+			}
+			sum_n += p->nsr;
+			sum_l += p->len - tmp1;
+			if (sum_n >= sum_n_all * 0.5) break;
+		}
+		rdist = (double)sum_l / sum_n;
+	}
+	if (fm_verbose >= 3) {
+		fprintf(stderr, "[M::%s] average read distance %.3f.\n", __func__, rdist);
+		fprintf(stderr, "[M::%s] approximate genome size: %.0f (inaccurate!)\n", __func__, rdist * sum_n_all);
+	}
+
+	free(srt);
+	return rdist;
+}
+
+/**************
+ * Key portal *
+ **************/
+
+void mag_init_opt(magopt_t *o)
+{
+	memset(o, 0, sizeof(magopt_t));
+	o->trim_len = 0;
+	o->trim_depth = 6;
+
+	o->min_elen = 300;
+	o->min_ovlp = 0;
+	o->min_merge_len = 0;
+	o->min_ensr = 4;
+	o->min_insr = 3;
+	o->min_dratio1 = 0.7;
+
+	o->max_bcov = 10.;
+	o->max_bfrac = 0.15;
+	o->max_bvtx = 64;
+	o->max_bdist = 512;
+	o->max_bdiff = 50;
+}
+
+void mag_g_clean(mag_t *g, const magopt_t *opt)
+{
+	int j;
+
+	if (g->min_ovlp < opt->min_ovlp) g->min_ovlp = opt->min_ovlp;
+	for (j = 2; j <= opt->min_ensr; ++j)
+		mag_g_rm_vext(g, opt->min_elen, j);
+	mag_g_merge(g, 0, opt->min_merge_len);
+	mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr);
+	mag_g_merge(g, 1, opt->min_merge_len);
+	for (j = 2; j <= opt->min_ensr; ++j)
+		mag_g_rm_vext(g, opt->min_elen, j);
+	mag_g_merge(g, 0, opt->min_merge_len);
+	if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen);
+	if (!(opt->flag & MAG_F_NO_SIMPL)) mag_g_simplify_bubble(g, opt->max_bvtx, opt->max_bdist);
+	mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->max_bdiff, opt->flag & MAG_F_AGGRESSIVE);
+	mag_g_rm_vint(g, opt->min_elen, opt->min_insr, g->min_ovlp);
+	mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr);
+	mag_g_merge(g, 1, opt->min_merge_len);
+	mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
+	mag_g_merge(g, 0, opt->min_merge_len);
+	if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen);
+	mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
+	mag_g_merge(g, 0, opt->min_merge_len);
+}
+
+void mag_v_trim_open(mag_t *g, magv_t *v, int trim_len, int trim_depth)
+{
+	int i, j, tl[2];
+	if (v->nei[0].n > 0 && v->nei[1].n > 0) return; // no open end; do nothing
+	if (v->nei[0].n == 0 && v->nei[1].n == 0 && v->len < trim_len * 3) { // disconnected short vertex
+		mag_v_del(g, v);
+		return;
+	}
+	for (j = 0; j < 2; ++j) {
+		ku128_v *r = &v->nei[!j];
+		int max_ovlp = 0;
+		for (i = 0; i < r->n; ++i)
+			max_ovlp = max_ovlp > r->a[i].y? max_ovlp : r->a[i].y;
+		tl[j] = v->len - max_ovlp < trim_len? v->len - max_ovlp : trim_len;
+	}
+	if (v->nei[0].n == 0) {
+		for (i = 0; i < tl[0] && v->cov[i] - 33 < trim_depth; ++i);
+		tl[0] = i;
+		v->len -= i;
+		memmove(v->seq, v->seq + tl[0], v->len);
+		memmove(v->cov, v->cov + tl[0], v->len);
+	}
+	if (v->nei[1].n == 0) {
+		for (i = v->len - 1; i >= v->len - tl[1] && v->cov[i] - 33 < trim_depth; --i);
+		tl[1] = v->len - 1 - i;
+		v->len -= tl[1];
+	}
+}
+
+void mag_g_trim_open(mag_t *g, const magopt_t *opt)
+{
+	int i;
+	if (opt->trim_len == 0) return;
+	for (i = 0; i < g->v.n; ++i)
+		mag_v_trim_open(g, &g->v.a[i], opt->trim_len, opt->trim_depth);
+}
diff --git a/mag.h b/mag.h
new file mode 100644
index 0000000..ca76527
--- /dev/null
+++ b/mag.h
@@ -0,0 +1,69 @@
+#ifndef FM_MOG_H
+#define FM_MOG_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "kstring.h"
+#include "fml.h"
+
+#ifndef KINT_DEF
+#define KINT_DEF
+typedef struct { uint64_t x, y; } ku128_t;
+typedef struct { size_t n, m; uint64_t *a; } ku64_v;
+typedef struct { size_t n, m; ku128_t *a; } ku128_v;
+#endif
+
+typedef struct {
+	int len, nsr;    // length; number supporting reads
+	uint32_t max_len;// allocated seq/cov size
+	uint64_t k[2];   // bi-interval
+	ku128_v nei[2];  // neighbors
+	char *seq, *cov; // sequence and coverage
+	void *ptr;       // additional information
+} magv_t;
+
+typedef struct { size_t n, m; magv_t *a; } magv_v;
+
+typedef struct mag_t {
+	magv_v v;
+	float rdist;  // read distance
+	int min_ovlp; // minimum overlap seen from the graph
+	void *h;
+} mag_t;
+
+struct mogb_aux;
+typedef struct mogb_aux mogb_aux_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	void mag_init_opt(magopt_t *o);
+	void mag_g_clean(mag_t *g, const magopt_t *opt);
+
+	void mag_g_destroy(mag_t *g);
+	void mag_g_amend(mag_t *g);
+	void mag_g_build_hash(mag_t *g);
+	void mag_g_print(const mag_t *g);
+	int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr);
+	void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr);
+	void mag_g_merge(mag_t *g, int rmdup, int min_merge_len);
+	void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist);
+	void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive);
+	void mag_g_pop_open(mag_t *g, int min_elen);
+	void mag_g_trim_open(mag_t *g, const magopt_t *opt);
+
+	void mag_v_copy_to_empty(magv_t *dst, const magv_t *src); // NB: memory leak if dst is allocated
+	void mag_v_del(mag_t *g, magv_t *p);
+	void mag_v_write(const magv_t *p, kstring_t *out);
+	void mag_v_pop_open(mag_t *g, magv_t *p, int min_elen);
+
+	uint64_t mag_tid2idd(void *h, uint64_t tid);
+	void mag_v128_clean(ku128_v *r);
+	double mag_cal_rdist(mag_t *g);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/misc.c b/misc.c
new file mode 100644
index 0000000..6362000
--- /dev/null
+++ b/misc.c
@@ -0,0 +1,302 @@
+#include <assert.h>
+#include "internal.h"
+#include "kstring.h"
+#include "rle.h"
+#include "mrope.h"
+#include "rld0.h"
+#include "mag.h"
+#include "kvec.h"
+#include "fml.h"
+#include "htab.h"
+
+unsigned char seq_nt6_table[256] = {
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 1, 5, 2,  5, 5, 5, 3,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  4, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 1, 5, 2,  5, 5, 5, 3,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  4, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
+    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5
+};
+
+void fml_opt_init(fml_opt_t *opt)
+{
+	opt->n_threads = 1;
+	opt->ec_k = 0;
+	opt->min_cnt = 4;
+	opt->max_cnt = 8;
+	opt->min_asm_ovlp = 33;
+	opt->min_merge_len = 0;
+	mag_init_opt(&opt->mag_opt);
+	opt->mag_opt.flag = MAG_F_NO_SIMPL | MAG_F_POPOPEN;
+}
+
+void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs)
+{
+	int i, log_len;
+	uint64_t tot_len = 0;
+	if (opt->n_threads < 1) opt->n_threads = 1;
+	for (i = 0; i < n_seqs; ++i) tot_len += seqs[i].l_seq; // compute total length
+	for (log_len = 10; log_len < 32; ++log_len) // compute ceil(log2(tot_len))
+		if (1ULL<<log_len > tot_len) break;
+	if (opt->ec_k == 0) opt->ec_k = (log_len + 12) / 2;
+	if (opt->ec_k%2 == 0) ++opt->ec_k;
+	opt->mag_opt.min_elen = (int)((double)tot_len / n_seqs * 2.5 + .499);
+}
+
+static inline int is_rev_same(int l, const char *s)
+{
+	int i;
+	if (l&1) return 0;
+	for (i = 0; i < l>>1; ++i)
+		if (s[i] + s[l-1-i] != 5) break;
+	return (i == l>>1);
+}
+
+struct rld_t *fml_fmi_gen(int n, bseq1_t *seq, int is_mt)
+{
+	mrope_t *mr;
+	kstring_t str = {0,0,0};
+	mritr_t itr;
+	rlditr_t di;
+	const uint8_t *block;
+	rld_t *e = 0;
+	int k;
+
+	for (k = 0; k < n; ++k)
+		if (seq[k].l_seq > 0)
+			break;
+	if (k == n) return 0;
+
+	mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO);
+	for (k = 0; k < n; ++k) {
+		int i;
+		bseq1_t *s = &seq[k];
+		if (s->l_seq == 0) continue;
+		free(s->qual);
+		for (i = 0; i < s->l_seq; ++i)
+			s->seq[i] = seq_nt6_table[(int)s->seq[i]];
+		for (i = 0; i < s->l_seq; ++i)
+			if (s->seq[i] == 5) break;
+		if (i < s->l_seq) {
+			free(s->seq);
+			continue;
+		}
+		if (is_rev_same(s->l_seq, s->seq))
+			--s->l_seq, s->seq[s->l_seq] = 0;
+		seq_reverse(s->l_seq, (uint8_t*)s->seq);
+		kputsn(s->seq, s->l_seq + 1, &str);
+		seq_revcomp6(s->l_seq, (uint8_t*)s->seq);
+		kputsn(s->seq, s->l_seq + 1, &str);
+		free(s->seq);
+	}
+	free(seq);
+	mr_insert_multi(mr, str.l, (uint8_t*)str.s, is_mt);
+	free(str.s);
+
+	e = rld_init(6, 3);
+	rld_itr_init(e, &di, 0);
+	mr_itr_first(mr, &itr, 1);
+	while ((block = mr_itr_next_block(&itr)) != 0) {
+		const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block);
+		while (q < end) {
+			int c = 0;
+			int64_t l;
+			rle_dec1(q, c, l);
+			rld_enc(e, &di, l, c);
+		}
+	}
+	rld_enc_finish(e, &di);
+
+	mr_destroy(mr);
+	return e;
+}
+
+struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq)
+{
+	return fml_fmi_gen(n, seq, opt->n_threads > 1? 1 : 0);
+}
+
+void fml_fmi_destroy(rld_t *e)
+{
+	rld_destroy(e);
+}
+
+void fml_mag_clean(const fml_opt_t *opt, struct mag_t *g)
+{
+	magopt_t o = opt->mag_opt;
+	o.min_merge_len = opt->min_merge_len;
+	mag_g_merge(g, 1, opt->min_merge_len);
+	mag_g_clean(g, &o);
+	mag_g_trim_open(g, &o);
+}
+
+void fml_mag_destroy(struct mag_t *g)
+{
+	mag_g_destroy(g);
+}
+
+#include "khash.h"
+KHASH_DECLARE(64, uint64_t, uint64_t)
+
+#define edge_is_del(_x)   ((_x).x == (uint64_t)-2 || (_x).y == 0) // from mag.c
+
+fml_utg_t *fml_mag2utg(struct mag_t *g, int *n)
+{
+	size_t i, j;
+	fml_utg_t *utg;
+	khash_t(64) *h;
+	khint_t k;
+
+	h = kh_init(64);
+	for (i = j = 0; i < g->v.n; ++i) {
+		int absent;
+		magv_t *p = &g->v.a[i];
+		if (p->len < 0) continue;
+		k = kh_put(64, h, p->k[0], &absent);
+		kh_val(h, k) = j<<1 | 0;
+		k = kh_put(64, h, p->k[1], &absent);
+		kh_val(h, k) = j<<1 | 1;
+		++j;
+	}
+	*n = j;
+	kh_destroy(64, g->h);
+
+	utg = (fml_utg_t*)calloc(*n, sizeof(fml_utg_t));
+	for (i = j = 0; i < g->v.n; ++i) {
+		magv_t *p = &g->v.a[i];
+		fml_utg_t *q;
+		int from, a, b;
+		if (p->len < 0) continue;
+		q = &utg[j++];
+		q->len = p->len, q->nsr = p->nsr;
+		q->seq = p->seq, q->cov = p->cov;
+		for (a = 0; a < q->len; ++a)
+			q->seq[a] = "$ACGTN"[(int)q->seq[a]];
+		q->seq[q->len] = q->cov[q->len] = 0;
+		for (from = 0; from < 2; ++from) {
+			ku128_v *r = &p->nei[from];
+			for (b = q->n_ovlp[from] = 0; b < r->n; ++b)
+				if (!edge_is_del(r->a[b])) ++q->n_ovlp[from];
+		}
+		q->ovlp = (fml_ovlp_t*)calloc(q->n_ovlp[0] + q->n_ovlp[1], sizeof(fml_ovlp_t));
+		for (from = a = 0; from < 2; ++from) {
+			ku128_v *r = &p->nei[from];
+			for (b = 0; b < r->n; ++b) {
+				ku128_t *s = &r->a[b];
+				fml_ovlp_t *t;
+				if (edge_is_del(*s)) continue;
+				t = &q->ovlp[a++];
+				k = kh_get(64, h, s->x);
+				assert(k != kh_end(h));
+				t->id = kh_val(h, k) >> 1;
+				t->to = kh_val(h, k) & 1;
+				t->len = s->y;
+				t->from = from;
+			}
+			free(p->nei[from].a);
+		}
+	}
+	kh_destroy(64, h);
+	free(g->v.a);
+	free(g);
+	return utg;
+}
+
+void fml_utg_print(int n, const fml_utg_t *utg)
+{
+	int i, j, l;
+	kstring_t out = {0,0,0};
+	for (i = 0; i < n; ++i) {
+		const fml_utg_t *u = &utg[i];
+		out.l = 0;
+		kputc('@', &out); kputw(i<<1|0, &out); kputc(':', &out); kputw(i<<1|1, &out);
+		kputc('\t', &out); kputw(u->nsr, &out);
+		kputc('\t', &out);
+		for (j = 0; j < u->n_ovlp[0]; ++j) {
+			kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out);
+			kputw(u->ovlp[j].len, &out); kputc(';', &out);
+		}
+		if (u->n_ovlp[0] == 0) kputc('.', &out);
+		kputc('\t', &out);
+		for (; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
+			kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out);
+			kputw(u->ovlp[j].len, &out); kputc(';', &out);
+		}
+		if (u->n_ovlp[1] == 0) kputc('.', &out);
+		kputc('\n', &out);
+		l = out.l;
+		kputsn(u->seq, u->len, &out);
+		kputsn("\n+\n", 3, &out);
+		kputsn(u->cov, u->len, &out);
+		kputc('\n', &out);
+		fputs(out.s, stdout);
+	}
+	free(out.s);
+}
+
+void fml_utg_print_gfa(int n, const fml_utg_t *utg)
+{
+	int i, j;
+	FILE *fp = stdout;
+	fputs("H\tVN:Z:1.0\n", fp);
+	for (i = 0; i < n; ++i) {
+		const fml_utg_t *u = &utg[i];
+		fprintf(fp, "S\t%d\t", i);
+		fputs(u->seq, fp);
+		fprintf(fp, "\tLN:i:%d\tRC:i:%d\tPD:Z:", u->len, u->nsr);
+		fputs(u->cov, fp);
+		fputc('\n', fp);
+		for (j = 0; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
+			fml_ovlp_t *o = &u->ovlp[j];
+			if (i < o->id)
+				fprintf(fp, "L\t%d\t%c\t%d\t%c\t%dM\n", i, "+-"[!o->from], o->id, "+-"[o->to], o->len);
+		}
+	}
+}
+
+void fml_utg_destroy(int n, fml_utg_t *utg)
+{
+	int i;
+	for (i = 0; i < n; ++i) {
+		free(utg[i].seq);
+		free(utg[i].cov);
+		free(utg[i].ovlp);
+	}
+	free(utg);
+}
+
+#define MAG_MIN_NSR_COEF .1
+
+fml_utg_t *fml_assemble(const fml_opt_t *opt0, int n_seqs, bseq1_t *seqs, int *n_utg)
+{
+	rld_t *e;
+	mag_t *g;
+	fml_utg_t *utg;
+	fml_opt_t opt = *opt0;
+	float kcov;
+
+	*n_utg = 0;
+	fml_opt_adjust(&opt, n_seqs, seqs);
+	if (opt.ec_k >= 0) fml_correct(&opt, n_seqs, seqs);
+	kcov = fml_fltuniq(&opt, n_seqs, seqs);
+	e = fml_seq2fmi(&opt, n_seqs, seqs);
+	if (e == 0) return 0; // this may happen when all sequences are filtered out
+	g = fml_fmi2mag(&opt, e);
+	opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > kcov * MAG_MIN_NSR_COEF? opt.mag_opt.min_ensr : (int)(kcov * MAG_MIN_NSR_COEF + .499);
+	opt.mag_opt.min_ensr = opt.mag_opt.min_ensr < opt0->max_cnt? opt.mag_opt.min_ensr : opt0->max_cnt;
+	opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > opt0->min_cnt? opt.mag_opt.min_ensr : opt0->min_cnt;
+	opt.mag_opt.min_insr = opt.mag_opt.min_ensr - 1;
+	fml_mag_clean(&opt, g);
+	utg = fml_mag2utg(g, n_utg);
+	return utg;
+}
diff --git a/mrope.c b/mrope.c
new file mode 100644
index 0000000..5954f06
--- /dev/null
+++ b/mrope.c
@@ -0,0 +1,307 @@
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <time.h>
+#include "mrope.h"
+
+/*******************************
+ *** Single-string insertion ***
+ *******************************/
+
+mrope_t *mr_init(int max_nodes, int block_len, int sorting_order)
+{
+	int a;
+	mrope_t *r;
+	assert(sorting_order >= 0 && sorting_order <= 2);
+	r = calloc(1, sizeof(mrope_t));
+	r->so = sorting_order;
+	r->thr_min = 1000;
+	for (a = 0; a != 6; ++a)
+		r->r[a] = rope_init(max_nodes, block_len);
+	return r;
+}
+
+void mr_destroy(mrope_t *r)
+{
+	int a;
+	for (a = 0; a != 6; ++a)
+		if (r->r[a]) rope_destroy(r->r[a]);
+	free(r);
+}
+
+int mr_thr_min(mrope_t *r, int thr_min)
+{
+	if (thr_min > 0)
+		r->thr_min = thr_min;
+	return r->thr_min;
+}
+
+int64_t mr_insert1(mrope_t *r, const uint8_t *str)
+{
+	int64_t tl[6], tu[6], l, u;
+	const uint8_t *p;
+	int b, is_srt = (r->so != MR_SO_IO), is_comp = (r->so == MR_SO_RCLO);
+	for (u = 0, b = 0; b != 6; ++b) u += r->r[b]->c[0];
+	l = is_srt? 0 : u;
+	for (p = str, b = 0; *p; b = *p++) {
+		int a;
+		if (l != u) {
+			int64_t cnt = 0;
+			rope_rank2a(r->r[b], l, u, tl, tu);
+			if (is_comp && *p != 5) {
+				for (a = 4; a > *p; --a) l += tu[a] - tl[a];
+				l += tu[0] - tl[0];
+			} else for (a = 0; a < *p; ++a) l += tu[a] - tl[a];
+			rope_insert_run(r->r[b], l, *p, 1, 0);
+			while (--b >= 0) cnt += r->r[b]->c[*p];
+			l = cnt + tl[*p]; u = cnt + tu[*p];
+		} else {
+			l = rope_insert_run(r->r[b], l, *p, 1, 0);
+			while (--b >= 0) l += r->r[b]->c[*p];
+			u = l;
+		}
+	}
+	return rope_insert_run(r->r[b], l, 0, 1, 0);
+}
+
+void mr_rank2a(const mrope_t *mr, int64_t x, int64_t y, int64_t *cx, int64_t *cy)
+{
+	int a, b;
+	int64_t z, c[6], l;
+	memset(c, 0, 48);
+	for (a = 0, z = 0; a < 6; ++a) {
+		const int64_t *ca = mr->r[a]->c;
+		l = ca[0] + ca[1] + ca[2] + ca[3] + ca[4] + ca[5];
+		if (z + l >= x) break;
+		for (b = 0; b < 6; ++b) c[b] += ca[b];
+		z += l;
+	}
+	assert(a != 6);
+	if (y >= 0 && z + l >= y) { // [x,y) is in the same bucket
+		rope_rank2a(mr->r[a], x - z, y - z, cx, cy);
+		for (b = 0; b < 6; ++b)
+			cx[b] += c[b], cy[b] += c[b];
+		return;
+	}
+	if (x != z) rope_rank1a(mr->r[a], x - z, cx);
+	else memset(cx, 0, 48);
+	for (b = 0; b < 6; ++b)
+		cx[b] += c[b], c[b] += mr->r[a]->c[b];
+	if (y < 0) return;
+	for (++a, z += l; a < 6; ++a) {
+		const int64_t *ca = mr->r[a]->c;
+		l = ca[0] + ca[1] + ca[2] + ca[3] + ca[4] + ca[5];
+		if (z + l >= y) break;
+		for (b = 0; b < 6; ++b) c[b] += ca[b];
+		z += l;
+	}
+	assert(a != 6);
+	if (y != z + l) rope_rank1a(mr->r[a], y - z, cy);
+	else for (b = 0; b < 6; ++b) cy[b] = mr->r[a]->c[b];
+	for (b = 0; b < 6; ++b) cy[b] += c[b];
+}
+
+/**********************
+ *** Mrope iterator ***
+ **********************/
+
+void mr_itr_first(mrope_t *r, mritr_t *i, int to_free)
+{
+	i->a = 0; i->r = r; i->to_free = to_free;
+	rope_itr_first(i->r->r[0], &i->i);
+}
+
+const uint8_t *mr_itr_next_block(mritr_t *i)
+{
+	const uint8_t *s;
+	if (i->a >= 6) return 0;
+	while ((s = rope_itr_next_block(&i->i)) == 0) {
+		if (i->to_free) {
+			rope_destroy(i->r->r[i->a]);
+			i->r->r[i->a] = 0;
+		}
+		if (++i->a == 6) return 0;
+		rope_itr_first(i->r->r[i->a], &i->i);
+	}
+	return i->a == 6? 0 : s;
+}
+
+/*****************************************
+ *** Inserting multiple strings in RLO ***
+ *****************************************/
+
+typedef struct {
+	uint64_t l;
+	uint64_t u:61, c:3;
+	const uint8_t *p;
+} triple64_t;
+
+typedef const uint8_t *cstr_t;
+
+#define rope_comp6(c) ((c) >= 1 && (c) <= 4? 5 - (c) : (c))
+
+static void mr_insert_multi_aux(rope_t *rope, int64_t m, triple64_t *a, int is_comp)
+{
+	int64_t k, beg;
+	rpcache_t cache;
+	memset(&cache, 0, sizeof(rpcache_t));
+	for (k = 0; k != m; ++k) // set the base to insert
+		a[k].c = *a[k].p++;
+	for (k = 1, beg = 0; k <= m; ++k) {
+		if (k == m || a[k].u != a[k-1].u) {
+			int64_t x, i, l = a[beg].l, u = a[beg].u, tl[6], tu[6], c[6];
+			int start, end, step, b;
+			if (l == u && k == beg + 1) { // special case; still works without the following block
+				a[beg].l = a[beg].u = rope_insert_run(rope, l, a[beg].c, 1, &cache);
+				beg = k;
+				continue;
+			} else if (l == u) {
+				memset(tl, 0, 48);
+				memset(tu, 0, 48);
+			} else rope_rank2a(rope, l, u, tl, tu);
+			memset(c, 0, 48);
+			for (i = beg; i < k; ++i) ++c[a[i].c];
+			// insert sentinel
+			if (c[0]) rope_insert_run(rope, l, 0, c[0], &cache);
+			// insert A/C/G/T
+			x =  l + c[0] + (tu[0] - tl[0]);
+			if (is_comp) start = 4, end = 0, step = -1;
+			else start = 1, end = 5, step = 1;
+			for (b = start; b != end; b += step) {
+				int64_t size = tu[b] - tl[b];
+				if (c[b]) {
+					tl[b] = rope_insert_run(rope, x, b, c[b], &cache);
+					tu[b] = tl[b] + size;
+				}
+				x += c[b] + size;
+			}
+			// insert N
+			if (c[5]) {
+				tu[5] -= tl[5];
+				tl[5] = rope_insert_run(rope, x, 5, c[5], &cache);
+				tu[5] += tl[5];
+			}
+			// update a[]
+			for (i = beg; i < k; ++i) {
+				triple64_t *p = &a[i];
+				p->l = tl[p->c], p->u = tu[p->c];
+			}
+			beg = k;
+		}
+	}
+}
+
+typedef struct {
+	volatile int *n_fin_workers;
+	volatile int to_run;
+	int to_exit;
+	mrope_t *mr;
+	int b, is_comp;
+	int64_t m;
+	triple64_t *a;
+} worker_t;
+
+static void *worker(void *data)
+{
+	worker_t *w = (worker_t*)data;
+	struct timespec req, rem;
+	req.tv_sec = 0; req.tv_nsec = 1000000;
+	do {
+		while (!__sync_bool_compare_and_swap(&w->to_run, 1, 0)) nanosleep(&req, &rem); // wait for the signal from the master thread
+		if (w->m) mr_insert_multi_aux(w->mr->r[w->b], w->m, w->a, w->is_comp);
+		__sync_add_and_fetch(w->n_fin_workers, 1);
+	} while (!w->to_exit);
+	return 0;
+}
+
+void mr_insert_multi(mrope_t *mr, int64_t len, const uint8_t *s, int is_thr)
+{
+	int64_t k, m, n0;
+	int b, is_srt = (mr->so != MR_SO_IO), is_comp = (mr->so == MR_SO_RCLO), stop_thr = 0;
+	volatile int n_fin_workers = 0;
+	triple64_t *a[2], *curr, *prev, *swap;
+	pthread_t *tid = 0;
+	worker_t *w = 0;
+
+	if (mr->thr_min < 0) mr->thr_min = 0;
+	assert(len > 0 && s[len-1] == 0);
+	{ // split into short strings
+		cstr_t p, q, end = s + len;
+		for (p = s, m = 0; p != end; ++p) // count #sentinels
+			if (*p == 0) ++m;
+		curr = a[0] = malloc(m * sizeof(triple64_t));
+		prev = a[1] = malloc(m * sizeof(triple64_t));
+		for (p = q = s, k = 0; p != end; ++p) // find the start of each string
+			if (*p == 0) prev[k++].p = q, q = p + 1;
+	}
+
+	for (k = n0 = 0; k < 6; ++k) n0 += mr->r[k]->c[0];
+	for (k = 0; k != m; ++k) {
+		if (is_srt) prev[k].l = 0, prev[k].u = n0;
+		else prev[k].l = prev[k].u = n0 + k;
+		prev[k].c = 0;
+	}
+	mr_insert_multi_aux(mr->r[0], m, prev, is_comp); // insert the first (actually the last) column
+
+	if (is_thr) {
+		tid = alloca(4 * sizeof(pthread_t));
+		w = alloca(4 * sizeof(worker_t));
+		memset(w, 0, 4 * sizeof(worker_t));
+		for (b = 0; b < 4; ++b) {
+			w[b].mr = mr, w[b].b = b + 1, w[b].is_comp = is_comp;
+			w[b].n_fin_workers = &n_fin_workers;
+		}
+		for (b = 0; b < 4; ++b) pthread_create(&tid[b], 0, worker, &w[b]);
+	}
+
+	n0 = 0; // the number of inserted strings
+	while (m) {
+		int64_t c[6], ac[6];
+		triple64_t *q[6];
+
+		memset(c, 0, 48);
+		for (k = n0; k != m; ++k) ++c[prev[k].c]; // counting
+		for (q[0] = curr + n0, b = 1; b < 6; ++b) q[b] = q[b-1] + c[b-1];
+		if (n0 + c[0] < m) {
+			for (k = n0; k != m; ++k) *q[prev[k].c]++ = prev[k]; // sort
+			for (b = 0; b < 6; ++b) q[b] -= c[b];
+		}
+		n0 += c[0];
+
+		if (is_thr && !stop_thr) {
+			struct timespec req, rem;
+			req.tv_sec = 0; req.tv_nsec = 1000000;
+			stop_thr = (m - n0 <= mr->thr_min);
+			for (b = 0; b < 4; ++b) {
+				w[b].a = q[b+1], w[b].m = c[b+1];
+				if (stop_thr) w[b].to_exit = 1; // signal the workers to exit
+				while (!__sync_bool_compare_and_swap(&w[b].to_run, 0, 1)); // signal the workers to start
+			}
+			if (c[5]) mr_insert_multi_aux(mr->r[5], c[5], q[5], is_comp); // the master thread processes the "N" bucket
+			while (!__sync_bool_compare_and_swap(&n_fin_workers, 4, 0)) // wait until all 4 workers finish
+				nanosleep(&req, &rem);
+			if (stop_thr && n0 < m)
+				fprintf(stderr, "[M::%s] Turn off parallelization for this batch as too few strings are left.\n", __func__);
+		} else {
+			for (b = 1; b < 6; ++b)
+				if (c[b]) mr_insert_multi_aux(mr->r[b], c[b], q[b], is_comp);
+		}
+		if (n0 == m) break;
+
+		memset(ac, 0, 48);
+		for (b = 1; b < 6; ++b) { // update the intervals to account for buckets ahead
+			int a;
+			for (a = 0; a < 6; ++a) ac[a] += mr->r[b-1]->c[a];
+			for (k = 0; k < c[b]; ++k) {
+				triple64_t *p = &q[b][k];
+				p->l += ac[p->c]; p->u += ac[p->c];
+			}
+		}
+		swap = curr, curr = prev, prev = swap;
+	}
+	if (is_thr) for (b = 0; b < 4; ++b) pthread_join(tid[b], 0);
+	free(a[0]); free(a[1]);
+}
diff --git a/mrope.h b/mrope.h
new file mode 100644
index 0000000..240448c
--- /dev/null
+++ b/mrope.h
@@ -0,0 +1,114 @@
+#ifndef MROPE_H_
+#define MROPE_H_
+
+#include "rope.h"
+
+#define MR_SO_IO    0
+#define MR_SO_RLO   1
+#define MR_SO_RCLO  2
+
+typedef struct {
+	uint8_t so; // sorting order
+	int thr_min; // when there are fewer sequences than this, disable multi-threading
+	rope_t *r[6];
+} mrope_t; // multi-rope
+
+typedef struct {
+	mrope_t *r;
+	int a, to_free;
+	rpitr_t i;
+} mritr_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/**
+	 * Initiate a multi-rope
+	 *
+	 * @param max_nodes     maximum number of nodes in an internal node; use ROPE_DEF_MAX_NODES (64) if unsure
+	 * @param block_len     maximum block length in an external node; use ROPE_DEF_BLOCK_LEN (256) if unsure
+	 * @param sorting_order the order in which sequences are added; possible values defined by the MR_SO_* macros
+	 */
+	mrope_t *mr_init(int max_nodes, int block_len, int sorting_order);
+
+	void mr_destroy(mrope_t *r);
+
+	int mr_thr_min(mrope_t *r, int thr_min);
+
+	/**
+	 * Insert one string into the index
+	 *
+	 * @param r      multi-rope
+	 * @param str    the *reverse* of the input string (important: it is reversed!)
+	 */
+	int64_t mr_insert1(mrope_t *r, const uint8_t *str);
+
+	/**
+	 * Insert multiple strings
+	 *
+	 * @param mr       multi-rope
+	 * @param len      total length of $s
+	 * @param s        concatenated, NULL delimited, reversed input strings
+	 * @param is_thr   true to use 5 threads
+	 */
+	void mr_insert_multi(mrope_t *mr, int64_t len, const uint8_t *s, int is_thr);
+
+	void mr_rank2a(const mrope_t *mr, int64_t x, int64_t y, int64_t *cx, int64_t *cy);
+	#define mr_rank1a(mr, x, cx) mr_rank2a(mr, x, -1, cx, 0)
+
+	/**
+	 * Put the iterator at the start of the index
+	 *
+	 * @param r         multi-rope
+	 * @param i         iterator to be initialized
+	 * @param to_free   if true, free visited buckets
+	 */
+	void mr_itr_first(mrope_t *r, mritr_t *i, int to_free);
+
+	/**
+	 * Iterate to the next block
+	 *
+	 * @param i         iterator
+	 *
+	 * @return          pointer to the start of a block; see rle.h for decoding the block
+	 */
+	const uint8_t *mr_itr_next_block(mritr_t *i);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int64_t mr_get_c(const mrope_t *mr, int64_t c[6])
+{
+	int a, b;
+	int64_t tot = 0;
+	for (a = 0; a < 6; ++a) c[a] = 0;
+	for (a = 0; a < 6; ++a) {
+		for (b = 0; b < 6; ++b)
+			c[b] += mr->r[a]->c[b];
+		tot += c[b];
+	}
+	return tot;
+}
+
+static inline int64_t mr_get_ac(const mrope_t *mr, int64_t ac[7])
+{
+	int a;
+	int64_t c[6], tot;
+	tot = mr_get_c(mr, c);
+	for (a = 1, ac[0] = 0; a <= 6; ++a) ac[a] = ac[a-1] + c[a-1];
+	return tot;
+}
+
+static inline int64_t mr_get_tot(const mrope_t *mr)
+{
+	int a, b;
+	int64_t tot = 0;
+	for (a = 0; a < 6; ++a)
+		for (b = 0; b < 6; ++b)
+			tot += mr->r[a]->c[b];
+	return tot;
+}
+
+#endif
diff --git a/rld0.c b/rld0.c
new file mode 100644
index 0000000..7f8b5ba
--- /dev/null
+++ b/rld0.c
@@ -0,0 +1,489 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include "rld0.h"
+
+#define RLD_IBITS_PLUS 4
+
+#define rld_file_size(e) ((4 + (e)->asize) * 8 + (e)->n_bytes + 8 * (e)->n_frames * ((e)->asize + 1))
+
+#ifndef xcalloc
+#define xcalloc(n, s) calloc(n, s)
+#endif
+#ifndef xmalloc
+#define xmalloc(s) malloc(s)
+#endif
+
+/******************
+ * Delta encoding *
+ ******************/
+
+static const char LogTable256[256] = {
+#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
+    -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+    LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
+    LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
+};
+
+static inline int ilog2_32(uint32_t v)
+{
+	register uint32_t t, tt;
+	if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt];
+	return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
+}
+
+static inline int ilog2(uint64_t v)
+{
+	return v>>32? 32 + ilog2_32(v>>32) : ilog2_32(v);
+}
+
+static inline int64_t rld_delta_enc1(uint64_t x, int *width)
+{
+	int y = ilog2(x);
+	int z = ilog2_32(y + 1);
+	*width = (z<<1) + 1 + y;
+	return (x ^ (uint64_t)1<<y) | (uint64_t)(y+1)<<y;
+}
+
+/***********************************
+ * Initialization and deallocation *
+ ***********************************/
+
+rld_t *rld_init(int asize, int bbits)
+{
+	rld_t *e;
+	e = xcalloc(1, sizeof(rld_t));
+	e->n = 1;
+	e->z = xmalloc(sizeof(void*));
+	e->z[0] = xcalloc(RLD_LSIZE, 8);
+	e->ssize = 1<<bbits;
+	e->cnt = xcalloc(asize + 1, 8);
+	e->mcnt = xcalloc(asize + 1, 8);
+	e->abits = ilog2(asize) + 1;
+	e->asize = asize;
+	e->sbits = bbits;
+	e->asize1 = asize + 1;
+	e->offset0[0] = (e->asize1*16+63)/64;
+	e->offset0[1] = (e->asize1*32+63)/64;
+	e->offset0[2] = e->asize1;
+	return e;
+}
+
+void rld_destroy(rld_t *e)
+{
+	int i = 0;
+	if (e == 0) return;
+	if (e->mem) {
+		close(e->fd);
+		munmap(e->mem, rld_file_size(e));
+	} else {
+		for (i = 0; i < e->n; ++i) free(e->z[i]);
+		free(e->frame);
+	}
+	free(e->z); free(e->cnt); free(e->mcnt); free(e);
+}
+
+void rld_itr_init(const rld_t *e, rlditr_t *itr, uint64_t k)
+{
+	itr->i = e->z + (k >> RLD_LBITS);
+	itr->shead = *itr->i + k%RLD_LSIZE;
+	itr->stail = rld_get_stail(e, itr);
+	itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)];
+	itr->q = (uint8_t*)itr->p;
+	itr->r = 64;
+	itr->c = -1;
+	itr->l = 0;
+}
+
+/************
+ * Encoding *
+ ************/
+
+static inline void enc_next_block(rld_t *e, rlditr_t *itr)
+{
+	int i, type;
+	if (itr->stail + 2 - *itr->i == RLD_LSIZE) {
+		++e->n;
+		e->z = realloc(e->z, e->n * sizeof(void*));
+		itr->i = e->z + e->n - 1;
+		itr->shead = *itr->i = xcalloc(RLD_LSIZE, 8);
+	} else itr->shead += e->ssize;
+	if (e->cnt[0] - e->mcnt[0] < 0x4000) {
+		uint16_t *p = (uint16_t*)itr->shead;
+		for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i];
+		type = 0;
+	} else if (e->cnt[0] - e->mcnt[0] < 0x40000000) {
+		uint32_t *p = (uint32_t*)itr->shead;
+		for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i];
+		type = 1;
+	} else {
+		uint64_t *p = (uint64_t*)itr->shead;
+		for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i];
+		type = 2;
+	}
+	*itr->shead |= (uint64_t)type<<62;
+	itr->p = itr->shead + e->offset0[type];
+	itr->stail = rld_get_stail(e, itr);
+	itr->q = (uint8_t*)itr->p;
+	itr->r = 64;
+	for (i = 0; i <= e->asize; ++i) e->mcnt[i] = e->cnt[i];
+}
+
+static int rld_enc1(rld_t *e, rlditr_t *itr, int64_t l, uint8_t c)
+{
+	int w;
+	uint64_t x = rld_delta_enc1(l, &w) << e->abits | c;
+	w += e->abits;
+	if (w >= itr->r && itr->p == itr->stail) enc_next_block(e, itr);
+	if (w > itr->r) {
+		w -= itr->r;
+		*itr->p++ |= x >> w;
+		*itr->p = x << (itr->r = 64 - w);
+	} else itr->r -= w, *itr->p |= x << itr->r;
+	e->cnt[0] += l;
+	e->cnt[c + 1] += l;
+	return 0;
+}
+
+int rld_enc(rld_t *e, rlditr_t *itr, int64_t l, uint8_t c)
+{
+	if (l == 0) return 0;
+	if (itr->c != c) {
+		if (itr->l) rld_enc1(e, itr, itr->l, itr->c);
+		itr->l = l; itr->c = c;
+	} else itr->l += l;
+	return 0;
+}
+
+void rld_rank_index(rld_t *e)
+{
+	uint64_t last, n_blks, i, k, *cnt;
+	int j;
+
+	n_blks = e->n_bytes * 8 / 64 / e->ssize + 1;
+	last = rld_last_blk(e);
+	cnt = alloca(e->asize * 8);
+	e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS;
+	e->n_frames = ((e->mcnt[0] + (1ll<<e->ibits) - 1) >> e->ibits) + 1;
+	e->frame = xcalloc(e->n_frames * e->asize1, 8);
+	e->frame[0] = 0;
+	for (j = 0; j < e->asize; ++j) cnt[j] = 0;
+	for (i = e->ssize, k = 1; i <= last; i += e->ssize) {
+		uint64_t sum, *p = rld_seek_blk(e, i);
+		int type = rld_block_type(*p);
+		if (type == 0) {
+			uint16_t *q = (uint16_t*)p;
+			for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j];
+		} else if (type == 1) {
+			uint32_t *q = (uint32_t*)p;
+			for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j] & 0x3fffffff;
+		} else {
+			uint64_t *q = (uint64_t*)p;
+			for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j];
+		}
+		for (j = 0, sum = 0; j < e->asize; ++j) sum += cnt[j];
+		while (sum >= k<<e->ibits) ++k;
+		if (k < e->n_frames) {
+			uint64_t x = k * e->asize1;
+			e->frame[x] = i;
+			for (j = 0; j < e->asize; ++j) e->frame[x + j + 1] = cnt[j];
+		}
+	}
+	assert(k >= e->n_frames - 1);
+	for (k = 1; k < e->n_frames; ++k) { // fill zero cells
+		uint64_t x = k * e->asize1;
+		if (e->frame[x] == 0) {
+			for (j = 0; j <= e->asize; ++j)
+				e->frame[x + j] = e->frame[x - e->asize1 + j];
+		}
+	}
+}
+
+uint64_t rld_enc_finish(rld_t *e, rlditr_t *itr)
+{
+	int i;
+	if (itr->l) rld_enc1(e, itr, itr->l, itr->c);
+	enc_next_block(e, itr);
+	e->n_bytes = (((uint64_t)(e->n - 1) * RLD_LSIZE) + (itr->p - *itr->i)) * 8;
+	// recompute e->cnt as the accumulative count; e->mcnt[] keeps the marginal counts
+	for (e->cnt[0] = 0, i = 1; i <= e->asize; ++i) e->cnt[i] += e->cnt[i - 1];
+	rld_rank_index(e);
+	return e->n_bytes;
+}
+
+/*****************
+ * Save and load *
+ *****************/
+
+int rld_dump(const rld_t *e, const char *fn)
+{
+	uint64_t k = 0;
+	int i;
+	uint32_t a;
+	FILE *fp;
+	fp = strcmp(fn, "-")? fopen(fn, "wb") : fdopen(fileno(stdout), "wb");
+	if (fp == 0) return -1;
+	a = e->asize<<16 | e->sbits;
+	fwrite("RLD\3", 1, 4, fp); // write magic
+	fwrite(&a, 4, 1, fp); // write sbits and asize
+	fwrite(&k, 8, 1, fp); // preserve 8 bytes for future uses
+	fwrite(&e->n_bytes, 8, 1, fp); // n_bytes can always be divided by 8
+	fwrite(&e->n_frames, 8, 1, fp); // number of frames
+	fwrite(e->mcnt + 1, 8, e->asize, fp); // write the marginal counts
+	for (i = 0, k = e->n_bytes / 8; i < e->n - 1; ++i, k -= RLD_LSIZE)
+		fwrite(e->z[i], 8, RLD_LSIZE, fp);
+	fwrite(e->z[i], 8, k, fp);
+	fwrite(e->frame, 8 * e->asize1, e->n_frames, fp);
+	fclose(fp);
+	return 0;
+}
+
+static rld_t *rld_restore_header(const char *fn, FILE **_fp)
+{
+	FILE *fp;
+	rld_t *e;
+	char magic[4];
+	uint64_t a[3];
+	int32_t i, x;
+
+	if (strcmp(fn, "-") == 0) *_fp = fp = stdin;
+	else if ((*_fp = fp = fopen(fn, "rb")) == 0) return 0;
+	fread(magic, 1, 4, fp);
+	if (strncmp(magic, "RLD\3", 4)) return 0;
+	fread(&x, 4, 1, fp);
+	e = rld_init(x>>16, x&0xffff);
+	fread(a, 8, 3, fp);
+	e->n_bytes = a[1]; e->n_frames = a[2];
+	fread(e->mcnt + 1, 8, e->asize, fp);
+	for (i = 0; i <= e->asize; ++i) e->cnt[i] = e->mcnt[i];
+	for (i = 1; i <= e->asize; ++i) e->cnt[i] += e->cnt[i - 1];
+	e->mcnt[0] = e->cnt[e->asize];
+	return e;
+}
+
+rld_t *rld_restore(const char *fn)
+{
+	FILE *fp;
+	rld_t *e;
+	uint64_t k, n_blks;
+	int32_t i;
+
+	if ((e = rld_restore_header(fn, &fp)) == 0) { // then load as plain DNA rle
+		uint8_t *buf;
+		int l;
+		rlditr_t itr;
+		buf = malloc(0x10000);
+		e = rld_init(6, 3);
+		rld_itr_init(e, &itr, 0);
+		while ((l = fread(buf, 1, 0x10000, fp)) != 0)
+			for (i = 0; i < l; ++i)
+				if (buf[i]>>3) rld_enc(e, &itr, buf[i]>>3, buf[i]&7);
+		fclose(fp);
+		free(buf);
+		rld_enc_finish(e, &itr);
+		return e;
+	}
+	if (e->n_bytes / 8 > RLD_LSIZE) { // allocate enough memory
+		e->n = (e->n_bytes / 8 + RLD_LSIZE - 1) / RLD_LSIZE;
+		e->z = realloc(e->z, e->n * sizeof(void*));
+		for (i = 1; i < e->n; ++i)
+			e->z[i] = xcalloc(RLD_LSIZE, 8);
+	}
+	for (i = 0, k = e->n_bytes / 8; i < e->n - 1; ++i, k -= RLD_LSIZE)
+		fread(e->z[i], 8, RLD_LSIZE, fp);
+	fread(e->z[i], 8, k, fp);
+	e->frame = xmalloc(e->n_frames * e->asize1 * 8);
+	fread(e->frame, 8 * e->asize1, e->n_frames, fp);
+	fclose(fp);
+	n_blks = e->n_bytes * 8 / 64 / e->ssize + 1;
+	e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS;
+	return e;
+}
+
+rld_t *rld_restore_mmap(const char *fn)
+{
+	FILE *fp;
+	rld_t *e;
+	int i;
+	int64_t n_blks;
+
+	e = rld_restore_header(fn, &fp);
+	fclose(fp);
+	free(e->z[0]); free(e->z);
+	e->n = (e->n_bytes / 8 + RLD_LSIZE - 1) / RLD_LSIZE;
+	e->z = xcalloc(e->n, sizeof(void*));
+	e->fd = open(fn, O_RDONLY);
+	e->mem = (uint64_t*)mmap(0, rld_file_size(e), PROT_READ, MAP_PRIVATE, e->fd, 0);
+	for (i = 0; i < e->n; ++i) e->z[i] = e->mem + (4 + e->asize) + (size_t)i * RLD_LSIZE;
+	e->frame = e->mem + (4 + e->asize) + e->n_bytes/8;
+	n_blks = e->n_bytes * 8 / 64 / e->ssize + 1;
+	e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS;
+	return e;
+}
+
+/******************
+ * Computing rank *
+ ******************/
+
+#ifdef _DNA_ONLY
+static inline int64_t rld_dec0_fast_dna(const rld_t *e, rlditr_t *itr, int *c)
+{ // This is NOT a replacement of rld_dec0(). It does not do boundary check.
+	uint64_t x = itr->r == 64? itr->p[0] : itr->p[0] << (64 - itr->r) | itr->p[1] >> itr->r;
+	if (x>>63 == 0) {
+		int64_t y;
+		int l, w = 0x333333335555779bll>>(x>>59<<2)&0xf;
+		l = (x >> (64 - w)) - 1;
+		y = x << w >> (64 - l) | 1u << l;
+		w += l;
+		*c = x << w >> 61;
+		w += 3;
+		itr->r -= w;
+		if (itr->r <= 0) ++itr->p, itr->r += 64;
+		return y;
+	} else {
+		*c = x << 1 >> 61;
+		itr->r -= 4;
+		if (itr->r <= 0) ++itr->p, itr->r += 64;
+		return 1;
+	}
+}
+#endif
+
+static inline uint64_t rld_locate_blk(const rld_t *e, rlditr_t *itr, uint64_t k, uint64_t *cnt, uint64_t *sum)
+{
+	int j;
+	uint64_t c = 0, *q, *z = e->frame + (k>>e->ibits) * e->asize1;
+	itr->i = e->z + (*z>>RLD_LBITS);
+	q = itr->p = *itr->i + (*z&RLD_LMASK);
+	for (j = 1, *sum = 0; j < e->asize1; ++j) *sum += (cnt[j-1] = z[j]);
+	while (1) { // seek to the small block
+		int type;
+		q += e->ssize;
+		if (q - *itr->i == RLD_LSIZE) q = *++itr->i;
+		type = rld_block_type(*q);
+		c = type == 2? *q&0x3fffffffffffffffULL : type == 1? *(uint32_t*)q : *(uint16_t*)q;
+		if (*sum + c > k) break;
+		if (type == 0) {
+			uint16_t *p = (uint16_t*)q + 1;
+#ifdef _DNA_ONLY
+			cnt[0] += p[0]; cnt[1] += p[1]; cnt[2] += p[2]; cnt[3] += p[3]; cnt[4] += p[4]; cnt[5] += p[5];
+#else
+			for (j = 0; j < e->asize; ++j) cnt[j] += p[j];
+#endif
+		} else if (type == 1) {
+			uint32_t *p = (uint32_t*)q + 1;
+			for (j = 0; j < e->asize; ++j) cnt[j] += p[j] & 0x3fffffff;
+		} else {
+			uint64_t *p = (uint64_t*)q + 1;
+			for (j = 0; j < e->asize; ++j) cnt[j] += p[j];
+		}
+		*sum += c;
+		itr->p = q;
+	}
+	itr->shead = itr->p;
+	itr->stail = rld_get_stail(e, itr);
+	itr->p += e->offset0[rld_block_type(*itr->shead)];
+	itr->q = (uint8_t*)itr->p;
+	itr->r = 64;
+	return c + *sum;
+}
+
+void rld_rank21(const rld_t *e, uint64_t k, uint64_t l, int c, uint64_t *ok, uint64_t *ol) // FIXME: can be faster
+{
+	*ok = rld_rank11(e, k, c);
+	*ol = rld_rank11(e, l, c);
+}
+
+int rld_rank1a(const rld_t *e, uint64_t k, uint64_t *ok)
+{
+	uint64_t z, l;
+	int a = -1;
+	rlditr_t itr;
+	if (k == 0) {
+		for (a = 0; a < e->asize; ++a) ok[a] = 0;
+		return -1;
+	}
+	rld_locate_blk(e, &itr, k-1, ok, &z);
+	while (1) {
+#ifdef _DNA_ONLY
+		l = rld_dec0_fast_dna(e, &itr, &a);
+#else
+		l = rld_dec0(e, &itr, &a);
+#endif
+		if (z + l >= k) break;
+		z += l; ok[a] += l;
+	}
+	ok[a] += k - z;
+	return a;
+}
+
+uint64_t rld_rank11(const rld_t *e, uint64_t k, int c)
+{
+	uint64_t *ok;
+	if (k == (uint64_t)-1) return 0;
+	ok = alloca(e->asize1 * 8);
+	rld_rank1a(e, k, ok);
+	return ok[c];
+}
+
+void rld_rank2a(const rld_t *e, uint64_t k, uint64_t l, uint64_t *ok, uint64_t *ol)
+{
+	uint64_t z, y, len;
+	rlditr_t itr;
+	int a = -1;
+	if (k == 0) {
+		for (a = 0; a < e->asize; ++a) ok[a] = 0;
+		rld_rank1a(e, l, ol);
+		return;
+	}
+	y = rld_locate_blk(e, &itr, k-1, ok, &z); // locate the block bracketing k
+	while (1) { // compute ok[]
+#ifdef _DNA_ONLY
+		len = rld_dec0_fast_dna(e, &itr, &a);
+#else
+		len = rld_dec0(e, &itr, &a);
+#endif
+		if (z + len >= k) break;
+		z += len; ok[a] += len;
+	}
+	if (y > l) { // we do not need to decode other blocks
+		int b;
+		for (b = 0; b < e->asize; ++b) ol[b] = ok[b]; // copy ok[] to ol[]
+		ok[a] += k - z; // finalize ok[a]
+		if (z + len < l) { // we need to decode the next run
+			z += len; ol[a] += len;
+			while (1) {
+				len = rld_dec0(e, &itr, &a);
+				if (z + len >= l) break;
+				z += len; ol[a] += len;
+			}
+		}
+		ol[a] += l - z;
+	} else { // we have to decode other blocks
+		ok[a] += k - z;
+		rld_rank1a(e, l, ol);
+	}
+}
+
+int rld_extend(const rld_t *e, const rldintv_t *ik, rldintv_t ok[6], int is_back)
+{ // TODO: this can be accelerated a little by using rld_rank1a() when ik.x[2]==1
+	uint64_t tk[6], tl[6];
+	int i;
+	rld_rank2a(e, ik->x[!is_back], ik->x[!is_back] + ik->x[2], tk, tl);
+	for (i = 0; i < 6; ++i) {
+		ok[i].x[!is_back] = e->cnt[i] + tk[i];
+		ok[i].x[2] = (tl[i] -= tk[i]);
+	}
+	ok[0].x[is_back] = ik->x[is_back];
+	ok[4].x[is_back] = ok[0].x[is_back] + tl[0];
+	ok[3].x[is_back] = ok[4].x[is_back] + tl[4];
+	ok[2].x[is_back] = ok[3].x[is_back] + tl[3];
+	ok[1].x[is_back] = ok[2].x[is_back] + tl[2];
+	ok[5].x[is_back] = ok[1].x[is_back] + tl[1];
+	return 0;
+}
diff --git a/rld0.h b/rld0.h
new file mode 100644
index 0000000..c811cad
--- /dev/null
+++ b/rld0.h
@@ -0,0 +1,137 @@
+#ifndef RLDELTA0_H
+#define RLDELTA0_H
+
+#define _DNA_ONLY
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+
+#define RLD_LBITS 23
+#define RLD_LSIZE (1<<RLD_LBITS)
+#define RLD_LMASK (RLD_LSIZE - 1)
+
+typedef struct {
+	int r, c; // $r: bits remained in the last 64-bit integer; $c: pending symbol
+	int64_t l; // $l: pending length
+	uint64_t *p, *shead, *stail, **i;
+	uint8_t *q;
+} rlditr_t;
+
+typedef struct rld_t {
+	// initialized in the constructor
+	uint8_t asize, asize1; // alphabet size; asize1=asize+1
+	int8_t abits; // bits required to store a symbol
+	int8_t sbits; // bits per small block
+	int8_t ibits; // modified during indexing; here for a better alignment
+	int8_t offset0[3]; // 0 for 16-bit blocks; 1 for 32-bit blocks; 2 for 64-bit blocks
+	int ssize; // ssize = 1<<sbits
+	// modified during encoding
+	int n; // number of blocks (unchanged in decoding)
+	uint64_t n_bytes; // total number of bits (unchanged in decoding)
+	uint64_t **z; // the actual data (unchanged in decoding)
+	uint64_t *cnt, *mcnt; // after enc_finish, cnt keeps the accumulative count and mcnt keeps the marginal
+	// modified during indexing
+	uint64_t n_frames;
+	uint64_t *frame;
+	//
+	int fd;
+	uint64_t *mem; // only used for memory mapped file
+} rld_t;
+
+typedef struct {
+	uint64_t x[3]; // 0: start of the interval, backward; 1: forward; 2: size of the interval
+	uint64_t info;
+} rldintv_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	rld_t *rld_init(int asize, int bbits);
+	void rld_destroy(rld_t *e);
+	int rld_dump(const rld_t *e, const char *fn);
+	rld_t *rld_restore(const char *fn);
+	rld_t *rld_restore_mmap(const char *fn);
+
+	void rld_itr_init(const rld_t *e, rlditr_t *itr, uint64_t k);
+	int rld_enc(rld_t *e, rlditr_t *itr, int64_t l, uint8_t c);
+	uint64_t rld_enc_finish(rld_t *e, rlditr_t *itr);
+
+	uint64_t rld_rank11(const rld_t *e, uint64_t k, int c);
+	int rld_rank1a(const rld_t *e, uint64_t k, uint64_t *ok);
+	void rld_rank21(const rld_t *e, uint64_t k, uint64_t l, int c, uint64_t *ok, uint64_t *ol);
+	void rld_rank2a(const rld_t *e, uint64_t k, uint64_t l, uint64_t *ok, uint64_t *ol);
+
+	int rld_extend(const rld_t *e, const rldintv_t *ik, rldintv_t ok[6], int is_back);
+
+#ifdef __cplusplus
+}
+#endif
+
+#define rld_last_blk(e) ((e)->n_bytes>>3>>(e)->sbits<<(e)->sbits)
+#define rld_seek_blk(e, k) ((e)->z[(k)>>RLD_LBITS] + ((k)&RLD_LMASK))
+#define rld_get_stail(e, itr) ((itr)->shead + (e)->ssize - ((itr)->shead + (e)->ssize - *(itr)->i == RLD_LSIZE? 2 : 1))
+
+#define rld_block_type(x) ((uint64_t)(x)>>62)
+
+static inline int64_t rld_dec0(const rld_t *e, rlditr_t *itr, int *c)
+{
+	int w;
+	uint64_t x;
+	int64_t l, y = 0;
+	x = itr->p[0] << (64 - itr->r) | (itr->p != itr->stail && itr->r != 64? itr->p[1] >> itr->r : 0);
+	if (x>>63 == 0) {
+		if ((w = 0x333333335555779bll>>(x>>59<<2)&0xf) == 0xb && x>>58 == 0) return 0;
+		l = (x >> (64 - w)) - 1;
+		y = x << w >> (64 - l) | 1u << l;
+		w += l;
+	} else w = y = 1;
+	*c = x << w >> (64 - e->abits);
+	w += e->abits;
+	if (itr->r > w) itr->r -= w;
+	else ++itr->p, itr->r = 64 + itr->r - w;
+	return y;
+}
+
+static inline int64_t rld_dec(const rld_t *e, rlditr_t *itr, int *_c, int is_free)
+{
+	int64_t l = rld_dec0(e, itr, _c);
+	if (l == 0 || *_c > e->asize) {
+		uint64_t last = rld_last_blk(e);
+		if (itr->p - *itr->i > RLD_LSIZE - e->ssize) {
+			if (is_free) {
+				free(*itr->i); *itr->i = 0;
+			}
+			itr->shead = *++itr->i;
+		} else itr->shead += e->ssize;
+		if (itr->shead == rld_seek_blk(e, last)) return -1;
+		itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)];
+		itr->q = (uint8_t*)itr->p;
+		itr->stail = rld_get_stail(e, itr);
+		itr->r = 64;
+		return rld_dec0(e, itr, _c);
+	} else return l;
+}
+
+// take k symbols from e0 and write it to e
+static inline void rld_dec_enc(rld_t *e, rlditr_t *itr, const rld_t *e0, rlditr_t *itr0, int64_t k)
+{
+	if (itr0->l >= k) { // there are more pending symbols
+		rld_enc(e, itr, k, itr0->c);
+		itr0->l -= k; // l - k symbols remains
+	} else { // use up all pending symbols
+		int c = -1; // to please gcc
+		int64_t l;
+		rld_enc(e, itr, itr0->l, itr0->c); // write all pending symbols
+		k -= itr0->l;
+		for (; k > 0; k -= l) { // we always go into this loop because l0<k
+			l = rld_dec(e0, itr0, &c, 1);
+			rld_enc(e, itr, k < l? k : l, c);
+		}
+		itr0->l = -k; itr0->c = c;
+	}
+}
+
+#endif
diff --git a/rle.c b/rle.c
new file mode 100644
index 0000000..221e1cd
--- /dev/null
+++ b/rle.c
@@ -0,0 +1,191 @@
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "rle.h"
+
+const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 };
+
+// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase
+int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6])
+{
+	uint16_t *nptr = (uint16_t*)block;
+	int diff;
+
+	block += 2; // skip the first 2 counting bytes
+	if (*nptr == 0) {
+		memset(cnt, 0, 48);
+		diff = rle_enc1(block, a, rl);
+	} else {
+		uint8_t *p, *end = block + *nptr, *q;
+		int64_t pre, z, l = 0, tot, beg_l;
+		int c = -1, n_bytes = 0, n_bytes2, t = 0;
+		uint8_t tmp[24];
+		beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5];
+		tot   = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
+		if (x < beg_l) {
+			beg_l = 0, *beg = 0;
+			memset(bc, 0, 48);
+		}
+		if (x == beg_l) {
+			p = q = block + (*beg); z = beg_l;
+			memcpy(cnt, bc, 48);
+		} else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward
+			z = beg_l; p = block + (*beg);
+			memcpy(cnt, bc, 48);
+			while (z < x) {
+				rle_dec1(p, c, l);
+				z += l; cnt[c] += l;
+			}
+			for (q = p - 1; *q>>6 == 2; --q);
+		} else { // backward
+			memcpy(cnt, ec, 48);
+			z = tot; p = end;
+			while (z >= x) {
+				--p;
+				if (*p>>6 != 2) {
+					l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3;
+					z -= l; cnt[*p&7] -= l;
+					l = 0; t = 0;
+				} else {
+					l |= (*p&0x3fL) << t;
+					t += 6;
+				}
+			}
+			q = p;
+			rle_dec1(p, c, l);
+			z += l; cnt[c] += l;
+		}
+		*beg = q - block;
+		memcpy(bc, cnt, 48);
+		bc[c] -= l;
+		n_bytes = p - q;
+		if (x == z && a != c && p < end) { // then try the next run
+			int tc;
+			int64_t tl;
+			q = p;
+			rle_dec1(q, tc, tl);
+			if (a == tc)
+				c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl;
+		}
+		if (z != x) cnt[c] -= z - x;
+		pre = x - (z - l); p -= n_bytes;
+		if (a == c) { // insert to the same run
+			n_bytes2 = rle_enc1(tmp, c, l + rl);
+		} else if (x == z) { // at the end; append to the existing run
+			p += n_bytes; n_bytes = 0;
+			n_bytes2 = rle_enc1(tmp, a, rl);
+		} else { // break the current run
+			n_bytes2 = rle_enc1(tmp, c, pre);
+			n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl);
+			n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre);
+		}
+		if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed
+			memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes);
+		memcpy(p, tmp, n_bytes2);
+		diff = n_bytes2 - n_bytes;
+	}
+	return (*nptr += diff);
+}
+
+int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6])
+{
+	int beg = 0;
+	int64_t bc[6];
+	memset(bc, 0, 48);
+	return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc);
+}
+
+void rle_split(uint8_t *block, uint8_t *new_block)
+{
+	int n = *(uint16_t*)block;
+	uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1);
+	while (*q>>6 == 2) --q;
+	memcpy(new_block + 2, q, end - q);
+	*(uint16_t*)new_block = end - q;
+	*(uint16_t*)block = q - block - 2;
+}
+
+void rle_count(const uint8_t *block, int64_t cnt[6])
+{
+	const uint8_t *q = block + 2, *end = q + *(uint16_t*)block;
+	while (q < end) {
+		int c;
+		int64_t l;
+		rle_dec1(q, c, l);
+		cnt[c] += l;
+	}
+}
+
+void rle_print(const uint8_t *block, int expand)
+{
+	const uint16_t *p = (const uint16_t*)block;
+	const uint8_t *q = block + 2, *end = block + 2 + *p;
+	while (q < end) {
+		int c;
+		int64_t l, x;
+		rle_dec1(q, c, l);
+		if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]);
+		else printf("%c%ld", "$ACGTN"[c], (long)l);
+	}
+	putchar('\n');
+}
+
+void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6])
+{
+	int a;
+	int64_t tot, cnt[6];
+	const uint8_t *p;
+
+	y = y >= x? y : x;
+	tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
+	if (tot == 0) return;
+	if (x <= (tot - y) + (tot>>3)) {
+		int c = 0;
+		int64_t l, z = 0;
+		memset(cnt, 0, 48);
+		p = block + 2;
+		while (z < x) {
+			rle_dec1(p, c, l);
+			z += l; cnt[c] += l;
+		}
+		for (a = 0; a != 6; ++a) cx[a] += cnt[a];
+		cx[c] -= z - x;
+		if (cy) {
+			while (z < y) {
+				rle_dec1(p, c, l);
+				z += l; cnt[c] += l;
+			}
+			for (a = 0; a != 6; ++a) cy[a] += cnt[a];
+			cy[c] -= z - y;
+		}
+	} else {
+#define move_backward(_x) \
+		while (z >= (_x)) { \
+			--p; \
+			if (*p>>6 != 2) { \
+				l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \
+				z -= l; cnt[*p&7] -= l; \
+				l = 0; t = 0; \
+			} else { \
+				l |= (*p&0x3fL) << t; \
+				t += 6; \
+			} \
+		} \
+
+		int t = 0;
+		int64_t l = 0, z = tot;
+		memcpy(cnt, ec, 48);
+		p = block + 2 + *(const uint16_t*)block;
+		if (cy) {
+			move_backward(y)
+			for (a = 0; a != 6; ++a) cy[a] += cnt[a];
+			cy[*p&7] += y - z;
+		}
+		move_backward(x)
+		for (a = 0; a != 6; ++a) cx[a] += cnt[a];
+		cx[*p&7] += x - z;
+
+#undef move_backward
+	}
+}
diff --git a/rle.h b/rle.h
new file mode 100644
index 0000000..4f8946d
--- /dev/null
+++ b/rle.h
@@ -0,0 +1,77 @@
+#ifndef RLE6_H_
+#define RLE6_H_
+
+#include <stdint.h>
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#else
+#define LIKELY(x) (x)
+#endif
+#ifdef __cplusplus
+
+extern "C" {
+#endif
+
+	int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]);
+	int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]);
+	void rle_split(uint8_t *block, uint8_t *new_block);
+	void rle_count(const uint8_t *block, int64_t cnt[6]);
+	void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]);
+	#define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec)
+
+	void rle_print(const uint8_t *block, int expand);
+
+#ifdef __cplusplus
+}
+#endif
+
+/******************
+ *** 43+3 codec ***
+ ******************/
+
+extern const uint8_t rle_auxtab[8];
+
+#define RLE_MIN_SPACE 18
+#define rle_nptr(block) ((uint16_t*)(block))
+
+// decode one run (c,l) and move the pointer p
+#define rle_dec1(p, c, l) do { \
+		(c) = *(p) & 7; \
+		if (LIKELY((*(p)&0x80) == 0)) { \
+			(l) = *(p)++ >> 3; \
+		} else if (LIKELY(*(p)>>5 == 6)) { \
+			(l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \
+			(p) += 2; \
+		} else { \
+			int n = ((*(p)&0x10) >> 2) + 4; \
+			(l) = *(p)++ >> 3 & 1; \
+			while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \
+		} \
+	} while (0)
+
+static inline int rle_enc1(uint8_t *p, int c, int64_t l)
+{
+	if (l < 1LL<<4) {
+		*p = l << 3 | c;
+		return 1;
+	} else if (l < 1LL<<8) {
+		*p = 0xC0 | l >> 6 << 3 | c;
+		p[1] = 0x80 | (l & 0x3f);
+		return 2;
+	} else if (l < 1LL<<19) {
+		*p = 0xE0 | l >> 18 << 3 | c;
+		p[1] = 0x80 | (l >> 12 & 0x3f);
+		p[2] = 0x80 | (l >> 6 & 0x3f);
+		p[3] = 0x80 | (l & 0x3f);
+		return 4;
+	} else {
+		int i, shift = 36;
+		*p = 0xF0 | l >> 42 << 3 | c;
+		for (i = 1; i < 8; ++i, shift -= 6)
+			p[i] = 0x80 | (l>>shift & 0x3f);
+		return 8;
+	}
+}
+
+#endif
diff --git a/rope.c b/rope.c
new file mode 100644
index 0000000..8d21a06
--- /dev/null
+++ b/rope.c
@@ -0,0 +1,219 @@
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdio.h>
+#include <zlib.h>
+#include "rle.h"
+#include "rope.h"
+
+/*******************
+ *** Memory Pool ***
+ *******************/
+
+#define MP_CHUNK_SIZE 0x100000 // 1MB per chunk
+
+typedef struct { // memory pool for fast and compact memory allocation (no free)
+	int size, i, n_elems;
+	int64_t top, max;
+	uint8_t **mem;
+} mempool_t;
+
+static mempool_t *mp_init(int size)
+{
+	mempool_t *mp;
+	mp = calloc(1, sizeof(mempool_t));
+	mp->size = size;
+	mp->i = mp->n_elems = MP_CHUNK_SIZE / size;
+	mp->top = -1;
+	return mp;
+}
+
+static void mp_destroy(mempool_t *mp)
+{
+	int64_t i;
+	for (i = 0; i <= mp->top; ++i) free(mp->mem[i]);
+	free(mp->mem); free(mp);
+}
+
+static inline void *mp_alloc(mempool_t *mp)
+{
+	if (mp->i == mp->n_elems) {
+		if (++mp->top == mp->max) {
+			mp->max = mp->max? mp->max<<1 : 1;
+			mp->mem = realloc(mp->mem, sizeof(void*) * mp->max);
+		}
+		mp->mem[mp->top] = calloc(mp->n_elems, mp->size);
+		mp->i = 0;
+	}
+	return mp->mem[mp->top] + (mp->i++) * mp->size;
+}
+
+/***************
+ *** B+ rope ***
+ ***************/
+
+rope_t *rope_init(int max_nodes, int block_len)
+{
+	rope_t *rope;
+	rope = calloc(1, sizeof(rope_t));
+	if (block_len < 32) block_len = 32;
+	rope->max_nodes = (max_nodes+ 1)>>1<<1;
+	rope->block_len = (block_len + 7) >> 3 << 3;
+	rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes);
+	rope->leaf = mp_init(rope->block_len);
+	rope->root = mp_alloc(rope->node);
+	rope->root->n = 1;
+	rope->root->is_bottom = 1;
+	rope->root->p = mp_alloc(rope->leaf);
+	return rope;
+}
+
+void rope_destroy(rope_t *rope)
+{
+	mp_destroy(rope->node);
+	mp_destroy(rope->leaf);
+	free(rope);
+}
+
+static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v)
+{ // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u
+	int j, i = v - u;
+	rpnode_t *w; // $w is the sibling of $v
+	if (u == 0) { // only happens at the root; add a new root
+		u = v = mp_alloc(rope->node);
+		v->n = 1; v->p = rope->root; // the new root has the old root as the only child
+		memcpy(v->c, rope->c, 48);
+		for (j = 0; j < 6; ++j) v->l += v->c[j];
+		rope->root = v;
+	}
+	if (i != u->n - 1) // then make room for a new node
+		memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1));
+	++u->n; w = v + 1;
+	memset(w, 0, sizeof(rpnode_t));
+	w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node);
+	if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node
+		uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p;
+		rle_split(p, q);
+		rle_count(q, w->c);
+	} else { // $v->p is a node, not a string
+		rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins
+		p->n -= rope->max_nodes>>1;
+		memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1));
+		q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy()
+		q->is_bottom = p->is_bottom;
+		for (i = 0; i < q->n; ++i)
+			for (j = 0; j < 6; ++j)
+				w->c[j] += q[i].c[j];
+	}
+	for (j = 0; j < 6; ++j) // compute $w->l and update $v->c
+		w->l += w->c[j], v->c[j] -= w->c[j];
+	v->l -= w->l; // update $v->c
+	return v;
+}
+
+int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache)
+{ // insert $a after $x symbols in $rope and the returns rank(a, x)
+	rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket
+	int64_t y = 0, z = 0, cnt[6];
+	int n_runs;
+	do { // top-down update. Searching and node splitting are done together in one pass.
+		if (p->n == rope->max_nodes) { // node is full; split
+			v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root
+			if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v
+				y += v->l, z += v->c[a], ++v, p = v->p;
+		}
+		u = p;
+		if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend
+			p += p->n - 1; y += v->l; z += v->c[a];
+			for (; y >= x; --p) y -= p->l, z -= p->c[a];
+			++p;
+		} else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly
+		assert(p - u < u->n);
+		if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split
+		v = p; p = p->p; // descend
+	} while (!u->is_bottom);
+	rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts
+	if (cache) {
+		if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t));
+		n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc);
+		cache->p = (uint8_t*)p;
+	} else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c);
+	z += cnt[a];
+	v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work
+	if (n_runs + RLE_MIN_SPACE > rope->block_len) {
+		split_node(rope, u, v);
+		if (cache) memset(cache, 0, sizeof(rpcache_t));
+	}
+	return z;
+}
+
+static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest)
+{
+	rpnode_t *u, *v = 0, *p = rope->root;
+	int64_t y = 0;
+	int a;
+
+	memset(cx, 0, 48);
+	do {
+		u = p;
+		if (v && x - y > v->l>>1) {
+			p += p->n - 1; y += v->l;
+			for (a = 0; a != 6; ++a) cx[a] += v->c[a];
+			for (; y >= x; --p) {
+				y -= p->l;
+				for (a = 0; a != 6; ++a) cx[a] -= p->c[a];
+			}
+			++p;
+		} else {
+			for (; y + p->l < x; ++p) {
+				y += p->l;
+				for (a = 0; a != 6; ++a) cx[a] += p->c[a];
+			}
+		}
+		v = p; p = p->p;
+	} while (!u->is_bottom);
+	*rest = x - y;
+	return v;
+}
+
+void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy)
+{
+	rpnode_t *v;
+	int64_t rest;
+	v = rope_count_to_leaf(rope, x, cx, &rest);
+	if (y < x || cy == 0) {
+		rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
+	} else if (rest + (y - x) <= v->l) {
+		memcpy(cy, cx, 48);
+		rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c);
+	} else {
+		rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
+		v = rope_count_to_leaf(rope, y, cy, &rest);
+		rle_rank1a((const uint8_t*)v->p, rest, cy, v->c);
+	}
+}
+
+/*********************
+ *** Rope iterator ***
+ *********************/
+
+void rope_itr_first(const rope_t *rope, rpitr_t *i)
+{
+	memset(i, 0, sizeof(rpitr_t));
+	i->rope = rope;
+	for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf
+		++i->d, i->pa[i->d] = i->pa[i->d - 1]->p;
+}
+
+const uint8_t *rope_itr_next_block(rpitr_t *i)
+{
+	const uint8_t *ret;
+	assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall
+	if (i->d < 0) return 0;
+	ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p;
+	while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking
+	if (i->d >= 0)
+		while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf
+			++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p;
+	return ret;
+}
diff --git a/rope.h b/rope.h
new file mode 100644
index 0000000..d114713
--- /dev/null
+++ b/rope.h
@@ -0,0 +1,54 @@
+#ifndef ROPE_H_
+#define ROPE_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define ROPE_MAX_DEPTH 80
+#define ROPE_DEF_MAX_NODES 64
+#define ROPE_DEF_BLOCK_LEN 512
+
+typedef struct rpnode_s {
+	struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs)
+	uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket
+	int64_t c[6]; // marginal counts
+} rpnode_t;
+
+typedef struct {
+	int32_t max_nodes, block_len; // both MUST BE even numbers
+	int64_t c[6]; // marginal counts
+	rpnode_t *root;
+	void *node, *leaf; // memory pool
+} rope_t;
+
+typedef struct {
+	const rope_t *rope; // the rope
+	const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes
+	int ia[ROPE_MAX_DEPTH]; // index in the parent nodes
+	int d; // the current depth in the B+-tree
+} rpitr_t;
+
+typedef struct {
+	int beg;
+	int64_t bc[6];
+	uint8_t *p;
+} rpcache_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	rope_t *rope_init(int max_nodes, int block_len);
+	void rope_destroy(rope_t *rope);
+	int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache);
+	void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy);
+	#define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0)
+
+	void rope_itr_first(const rope_t *rope, rpitr_t *i);
+	const uint8_t *rope_itr_next_block(rpitr_t *i);
+	
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/test/MT-simu.fq.gz b/test/MT-simu.fq.gz
new file mode 100644
index 0000000..8d0558a
Binary files /dev/null and b/test/MT-simu.fq.gz differ
diff --git a/unitig.c b/unitig.c
new file mode 100644
index 0000000..55e2fd9
--- /dev/null
+++ b/unitig.c
@@ -0,0 +1,455 @@
+#include <assert.h>
+#include <string.h>
+#include <math.h>
+#include "kvec.h"
+#include "kstring.h"
+#include "rld0.h"
+#include "mag.h"
+#include "internal.h"
+
+/******************
+ *** From fermi ***
+ ******************/
+
+typedef struct { size_t n, m; int32_t *a; } fm32s_v;
+typedef struct { size_t n, m; rldintv_t *a; } rldintv_v;
+
+static uint64_t utg_primes[] = { 123457, 234571, 345679, 456791, 567899, 0 };
+
+#define fm6_comp(a) ((a) >= 1 && (a) <= 4? 5 - (a) : (a))
+#define fm6_set_intv(e, c, ik) ((ik).x[0] = (e)->cnt[(int)(c)], (ik).x[2] = (e)->cnt[(int)(c)+1] - (e)->cnt[(int)(c)], (ik).x[1] = (e)->cnt[fm6_comp(c)], (ik).info = 0)
+
+int rld_extend0(const rld_t *e, const rldintv_t *ik, rldintv_t *ok0, int is_back)
+{ // FIXME: this can be accelerated a little by using rld_rank1a() when ik.x[2]==1
+	uint64_t tk[6], tl[6];
+	rld_rank2a(e, ik->x[!is_back], ik->x[!is_back] + ik->x[2], tk, tl);
+	ok0->x[!is_back] = tk[0];
+	ok0->x[is_back] = ik->x[is_back];
+	ok0->x[2] = tl[0] - tk[0];
+	return 0;
+}
+
+uint64_t fm6_retrieve(const rld_t *e, uint64_t x, kstring_t *s, rldintv_t *k2, int *contained)
+{
+	uint64_t k = x, ok[6];
+	rldintv_t ok2[6];
+	s->l = 0; *contained = 0;
+	while (1) {
+		int c = rld_rank1a(e, k + 1, ok);
+		k = e->cnt[c] + ok[c] - 1;
+		if (c == 0) break;
+		if (s->l > 0) {
+			if (k2->x[2] == 1) k2->x[0] = k;
+			else {
+				rld_extend(e, k2, ok2, 1);
+				*k2 = ok2[c];
+			}
+		} else fm6_set_intv(e, c, *k2);
+		kputc(c, s);
+	}
+	if (k2->x[2] != 1) {
+		rld_extend(e, k2, ok2, 1);
+		if (ok2[0].x[2] != k2->x[2]) *contained |= 1; // left contained
+		*k2 = ok2[0];
+	} else k2->x[0] = k;
+	rld_extend(e, k2, ok2, 0);
+	if (ok2[0].x[2] != k2->x[2]) *contained |= 2; // right contained
+	*k2 = ok2[0];
+	return k;
+}
+
+/*****************
+ *** Main body ***
+ *****************/
+
+#define info_lt(a, b) ((a).info < (b).info)
+
+#include "ksort.h"
+KSORT_INIT(infocmp, rldintv_t, info_lt)
+
+static inline void set_bit(uint64_t *bits, uint64_t x)
+{
+	uint64_t *p = bits + (x>>6);
+	uint64_t z = 1LLU<<(x&0x3f);
+	__sync_fetch_and_or(p, z);
+}
+
+static inline void set_bits(uint64_t *bits, const rldintv_t *p)
+{
+	uint64_t k;
+	for (k = 0; k < p->x[2]; ++k) {
+		set_bit(bits, p->x[0] + k);
+		set_bit(bits, p->x[1] + k);
+	}
+}
+
+static rldintv_t overlap_intv(const rld_t *e, int len, const uint8_t *seq, int min, int j, int at5, rldintv_v *p, int inc_sentinel)
+{ // requirement: seq[j] matches the end of a read
+	int c, depth, dir, end;
+	rldintv_t ik, ok[6];
+	p->n = 0;
+	dir = at5? 1 : -1; // at5 is true iff we start from the 5'-end of a read
+	end = at5? len : -1;
+	c = seq[j];
+	fm6_set_intv(e, c, ik);
+	for (depth = 1, j += dir; j != end; j += dir, ++depth) {
+		c = at5? fm6_comp(seq[j]) : seq[j];
+		rld_extend(e, &ik, ok, !at5);
+		if (!ok[c].x[2]) break; // cannot be extended
+		if (depth >= min && ok[0].x[2]) {
+			if (inc_sentinel) {
+				ok[0].info = j - dir;
+				kv_push(rldintv_t, *p, ok[0]);
+			} else {
+				ik.info = j - dir;
+				kv_push(rldintv_t, *p, ik);
+			}
+		}
+		ik = ok[c];
+	}
+	kv_reverse(rldintv_t, *p, 0); // reverse the array such that the smallest interval comes first
+	return ik;
+}
+
+typedef struct {
+	const rld_t *e;
+	int min_match, min_merge_len;
+	rldintv_v a[2], nei;
+	fm32s_v cat;
+	uint64_t *used, *bend;
+	kstring_t str;
+	uint64_t n, sum, sum2, unpaired;
+} aux_t;
+
+int fm6_is_contained(const rld_t *e, int min_match, const kstring_t *s, rldintv_t *intv, rldintv_v *ovlp)
+{ // for s is a sequence in e, test if s is contained in other sequences in e; return intervals right overlapping with s
+	rldintv_t ik, ok[6];
+	int ret = 0;
+	assert(s->l > min_match);
+	ovlp->n = 0;
+	ik = overlap_intv(e, s->l, (uint8_t*)s->s, min_match, s->l - 1, 0, ovlp, 0);
+	rld_extend(e, &ik, ok, 1); assert(ok[0].x[2]);
+	if (ik.x[2] != ok[0].x[2]) ret = -1; // the sequence is left contained
+	ik = ok[0];
+	rld_extend(e, &ik, ok, 0); assert(ok[0].x[2]);
+	if (ik.x[2] != ok[0].x[2]) ret = -1; // the sequence is right contained
+	*intv = ok[0];
+	return ret;
+}
+
+int fm6_get_nei(const rld_t *e, int min_match, int beg, kstring_t *s, rldintv_v *nei, // input and output variables
+				rldintv_v *prev, rldintv_v *curr, fm32s_v *cat,                        // temporary arrays
+				uint64_t *used)                                                      // optional info
+{
+	int ori_l = s->l, j, i, c, rbeg, is_forked = 0;
+	rldintv_v *swap;
+	rldintv_t ok[6], ok0;
+
+	curr->n = nei->n = cat->n = 0;
+	if (prev->n == 0) { // when this routine is called for the seed, prev may filled by fm6_is_contained()
+		overlap_intv(e, s->l - beg, (uint8_t*)s->s + beg, min_match, s->l - beg - 1, 0, prev, 0);
+		if (prev->n == 0) return -1; // no overlap
+		for (j = 0; j < prev->n; ++j) prev->a[j].info += beg;
+	}
+	kv_resize(int, *cat, prev->m);
+	for (j = 0; j < prev->n; ++j) cat->a[j] = 0; // only one interval; all point to 0
+	while (prev->n) {
+		for (j = 0, curr->n = 0; j < prev->n; ++j) {
+			rldintv_t *p = &prev->a[j];
+			if (cat->a[j] < 0) continue;
+			rld_extend(e, p, ok, 0); // forward extension
+			if (ok[0].x[2] && ori_l != s->l) { // some (partial) reads end here
+				rld_extend0(e, &ok[0], &ok0, 1); // backward extension to look for sentinels
+				if (ok0.x[2]) { // the match is bounded by sentinels - a full-length match
+					if (ok[0].x[2] == p->x[2] && p->x[2] == ok0.x[2]) { // never consider a read contained in another read
+						int cat0 = cat->a[j]; // a category approximately corresponds to one neighbor, though not always
+						assert(j == 0 || cat->a[j] > cat->a[j-1]); // otherwise not irreducible
+						ok0.info = ori_l - (p->info&0xffffffffU);
+						for (i = j; i < prev->n && cat->a[i] == cat0; ++i) cat->a[i] = -1; // mask out other intervals of the same cat
+						kv_push(rldintv_t, *nei, ok0); // keep in the neighbor vector
+						continue; // no need to go through for(c); do NOT set "used" as this neighbor may be rejected later
+					} else if (used) set_bits(used, &ok0); // the read is contained in another read; mark it as used
+				}
+			} // ~if(ok[0].x[2])
+			if (cat->a[j] < 0) continue; // no need to proceed if we have finished this path
+			for (c = 1; c < 5; ++c) // collect extensible intervals
+				if (ok[c].x[2]) {
+					rld_extend0(e, &ok[c], &ok0, 1);
+					if (ok0.x[2]) { // do not extend intervals whose left end is not bounded by a sentinel
+						ok[c].info = (p->info&0xfffffff0ffffffffLLU) | (uint64_t)c<<32;
+						kv_push(rldintv_t, *curr, ok[c]);
+					}
+				}
+		} // ~for(j)
+		if (curr->n) { // update category
+			uint32_t last, cat0;
+			kv_resize(int, *cat, curr->m);
+			c = curr->a[0].info>>32&0xf;
+			kputc(fm6_comp(c), s);
+			ks_introsort(infocmp, curr->n, curr->a);
+			last = curr->a[0].info >> 32;
+			cat->a[0] = 0;
+			curr->a[0].info &= 0xffffffff;
+			for (j = 1, cat0 = 0; j < curr->n; ++j) { // this loop recalculate cat
+				if (curr->a[j].info>>32 != last)
+					last = curr->a[j].info>>32, cat0 = j;
+				cat->a[j] = cat0;
+				curr->a[j].info = (curr->a[j].info&0xffffffff) | (uint64_t)cat0<<36;
+			}
+			if (cat0 != 0) is_forked = 1;
+		}
+		swap = curr; curr = prev; prev = swap; // swap curr and prev
+	} // ~while(prev->n)
+	if (nei->n == 0) return -1; // no overlap
+	rbeg = ori_l - (uint32_t)nei->a[0].info;
+	if (nei->n == 1 && is_forked) { // this may happen if there are contained reads; fix this
+		fm6_set_intv(e, 0, ok0);
+		for (i = rbeg; i < ori_l; ++i) {
+			rld_extend(e, &ok0, ok, 0);
+			ok0 = ok[fm6_comp(s->s[i])];
+		}
+		for (i = ori_l; i < s->l; ++i) {
+			int c0 = -1;
+			rld_extend(e, &ok0, ok, 0);
+			for (c = 1, j = 0; c < 5; ++c)
+				if (ok[c].x[2] && ok[c].x[0] <= nei->a[0].x[0] && ok[c].x[0] + ok[c].x[2] >= nei->a[0].x[0] + nei->a[0].x[2])
+					++j, c0 = c;
+			if (j == 0 && ok[0].x[2]) break;
+			assert(j == 1);
+			s->s[i] = fm6_comp(c0);
+			ok0 = ok[c0];
+		}
+		s->l = i; s->s[s->l] = 0;
+	}
+	if (nei->n > 1) s->l = ori_l, s->s[s->l] = 0;
+	return rbeg;
+}
+
+static int try_right(aux_t *a, int beg, kstring_t *s)
+{
+	return fm6_get_nei(a->e, a->min_match, beg, s, &a->nei, &a->a[0], &a->a[1], &a->cat, a->used);
+}
+
+static int check_left_simple(aux_t *a, int beg, int rbeg, const kstring_t *s)
+{
+	rldintv_t ok[6];
+	rldintv_v *prev = &a->a[0], *curr = &a->a[1], *swap;
+	int i, j;
+
+	overlap_intv(a->e, s->l, (uint8_t*)s->s, a->min_match, rbeg, 1, prev, 1);
+	for (i = rbeg - 1; i >= beg; --i) {
+		for (j = 0, curr->n = 0; j < prev->n; ++j) {
+			rldintv_t *p = &prev->a[j];
+			rld_extend(a->e, p, ok, 1);
+			if (ok[0].x[2]) set_bits(a->used, &ok[0]); // some reads end here; they must be contained in a longer read
+			if (ok[0].x[2] + ok[(int)s->s[i]].x[2] != p->x[2]) return -1; // potential backward bifurcation
+			kv_push(rldintv_t, *curr, ok[(int)s->s[i]]);
+		}
+		swap = curr; curr = prev; prev = swap;
+	} // ~for(i)
+	return 0;
+}
+
+static int check_left(aux_t *a, int beg, int rbeg, const kstring_t *s)
+{
+	int i, ret;
+	rldintv_t tmp;
+	assert(a->nei.n == 1);
+	ret = check_left_simple(a, beg, rbeg, s);
+	if (ret == 0) return 0;
+	// when ret<0, the back fork may be caused by a contained read. we have to do more to confirm this.
+	tmp = a->nei.a[0]; // backup the neighbour as it will be overwritten by try_right()
+	a->a[0].n = a->a[1].n = a->nei.n = 0;
+	ks_resize(&a->str, s->l - rbeg + 1);
+	for (i = s->l - 1, a->str.l = 0; i >= rbeg; --i)
+		a->str.s[a->str.l++] = fm6_comp(s->s[i]);
+	a->str.s[a->str.l] = 0;
+	try_right(a, 0, &a->str);
+	assert(a->nei.n >= 1);
+	ret = a->nei.n > 1? -1 : 0;
+	a->nei.n = 1; a->nei.a[0] = tmp; // recover the original neighbour
+	return ret;
+}
+
+static int unitig_unidir(aux_t *a, kstring_t *s, kstring_t *cov, int beg0, uint64_t k0, uint64_t *end, int *is_loop)
+{
+	int i, beg = beg0, rbeg, ori_l = s->l, n_reads = 0;
+	*is_loop = 0;
+	while ((rbeg = try_right(a, beg, s)) >= 0) { // loop if there is at least one overlap
+		uint64_t k;
+		if (a->nei.n > 1) { // forward bifurcation
+			set_bit(a->bend, *end);
+			break;
+		}
+		if ((k = a->nei.a[0].x[0]) == *end) break; // a loop like b>>c>>a><a; keep the link but stop extension
+		if (((a->bend[k>>6]>>(k&0x3f)&1) || check_left(a, beg, rbeg, s) < 0)) { // backward bifurcation
+			set_bit(a->bend, k);
+			break;
+		}
+		if (k == k0) { // a loop like a>>b>>c>>a
+			*is_loop = 1;
+			break;
+		}
+		if (a->nei.a[0].x[1] == *end) { // a loop like b>>c>>a>>a; cut the last link
+			a->nei.n = 0;
+			break;
+		}
+		if ((int)a->nei.a[0].info < a->min_merge_len) break; // the overlap is not long enough
+		*end = a->nei.a[0].x[1];
+		set_bits(a->used, &a->nei.a[0]); // successful extension
+		++n_reads;
+		if (cov->m < s->m) ks_resize(cov, s->m);
+		cov->l = s->l; cov->s[cov->l] = 0;
+		for (i = rbeg; i < ori_l; ++i) // update the coverage string
+			if (cov->s[i] != '~') ++cov->s[i];
+		for (i = ori_l; i < s->l; ++i) cov->s[i] = '"';
+		beg = rbeg; ori_l = s->l; a->a[0].n = a->a[1].n = 0; // prepare for the next round of loop
+	}
+	cov->l = s->l = ori_l; s->s[ori_l] = cov->s[ori_l] = 0;
+	return n_reads;
+}
+
+static void copy_nei(ku128_v *dst, const rldintv_v *src)
+{
+	int i;
+	for (i = 0; i < src->n; ++i) {
+		ku128_t z;
+		z.x = src->a[i].x[0]; z.y = src->a[i].info;
+		kv_push(ku128_t, *dst, z);
+	}
+}
+
+static int unitig1(aux_t *a, int64_t seed, kstring_t *s, kstring_t *cov, uint64_t end[2], ku128_v nei[2], int *n_reads)
+{
+	rldintv_t intv0;
+	int seed_len, ret, is_loop, contained;
+	int64_t k;
+	size_t i;
+
+	*n_reads = nei[0].n = nei[1].n = 0;
+	if (a->used[seed>>6]>>(seed&0x3f)&1) return -2; // used
+	// retrieve the sequence pointed by seed
+	k = fm6_retrieve(a->e, seed, s, &intv0, &contained);
+	seq_reverse(s->l, (uint8_t*)s->s);
+	seed_len = s->l;
+	// check contained status
+	if (intv0.x[2] > 1 && k != intv0.x[0]) return -3; // duplicated, but not the first
+	set_bits(a->used, &intv0);
+	if (contained) return -3; // contained
+	// check length, containment and if used before
+	if (s->l <= a->min_match) return -1; // too short
+	ret = fm6_is_contained(a->e, a->min_match, s, &intv0, &a->a[0]);
+	*n_reads = 1;
+	// initialize the coverage string
+	if (cov->m < s->m) ks_resize(cov, s->m);
+	cov->l = s->l; cov->s[cov->l] = 0;
+	for (i = 0; i < cov->l; ++i) cov->s[i] = '"';
+	// left-wards extension
+	end[0] = intv0.x[1]; end[1] = intv0.x[0];
+	if (a->a[0].n) { // no need to extend to the right if there is no overlap
+		*n_reads += unitig_unidir(a, s, cov, 0, intv0.x[0], &end[0], &is_loop);
+		copy_nei(&nei[0], &a->nei);
+		if (is_loop) {
+			ku128_t z;
+			z.x = end[0]; z.y = a->nei.a[0].info;
+			kv_push(ku128_t, nei[1], z);
+			return 0;
+		}
+	}
+	// right-wards extension
+	a->a[0].n = a->a[1].n = a->nei.n = 0;
+	seq_revcomp6(s->l, (uint8_t*)s->s); // reverse complement for extension in the other direction
+	seq_reverse(cov->l, (uint8_t*)cov->s); // reverse the coverage
+	*n_reads += unitig_unidir(a, s, cov, s->l - seed_len, intv0.x[1], &end[1], &is_loop);
+	copy_nei(&nei[1], &a->nei);
+	return 0;
+}
+
+typedef struct {
+	long max_l;
+	aux_t a;
+	kstring_t str, cov;
+	magv_t z;
+	magv_v v;
+} thrdat_t;
+
+typedef struct {
+	uint64_t prime, *used, *bend, *visited;
+	const rld_t *e;
+	thrdat_t *d;
+} worker_t;
+
+static void worker(void *data, long _i, int tid)
+{
+	worker_t *w = (worker_t*)data;
+	thrdat_t *d = &w->d[tid];
+	uint64_t i = (w->prime * _i) % w->e->mcnt[1];
+	if (unitig1(&d->a, i, &d->str, &d->cov, d->z.k, d->z.nei, &d->z.nsr) >= 0) { // then we keep the unitig
+		uint64_t *p[2], x[2];
+		magv_t *q;
+		p[0] = w->visited + (d->z.k[0]>>6); x[0] = 1LLU<<(d->z.k[0]&0x3f);
+		p[1] = w->visited + (d->z.k[1]>>6); x[1] = 1LLU<<(d->z.k[1]&0x3f);
+		if ((__sync_fetch_and_or(p[0], x[0])&x[0]) || (__sync_fetch_and_or(p[1], x[1])&x[1])) return;
+		d->z.len = d->str.l;
+		if (d->max_l < d->str.m) {
+			d->max_l = d->str.m;
+			d->z.seq = realloc(d->z.seq, d->max_l);
+			d->z.cov = realloc(d->z.cov, d->max_l);
+		}
+		memcpy(d->z.seq, d->str.s, d->z.len);
+		memcpy(d->z.cov, d->cov.s, d->z.len + 1);
+		kv_pushp(magv_t, d->v, &q);
+		mag_v_copy_to_empty(q, &d->z);
+	}
+}
+
+mag_t *fml_fmi2mag_core(const rld_t *e, int min_match, int min_merge_len, int n_threads)
+{
+	extern void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
+	worker_t w;
+	int j;
+	mag_t *g;
+
+	w.used    = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8);
+	w.bend    = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8);
+	w.visited = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8);
+	w.e       = e;
+	assert(e->mcnt[1] >= n_threads * 2);
+	w.d = calloc(n_threads, sizeof(thrdat_t));
+	w.prime = 0;
+	for (j = 0; utg_primes[j] > 0; ++j)
+		if (e->mcnt[1] % utg_primes[j] != 0) {
+			w.prime = utg_primes[j];
+			break;
+		}
+	assert(w.prime);
+	for (j = 0; j < n_threads; ++j) {
+		w.d[j].a.e = e; w.d[j].a.min_match = min_match; w.d[j].a.min_merge_len = min_merge_len;
+		w.d[j].a.used = w.used; w.d[j].a.bend = w.bend;
+	}
+	kt_for(n_threads, worker, &w, e->mcnt[1]);
+	g = (mag_t*)calloc(1, sizeof(mag_t));
+	for (j = 0; j < n_threads; ++j) {
+		kv_resize(magv_t, g->v, g->v.n + w.d[j].v.n);
+		memcpy(g->v.a + g->v.n, w.d[j].v.a, w.d[j].v.n * sizeof(magv_t));
+		g->v.n += w.d[j].v.n;
+		free(w.d[j].v.a);
+		free(w.d[j].a.a[0].a); free(w.d[j].a.a[1].a); free(w.d[j].a.nei.a); free(w.d[j].a.cat.a);
+		free(w.d[j].z.nei[0].a); free(w.d[j].z.nei[1].a); free(w.d[j].z.seq); free(w.d[j].z.cov);
+		free(w.d[j].a.str.s); free(w.d[j].str.s); free(w.d[j].cov.s);
+	}
+	free(w.d); free(w.used); free(w.bend); free(w.visited);
+
+	mag_g_build_hash(g);
+	mag_g_amend(g);
+	g->rdist = mag_cal_rdist(g);
+	return g;
+}
+
+mag_t *fml_fmi2mag(const fml_opt_t *opt, rld_t *e)
+{
+	mag_t *g;
+	g = fml_fmi2mag_core(e, opt->min_asm_ovlp, opt->min_merge_len, opt->n_threads);
+	rld_destroy(e);
+	return g;
+}