From: Andreas Tille Date: Sat, 14 Dec 2024 18:47:40 +0000 (+0100) Subject: Import fermi-lite_0.1+git20221215.85f159e.orig.tar.xz X-Git-Tag: archive/raspbian/0.1+git20221215.85f159e-1+rpi1^2~9 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=8e09405be7caab1d1db1e89ad5071ebbc42cb9b8;p=fermi-lite.git Import fermi-lite_0.1+git20221215.85f159e.orig.tar.xz [dgit import orig fermi-lite_0.1+git20221215.85f159e.orig.tar.xz] --- 8e09405be7caab1d1db1e89ad5071ebbc42cb9b8 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..282c932 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.o +*.a +.*.swp diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d423a43 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,23 @@ +The MIT License + +Copyright (c) 2016 Broad Institute + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..edda713 --- /dev/null +++ b/Makefile @@ -0,0 +1,46 @@ +CC= gcc +CFLAGS= -g -Wall -O2 -Wno-unused-function #-fno-inline-functions -fno-inline-functions-called-once +CPPFLAGS= +INCLUDES= +OBJS= kthread.o misc.o \ + bseq.o htab.o bfc.o \ + rle.o rope.o mrope.o rld0.o \ + unitig.o mag.o bubble.o ksw.o +PROG= fml-asm +LIBS= -lm -lz -lpthread + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +fml-asm:libfml.a example.o + $(CC) $(CFLAGS) $^ -o $@ -L. -lfml $(LIBS) + +libfml.a:$(OBJS) + $(AR) -csru $@ $(OBJS) + +clean: + rm -fr gmon.out *.o ext/*.o a.out $(PROG) *~ *.a *.dSYM session* + +depend: + (LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c) + +# DO NOT DELETE + +bfc.o: htab.h kmer.h internal.h fml.h kvec.h ksort.h +bseq.o: fml.h kseq.h +bubble.o: mag.h kstring.h fml.h kvec.h ksw.h internal.h khash.h +example.o: fml.h +htab.o: htab.h kmer.h khash.h +ksw.o: ksw.h +mag.o: mag.h kstring.h fml.h kvec.h internal.h kseq.h khash.h ksort.h +misc.o: internal.h fml.h kstring.h rle.h mrope.h rope.h rld0.h mag.h kvec.h +misc.o: htab.h kmer.h khash.h +mrope.o: mrope.h rope.h +rld0.o: rld0.h +rle.o: rle.h +rope.o: rle.h rope.h +unitig.o: kvec.h kstring.h rld0.h mag.h fml.h internal.h ksort.h diff --git a/README.md b/README.md new file mode 100644 index 0000000..6dc7ff9 --- /dev/null +++ b/README.md @@ -0,0 +1,96 @@ +## Getting Started +```sh +git clone https://github.com/lh3/fermi-lite +cd fermi-lite && make +./fml-asm test/MT-simu.fq.gz > MT.fq +# to compile your program: +gcc -Wall -O2 prog.c -o prog -L/path/to/fermi-lite -lfml -lz -lm -lpthread +``` + +## Introduction + +Fermi-lite is a standalone C library as well as a command-line tool for +assembling Illumina short reads in regions from 100bp to 10 million bp in size. +It is largely a light-weight in-memory version of [fermikit][fk] without +generating any intermediate files. It inherits the performance, the relatively +small memory footprint and the features of fermikit. In particular, fermi-lite +is able to retain heterozygous events and thus can be used to assemble diploid +regions for the purpose of variant calling. It is one of the limited choices +for local re-assembly and arguably the easiest to interface. + +If you use fermi-lite in your work, please cite the FermiKit paper: + +> Li H (2015) FermiKit: assembly-based variant calling for Illumina +> resequencing data, *Bioinformatics*, **31**:3694-6. + +## Usage + +For now, see [example.c][example] for the basic use of the library. Here is a +sketch of the example: +```cpp +#include // for printf() +#include "fml.h" // only one header file required + +int main(int argc, char *argv[]) +{ + int i, n_seqs, n_utgs; + bseq1_t *seqs; // array of input sequences + fml_utg_t *utgs; // array of output unitigs + fml_opt_t opt; + if (argc == 1) return 1; // do nothing if there is no input file + seqs = bseq_read(argv[1], &n_seqs); // or fill the array with callers' functions + fml_opt_init(&opt); // initialize parameters + utgs = fml_assemble(&opt, n_seqs, seqs, &n_utgs); // assemble! + for (i = 0; i < n_utgs; ++i) // output in fasta + printf(">%d\n%s\n", i+1, utgs[i].seq); + fml_utg_destroy(n_utgs, utgs); // deallocate unitigs + return 0; +} +``` +The `fml_assemble()` output is in fact a graph. You may have a look at the +`fml_utg_print_gfa()` function in [misc.c][misc] about how to derive a +[GFA][gfa] representation from an array of `fml_utg_t` objects. + +## Overview of the Assembly Algorithm + +Fermi-lite is an overlap-based assembler. Given a set of input reads, it counts +*k*-mers, estimates the *k*-mer coverage, sets a threshold on *k*-mer +occurrences to determine solid *k*-mers and then use them correct sequencing +errors ([Li, 2015][bfc-paper]). After error correction, fermi-lite trims a read +at an *l*-mer unique to the read. It then constructs an FM-index for trimmed +reads ([Li, 2014][rb2-paper]) and builds a transitively reduced overlap graph from the +FM-index ([Simpson and Durbin, 2010][sga-paper]; [Li, 2012][fm1-paper]), +requiring at least *l*-bp overlaps. In this graph, fermi-lite trims tips and +pops bubbles caused by uncorrected errors. If a sequence in the graph has +multiple overlaps, fermi-lite discards overlaps significantly shorter than the +longest overlap -- this is a technique applied to overlap graph only. The graph +after these procedure is the final output. Sequences in this graph are unitigs. + +## Limitations + +1. Fermi-lite can efficiently assemble bacterial genomes. However, it has not + been carefully tuned for this type of assembly. While on a few GAGE-B data + sets fermi-lite appears to work well, it may not compete with recent + mainstream assemblers in general. + +2. Fermi-lite does not work with genomes more than tens of megabases as a + whole. It would take too much memory to stage all data in memory. For large + genomes, please use [fermikit][fk] instead. + +3. This is the first iteration of fermi-lite. It is still immarture. In + particular, I hope fermi-lite can be smart enough to automatically figure + out various parameters based on input, which is very challenging given the + high variability of input data. + +[sga-paper]: http://www.ncbi.nlm.nih.gov/pubmed/20529929 +[bfc-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25953801 +[rb2-paper]: http://www.ncbi.nlm.nih.gov/pubmed/25107872 +[fm1-paper]: http://www.ncbi.nlm.nih.gov/pubmed/22569178 +[bfc]: http://github.com/lh3/bfc +[rb2]: http://github.com/lh3/ropebwt2 +[fm2]: http://github.com/lh3/fermi2 +[fk]: http://github.com/lh3/fermikit +[example]: https://github.com/lh3/fermi-lite/blob/master/example.c +[header]: https://github.com/lh3/fermi-lite/blob/master/fml.h +[misc]: https://github.com/lh3/fermi-lite/blob/master/misc.c +[gfa]: https://github.com/pmelsted/GFA-spec diff --git a/bfc.c b/bfc.c new file mode 100644 index 0000000..27ccfde --- /dev/null +++ b/bfc.c @@ -0,0 +1,674 @@ +#include +#include +#include +#include +#include +#include "htab.h" +#include "kmer.h" +#include "internal.h" +#include "fml.h" + +/******************* + *** BFC options *** + *******************/ + +typedef struct { + int n_threads, q, k, l_pre; + int min_cov; // a k-mer is considered solid if the count is no less than this + + int max_end_ext; + int win_multi_ec; + float min_trim_frac; + + // these ec options cannot be changed on the command line + int w_ec, w_ec_high, w_absent, w_absent_high; + int max_path_diff, max_heap; +} bfc_opt_t; + +void bfc_opt_init(bfc_opt_t *opt) +{ + memset(opt, 0, sizeof(bfc_opt_t)); + opt->n_threads = 1; + opt->q = 20; + opt->k = -1; + opt->l_pre = -1; + + opt->min_cov = 4; // in BFC, this defaults to 3 because it has Bloom pre-filter + opt->win_multi_ec = 10; + opt->max_end_ext = 5; + opt->min_trim_frac = .8; + + opt->w_ec = 1; + opt->w_ec_high = 7; + opt->w_absent = 3; + opt->w_absent_high = 1; + opt->max_path_diff = 15; + opt->max_heap = 100; +} + +/********************** + *** K-mer counting *** + **********************/ + +#define CNT_BUF_SIZE 256 + +typedef struct { // cache to reduce locking + uint64_t y[2]; + int is_high; +} insbuf_t; + +typedef struct { + int k, q; + int n_seqs; + const bseq1_t *seqs; + bfc_ch_t *ch; + int *n_buf; + insbuf_t **buf; +} cnt_step_t; + +bfc_kmer_t bfc_kmer_null = {{0,0,0,0}}; + +static int bfc_kmer_bufclear(cnt_step_t *cs, int forced, int tid) +{ + int i, k, r; + if (cs->ch == 0) return 0; + for (i = k = 0; i < cs->n_buf[tid]; ++i) { + r = bfc_ch_insert(cs->ch, cs->buf[tid][i].y, cs->buf[tid][i].is_high, forced); + if (r < 0) cs->buf[tid][k++] = cs->buf[tid][i]; + } + cs->n_buf[tid] = k; + return k; +} + +static void bfc_kmer_insert(cnt_step_t *cs, const bfc_kmer_t *x, int is_high, int tid) +{ + int k = cs->k; + uint64_t y[2], hash; + hash = bfc_kmer_hash(k, x->x, y); + if (bfc_ch_insert(cs->ch, y, is_high, 0) < 0) { + insbuf_t *p; + if (bfc_kmer_bufclear(cs, 0, tid) == CNT_BUF_SIZE) + bfc_kmer_bufclear(cs, 1, tid); + p = &cs->buf[tid][cs->n_buf[tid]++]; + p->y[0] = y[0], p->y[1] = y[1], p->is_high = is_high; + } +} + +static void worker_count(void *_data, long k, int tid) +{ + cnt_step_t *cs = (cnt_step_t*)_data; + const bseq1_t *s = &cs->seqs[k]; + int i, l; + bfc_kmer_t x = bfc_kmer_null; + uint64_t qmer = 0, mask = (1ULL<k) - 1; + for (i = l = 0; i < s->l_seq; ++i) { + int c = seq_nt6_table[(uint8_t)s->seq[i]] - 1; + if (c < 4) { + bfc_kmer_append(cs->k, x.x, c); + qmer = (qmer<<1 | (s->qual == 0 || s->qual[i] - 33 >= cs->q)) & mask; + if (++l >= cs->k) bfc_kmer_insert(cs, &x, (qmer == mask), tid); + } else l = 0, qmer = 0, x = bfc_kmer_null; + } +} + +struct bfc_ch_s *fml_count(int n, const bseq1_t *seq, int k, int q, int l_pre, int n_threads) +{ + int i; + cnt_step_t cs; + cs.n_seqs = n, cs.seqs = seq, cs.k = k, cs.q = q; + cs.ch = bfc_ch_init(cs.k, l_pre); + cs.n_buf = calloc(n_threads, sizeof(int)); + cs.buf = calloc(n_threads, sizeof(void*)); + for (i = 0; i < n_threads; ++i) + cs.buf[i] = malloc(CNT_BUF_SIZE * sizeof(insbuf_t)); + kt_for(n_threads, worker_count, &cs, cs.n_seqs); + for (i = 0; i < n_threads; ++i) free(cs.buf[i]); + free(cs.buf); free(cs.n_buf); + return cs.ch; +} + +/*************** + *** Correct *** + ***************/ + +#define BFC_MAX_KMER 63 +#define BFC_MAX_BF_SHIFT 37 + +#define BFC_MAX_PATHS 4 +#define BFC_EC_HIST 5 +#define BFC_EC_HIST_HIGH 2 + +#define BFC_EC_MIN_COV_COEF .1 + +/************************** + * Sequence struct for ec * + **************************/ + +#include "kvec.h" + +typedef struct { // NOTE: unaligned memory + uint8_t b:3, q:1, ob:3, oq:1; + uint8_t dummy; + uint16_t lcov:6, hcov:6, solid_end:1, high_end:1, ec:1, absent:1; + int i; +} ecbase_t; + +typedef kvec_t(ecbase_t) ecseq_t; + +static int bfc_seq_conv(const char *s, const char *q, int qthres, ecseq_t *seq) +{ + int i, l; + l = strlen(s); + kv_resize(ecbase_t, *seq, l); + seq->n = l; + for (i = 0; i < l; ++i) { + ecbase_t *c = &seq->a[i]; + c->b = c->ob = seq_nt6_table[(int)s[i]] - 1; + c->q = c->oq = !q? 1 : q[i] - 33 >= qthres? 1 : 0; + if (c->b > 3) c->q = c->oq = 0; + c->i = i; + } + return l; +} + +static inline ecbase_t ecbase_comp(const ecbase_t *b) +{ + ecbase_t r = *b; + r.b = b->b < 4? 3 - b->b : 4; + r.ob = b->ob < 4? 3 - b->ob : 4; + return r; +} + +static void bfc_seq_revcomp(ecseq_t *seq) +{ + int i; + for (i = 0; i < seq->n>>1; ++i) { + ecbase_t tmp; + tmp = ecbase_comp(&seq->a[i]); + seq->a[i] = ecbase_comp(&seq->a[seq->n - 1 - i]); + seq->a[seq->n - 1 - i] = tmp; + } + if (seq->n&1) seq->a[i] = ecbase_comp(&seq->a[i]); +} + +/*************************** + * Independent ec routines * + ***************************/ + +int bfc_ec_greedy_k(int k, int mode, const bfc_kmer_t *x, const bfc_ch_t *ch) +{ + int i, j, max = 0, max_ec = -1, max2 = 0; + for (i = 0; i < k; ++i) { + int c = (x->x[1]>>i&1)<<1 | (x->x[0]>>i&1); + for (j = 0; j < 4; ++j) { + bfc_kmer_t y = *x; + int ret; + if (j == c) continue; + bfc_kmer_change(k, y.x, i, j); + ret = bfc_ch_kmer_occ(ch, &y); + if (ret < 0) continue; + if ((max&0xff) < (ret&0xff)) max2 = max, max = ret, max_ec = i<<2 | j; + else if ((max2&0xff) < (ret&0xff)) max2 = ret; + } + } + return (max&0xff) * 3 > mode && (max2&0xff) < 3? max_ec : -1; +} + +int bfc_ec_first_kmer(int k, const ecseq_t *s, int start, bfc_kmer_t *x) +{ + int i, l; + *x = bfc_kmer_null; + for (i = start, l = 0; i < s->n; ++i) { + ecbase_t *c = &s->a[i]; + if (c->b < 4) { + bfc_kmer_append(k, x->x, c->b); + if (++l == k) break; + } else l = 0, *x = bfc_kmer_null; + } + return i; +} + +void bfc_ec_kcov(int k, int min_occ, ecseq_t *s, const bfc_ch_t *ch) +{ + int i, l, r, j; + bfc_kmer_t x = bfc_kmer_null; + for (i = l = 0; i < s->n; ++i) { + ecbase_t *c = &s->a[i]; + c->high_end = c->solid_end = c->lcov = c->hcov = 0; + if (c->b < 4) { + bfc_kmer_append(k, x.x, c->b); + if (++l >= k) { + if ((r = bfc_ch_kmer_occ(ch, &x)) >= 0) { + if ((r>>8&0x3f) >= min_occ+1) c->high_end = 1; + if ((r&0xff) >= min_occ) { + c->solid_end = 1; + for (j = i - k + 1; j <= i; ++j) + ++s->a[j].lcov, s->a[j].hcov += c->high_end; + } + } + } + } else l = 0, x = bfc_kmer_null; + } +} + +uint64_t bfc_ec_best_island(int k, const ecseq_t *s) +{ // IMPORTANT: call bfc_ec_kcov() before calling this function! + int i, l, max, max_i; + for (i = k - 1, max = l = 0, max_i = -1; i < s->n; ++i) { + if (!s->a[i].solid_end) { + if (l > max) max = l, max_i = i; + l = 0; + } else ++l; + } + if (l > max) max = l, max_i = i; + return max > 0? (uint64_t)(max_i - max - k + 1) << 32 | max_i : 0; +} + +/******************** + * Correct one read * + ********************/ + +#include "ksort.h" + +#define ECCODE_MISC 1 +#define ECCODE_MANY_N 2 +#define ECCODE_NO_SOLID 3 +#define ECCODE_UNCORR_N 4 +#define ECCODE_MANY_FAIL 5 + +typedef struct { + uint32_t ec_code:3, brute:1, n_ec:14, n_ec_high:14; + uint32_t n_absent:24, max_heap:8; +} ecstat_t; + +typedef struct { + uint8_t ec:1, ec_high:1, absent:1, absent_high:1, b:4; +} bfc_penalty_t; + +typedef struct { + int tot_pen; + int i; // base position + int k; // position in the stack + int32_t ecpos_high[BFC_EC_HIST_HIGH]; + int32_t ecpos[BFC_EC_HIST]; + bfc_kmer_t x; +} echeap1_t; + +typedef struct { + int parent, i, tot_pen; + uint8_t b; + bfc_penalty_t pen; + uint16_t cnt; +} ecstack1_t; + +typedef struct { + const bfc_opt_t *opt; + const bfc_ch_t *ch; + kvec_t(echeap1_t) heap; + kvec_t(ecstack1_t) stack; + ecseq_t seq, tmp, ec[2]; + int mode; + ecstat_t ori_st; +} bfc_ec1buf_t; + +#define heap_lt(a, b) ((a).tot_pen > (b).tot_pen) +KSORT_INIT(ec, echeap1_t, heap_lt) + +static bfc_ec1buf_t *ec1buf_init(const bfc_opt_t *opt, const bfc_ch_t *ch) +{ + bfc_ec1buf_t *e; + e = calloc(1, sizeof(bfc_ec1buf_t)); + e->opt = opt, e->ch = ch; + return e; +} + +static void ec1buf_destroy(bfc_ec1buf_t *e) +{ + free(e->heap.a); free(e->stack.a); free(e->seq.a); free(e->tmp.a); free(e->ec[0].a); free(e->ec[1].a); + free(e); +} + +#define weighted_penalty(o, p) ((o)->w_ec * (p).ec + (o)->w_ec_high * (p).ec_high + (o)->w_absent * (p).absent + (o)->w_absent_high * (p).absent_high) + +static void buf_update(bfc_ec1buf_t *e, const echeap1_t *prev, bfc_penalty_t pen, int cnt) +{ + ecstack1_t *q; + echeap1_t *r; + const bfc_opt_t *o = e->opt; + int b = pen.b; + // update stack + kv_pushp(ecstack1_t, e->stack, &q); + q->parent = prev->k; + q->i = prev->i; + q->b = b; + q->pen = pen; + q->cnt = cnt > 0? cnt&0xff : 0; + q->tot_pen = prev->tot_pen + weighted_penalty(o, pen); + // update heap + kv_pushp(echeap1_t, e->heap, &r); + r->i = prev->i + 1; + r->k = e->stack.n - 1; + r->x = prev->x; + if (pen.ec_high) { + memcpy(r->ecpos_high + 1, prev->ecpos_high, (BFC_EC_HIST_HIGH - 1) * 4); + r->ecpos_high[0] = prev->i; + } else memcpy(r->ecpos_high, prev->ecpos_high, BFC_EC_HIST_HIGH * 4); + if (pen.ec) { + memcpy(r->ecpos + 1, prev->ecpos, (BFC_EC_HIST - 1) * 4); + r->ecpos[0] = prev->i; + } else memcpy(r->ecpos, prev->ecpos, BFC_EC_HIST * 4); + r->tot_pen = q->tot_pen; + bfc_kmer_append(e->opt->k, r->x.x, b); + ks_heapup_ec(e->heap.n, e->heap.a); +} + +static int buf_backtrack(ecstack1_t *s, int end, const ecseq_t *seq, ecseq_t *path) +{ + int i, n_absent = 0; + kv_resize(ecbase_t, *path, seq->n); + path->n = seq->n; + while (end >= 0) { + if ((i = s[end].i) < seq->n) { + path->a[i].b = s[end].b; + path->a[i].ec = s[end].pen.ec; + path->a[i].absent = s[end].pen.absent; + n_absent += s[end].pen.absent; + } + end = s[end].parent; + } + return n_absent; +} + +static int bfc_ec1dir(bfc_ec1buf_t *e, const ecseq_t *seq, ecseq_t *ec, int start, int end, int *max_heap) +{ + echeap1_t z; + int i, l, rv = -1, path[BFC_MAX_PATHS], n_paths = 0, min_path = -1, min_path_pen = INT_MAX, n_failures = 0; + assert(end <= seq->n && end - start >= e->opt->k); + e->heap.n = e->stack.n = 0; + *max_heap = 0; + memset(&z, 0, sizeof(echeap1_t)); + kv_resize(ecbase_t, *ec, seq->n); + ec->n = seq->n; + for (z.i = start, l = 0; z.i < end; ++z.i) { + int c = seq->a[z.i].b; + if (c < 4) { + if (++l == e->opt->k) break; + bfc_kmer_append(e->opt->k, z.x.x, c); + } else l = 0, z.x = bfc_kmer_null; + } + assert(z.i < end); // before calling this function, there must be at least one solid k-mer + z.k = -1; + for (i = 0; i < BFC_EC_HIST; ++i) z.ecpos[i] = -1; + for (i = 0; i < BFC_EC_HIST_HIGH; ++i) z.ecpos_high[i] = -1; + kv_push(echeap1_t, e->heap, z); + for (i = 0; i < seq->n; ++i) ec->a[i].b = seq->a[i].b, ec->a[i].ob = seq->a[i].ob; + // exhaustive error correction + while (1) { + int stop = 0; + *max_heap = *max_heap > 255? 255 : *max_heap > e->heap.n? *max_heap : e->heap.n; + if (e->heap.n == 0) { // may happen when there is an uncorrectable "N" + rv = -2; + break; + } + z = e->heap.a[0]; + e->heap.a[0] = kv_pop(e->heap); + ks_heapdown_ec(0, e->heap.n, e->heap.a); + if (min_path >= 0 && z.tot_pen > min_path_pen + e->opt->max_path_diff) break; + if (z.i - end > e->opt->max_end_ext) stop = 1; + if (!stop) { + ecbase_t *c = z.i < seq->n? &seq->a[z.i] : 0; + int b, os = -1, fixed = 0, other_ext = 0, n_added = 0, added_cnt[4]; + bfc_penalty_t added[4]; + // test if the read extension alone is enough + if (z.i > end) fixed = 1; + if (c && c->b < 4) { // A, C, G or T + bfc_kmer_t x = z.x; + bfc_kmer_append(e->opt->k, x.x, c->b); + os = bfc_ch_kmer_occ(e->ch, &x); + if (c->q && (os&0xff) >= e->opt->min_cov + 1 && c->lcov >= e->opt->min_cov + 1) fixed = 1; + else if (c->hcov > e->opt->k * .75) fixed = 1; + } + // extension + for (b = 0; b < 4; ++b) { + bfc_penalty_t pen; + if (fixed && c && b != c->b) continue; + if (c == 0 || b != c->b) { + int s; + bfc_kmer_t x = z.x; + pen.ec = 0, pen.ec_high = 0, pen.absent = 0, pen.absent_high = 0, pen.b = b; + if (c) { // not over the end + if (c->q && z.ecpos_high[BFC_EC_HIST_HIGH-1] >= 0 && z.i - z.ecpos_high[BFC_EC_HIST_HIGH-1] < e->opt->win_multi_ec) continue; // no close highQ corrections + if (z.ecpos[BFC_EC_HIST-1] >= 0 && z.i - z.ecpos[BFC_EC_HIST-1] < e->opt->win_multi_ec) continue; // no clustered corrections + } + bfc_kmer_append(e->opt->k, x.x, b); + s = bfc_ch_kmer_occ(e->ch, &x); + if (s < 0 || (s&0xff) < e->opt->min_cov) continue; // not solid + //if (os >= 0 && (s&0xff) - (os&0xff) < 2) continue; // not sufficiently better than the read path + pen.ec = c && c->b < 4? 1 : 0; + pen.ec_high = pen.ec? c->oq : 0; + pen.absent = 0; + pen.absent_high = ((s>>8&0xff) < e->opt->min_cov); + pen.b = b; + added_cnt[n_added] = s; + added[n_added++] = pen; + ++other_ext; + } else { + pen.ec = pen.ec_high = 0; + pen.absent = (os < 0 || (os&0xff) < e->opt->min_cov); + pen.absent_high = (os < 0 || (os>>8&0xff) < e->opt->min_cov); + pen.b = b; + added_cnt[n_added] = os; + added[n_added++] = pen; + } + } // ~for(b) + if (fixed == 0 && other_ext == 0) ++n_failures; + if (n_failures > seq->n * 2) { + rv = -3; + break; + } + if (c || n_added == 1) { + if (n_added > 1 && e->heap.n > e->opt->max_heap) { // to prevent heap explosion + int min_b = -1, min = INT_MAX; + for (b = 0; b < n_added; ++b) { + int t = weighted_penalty(e->opt, added[b]); + if (min > t) min = t, min_b = b; + } + buf_update(e, &z, added[min_b], added_cnt[min_b]); + } else { + for (b = 0; b < n_added; ++b) + buf_update(e, &z, added[b], added_cnt[b]); + } + } else { + if (n_added == 0) + e->stack.a[z.k].tot_pen += e->opt->w_absent * (e->opt->max_end_ext - (z.i - end)); + stop = 1; + } + } // ~if(!stop) + if (stop) { + if (e->stack.a[z.k].tot_pen < min_path_pen) + min_path_pen = e->stack.a[z.k].tot_pen, min_path = n_paths; + path[n_paths++] = z.k; + if (n_paths == BFC_MAX_PATHS) break; + } + } // ~while(1) + // backtrack + if (n_paths == 0) return rv; + assert(min_path >= 0 && min_path < n_paths && e->stack.a[path[min_path]].tot_pen == min_path_pen); + rv = buf_backtrack(e->stack.a, path[min_path], seq, ec); + for (i = 0; i < ec->n; ++i) // mask out uncorrected regions + if (i < start + e->opt->k || i >= end) ec->a[i].b = 4; + return rv; +} + +ecstat_t bfc_ec1(bfc_ec1buf_t *e, char *seq, char *qual) +{ + int i, start = 0, end = 0, n_n = 0, rv[2], max_heap[2]; + uint64_t r; + ecstat_t s; + + s.ec_code = ECCODE_MISC, s.brute = 0, s.n_ec = s.n_ec_high = 0, s.n_absent = s.max_heap = 0; + bfc_seq_conv(seq, qual, e->opt->q, &e->seq); + for (i = 0; i < e->seq.n; ++i) + if (e->seq.a[i].ob > 3) ++n_n; + if (n_n > e->seq.n * .05) { + s.ec_code = ECCODE_MANY_N; + return s; + } + bfc_ec_kcov(e->opt->k, e->opt->min_cov, &e->seq, e->ch); + r = bfc_ec_best_island(e->opt->k, &e->seq); + if (r == 0) { // no solid k-mer + bfc_kmer_t x; + int ec = -1; + while ((end = bfc_ec_first_kmer(e->opt->k, &e->seq, start, &x)) < e->seq.n) { + ec = bfc_ec_greedy_k(e->opt->k, e->mode, &x, e->ch); + if (ec >= 0) break; + if (end + (e->opt->k>>1) >= e->seq.n) break; + start = end - (e->opt->k>>1); + } + if (ec >= 0) { + e->seq.a[end - (ec>>2)].b = ec&3; + ++end; start = end - e->opt->k; + s.brute = 1; + } else { + s.ec_code = ECCODE_NO_SOLID; + return s; + } + } else start = r>>32, end = (uint32_t)r; + if ((rv[0] = bfc_ec1dir(e, &e->seq, &e->ec[0], start, e->seq.n, &max_heap[0])) < 0) { + s.ec_code = rv[0] == -2? ECCODE_UNCORR_N : rv[0] == -3? ECCODE_MANY_FAIL : ECCODE_MISC; + return s; + } + bfc_seq_revcomp(&e->seq); + if ((rv[1] = bfc_ec1dir(e, &e->seq, &e->ec[1], e->seq.n - end, e->seq.n, &max_heap[1])) < 0) { + s.ec_code = rv[1] == -2? ECCODE_UNCORR_N : rv[1] == -3? ECCODE_MANY_FAIL : ECCODE_MISC; + return s; + } + s.max_heap = max_heap[0] > max_heap[1]? max_heap[0] : max_heap[1]; + s.ec_code = 0, s.n_absent = rv[0] + rv[1]; + bfc_seq_revcomp(&e->ec[1]); + bfc_seq_revcomp(&e->seq); + for (i = 0; i < e->seq.n; ++i) { + ecbase_t *c = &e->seq.a[i]; + if (e->ec[0].a[i].b == e->ec[1].a[i].b) + c->b = e->ec[0].a[i].b > 3? e->seq.a[i].b : e->ec[0].a[i].b; + else if (e->ec[1].a[i].b > 3) c->b = e->ec[0].a[i].b; + else if (e->ec[0].a[i].b > 3) c->b = e->ec[1].a[i].b; + else c->b = e->seq.a[i].ob; + } + for (i = 0; i < e->seq.n; ++i) { + int is_diff = !(e->seq.a[i].b == e->seq.a[i].ob); + if (is_diff) { + ++s.n_ec; + if (e->seq.a[i].q) ++s.n_ec_high; + } + seq[i] = (is_diff? "acgtn" : "ACGTN")[e->seq.a[i].b]; + if (qual) qual[i] = is_diff? 34 + e->seq.a[i].ob : "+?"[e->seq.a[i].q]; + } + return s; +} + +/******************** + * Error correction * + ********************/ + +typedef struct { + const bfc_opt_t *opt; + const bfc_ch_t *ch; + bfc_ec1buf_t **e; + int64_t n_processed; + int n_seqs, flt_uniq; + bseq1_t *seqs; +} ec_step_t; + +static uint64_t max_streak(int k, const bfc_ch_t *ch, const bseq1_t *s) +{ + int i, l; + uint64_t max = 0, t = 0; + bfc_kmer_t x = bfc_kmer_null; + for (i = l = 0; i < s->l_seq; ++i) { + int c = seq_nt6_table[(uint8_t)s->seq[i]] - 1; + if (c < 4) { // not an ambiguous base + bfc_kmer_append(k, x.x, c); + if (++l >= k) { // ok, we have a k-mer now + if (bfc_ch_kmer_occ(ch, &x) > 0) t += 1ULL<<32; + else t = i + 1; + } else t = i + 1; + } else l = 0, x = bfc_kmer_null, t = i + 1; + max = max > t? max : t; + } + return max; +} + +static void worker_ec(void *_data, long k, int tid) +{ + ec_step_t *es = (ec_step_t*)_data; + bseq1_t *s = &es->seqs[k]; + if (es->flt_uniq) { + uint64_t max; + max = max_streak(es->opt->k, es->ch, s); + if (max>>32 && (double)((max>>32) + es->opt->k - 1) / s->l_seq > es->opt->min_trim_frac) { + int start = (uint32_t)max, end = start + (max>>32); + start -= es->opt->k - 1; + assert(start >= 0 && end <= s->l_seq); + memmove(s->seq, s->seq + start, end - start); + s->l_seq = end - start; + s->seq[s->l_seq] = 0; + if (s->qual) { + memmove(s->qual, s->qual + start, s->l_seq); + s->qual[s->l_seq] = 0; + } + } else { + free(s->seq); free(s->qual); + s->l_seq = 0, s->seq = s->qual = 0; + } + } else bfc_ec1(es->e[tid], s->seq, s->qual); +} + +float fml_correct_core(const fml_opt_t *opt, int flt_uniq, int n, bseq1_t *seq) +{ + bfc_ch_t *ch; + int i, mode; + uint64_t hist[256], hist_high[64], tot_len = 0, sum_k = 0, tot_k = 0; + ec_step_t es; + bfc_opt_t bfc_opt; + float kcov; + + // initialize BFC options + bfc_opt_init(&bfc_opt); + bfc_opt.n_threads = opt->n_threads; // copy from FML options + bfc_opt.k = flt_uniq? opt->min_asm_ovlp : opt->ec_k; + for (i = 0; i < n; ++i) tot_len += seq[i].l_seq; // compute total length + bfc_opt.l_pre = tot_len - 8 < 20? tot_len - 8 : 20; + + memset(&es, 0, sizeof(ec_step_t)); + es.opt = &bfc_opt, es.n_seqs = n, es.seqs = seq, es.flt_uniq = flt_uniq; + + es.ch = ch = fml_count(n, seq, bfc_opt.k, bfc_opt.q, bfc_opt.l_pre, bfc_opt.n_threads); + mode = bfc_ch_hist(ch, hist, hist_high); + for (i = opt->min_cnt; i < 256; ++i) + sum_k += hist[i], tot_k += i * hist[i]; + kcov = (float)tot_k / sum_k; + bfc_opt.min_cov = (int)(BFC_EC_MIN_COV_COEF * kcov + .499); + bfc_opt.min_cov = bfc_opt.min_cov < opt->max_cnt? bfc_opt.min_cov : opt->max_cnt; + bfc_opt.min_cov = bfc_opt.min_cov > opt->min_cnt? bfc_opt.min_cov : opt->min_cnt; + + es.e = calloc(es.opt->n_threads, sizeof(void*)); + for (i = 0; i < es.opt->n_threads; ++i) + es.e[i] = ec1buf_init(es.opt, ch), es.e[i]->mode = mode; + kt_for(es.opt->n_threads, worker_ec, &es, es.n_seqs); + for (i = 0; i < es.opt->n_threads; ++i) + ec1buf_destroy(es.e[i]); + free(es.e); + bfc_ch_destroy(ch); + return kcov; +} + +float fml_correct(const fml_opt_t *opt, int n, bseq1_t *seq) +{ + return fml_correct_core(opt, 0, n, seq); +} + +float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq) +{ + return fml_correct_core(opt, 1, n, seq); +} diff --git a/bseq.c b/bseq.c new file mode 100644 index 0000000..719bafb --- /dev/null +++ b/bseq.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include "fml.h" +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +bseq1_t *bseq_read(const char *fn, int *n_) +{ + gzFile fp; + bseq1_t *seqs; + kseq_t *ks; + int m, n; + uint64_t size = 0; + + *n_ = 0; + fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return 0; + ks = kseq_init(fp); + + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + bseq1_t *s; + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + s = &seqs[n]; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = ks->seq.l; + size += seqs[n++].l_seq; + } + *n_ = n; + + kseq_destroy(ks); + gzclose(fp); + return seqs; +} + +void seq_reverse(int l, unsigned char *s) +{ + int i; + for (i = 0; i < l>>1; ++i) { + int tmp = s[l-1-i]; + s[l-1-i] = s[i]; s[i] = tmp; + } +} + +void seq_revcomp6(int l, unsigned char *s) +{ + int i; + for (i = 0; i < l>>1; ++i) { + int tmp = s[l-1-i]; + tmp = (tmp >= 1 && tmp <= 4)? 5 - tmp : tmp; + s[l-1-i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i]; + s[i] = tmp; + } + if (l&1) s[i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i]; +} diff --git a/bubble.c b/bubble.c new file mode 100644 index 0000000..cbe54be --- /dev/null +++ b/bubble.c @@ -0,0 +1,366 @@ +#include +#include +#include "mag.h" +#include "kvec.h" +#include "ksw.h" +#include "internal.h" +#include "khash.h" +KHASH_DECLARE(64, uint64_t, uint64_t) + +typedef khash_t(64) hash64_t; + +#define MAX_N_DIFF 2.01 // for evaluating alignment after SW +#define MAX_R_DIFF 0.1 +#define L_DIFF_COEF 0.2 // n_diff=|l_0 - l_1|*L_DIFF_COEF + +#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0) +#define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0) + +/****************** + * Closed bubbles * + ******************/ + +typedef struct { + uint64_t id; + int cnt[2]; + int n[2][2], d[2][2]; + uint64_t v[2][2]; +} trinfo_t; + +const trinfo_t g_trinull = {-1, {0, 0}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{INT_MIN, INT_MIN}, {INT_MIN, INT_MIN}}, {{-1, -1}, {-1, -1}}}; + +typedef struct { + int n, m; + trinfo_t **buf; +} tipool_t; + +struct mogb_aux { + tipool_t pool; + ku64_v stack; + hash64_t *h; +}; + +mogb_aux_t *mag_b_initaux(void) +{ + mogb_aux_t *aux = calloc(1, sizeof(mogb_aux_t)); + aux->h = kh_init(64); + return aux; +} + +void mag_b_destroyaux(mogb_aux_t *b) +{ + int i; + for (i = 0; i < b->pool.m; ++i) + free(b->pool.buf[i]); + free(b->pool.buf); free(b->stack.a); + kh_destroy(64, b->h); + free(b); +} + +#define tiptr(p) ((trinfo_t*)(p)->ptr) + +static inline trinfo_t *tip_alloc(tipool_t *pool, uint32_t id) +{ // allocate an object from the memory pool + trinfo_t *p; + if (pool->n == pool->m) { + int i, new_m = pool->m? pool->m<<1 : 256; + pool->buf = realloc(pool->buf, new_m * sizeof(void*)); + for (i = pool->m; i < new_m; ++i) + pool->buf[i] = malloc(sizeof(trinfo_t)); + pool->m = new_m; + } + p = pool->buf[pool->n++]; + *p = g_trinull; + p->id = id; + return p; +} + +static void backtrace(mag_t *g, uint64_t end, uint64_t start, hash64_t *h) +{ + while (end>>32 != start) { + int ret; + kh_put(64, h, end>>33, &ret); + end = tiptr(&g->v.a[end>>33])->v[(end>>32^1)&1][end&1]; + } +} + +void mag_vh_simplify_bubble(mag_t *g, uint64_t idd, int max_vtx, int max_dist, mogb_aux_t *a) +{ + int i, n_pending = 0; + magv_t *p, *q; + + p = &g->v.a[idd>>1]; + if (p->len < 0 || p->nei[idd&1].n < 2) return; // stop if p is deleted or it has 0 or 1 neighbor + // reset aux data + a->stack.n = a->pool.n = 0; + if (kh_n_buckets(a->h) >= 64) { + kh_destroy(64, a->h); + a->h = kh_init(64); + } else kh_clear(64, a->h); + // add the initial vertex + p->ptr = tip_alloc(&a->pool, idd>>1); + tiptr(p)->d[(idd&1)^1][0] = -p->len; + tiptr(p)->n[(idd&1)^1][0] = -p->nsr; + kv_push(uint64_t, a->stack, idd^1); + // essentially a topological sorting + while (a->stack.n) { + uint64_t x, y; + ku128_v *r; + if (a->stack.n == 1 && a->stack.a[0] != (idd^1) && n_pending == 0) break; // found the other end of the bubble + x = kv_pop(a->stack); + p = &g->v.a[x>>1]; + //printf("%lld:%lld\n", p->k[0], p->k[1]); + r = &p->nei[(x&1)^1]; // we will look the the neighbors from the other end of the unitig + if (a->pool.n > max_vtx || tiptr(p)->d[x&1][0] > max_dist || tiptr(p)->d[x&1][1] > max_dist || r->n == 0) break; // we failed + // set the distance to p's neighbors + for (i = 0; i < r->n; ++i) { + int nsr, dist, which; + if ((int64_t)r->a[i].x < 0) continue; + y = mag_tid2idd(g->h, r->a[i].x); + if (y == (idd^1)) { // there is a loop involving the initial vertex + a->stack.n = 0; + break; // not a bubble; stop; this will jump out of the while() loop + } + q = &g->v.a[y>>1]; + if (q->ptr == 0) { // has not been attempted + q->ptr = tip_alloc(&a->pool, y>>1), ++n_pending; + mag_v128_clean(&q->nei[y&1]); // make sure there are no deleted edges + } + nsr = tiptr(p)->n[x&1][0] + p->nsr; which = 0; + dist = tiptr(p)->d[x&1][0] + p->len - r->a[i].y; + //printf("01 [%d]\t[%d,%d]\t[%d,%d]\n", i, tiptr(q)->n[y&1][0], tiptr(q)->n[y&1][1], tiptr(q)->d[y&1][0], tiptr(q)->d[y&1][1]); + // test and possibly update the tentative distance + if (nsr > tiptr(q)->n[y&1][0]) { // then move the best to the 2nd best and update the best + tiptr(q)->n[y&1][1] = tiptr(q)->n[y&1][0]; tiptr(q)->n[y&1][0] = nsr; + tiptr(q)->v[y&1][1] = tiptr(q)->v[y&1][0]; tiptr(q)->v[y&1][0] = (x^1)<<32|i<<1|which; + tiptr(q)->d[y&1][1] = tiptr(q)->d[y&1][0]; tiptr(q)->d[y&1][0] = dist; + nsr = tiptr(p)->n[x&1][1] + p->nsr; which = 1; // now nsr is the 2nd best + dist = tiptr(p)->d[x&1][1] + p->len - r->a[i].y; + } + if (nsr > tiptr(q)->n[y&1][1]) // update the 2nd best + tiptr(q)->n[y&1][1] = nsr, tiptr(q)->v[y&1][1] = (x^1)<<32|i<<1|which, tiptr(q)->d[y&1][1] = dist; + if (++tiptr(q)->cnt[y&1] == q->nei[y&1].n) { // all q's predecessors have been processed; then push + kv_push(uint64_t, a->stack, y); + --n_pending; + } + } + } + if (n_pending == 0 && a->stack.n == 1) { // found a bubble + uint64_t x = a->stack.a[0]; + p = &g->v.a[x>>1]; + //printf("(%d,%d)\t(%d,%d)\n", tiptr(p)->n[x&1][0], tiptr(p)->n[x&1][1], tiptr(p)->d[x&1][0], tiptr(p)->d[x&1][1]); + backtrace(g, tiptr(p)->v[x&1][0], idd, a->h); + backtrace(g, tiptr(p)->v[x&1][1], idd, a->h); + } + for (i = 0; i < a->pool.n; ++i) // reset p->ptr + g->v.a[a->pool.buf[i]->id].ptr = 0; + if (kh_size(a->h)) { // bubble detected; then remove verticies not in the top two paths + for (i = 1; i < a->pool.n; ++i) { // i=0 corresponds to the initial vertex which we want to exclude + uint64_t id = a->pool.buf[i]->id; + if (id != a->stack.a[0]>>1 && kh_get(64, a->h, id) == kh_end(a->h)) // not in the top two paths + mag_v_del(g, &g->v.a[id]); + } + } +} + +void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist) +{ + int64_t i; + mogb_aux_t *a; + a = mag_b_initaux(); + for (i = 0; i < g->v.n; ++i) { + mag_vh_simplify_bubble(g, i<<1|0, max_vtx, max_dist, a); + mag_vh_simplify_bubble(g, i<<1|1, max_vtx, max_dist, a); + } + mag_b_destroyaux(a); + mag_g_merge(g, 0, 0); +} + +int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int max_bdiff, int aggressive) +{ + magv_t *p = &g->v.a[idd>>1], *q[2]; + ku128_v *r; + int i, j, k, dir[2], l[2], ret = -1; + char *seq[2], *cov[2]; + float n_diff, r_diff, avg[2], max_n_diff = aggressive? MAX_N_DIFF * 2. : MAX_N_DIFF; + + if (p->len < 0 || p->nei[idd&1].n != 2) return ret; // deleted or no bubble + r = &p->nei[idd&1]; + for (j = 0; j < 2; ++j) { + uint64_t x; + if ((int64_t)r->a[j].x < 0) return ret; + x = mag_tid2idd(g->h, r->a[j].x); + dir[j] = x&1; + q[j] = &g->v.a[x>>1]; + if (q[j]->nei[0].n != 1 || q[j]->nei[1].n != 1) return ret; // no bubble + l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); // bubble length, excluding overlaps + } + if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble + if (l[0] - l[1] > max_bdiff || l[1] - l[0] > max_bdiff) return 1; // huge bubble differences + for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[] + if (l[j] > 0) { + seq[j] = malloc(l[j]<<1); + cov[j] = seq[j] + l[j]; + for (i = 0; i < l[j]; ++i) { + seq[j][i] = q[j]->seq[i + q[j]->nei[0].a->y]; + cov[j][i] = q[j]->cov[i + q[j]->nei[0].a->y]; + } + if (dir[j]) { + seq_revcomp6(l[j], (uint8_t*)seq[j]); + seq_reverse(l[j], (uint8_t*)cov[j]); + } + for (i = 0, avg[j] = 0.; i < l[j]; ++i) { + --seq[j][i]; // change DNA6 encoding to DNA4 for SW below + avg[j] += cov[j][i] - 33; + } + avg[j] /= l[j]; + } else { // l[j] <= 0; this may happen around a tandem repeat + int beg, end; + seq[j] = cov[j] = 0; + beg = q[j]->nei[0].a->y; end = q[j]->len - q[j]->nei[1].a->y; + if (beg > end) beg ^= end, end ^= beg, beg ^= end; // swap + if (beg < end) { + for (i = beg, avg[j] = 0.; i < end; ++i) + avg[j] += q[j]->cov[i] - 33; + avg[j] /= end - beg; + } else avg[j] = q[j]->cov[beg] - 33; // FIXME: when q[j] is contained, weird thing may happen + } + } + ret = 1; + if (l[0] > 0 && l[1] > 0) { // then do SW to compute n_diff and r_diff + int8_t mat[16]; + kswr_t aln; + for (i = k = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) + mat[k++] = i == j? 5 : -4; + aln = ksw_align(l[0], (uint8_t*)seq[0], l[1], (uint8_t*)seq[1], 4, mat, 5, 2, 0, 0); + n_diff = ((l[0] < l[1]? l[0] : l[1]) * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score + r_diff = n_diff / ((l[0] + l[1]) / 2.); + //fprintf(stderr, "===> %f %f <===\n", n_diff, r_diff); for (j = 0; j < 2; ++j) { for (i = 0; i < l[j]; ++i) fputc("ACGTN"[(int)seq[j][i]], stderr); fputc('\n', stderr); } + } else { + n_diff = abs(l[0] - l[1]) * L_DIFF_COEF; + r_diff = 1.; + //fprintf(stderr, "---> (%d,%d) <---\n", l[0], l[1]); + } + if (n_diff < max_n_diff || r_diff < MAX_R_DIFF) { + j = avg[0] < avg[1]? 0 : 1; + if (aggressive || (avg[j] < max_cov && avg[j] / (avg[j^1] + avg[j]) < max_frac)) { + mag_v_del(g, q[j]); + ret = 2; + } + } + free(seq[0]); free(seq[1]); + return ret; +} + +void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive) +{ + int64_t i, n_examined = 0, n_popped = 0; + int ret; + + for (i = 0; i < g->v.n; ++i) { + ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, max_bdiff, aggressive); + if (ret >= 1) ++n_examined; + if (ret >= 2) ++n_popped; + ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, max_bdiff, aggressive); + if (ret >= 1) ++n_examined; + if (ret >= 2) ++n_popped; + } + if (fm_verbose >= 3) + fprintf(stderr, "[M::%s] examined %ld bubbles and popped %ld\n", __func__, (long)n_examined, (long)n_popped); + mag_g_merge(g, 0, min_merge_len); +} + +/**************** + * Open bubbles * + ****************/ + +void mag_v_pop_open(mag_t *g, magv_t *p, int min_elen) +{ + int i, j, k, l, dir, max_l, l_qry; + magv_t *q, *t; + ku128_v *r, *s; + uint8_t *seq; + int8_t mat[16]; + + if (p->len < 0 || p->len >= min_elen) return; + //if (p->nei[0].n && p->nei[1].n) return; // FIXME: between this and the next line, which is better? + if (p->nei[0].n + p->nei[1].n != 1) return; + dir = p->nei[0].n? 0 : 1; + // initialize the scoring system + for (i = k = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) + mat[k++] = i == j? 5 : -4; + + s = &p->nei[dir]; + for (l = 0; l < s->n; ++l) { // if we use "if (p->nei[0].n + p->nei[1].n != 1)", s->n == 1 + uint64_t v; + kswq_t *qry; + if ((int64_t)s->a[l].x < 0) continue; + v = mag_tid2idd(g->h, s->a[l].x); + q = &g->v.a[v>>1]; + if (q == p || q->nei[v&1].n == 1) continue; + // get the query ready + max_l = (p->len - s->a[l].y) * 2; + seq = malloc(max_l + 1); + if (dir == 0) { // forward strand + for (j = s->a[l].y, k = 0; j < p->len; ++j) + seq[k++] = p->seq[j] - 1; + } else { // reverse + for (j = p->len - s->a[l].y - 1, k = 0; j >= 0; --j) + seq[k++] = 4 - p->seq[j]; + } + l_qry = k; + qry = ksw_qinit(2, l_qry, seq, 4, mat); + //fprintf(stderr, "===> %lld:%lld:%d[%d], %d, %ld <===\n", p->k[0], p->k[1], s->n, l, p->nsr, q->nei[v&1].n); + //for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fputc('\n', stderr); + + r = &q->nei[v&1]; + for (i = 0; i < r->n; ++i) { + uint64_t w; + kswr_t aln; + if (r->a[i].x == p->k[dir] || (int64_t)r->a[i].x < 0) continue; + w = mag_tid2idd(g->h, r->a[i].x); + // get the target sequence + t = &g->v.a[w>>1]; + if (w&1) { // reverse strand + for (j = t->len - r->a[i].y - 1, k = 0; j >= 0 && k < max_l; --j) + seq[k++] = 4 - t->seq[j]; + } else { + for (j = r->a[i].y, k = 0; j < t->len && k < max_l; ++j) + seq[k++] = t->seq[j] - 1; + } + aln = ksw_align(0, 0, k, seq, 4, mat, 5, 2, 0, &qry); + //for (j = 0; j < k; ++j) fputc("ACGTN"[(int)seq[j]], stderr); fprintf(stderr, "\t%d\t%f\n", aln.score, (l_qry * 5. - aln.score) / (5. + 4.)); + if (aln.score >= l_qry * 5 / 2) { + double r_diff, n_diff; + n_diff = (l_qry * 5. - aln.score) / (5. + 4.); // 5: matching score; -4: mismatchig score + r_diff = n_diff / l_qry; + if (n_diff < MAX_N_DIFF || r_diff < MAX_R_DIFF) break; + } + } + + if (i != r->n) { + // mark delete in p and delete in q + edge_mark_del(s->a[l]); + for (i = 0; i < r->n; ++i) + if (r->a[i].x == p->k[dir]) + edge_mark_del(r->a[i]); + } + free(seq); free(qry); + } + + for (i = 0; i < s->n; ++i) + if (!edge_is_del(s->a[i])) break; + if (i == s->n) mag_v_del(g, p); // p is not connected to any other vertices +} + +void mag_g_pop_open(mag_t *g, int min_elen) +{ + int64_t i; + for (i = 0; i < g->v.n; ++i) + mag_v_pop_open(g, &g->v.a[i], min_elen); + if (fm_verbose >= 3) + fprintf(stderr, "[M:%s] popped open bubbles\n", __func__); + mag_g_merge(g, 0, 0); +} diff --git a/example.c b/example.c new file mode 100644 index 0000000..5cf65dd --- /dev/null +++ b/example.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include "fml.h" + +int main(int argc, char *argv[]) +{ + fml_opt_t opt; + int c, n_seqs, n_utg, gfa_out = 0; + bseq1_t *seqs; + fml_utg_t *utg; + + fml_opt_init(&opt); + while ((c = getopt(argc, argv, "gOAe:l:r:t:c:d:v:")) >= 0) { + if (c == 'e') opt.ec_k = atoi(optarg); + else if (c == 'l') opt.min_asm_ovlp = atoi(optarg); + else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg); + else if (c == 'A') opt.mag_opt.flag |= MAG_F_AGGRESSIVE; + else if (c == 'O') opt.mag_opt.flag &= ~MAG_F_POPOPEN; + else if (c == 'd') opt.mag_opt.max_bdiff = atoi(optarg); + else if (c == 't') opt.n_threads = atoi(optarg); + else if (c == 'g') gfa_out = 1; + else if (c == 'v') fm_verbose = atoi(optarg); + else if (c == 'c') { + char *p; + opt.min_cnt = strtol(optarg, &p, 10); + if (*p == ',') opt.max_cnt = strtol(p + 1, &p, 10); + } + } + if (argc == optind) { + fprintf(stderr, "Usage: fml-asm [options] \n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -e INT k-mer length for error correction (0 for auto; -1 to disable) [%d]\n", opt.ec_k); + fprintf(stderr, " -c INT1[,INT2] range of k-mer & read count thresholds for ec and graph cleaning [%d,%d]\n", opt.min_cnt, opt.max_cnt); + fprintf(stderr, " -l INT min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp); + fprintf(stderr, " -r FLOAT drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1); + fprintf(stderr, " -t INT number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads); + fprintf(stderr, " -d INT retain a bubble if one side is longer than the other side by >INT-bp [%d]\n", opt.mag_opt.max_bdiff); + fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes; override -O)\n"); + fprintf(stderr, " -O don't apply aggressive tip trimming\n"); + fprintf(stderr, " -g output the assembly graph in the GFA format\n"); + return 1; + } + seqs = bseq_read(argv[optind], &n_seqs); + utg = fml_assemble(&opt, n_seqs, seqs, &n_utg); + if (!gfa_out) fml_utg_print(n_utg, utg); + else fml_utg_print_gfa(n_utg, utg); + fml_utg_destroy(n_utg, utg); + return 0; +} diff --git a/fml.h b/fml.h new file mode 100644 index 0000000..fd7d466 --- /dev/null +++ b/fml.h @@ -0,0 +1,193 @@ +#ifndef FML_H +#define FML_H + +#define FML_VERSION "r55" + +#include + +typedef struct { + int32_t l_seq; + char *seq, *qual; // NULL-terminated strings; length expected to match $l_seq +} bseq1_t; + +#define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default) +#define MAG_F_POPOPEN 0x40 // aggressive tip trimming (default) +#define MAG_F_NO_SIMPL 0x80 // skip bubble simplification (default) + +typedef struct { + int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth; + float min_dratio1, max_bcov, max_bfrac; +} magopt_t; + +typedef struct { + int n_threads; // number of threads; don't use multi-threading for small data sets + int ec_k; // k-mer length for error correction; 0 for auto estimate + int min_cnt, max_cnt; // both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt] + int min_asm_ovlp; // min overlap length during assembly + int min_merge_len; // during assembly, don't explicitly merge an overlap if shorter than this value + magopt_t mag_opt; // graph cleaning options +} fml_opt_t; + +struct rld_t; +struct mag_t; + +typedef struct { + uint32_t len:31, from:1; // $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end + uint32_t id:31, to:1; // $id: unitig number +} fml_ovlp_t; + +typedef struct { + int32_t len; // length of sequence + int32_t nsr; // number of supporting reads + char *seq; // unitig sequence + char *cov; // cov[i]-33 gives per-base coverage at i + int n_ovlp[2]; // number of 5'-end [0] and 3'-end [1] overlaps + fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1] +} fml_utg_t; + +extern int fm_verbose; + +#ifdef __cplusplus +extern "C" { +#endif + +/************************ + * High-level functions * + ************************/ + +/** + * Read all sequences from a FASTA/FASTQ file + * + * @param fn filename; NULL or "-" for stdin + * @param n (out) number of sequences read into RAM + * + * @return array of sequences + */ +bseq1_t *bseq_read(const char *fn, int *n); + +/** + * Initialize default parameters + * + * @param opt (out) pointer to parameters + */ +void fml_opt_init(fml_opt_t *opt); + +/** + * Assemble a list of sequences + * + * @param opt parameters + * @param n_seqs number of input sequences + * @param seqs sequences to assemble; FREED on return + * @param n_utg (out) number of unitigs in return + * + * @return array of unitigs + */ +fml_utg_t *fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg); + +/** + * Free unitigs + * + * @param n_utg number of unitigs + * @param utg array of unitigs + */ +void fml_utg_destroy(int n_utg, fml_utg_t *utg); + +/************************************************ + * Mid-level functions called by fml_assemble() * + ************************************************/ + +/** + * Adjust parameters based on input sequences + * + * @param opt parameters to update IN PLACE + * @param n_seqs number of sequences + * @param seqs array of sequences + */ +void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs); + +/** + * Error correction + * + * @param opt parameters + * @param n number of sequences + * @param seq array of sequences; corrected IN PLACE + * + * @return k-mer coverage + */ +float fml_correct(const fml_opt_t *opt, int n, bseq1_t *seq); +float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq); + +/** + * Construct FMD-index + * + * @param opt parameters + * @param n number of sequences + * @param seq array of sequences; FREED on return + * + * @return FMD-index on success; NULL if all input sequences are zero in length + */ +struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq); + +/** + * Generate initial overlap graph + * + * @param opt parameters + * @param e FMD-index; FREED on return + * + * @return overlap graph in the "mag" structure + */ +struct mag_t *fml_fmi2mag(const fml_opt_t *opt, struct rld_t *e); + +/** + * Clean a mag graph + * + * @param opt parameters + * @param g overlap graph; modified IN PLACE + */ +void fml_mag_clean(const fml_opt_t *opt, struct mag_t *g); + +/** + * Convert a graph in mag to fml_utg_t + * + * @param g graph in the "mag" structure; FREED on return + * @param n_utg (out) number of unitigs + * + * @return array of unitigs + */ +fml_utg_t *fml_mag2utg(struct mag_t *g, int *n_utg); + +/** + * Output unitig graph in the mag format + * + * @param n_utg number of unitigs + * @param utg array of unitigs + */ +void fml_utg_print(int n_utgs, const fml_utg_t *utg); + +/** + * Output unitig graph in the GFA format + * + * @param n_utg number of unitigs + * @param utg array of unitigs + */ +void fml_utg_print_gfa(int n, const fml_utg_t *utg); + +/** + * Deallocate an FM-index + * + * @param e pointer to the FM-index + */ +void fml_fmi_destroy(struct rld_t *e); + +/** + * Deallocate a mag graph + * + * @param g pointer to the mag graph + */ +void fml_mag_destroy(struct mag_t *g); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/htab.c b/htab.c new file mode 100644 index 0000000..13b4150 --- /dev/null +++ b/htab.c @@ -0,0 +1,132 @@ +#include +#include +#include +#include "htab.h" +#include "khash.h" + +#define _cnt_eq(a, b) ((a)>>14 == (b)>>14) +#define _cnt_hash(a) ((a)>>14) +KHASH_INIT(cnt, uint64_t, char, 0, _cnt_hash, _cnt_eq) +typedef khash_t(cnt) cnthash_t; + +struct bfc_ch_s { + int k; + cnthash_t **h; + // private + int l_pre; +}; + +bfc_ch_t *bfc_ch_init(int k, int l_pre) +{ + bfc_ch_t *ch; + int i; + assert(k <= 63); + if (k * 2 - l_pre > BFC_CH_KEYBITS) + l_pre = k * 2 - BFC_CH_KEYBITS; + if (l_pre > BFC_CH_MAXPRE) l_pre = BFC_CH_MAXPRE; + assert(k - l_pre < BFC_CH_KEYBITS); + ch = calloc(1, sizeof(bfc_ch_t)); + ch->k = k, ch->l_pre = l_pre; + ch->h = calloc(1<l_pre, sizeof(void*)); + for (i = 0; i < 1<l_pre; ++i) + ch->h[i] = kh_init(cnt); + return ch; +} + +void bfc_ch_destroy(bfc_ch_t *ch) +{ + int i; + if (ch == 0) return; + for (i = 0; i < 1<l_pre; ++i) + kh_destroy(cnt, ch->h[i]); + free(ch->h); free(ch); +} + +static inline cnthash_t *get_subhash(const bfc_ch_t *ch, const uint64_t x[2], uint64_t *key) +{ + if (ch->k <= 32) { + int t = ch->k * 2 - ch->l_pre; + uint64_t z = x[0] << ch->k | x[1]; + *key = (z & ((1ULL<h[z>>t]; + } else { + int t = ch->k - ch->l_pre; + int shift = t + ch->k < BFC_CH_KEYBITS? ch->k : BFC_CH_KEYBITS - t; + *key = ((x[0] & ((1ULL<h[x[0]>>t]; + } +} + +int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced) +{ + int absent; + uint64_t key; + cnthash_t *h; + khint_t k; + h = get_subhash(ch, x, &key); + if (__sync_lock_test_and_set(&h->lock, 1)) { + if (forced) // then wait until the hash table is unlocked by the thread using it + while (__sync_lock_test_and_set(&h->lock, 1)) + while (h->lock); // lock + else return -1; + } + k = kh_put(cnt, h, key, &absent); + if (absent) { + if (is_high) kh_key(h, k) |= 1<<8; + } else { + if ((kh_key(h, k) & 0xff) != 0xff) ++kh_key(h, k); + if (is_high && (kh_key(h, k) >> 8 & 0x3f) != 0x3f) kh_key(h, k) += 1<<8; + } + __sync_lock_release(&h->lock); // unlock + return 0; +} + +int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2]) +{ + uint64_t key; + cnthash_t *h; + khint_t itr; + h = get_subhash(ch, x, &key); + itr = kh_get(cnt, h, key); + return itr == kh_end(h)? -1 : kh_key(h, itr) & 0x3fff; +} + +int bfc_ch_kmer_occ(const bfc_ch_t *ch, const bfc_kmer_t *z) +{ + uint64_t x[2]; + bfc_kmer_hash(ch->k, z->x, x); + return bfc_ch_get(ch, x); +} + +uint64_t bfc_ch_count(const bfc_ch_t *ch) +{ + int i; + uint64_t cnt = 0; + for (i = 0; i < 1<l_pre; ++i) + cnt += kh_size(ch->h[i]); + return cnt; +} + +int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64]) +{ + int i, max_i = -1; + uint64_t max; + memset(cnt, 0, 256 * 8); + memset(high, 0, 64 * 8); + for (i = 0; i < 1<l_pre; ++i) { + khint_t k; + cnthash_t *h = ch->h[i]; + for (k = 0; k != kh_end(h); ++k) + if (kh_exist(h, k)) + ++cnt[kh_key(h, k) & 0xff], ++high[kh_key(h, k)>>8 & 0x3f]; + } + for (i = 3, max = 0; i < 256; ++i) + if (cnt[i] > max) + max = cnt[i], max_i = i; + return max_i; +} + +int bfc_ch_get_k(const bfc_ch_t *ch) +{ + return ch->k; +} diff --git a/htab.h b/htab.h new file mode 100644 index 0000000..118244c --- /dev/null +++ b/htab.h @@ -0,0 +1,23 @@ +#ifndef BFC_HTAB_H +#define BFC_HTAB_H + +#include +#include "kmer.h" + +#define BFC_CH_KEYBITS 50 +#define BFC_CH_MAXPRE 20 + +struct bfc_ch_s; +typedef struct bfc_ch_s bfc_ch_t; + +bfc_ch_t *bfc_ch_init(int k, int l_pre); +void bfc_ch_destroy(bfc_ch_t *ch); +int bfc_ch_insert(bfc_ch_t *ch, const uint64_t x[2], int is_high, int forced); +int bfc_ch_get(const bfc_ch_t *ch, const uint64_t x[2]); +uint64_t bfc_ch_count(const bfc_ch_t *ch); +int bfc_ch_hist(const bfc_ch_t *ch, uint64_t cnt[256], uint64_t high[64]); +int bfc_ch_get_k(const bfc_ch_t *ch); + +int bfc_ch_kmer_occ(const bfc_ch_t *ch, const bfc_kmer_t *z); + +#endif diff --git a/internal.h b/internal.h new file mode 100644 index 0000000..37e4d96 --- /dev/null +++ b/internal.h @@ -0,0 +1,21 @@ +#ifndef FML_INTERNAL_H +#define FML_INTERNAL_H + +#include "fml.h" + +extern unsigned char seq_nt6_table[256]; + +#ifdef __cplusplus +extern "C" { +#endif + +void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); +void seq_reverse(int l, unsigned char *s); +void seq_revcomp6(int l, unsigned char *s); +struct bfc_ch_s *fml_count(int n, const bseq1_t *seq, int k, int q, int l_pre, int n_threads); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/khash.h b/khash.h new file mode 100644 index 0000000..870ae5d --- /dev/null +++ b/khash.h @@ -0,0 +1,614 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.8" + +#include +#include +#include + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied; \ + volatile int lock; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (new_n_buckets>>1) + (new_n_buckets>>2)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) return -1; \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) return -1; \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= (h->n_buckets>>2) + (h->n_buckets>>1)) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/kmer.h b/kmer.h new file mode 100644 index 0000000..b53b7d2 --- /dev/null +++ b/kmer.h @@ -0,0 +1,106 @@ +#ifndef BFC_KMER_H +#define BFC_KMER_H + +#include + +typedef struct { + uint64_t x[4]; +} bfc_kmer_t; + +static inline void bfc_kmer_append(int k, uint64_t x[4], int c) +{ // IMPORTANT: 0 <= c < 4 + uint64_t mask = (1ULL<>1)) & mask; + x[2] = x[2]>>1 | (1ULL^(c&1))<<(k-1); + x[3] = x[3]>>1 | (1ULL^c>>1) <<(k-1); +} + +static inline void bfc_kmer_change(int k, uint64_t x[4], int d, int c) // d-bp from the 3'-end of k-mer; 0<=d>1)<>1)<<(k-1-d) | (x[3]&t); +} + +// Thomas Wang's integer hash functions. See for a snapshot. +static inline uint64_t bfc_hash_64(uint64_t key, uint64_t mask) +{ + key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; + key = key ^ key >> 24; + key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 + key = key ^ key >> 14; + key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 + key = key ^ key >> 28; + key = (key + (key << 31)) & mask; + return key; +} + +static inline uint64_t bfc_hash_64_inv(uint64_t key, uint64_t mask) +{ + uint64_t tmp; + + // Invert key = key + (key << 31) + tmp = (key - (key << 31)); + key = (key - (tmp << 31)) & mask; + + // Invert key = key ^ (key >> 28) + tmp = key ^ key >> 28; + key = key ^ tmp >> 28; + + // Invert key *= 21 + key = (key * 14933078535860113213ull) & mask; + + // Invert key = key ^ (key >> 14) + tmp = key ^ key >> 14; + tmp = key ^ tmp >> 14; + tmp = key ^ tmp >> 14; + key = key ^ tmp >> 14; + + // Invert key *= 265 + key = (key * 15244667743933553977ull) & mask; + + // Invert key = key ^ (key >> 24) + tmp = key ^ key >> 24; + key = key ^ tmp >> 24; + + // Invert key = (~key) + (key << 21) + tmp = ~key; + tmp = ~(key - (tmp << 21)); + tmp = ~(key - (tmp << 21)); + key = ~(key - (tmp << 21)) & mask; + + return key; +} + +static inline uint64_t bfc_kmer_hash(int k, const uint64_t x[4], uint64_t h[2]) +{ + int t = k>>1, u = ((x[1]>>t&1) > (x[3]>>t&1)); // the middle base is always different + uint64_t mask = (1ULL<>l&1)<<1 | (y[0]>>l&1)]; + buf[k] = 0; + return buf; +} + +#endif diff --git a/kseq.h b/kseq.h new file mode 100644 index 0000000..84c1fa3 --- /dev/null +++ b/kseq.h @@ -0,0 +1,248 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + int begin, end; \ + int is_eof:2, bufsize:30; \ + type_t f; \ + unsigned char *buf; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(SCOPE, type_t, __bufsize) \ + SCOPE kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; ks->bufsize = __bufsize; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + SCOPE void ks_destroy(kstream_t *ks) \ + { \ + if (!ks) return; \ + free(ks->buf); \ + free(ks); \ + } + +#define __KS_INLINED(__read) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(SCOPE, __read) \ + SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(SCOPE, type_t, __bufsize) \ + __KS_GETUNTIL(SCOPE, __read) \ + __KS_INLINED(__read) + +#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) + +#define KSTREAM_DECLARE(type_t, __read) \ + __KS_TYPE(type_t) \ + extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ + extern kstream_t *ks_init(type_t f); \ + extern void ks_destroy(kstream_t *ks); \ + __KS_INLINED(__read) + +/****************** + * FASTA/Q parser * + ******************/ + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif diff --git a/ksort.h b/ksort.h new file mode 100644 index 0000000..e659b22 --- /dev/null +++ b/ksort.h @@ -0,0 +1,309 @@ +/* The MIT License + + Copyright (c) 2008, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + 2011-04-10 (0.1.6): + + * Added sample + + 2011-03 (0.1.5): + + * Added shuffle/permutation + + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1; \ + if (__sort_lt(tmp, l[i])) break; \ + l[k] = l[i]; k = i; \ + } \ + l[k] = tmp; \ + } \ + void ks_heapmake_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapdown_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapdown_##name(0, i, l); \ + } \ + } \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } \ + void ks_shuffle_##name(size_t n, type_t a[]) \ + { \ + int i, j; \ + for (i = n; i > 1; --i) { \ + type_t tmp; \ + j = (int)(drand48() * i); \ + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ + } \ + } \ + void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \ + { /* reference: http://code.activestate.com/recipes/272884/ */ \ + int i, k, pop = n; \ + for (i = (int)r, k = 0; i >= 0; --i) { \ + double z = 1., x = drand48(); \ + type_t tmp; \ + while (x < z) z -= z * i / (pop--); \ + if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \ + ++k; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) +#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/kstring.h b/kstring.h new file mode 100644 index 0000000..b179a8c --- /dev/null +++ b/kstring.h @@ -0,0 +1,169 @@ +/* The MIT License + + Copyright (c) by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +typedef struct { + uint64_t tab[4]; + int sep, finished; + const char *p; // end of the current token +} ks_tokaux_t; + +#ifdef __cplusplus +extern "C" { +#endif + + int ksprintf(kstring_t *s, const char *fmt, ...); + int ksprintf_fast(kstring_t *s, const char *fmt, ...); + int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + char *kstrstr(const char *str, const char *pat, int **_prep); + char *kstrnstr(const char *str, const char *pat, int n, int **_prep); + void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); + + /* kstrtok() is similar to strtok_r() except that str is not + * modified and both str and sep can be NULL. For efficiency, it is + * actually recommended to set both to NULL in the subsequent calls + * if sep is not changed. */ + char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); + +#ifdef __cplusplus +} +#endif + +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + memcpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/ksw.c b/ksw.c new file mode 100644 index 0000000..71f2635 --- /dev/null +++ b/ksw.c @@ -0,0 +1,352 @@ +/* The MIT License + + Copyright (c) 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include "ksw.h" + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; + +struct _kswq_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +}; + +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + kswq_t *q; + int slen, a, tmp, p; + + size = size > 1? 2 : 1; + p = 8 * (3 - size); // # values per __m128i + slen = (qlen + p - 1) / p; // segmented length + q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} + +kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; + kswr_t r; + +#define __max_16(ret, xx) do { \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ + (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ + } while (0) + + // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi8(_gapo + _gape); + gape = _mm_set1_epi8(_gape); + shift = _mm_set1_epi8(q->shift); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, cmp, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + for (j = 0; LIKELY(j < slen); ++j) { + /* SW cells are computed in the following order: + * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} + * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} + */ + // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) + h = _mm_adds_epu8(h, _mm_load_si128(S + j)); + h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + e = _mm_load_si128(E + j); // e=E'(i,j) + h = _mm_max_epu8(h, e); + h = _mm_max_epu8(h, f); // h=H'(i,j) + max = _mm_max_epu8(max, h); // set max + _mm_store_si128(H1 + j, h); // save to H'(i,j) + // now compute E'(i+1,j) + h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo + e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape + e = _mm_max_epu8(e, h); // e=E'(i+1,j) + _mm_store_si128(E + j, e); // save to E'(i+1,j) + // now compute F'(i,j+1) + f = _mm_subs_epu8(f, gape); + f = _mm_max_epu8(f, h); + // get H'(i-1,j) and prepare for the next j + h = _mm_load_si128(H0 + j); // h=H'(i-1,j) + } + // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion + for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max + f = _mm_slli_si128(f, 1); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epu8(h, f); // h=H'(i,j) + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu8(h, gapoe); + f = _mm_subs_epu8(f, gape); + cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); + if (UNLIKELY(cmp == 0xffff)) goto end_loop16; + } + } +end_loop16: + //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); + __max_16(imax, max); // imax is the maximum number in max + if (imax >= minsc) { // write the b array; this condition adds branching unfornately + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = (uint64_t*)realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; // te is the end position on the target + for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax + q->shift >= 255 || gmax >= endsc) break; + } + S = H1; H1 = H0; H0 = S; // swap H0 and H1 + } + r.score = gmax + q->shift < 255? gmax : 255; + r.te = te; + if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score + int max = -1, low, high, qlen = slen * 16; + uint8_t *t = (uint8_t*)Hmax; + for (i = 0; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; + //printf("%d,%d\n", max, gmax); + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } + } + } + free(b); + return r; +} + +kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + kswr_t r; + +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) + + // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi16(_gapo + _gape); + gape = _mm_set1_epi16(_gape); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_adds_epi16(h, *S++); + e = _mm_load_si128(E + j); + h = _mm_max_epi16(h, e); + h = _mm_max_epi16(h, f); + max = _mm_max_epi16(max, h); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + e = _mm_subs_epu16(e, gape); + e = _mm_max_epi16(e, h); + _mm_store_si128(E + j, e); + f = _mm_subs_epu16(f, gape); + f = _mm_max_epi16(f, h); + h = _mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 16); ++k) { + f = _mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epi16(h, f); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + f = _mm_subs_epu16(f, gape); + if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; + } + } +end_loop8: + __max_8(imax, max); + if (imax >= minsc) { + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = (uint64_t*)realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; + for (j = 0; LIKELY(j < slen); ++j) + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax >= endsc) break; + } + S = H1; H1 = H0; H0 = S; + } + r.score = gmax; r.te = te; + { + int max = -1, low, high, qlen = slen * 8; + uint16_t *t = (uint16_t*)Hmax; + for (i = 0, r.qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } + } + } + free(b); + return r; +} + +static void revseq(int l, uint8_t *s) +{ + int i, t; + for (i = 0; i < l>>1; ++i) + t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; +} + +kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) +{ + int size; + kswq_t *q; + kswr_t r, rr; + kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int); + + q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); + if (qry && *qry == 0) *qry = q; + func = q->size == 2? ksw_i16 : ksw_u8; + size = q->size; + r = func(q, tlen, target, gapo, gape, xtra); + if (qry == 0) free(q); + if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; + revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end + q = ksw_qinit(size, r.qe + 1, query, m, mat); + rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score); + revseq(r.qe + 1, query); revseq(r.te + 1, target); + free(q); + if (r.score == rr.score) + r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; + return r; +} diff --git a/ksw.h b/ksw.h new file mode 100644 index 0000000..160cee8 --- /dev/null +++ b/ksw.h @@ -0,0 +1,71 @@ +#ifndef __AC_KSW_H +#define __AC_KSW_H + +#include + +#define KSW_XBYTE 0x10000 +#define KSW_XSTOP 0x20000 +#define KSW_XSUBO 0x40000 +#define KSW_XSTART 0x80000 + +struct _kswq_t; +typedef struct _kswq_t kswq_t; + +typedef struct { + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Aligning two sequences + * + * @param qlen length of the query sequence (typically +#include +#include + +/************ + * kt_for() * + ************/ + +struct kt_for_t; + +typedef struct { + struct kt_for_t *t; + long i; +} ktf_worker_t; + +typedef struct kt_for_t { + int n_threads; + long n; + ktf_worker_t *w; + void (*func)(void*,long,int); + void *data; +} kt_for_t; + +static inline long steal_work(kt_for_t *t) +{ + int i, min_i = -1; + long k, min = LONG_MAX; + for (i = 0; i < t->n_threads; ++i) + if (min > t->w[i].i) min = t->w[i].i, min_i = i; + k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); + return k >= t->n? -1 : k; +} + +static void *ktf_worker(void *data) +{ + ktf_worker_t *w = (ktf_worker_t*)data; + long i; + for (;;) { + i = __sync_fetch_and_add(&w->i, w->t->n_threads); + if (i >= w->t->n) break; + w->t->func(w->t->data, i, w - w->t->w); + } + while ((i = steal_work(w->t)) >= 0) + w->t->func(w->t->data, i, w - w->t->w); + pthread_exit(0); +} + +void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) +{ + if (n_threads > 1) { + int i; + kt_for_t t; + pthread_t *tid; + t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; + t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); + tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); + for (i = 0; i < n_threads; ++i) + t.w[i].t = &t, t.w[i].i = i; + for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + } else { + long j; + for (j = 0; j < n; ++j) func(data, j, 0); + } +} diff --git a/kvec.h b/kvec.h new file mode 100644 index 0000000..632fce4 --- /dev/null +++ b/kvec.h @@ -0,0 +1,110 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "kvec.h" +int main() { + kvec_t(int) array; + kv_init(array); + kv_push(int, array, 10); // append + kv_a(int, array, 20) = 5; // dynamic + kv_A(array, 20) = 4; // static + kv_destroy(array); + return 0; +} +*/ + +/* + 2008-09-22 (0.1.0): + + * The initial version. + +*/ + +#ifndef AC_KVEC_H +#define AC_KVEC_H + +#include + +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) + +#define kvec_t(type) struct { size_t n, m; type *a; } +#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) +#define kv_destroy(v) free((v).a) +#define kv_A(v, i) ((v).a[(i)]) +#define kv_pop(v) ((v).a[--(v).n]) +#define kv_size(v) ((v).n) +#define kv_max(v) ((v).m) + +#define kv_resize(type, v, s) do { \ + if ((v).m < (s)) { \ + (v).m = (s); \ + kv_roundup32((v).m); \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + } \ + } while (0) + +#define kv_copy(type, v1, v0) do { \ + if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ + (v1).n = (v0).n; \ + memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ + } while (0) \ + +#define kv_push(type, v, x) do { \ + if ((v).n == (v).m) { \ + (v).m = (v).m? (v).m<<1 : 2; \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + } \ + (v).a[(v).n++] = (x); \ + } while (0) + +#define kv_pushp(type, v, p) do { \ + if ((v).n == (v).m) { \ + (v).m = (v).m? (v).m<<1 : 2; \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + } \ + *(p) = &(v).a[(v).n++]; \ + } while (0) + +#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ + ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : (v).n <= (size_t)(i)? (v).n = (i) \ + : 0), (v).a[(i)] + +#define kv_reverse(type, v, start) do { \ + if ((v).m > 0 && (v).n > (start)) { \ + size_t __i, __end = (v).n - (start); \ + type *__a = (v).a + (start); \ + for (__i = 0; __i < __end>>1; ++__i) { \ + type __t = __a[__end - 1 - __i]; \ + __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ + } \ + } \ + } while (0) + +#endif diff --git a/mag.c b/mag.c new file mode 100644 index 0000000..acdca29 --- /dev/null +++ b/mag.c @@ -0,0 +1,620 @@ +/* remaining problems: + + 1. multiedges due to tandem repeats +*/ + +#include +#include +#include +#include +#include "mag.h" +#include "kvec.h" +#include "internal.h" +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +#include "khash.h" +KHASH_INIT2(64,, khint64_t, uint64_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef khash_t(64) hash64_t; + +#define ku128_xlt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y > (b).y)) +#define ku128_ylt(a, b) ((int64_t)(a).y > (int64_t)(b).y) +#include "ksort.h" +KSORT_INIT(128x, ku128_t, ku128_xlt) +KSORT_INIT(128y, ku128_t, ku128_ylt) +KSORT_INIT_GENERIC(uint64_t) + +#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0) +#define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0) + +int fm_verbose = 1; + +/********************* + * Vector operations * + *********************/ + +static inline void v128_clean(ku128_v *r) +{ + int i, j; + for (i = j = 0; i < r->n; ++i) + if (!edge_is_del(r->a[i])) { // keep this arc + if (j != i) r->a[j++] = r->a[i]; + else ++j; + } + r->n = j; +} + +void mag_v128_clean(ku128_v *r) +{ + v128_clean(r); +} + +static inline void v128_rmdup(ku128_v *r) +{ + int l, cnt; + uint64_t x; + if (r->n > 1) ks_introsort(128x, r->n, r->a); + for (l = cnt = 0; l < r->n; ++l) // jump to the first node to be retained + if (edge_is_del(r->a[l])) ++cnt; + else break; + if (l == r->n) { // no good arcs + r->n = 0; + return; + } + x = r->a[l].x; + for (++l; l < r->n; ++l) { // mark duplicated node + if (edge_is_del(r->a[l]) || r->a[l].x == x) + edge_mark_del(r->a[l]), ++cnt; + else x = r->a[l].x; + } + if (cnt) v128_clean(r); +} + +static inline void v128_cap(ku128_v *r, int max) +{ + int i, thres; + if (r->n <= max) return; + ks_introsort(128y, r->n, r->a); + thres = r->a[max].y; + for (i = 0; i < r->n; ++i) + if (r->a[i].y == thres) break; + r->n = i; +} + +/************************************************* + * Mapping between vertex id and interval end id * + *************************************************/ + +void mag_g_build_hash(mag_t *g) +{ + long i; + int j, ret; + hash64_t *h; + h = kh_init(64); + for (i = 0; i < g->v.n; ++i) { + const magv_t *p = &g->v.a[i]; + for (j = 0; j < 2; ++j) { + khint_t k = kh_put(64, h, p->k[j], &ret); + if (ret == 0) { + if (fm_verbose >= 2) + fprintf(stderr, "[W::%s] terminal %ld is duplicated.\n", __func__, (long)p->k[j]); + kh_val(h, k) = (uint64_t)-1; + } else kh_val(h, k) = i<<1|j; + } + } + g->h = h; +} + +static inline uint64_t tid2idd(hash64_t *h, uint64_t tid) +{ + khint_t k = kh_get(64, h, tid); + assert(k != kh_end(h)); + return kh_val(h, k); +} + +uint64_t mag_tid2idd(void *h, uint64_t tid) // exported version +{ + return tid2idd(h, tid); +} + +void mag_g_amend(mag_t *g) +{ + int i, j, l, ll; + for (i = 0; i < g->v.n; ++i) { + magv_t *p = &g->v.a[i]; + ku128_v *r; + for (j = 0; j < 2; ++j) { + for (l = 0; l < p->nei[j].n; ++l) { + khint_t k; + uint64_t z, x = p->nei[j].a[l].x; + k = kh_get(64, g->h, x); + if (k == kh_end((hash64_t*)g->h)) { // neighbor is not in the hash table; likely due to tip removal + edge_mark_del(p->nei[j].a[l]); + continue; + } else z = kh_val((hash64_t*)g->h, k); + r = &g->v.a[z>>1].nei[z&1]; + for (ll = 0, z = p->k[j]; ll < r->n; ++ll) + if (r->a[ll].x == z) break; + if (ll == r->n) // not in neighbor's neighor + edge_mark_del(p->nei[j].a[l]); + } + v128_rmdup(&p->nei[j]); + } + } +} + +/********************************* + * Graph I/O initialization etc. * + *********************************/ + +void mag_v_write(const magv_t *p, kstring_t *out) +{ + int j, k; + if (p->len <= 0) return; + out->l = 0; + kputc('@', out); kputl(p->k[0], out); kputc(':', out); kputl(p->k[1], out); + kputc('\t', out); kputw(p->nsr, out); + for (j = 0; j < 2; ++j) { + const ku128_v *r = &p->nei[j]; + kputc('\t', out); + for (k = 0; k < r->n; ++k) { + if (edge_is_del(r->a[k])) continue; + kputl(r->a[k].x, out); kputc(',', out); kputw((int32_t)r->a[k].y, out); + kputc(';', out); + } + if (p->nei[j].n == 0) kputc('.', out); + } + kputc('\n', out); + ks_resize(out, out->l + 2 * p->len + 5); + for (j = 0; j < p->len; ++j) + out->s[out->l++] = "ACGT"[(int)p->seq[j] - 1]; + out->s[out->l] = 0; + kputsn("\n+\n", 3, out); + kputsn(p->cov, p->len, out); + kputc('\n', out); +} + +void mag_g_print(const mag_t *g) +{ + int i; + kstring_t out; + out.l = out.m = 0; out.s = 0; + for (i = 0; i < g->v.n; ++i) { + if (g->v.a[i].len < 0) continue; + mag_v_write(&g->v.a[i], &out); + fwrite(out.s, 1, out.l, stdout); + } + free(out.s); + fflush(stdout); +} + +/************************** + * Basic graph operations * + **************************/ + +void mag_v_destroy(magv_t *v) +{ + free(v->nei[0].a); free(v->nei[1].a); + free(v->seq); free(v->cov); + memset(v, 0, sizeof(magv_t)); + v->len = -1; +} + +void mag_g_destroy(mag_t *g) +{ + int i; + kh_destroy(64, g->h); + for (i = 0; i < g->v.n; ++i) + mag_v_destroy(&g->v.a[i]); + free(g->v.a); + free(g); +} + +void mag_v_copy_to_empty(magv_t *dst, const magv_t *src) // NB: memory leak if dst is allocated +{ + memcpy(dst, src, sizeof(magv_t)); + dst->max_len = dst->len + 1; + kroundup32(dst->max_len); + dst->seq = calloc(dst->max_len, 1); memcpy(dst->seq, src->seq, src->len); + dst->cov = calloc(dst->max_len, 1); memcpy(dst->cov, src->cov, src->len); + kv_init(dst->nei[0]); kv_copy(ku128_t, dst->nei[0], src->nei[0]); + kv_init(dst->nei[1]); kv_copy(ku128_t, dst->nei[1], src->nei[1]); +} + +void mag_eh_add(mag_t *g, uint64_t u, uint64_t v, int ovlp) // add v to u +{ + ku128_v *r; + ku128_t *q; + uint64_t idd; + int i; + if ((int64_t)u < 0) return; + idd = tid2idd(g->h, u); + r = &g->v.a[idd>>1].nei[idd&1]; + for (i = 0; i < r->n; ++i) // no multi-edges + if (r->a[i].x == v) return; + kv_pushp(ku128_t, *r, &q); + q->x = v; q->y = ovlp; +} + +void mag_eh_markdel(mag_t *g, uint64_t u, uint64_t v) // mark deletion of v from u +{ + int i; + uint64_t idd; + if ((int64_t)u < 0) return; + idd = tid2idd(g->h, u); + ku128_v *r = &g->v.a[idd>>1].nei[idd&1]; + for (i = 0; i < r->n; ++i) + if (r->a[i].x == v) edge_mark_del(r->a[i]); +} + +void mag_v_del(mag_t *g, magv_t *p) +{ + int i, j; + khint_t k; + if (p->len < 0) return; + for (i = 0; i < 2; ++i) { + ku128_v *r = &p->nei[i]; + for (j = 0; j < r->n; ++j) + if (!edge_is_del(r->a[j]) && r->a[j].x != p->k[0] && r->a[j].x != p->k[1]) + mag_eh_markdel(g, r->a[j].x, p->k[i]); + } + for (i = 0; i < 2; ++i) { + k = kh_get(64, g->h, p->k[i]); + kh_del(64, g->h, k); + } + mag_v_destroy(p); +} + +void mag_v_transdel(mag_t *g, magv_t *p, int min_ovlp) +{ + if (p->nei[0].n && p->nei[1].n) { + int i, j, ovlp; + for (i = 0; i < p->nei[0].n; ++i) { + if (edge_is_del(p->nei[0].a[i]) || p->nei[0].a[i].x == p->k[0] || p->nei[0].a[i].x == p->k[1]) continue; // due to p->p loop + for (j = 0; j < p->nei[1].n; ++j) { + if (edge_is_del(p->nei[1].a[j]) || p->nei[1].a[j].x == p->k[0] || p->nei[1].a[j].x == p->k[1]) continue; + ovlp = (int)(p->nei[0].a[i].y + p->nei[1].a[j].y) - p->len; + if (ovlp >= min_ovlp) { + mag_eh_add(g, p->nei[0].a[i].x, p->nei[1].a[j].x, ovlp); + mag_eh_add(g, p->nei[1].a[j].x, p->nei[0].a[i].x, ovlp); + } + } + } + } + mag_v_del(g, p); +} + +void mag_v_flip(mag_t *g, magv_t *p) +{ + ku128_v t; + khint_t k; + hash64_t *h = (hash64_t*)g->h; + + seq_revcomp6(p->len, (uint8_t*)p->seq); + seq_reverse(p->len, (uint8_t*)p->cov); + p->k[0] ^= p->k[1]; p->k[1] ^= p->k[0]; p->k[0] ^= p->k[1]; + t = p->nei[0]; p->nei[0] = p->nei[1]; p->nei[1] = t; + k = kh_get(64, h, p->k[0]); + assert(k != kh_end(h)); + kh_val(h, k) ^= 1; + k = kh_get(64, h, p->k[1]); + assert(k != kh_end(h)); + kh_val(h, k) ^= 1; +} + +/********************* + * Unambiguous merge * + *********************/ + +int mag_vh_merge_try(mag_t *g, magv_t *p, int min_merge_len) // merge p's neighbor to the right-end of p +{ + magv_t *q; + khint_t kp, kq; + int i, j, new_l; + hash64_t *h = (hash64_t*)g->h; + + // check if an unambiguous merge can be performed + if (p->nei[1].n != 1) return -1; // multiple or no neighbor; do not merge + if ((int64_t)p->nei[1].a[0].x < 0) return -2; // deleted neighbor + if ((int)p->nei[1].a[0].y < min_merge_len) return -5; + kq = kh_get(64, g->h, p->nei[1].a[0].x); + assert(kq != kh_end(h)); // otherwise the neighbor is non-existant + q = &g->v.a[kh_val((hash64_t*)g->h, kq)>>1]; + if (p == q) return -3; // we have a loop p->p. We cannot merge in this case + if (q->nei[kh_val(h, kq)&1].n != 1) return -4; // the neighbor q has multiple neighbors. cannot be an unambiguous merge + + // we can perform a merge; do further consistency check (mostly check bugs) + if (kh_val(h, kq)&1) mag_v_flip(g, q); // a "><" bidirectional arc; flip q + kp = kh_get(64, g->h, p->k[1]); assert(kp != kh_end(h)); // get the iterator to p + kh_del(64, g->h, kp); kh_del(64, g->h, kq); // remove the two ends of the arc in the hash table + assert(p->k[1] == q->nei[0].a[0].x && q->k[0] == p->nei[1].a[0].x); // otherwise inconsistent topology + assert(p->nei[1].a[0].y == q->nei[0].a[0].y); // the overlap length must be the same + assert(p->len >= p->nei[1].a[0].y && q->len >= p->nei[1].a[0].y); // and the overlap is shorter than both vertices + + // update the read count and sequence length + p->nsr += q->nsr; + new_l = p->len + q->len - p->nei[1].a[0].y; + if (new_l + 1 > p->max_len) { // then double p->seq and p->cov + p->max_len = new_l + 1; + kroundup32(p->max_len); + p->seq = realloc(p->seq, p->max_len); + p->cov = realloc(p->cov, p->max_len); + } + // merge seq and cov + for (i = p->len - p->nei[1].a[0].y, j = 0; j < q->len; ++i, ++j) { // write seq and cov + p->seq[i] = q->seq[j]; + if (i < p->len) { + if ((int)p->cov[i] + (q->cov[j] - 33) > 126) p->cov[i] = 126; + else p->cov[i] += q->cov[j] - 33; + } else p->cov[i] = q->cov[j]; + } + p->seq[new_l] = p->cov[new_l] = 0; + p->len = new_l; + // merge neighbors + free(p->nei[1].a); + p->nei[1] = q->nei[1]; p->k[1] = q->k[1]; + q->nei[1].a = 0; // to avoid freeing p->nei[1] by mag_v_destroy() below + // update the hash table for the right end of p + kp = kh_get(64, g->h, p->k[1]); + assert(kp != kh_end((hash64_t*)g->h)); + kh_val(h, kp) = (p - g->v.a)<<1 | 1; + // clean up q + mag_v_destroy(q); + return 0; +} + +void mag_g_merge(mag_t *g, int rmdup, int min_merge_len) +{ + int i; + uint64_t n = 0; + for (i = 0; i < g->v.n; ++i) { // remove multiedges; FIXME: should we do that? + if (rmdup) { + v128_rmdup(&g->v.a[i].nei[0]); + v128_rmdup(&g->v.a[i].nei[1]); + } else { + v128_clean(&g->v.a[i].nei[0]); + v128_clean(&g->v.a[i].nei[1]); + } + } + for (i = 0; i < g->v.n; ++i) { + magv_t *p = &g->v.a[i]; + if (p->len < 0) continue; + while (mag_vh_merge_try(g, p, min_merge_len) == 0) ++n; + mag_v_flip(g, p); + while (mag_vh_merge_try(g, p, min_merge_len) == 0) ++n; + } + if (fm_verbose >= 3) + fprintf(stderr, "[M::%s] unambiguously merged %ld pairs of vertices\n", __func__, (long)n); +} + +/***************************** + * Easy graph simplification * + *****************************/ + +typedef magv_t *magv_p; + +#define mag_vlt1(a, b) ((a)->nsr < (b)->nsr || ((a)->nsr == (b)->nsr && (a)->len < (b)->len)) +KSORT_INIT(vlt1, magv_p, mag_vlt1) + +#define mag_vlt2(a, b) ((a)->nei[0].n + (a)->nei[1].n < (b)->nei[0].n + (b)->nei[1].n) +KSORT_INIT(vlt2, magv_p, mag_vlt2) + +int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr) +{ + int i; + kvec_t(magv_p) a = {0,0,0}; + + for (i = 0; i < g->v.n; ++i) { + magv_t *p = &g->v.a[i]; + if (p->len < 0 || (p->nei[0].n > 0 && p->nei[1].n > 0)) continue; + if (p->len >= min_len || p->nsr >= min_nsr) continue; + kv_push(magv_p, a, p); + } + ks_introsort(vlt1, a.n, a.a); + for (i = 0; i < a.n; ++i) mag_v_del(g, a.a[i]); + free(a.a); + if (fm_verbose >= 3) + fprintf(stderr, "[M::%s] removed %ld tips (min_len=%d, min_nsr=%d)\n", __func__, a.n, min_len, min_nsr); + return a.n; +} + +int mag_g_rm_vint(mag_t *g, int min_len, int min_nsr, int min_ovlp) +{ + int i; + kvec_t(magv_p) a = {0,0,0}; + + for (i = 0; i < g->v.n; ++i) { + magv_t *p = &g->v.a[i]; + if (p->len >= 0 && p->len < min_len && p->nsr < min_nsr) + kv_push(magv_p, a, p); + } + ks_introsort(vlt1, a.n, a.a); + for (i = 0; i < a.n; ++i) mag_v_transdel(g, a.a[i], min_ovlp); + free(a.a); + if (fm_verbose >= 3) + fprintf(stderr, "[M::%s] removed %ld internal vertices (min_len=%d, min_nsr=%d)\n", __func__, a.n, min_len, min_nsr); + return a.n; +} + +void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr) +{ + int i, j, k; + kvec_t(magv_p) a = {0,0,0}; + uint64_t n_marked = 0; + + for (i = 0; i < g->v.n; ++i) { + magv_t *p = &g->v.a[i]; + if (p->len < 0) continue; + if ((p->nei[0].n == 0 || p->nei[1].n == 0) && p->len < min_len && p->nsr < min_nsr) + continue; // skip tips + kv_push(magv_p, a, p); + } + ks_introsort(vlt1, a.n, a.a); + + for (i = a.n - 1; i >= 0; --i) { + magv_t *p = a.a[i]; + for (j = 0; j < 2; ++j) { + ku128_v *r = &p->nei[j]; + int max_ovlp = min_ovlp, max_k = -1; + if (r->n == 0) continue; // no overlapping reads + for (k = 0; k < r->n; ++k) // get the max overlap length + if (max_ovlp < r->a[k].y) + max_ovlp = r->a[k].y, max_k = k; + if (max_k >= 0) { // test if max_k is a tip + uint64_t x = tid2idd(g->h, r->a[max_k].x); + magv_t *q = &g->v.a[x>>1]; + if (q->len >= 0 && (q->nei[0].n == 0 || q->nei[1].n == 0) && q->len < min_len && q->nsr < min_nsr) + max_ovlp = min_ovlp; + } + for (k = 0; k < r->n; ++k) { + if (edge_is_del(r->a[k])) continue; + if (r->a[k].y < min_ovlp || (double)r->a[k].y / max_ovlp < min_ratio) { + mag_eh_markdel(g, r->a[k].x, p->k[j]); // FIXME: should we check if r->a[k] is p itself? + edge_mark_del(r->a[k]); + ++n_marked; + } + } + } + } + free(a.a); + if (fm_verbose >= 3) + fprintf(stderr, "[M::%s] removed %ld edges\n", __func__, (long)n_marked); +} + +/********************************************* + * A-statistics and simplistic flow analysis * + *********************************************/ + +#define A_THRES 20. +#define A_MIN_SUPP 5 + +double mag_cal_rdist(mag_t *g) +{ + magv_v *v = &g->v; + int j; + uint64_t *srt; + double rdist = -1.; + int64_t i, sum_n_all, sum_n, sum_l; + + srt = calloc(v->n, 8); + for (i = 0, sum_n_all = 0; i < v->n; ++i) { + srt[i] = (uint64_t)v->a[i].nsr<<32 | i; + sum_n_all += v->a[i].nsr; + } + ks_introsort_uint64_t(v->n, srt); + + for (j = 0; j < 2; ++j) { + sum_n = sum_l = 0; + for (i = v->n - 1; i >= 0; --i) { + const magv_t *p = &v->a[srt[i]<<32>>32]; + int tmp1, tmp2; + tmp1 = tmp2 = 0; + if (p->nei[0].n) ++tmp1, tmp2 += p->nei[0].a[0].y; + if (p->nei[1].n) ++tmp1, tmp2 += p->nei[1].a[0].y; + if (tmp1) tmp2 /= tmp1; + if (rdist > 0.) { + double A = (p->len - tmp1) / rdist - p->nsr * M_LN2; + if (A < A_THRES) continue; + } + sum_n += p->nsr; + sum_l += p->len - tmp1; + if (sum_n >= sum_n_all * 0.5) break; + } + rdist = (double)sum_l / sum_n; + } + if (fm_verbose >= 3) { + fprintf(stderr, "[M::%s] average read distance %.3f.\n", __func__, rdist); + fprintf(stderr, "[M::%s] approximate genome size: %.0f (inaccurate!)\n", __func__, rdist * sum_n_all); + } + + free(srt); + return rdist; +} + +/************** + * Key portal * + **************/ + +void mag_init_opt(magopt_t *o) +{ + memset(o, 0, sizeof(magopt_t)); + o->trim_len = 0; + o->trim_depth = 6; + + o->min_elen = 300; + o->min_ovlp = 0; + o->min_merge_len = 0; + o->min_ensr = 4; + o->min_insr = 3; + o->min_dratio1 = 0.7; + + o->max_bcov = 10.; + o->max_bfrac = 0.15; + o->max_bvtx = 64; + o->max_bdist = 512; + o->max_bdiff = 50; +} + +void mag_g_clean(mag_t *g, const magopt_t *opt) +{ + int j; + + if (g->min_ovlp < opt->min_ovlp) g->min_ovlp = opt->min_ovlp; + for (j = 2; j <= opt->min_ensr; ++j) + mag_g_rm_vext(g, opt->min_elen, j); + mag_g_merge(g, 0, opt->min_merge_len); + mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr); + mag_g_merge(g, 1, opt->min_merge_len); + for (j = 2; j <= opt->min_ensr; ++j) + mag_g_rm_vext(g, opt->min_elen, j); + mag_g_merge(g, 0, opt->min_merge_len); + if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen); + if (!(opt->flag & MAG_F_NO_SIMPL)) mag_g_simplify_bubble(g, opt->max_bvtx, opt->max_bdist); + mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->max_bdiff, opt->flag & MAG_F_AGGRESSIVE); + mag_g_rm_vint(g, opt->min_elen, opt->min_insr, g->min_ovlp); + mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr); + mag_g_merge(g, 1, opt->min_merge_len); + mag_g_rm_vext(g, opt->min_elen, opt->min_ensr); + mag_g_merge(g, 0, opt->min_merge_len); + if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen); + mag_g_rm_vext(g, opt->min_elen, opt->min_ensr); + mag_g_merge(g, 0, opt->min_merge_len); +} + +void mag_v_trim_open(mag_t *g, magv_t *v, int trim_len, int trim_depth) +{ + int i, j, tl[2]; + if (v->nei[0].n > 0 && v->nei[1].n > 0) return; // no open end; do nothing + if (v->nei[0].n == 0 && v->nei[1].n == 0 && v->len < trim_len * 3) { // disconnected short vertex + mag_v_del(g, v); + return; + } + for (j = 0; j < 2; ++j) { + ku128_v *r = &v->nei[!j]; + int max_ovlp = 0; + for (i = 0; i < r->n; ++i) + max_ovlp = max_ovlp > r->a[i].y? max_ovlp : r->a[i].y; + tl[j] = v->len - max_ovlp < trim_len? v->len - max_ovlp : trim_len; + } + if (v->nei[0].n == 0) { + for (i = 0; i < tl[0] && v->cov[i] - 33 < trim_depth; ++i); + tl[0] = i; + v->len -= i; + memmove(v->seq, v->seq + tl[0], v->len); + memmove(v->cov, v->cov + tl[0], v->len); + } + if (v->nei[1].n == 0) { + for (i = v->len - 1; i >= v->len - tl[1] && v->cov[i] - 33 < trim_depth; --i); + tl[1] = v->len - 1 - i; + v->len -= tl[1]; + } +} + +void mag_g_trim_open(mag_t *g, const magopt_t *opt) +{ + int i; + if (opt->trim_len == 0) return; + for (i = 0; i < g->v.n; ++i) + mag_v_trim_open(g, &g->v.a[i], opt->trim_len, opt->trim_depth); +} diff --git a/mag.h b/mag.h new file mode 100644 index 0000000..ca76527 --- /dev/null +++ b/mag.h @@ -0,0 +1,69 @@ +#ifndef FM_MOG_H +#define FM_MOG_H + +#include +#include +#include "kstring.h" +#include "fml.h" + +#ifndef KINT_DEF +#define KINT_DEF +typedef struct { uint64_t x, y; } ku128_t; +typedef struct { size_t n, m; uint64_t *a; } ku64_v; +typedef struct { size_t n, m; ku128_t *a; } ku128_v; +#endif + +typedef struct { + int len, nsr; // length; number supporting reads + uint32_t max_len;// allocated seq/cov size + uint64_t k[2]; // bi-interval + ku128_v nei[2]; // neighbors + char *seq, *cov; // sequence and coverage + void *ptr; // additional information +} magv_t; + +typedef struct { size_t n, m; magv_t *a; } magv_v; + +typedef struct mag_t { + magv_v v; + float rdist; // read distance + int min_ovlp; // minimum overlap seen from the graph + void *h; +} mag_t; + +struct mogb_aux; +typedef struct mogb_aux mogb_aux_t; + +#ifdef __cplusplus +extern "C" { +#endif + + void mag_init_opt(magopt_t *o); + void mag_g_clean(mag_t *g, const magopt_t *opt); + + void mag_g_destroy(mag_t *g); + void mag_g_amend(mag_t *g); + void mag_g_build_hash(mag_t *g); + void mag_g_print(const mag_t *g); + int mag_g_rm_vext(mag_t *g, int min_len, int min_nsr); + void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr); + void mag_g_merge(mag_t *g, int rmdup, int min_merge_len); + void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist); + void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive); + void mag_g_pop_open(mag_t *g, int min_elen); + void mag_g_trim_open(mag_t *g, const magopt_t *opt); + + void mag_v_copy_to_empty(magv_t *dst, const magv_t *src); // NB: memory leak if dst is allocated + void mag_v_del(mag_t *g, magv_t *p); + void mag_v_write(const magv_t *p, kstring_t *out); + void mag_v_pop_open(mag_t *g, magv_t *p, int min_elen); + + uint64_t mag_tid2idd(void *h, uint64_t tid); + void mag_v128_clean(ku128_v *r); + double mag_cal_rdist(mag_t *g); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/misc.c b/misc.c new file mode 100644 index 0000000..6362000 --- /dev/null +++ b/misc.c @@ -0,0 +1,302 @@ +#include +#include "internal.h" +#include "kstring.h" +#include "rle.h" +#include "mrope.h" +#include "rld0.h" +#include "mag.h" +#include "kvec.h" +#include "fml.h" +#include "htab.h" + +unsigned char seq_nt6_table[256] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 +}; + +void fml_opt_init(fml_opt_t *opt) +{ + opt->n_threads = 1; + opt->ec_k = 0; + opt->min_cnt = 4; + opt->max_cnt = 8; + opt->min_asm_ovlp = 33; + opt->min_merge_len = 0; + mag_init_opt(&opt->mag_opt); + opt->mag_opt.flag = MAG_F_NO_SIMPL | MAG_F_POPOPEN; +} + +void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs) +{ + int i, log_len; + uint64_t tot_len = 0; + if (opt->n_threads < 1) opt->n_threads = 1; + for (i = 0; i < n_seqs; ++i) tot_len += seqs[i].l_seq; // compute total length + for (log_len = 10; log_len < 32; ++log_len) // compute ceil(log2(tot_len)) + if (1ULL< tot_len) break; + if (opt->ec_k == 0) opt->ec_k = (log_len + 12) / 2; + if (opt->ec_k%2 == 0) ++opt->ec_k; + opt->mag_opt.min_elen = (int)((double)tot_len / n_seqs * 2.5 + .499); +} + +static inline int is_rev_same(int l, const char *s) +{ + int i; + if (l&1) return 0; + for (i = 0; i < l>>1; ++i) + if (s[i] + s[l-1-i] != 5) break; + return (i == l>>1); +} + +struct rld_t *fml_fmi_gen(int n, bseq1_t *seq, int is_mt) +{ + mrope_t *mr; + kstring_t str = {0,0,0}; + mritr_t itr; + rlditr_t di; + const uint8_t *block; + rld_t *e = 0; + int k; + + for (k = 0; k < n; ++k) + if (seq[k].l_seq > 0) + break; + if (k == n) return 0; + + mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO); + for (k = 0; k < n; ++k) { + int i; + bseq1_t *s = &seq[k]; + if (s->l_seq == 0) continue; + free(s->qual); + for (i = 0; i < s->l_seq; ++i) + s->seq[i] = seq_nt6_table[(int)s->seq[i]]; + for (i = 0; i < s->l_seq; ++i) + if (s->seq[i] == 5) break; + if (i < s->l_seq) { + free(s->seq); + continue; + } + if (is_rev_same(s->l_seq, s->seq)) + --s->l_seq, s->seq[s->l_seq] = 0; + seq_reverse(s->l_seq, (uint8_t*)s->seq); + kputsn(s->seq, s->l_seq + 1, &str); + seq_revcomp6(s->l_seq, (uint8_t*)s->seq); + kputsn(s->seq, s->l_seq + 1, &str); + free(s->seq); + } + free(seq); + mr_insert_multi(mr, str.l, (uint8_t*)str.s, is_mt); + free(str.s); + + e = rld_init(6, 3); + rld_itr_init(e, &di, 0); + mr_itr_first(mr, &itr, 1); + while ((block = mr_itr_next_block(&itr)) != 0) { + const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block); + while (q < end) { + int c = 0; + int64_t l; + rle_dec1(q, c, l); + rld_enc(e, &di, l, c); + } + } + rld_enc_finish(e, &di); + + mr_destroy(mr); + return e; +} + +struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq) +{ + return fml_fmi_gen(n, seq, opt->n_threads > 1? 1 : 0); +} + +void fml_fmi_destroy(rld_t *e) +{ + rld_destroy(e); +} + +void fml_mag_clean(const fml_opt_t *opt, struct mag_t *g) +{ + magopt_t o = opt->mag_opt; + o.min_merge_len = opt->min_merge_len; + mag_g_merge(g, 1, opt->min_merge_len); + mag_g_clean(g, &o); + mag_g_trim_open(g, &o); +} + +void fml_mag_destroy(struct mag_t *g) +{ + mag_g_destroy(g); +} + +#include "khash.h" +KHASH_DECLARE(64, uint64_t, uint64_t) + +#define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0) // from mag.c + +fml_utg_t *fml_mag2utg(struct mag_t *g, int *n) +{ + size_t i, j; + fml_utg_t *utg; + khash_t(64) *h; + khint_t k; + + h = kh_init(64); + for (i = j = 0; i < g->v.n; ++i) { + int absent; + magv_t *p = &g->v.a[i]; + if (p->len < 0) continue; + k = kh_put(64, h, p->k[0], &absent); + kh_val(h, k) = j<<1 | 0; + k = kh_put(64, h, p->k[1], &absent); + kh_val(h, k) = j<<1 | 1; + ++j; + } + *n = j; + kh_destroy(64, g->h); + + utg = (fml_utg_t*)calloc(*n, sizeof(fml_utg_t)); + for (i = j = 0; i < g->v.n; ++i) { + magv_t *p = &g->v.a[i]; + fml_utg_t *q; + int from, a, b; + if (p->len < 0) continue; + q = &utg[j++]; + q->len = p->len, q->nsr = p->nsr; + q->seq = p->seq, q->cov = p->cov; + for (a = 0; a < q->len; ++a) + q->seq[a] = "$ACGTN"[(int)q->seq[a]]; + q->seq[q->len] = q->cov[q->len] = 0; + for (from = 0; from < 2; ++from) { + ku128_v *r = &p->nei[from]; + for (b = q->n_ovlp[from] = 0; b < r->n; ++b) + if (!edge_is_del(r->a[b])) ++q->n_ovlp[from]; + } + q->ovlp = (fml_ovlp_t*)calloc(q->n_ovlp[0] + q->n_ovlp[1], sizeof(fml_ovlp_t)); + for (from = a = 0; from < 2; ++from) { + ku128_v *r = &p->nei[from]; + for (b = 0; b < r->n; ++b) { + ku128_t *s = &r->a[b]; + fml_ovlp_t *t; + if (edge_is_del(*s)) continue; + t = &q->ovlp[a++]; + k = kh_get(64, h, s->x); + assert(k != kh_end(h)); + t->id = kh_val(h, k) >> 1; + t->to = kh_val(h, k) & 1; + t->len = s->y; + t->from = from; + } + free(p->nei[from].a); + } + } + kh_destroy(64, h); + free(g->v.a); + free(g); + return utg; +} + +void fml_utg_print(int n, const fml_utg_t *utg) +{ + int i, j, l; + kstring_t out = {0,0,0}; + for (i = 0; i < n; ++i) { + const fml_utg_t *u = &utg[i]; + out.l = 0; + kputc('@', &out); kputw(i<<1|0, &out); kputc(':', &out); kputw(i<<1|1, &out); + kputc('\t', &out); kputw(u->nsr, &out); + kputc('\t', &out); + for (j = 0; j < u->n_ovlp[0]; ++j) { + kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out); + kputw(u->ovlp[j].len, &out); kputc(';', &out); + } + if (u->n_ovlp[0] == 0) kputc('.', &out); + kputc('\t', &out); + for (; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) { + kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out); + kputw(u->ovlp[j].len, &out); kputc(';', &out); + } + if (u->n_ovlp[1] == 0) kputc('.', &out); + kputc('\n', &out); + l = out.l; + kputsn(u->seq, u->len, &out); + kputsn("\n+\n", 3, &out); + kputsn(u->cov, u->len, &out); + kputc('\n', &out); + fputs(out.s, stdout); + } + free(out.s); +} + +void fml_utg_print_gfa(int n, const fml_utg_t *utg) +{ + int i, j; + FILE *fp = stdout; + fputs("H\tVN:Z:1.0\n", fp); + for (i = 0; i < n; ++i) { + const fml_utg_t *u = &utg[i]; + fprintf(fp, "S\t%d\t", i); + fputs(u->seq, fp); + fprintf(fp, "\tLN:i:%d\tRC:i:%d\tPD:Z:", u->len, u->nsr); + fputs(u->cov, fp); + fputc('\n', fp); + for (j = 0; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) { + fml_ovlp_t *o = &u->ovlp[j]; + if (i < o->id) + fprintf(fp, "L\t%d\t%c\t%d\t%c\t%dM\n", i, "+-"[!o->from], o->id, "+-"[o->to], o->len); + } + } +} + +void fml_utg_destroy(int n, fml_utg_t *utg) +{ + int i; + for (i = 0; i < n; ++i) { + free(utg[i].seq); + free(utg[i].cov); + free(utg[i].ovlp); + } + free(utg); +} + +#define MAG_MIN_NSR_COEF .1 + +fml_utg_t *fml_assemble(const fml_opt_t *opt0, int n_seqs, bseq1_t *seqs, int *n_utg) +{ + rld_t *e; + mag_t *g; + fml_utg_t *utg; + fml_opt_t opt = *opt0; + float kcov; + + *n_utg = 0; + fml_opt_adjust(&opt, n_seqs, seqs); + if (opt.ec_k >= 0) fml_correct(&opt, n_seqs, seqs); + kcov = fml_fltuniq(&opt, n_seqs, seqs); + e = fml_seq2fmi(&opt, n_seqs, seqs); + if (e == 0) return 0; // this may happen when all sequences are filtered out + g = fml_fmi2mag(&opt, e); + opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > kcov * MAG_MIN_NSR_COEF? opt.mag_opt.min_ensr : (int)(kcov * MAG_MIN_NSR_COEF + .499); + opt.mag_opt.min_ensr = opt.mag_opt.min_ensr < opt0->max_cnt? opt.mag_opt.min_ensr : opt0->max_cnt; + opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > opt0->min_cnt? opt.mag_opt.min_ensr : opt0->min_cnt; + opt.mag_opt.min_insr = opt.mag_opt.min_ensr - 1; + fml_mag_clean(&opt, g); + utg = fml_mag2utg(g, n_utg); + return utg; +} diff --git a/mrope.c b/mrope.c new file mode 100644 index 0000000..5954f06 --- /dev/null +++ b/mrope.c @@ -0,0 +1,307 @@ +#include +#include +#include +#include +#include +#include +#include +#include "mrope.h" + +/******************************* + *** Single-string insertion *** + *******************************/ + +mrope_t *mr_init(int max_nodes, int block_len, int sorting_order) +{ + int a; + mrope_t *r; + assert(sorting_order >= 0 && sorting_order <= 2); + r = calloc(1, sizeof(mrope_t)); + r->so = sorting_order; + r->thr_min = 1000; + for (a = 0; a != 6; ++a) + r->r[a] = rope_init(max_nodes, block_len); + return r; +} + +void mr_destroy(mrope_t *r) +{ + int a; + for (a = 0; a != 6; ++a) + if (r->r[a]) rope_destroy(r->r[a]); + free(r); +} + +int mr_thr_min(mrope_t *r, int thr_min) +{ + if (thr_min > 0) + r->thr_min = thr_min; + return r->thr_min; +} + +int64_t mr_insert1(mrope_t *r, const uint8_t *str) +{ + int64_t tl[6], tu[6], l, u; + const uint8_t *p; + int b, is_srt = (r->so != MR_SO_IO), is_comp = (r->so == MR_SO_RCLO); + for (u = 0, b = 0; b != 6; ++b) u += r->r[b]->c[0]; + l = is_srt? 0 : u; + for (p = str, b = 0; *p; b = *p++) { + int a; + if (l != u) { + int64_t cnt = 0; + rope_rank2a(r->r[b], l, u, tl, tu); + if (is_comp && *p != 5) { + for (a = 4; a > *p; --a) l += tu[a] - tl[a]; + l += tu[0] - tl[0]; + } else for (a = 0; a < *p; ++a) l += tu[a] - tl[a]; + rope_insert_run(r->r[b], l, *p, 1, 0); + while (--b >= 0) cnt += r->r[b]->c[*p]; + l = cnt + tl[*p]; u = cnt + tu[*p]; + } else { + l = rope_insert_run(r->r[b], l, *p, 1, 0); + while (--b >= 0) l += r->r[b]->c[*p]; + u = l; + } + } + return rope_insert_run(r->r[b], l, 0, 1, 0); +} + +void mr_rank2a(const mrope_t *mr, int64_t x, int64_t y, int64_t *cx, int64_t *cy) +{ + int a, b; + int64_t z, c[6], l; + memset(c, 0, 48); + for (a = 0, z = 0; a < 6; ++a) { + const int64_t *ca = mr->r[a]->c; + l = ca[0] + ca[1] + ca[2] + ca[3] + ca[4] + ca[5]; + if (z + l >= x) break; + for (b = 0; b < 6; ++b) c[b] += ca[b]; + z += l; + } + assert(a != 6); + if (y >= 0 && z + l >= y) { // [x,y) is in the same bucket + rope_rank2a(mr->r[a], x - z, y - z, cx, cy); + for (b = 0; b < 6; ++b) + cx[b] += c[b], cy[b] += c[b]; + return; + } + if (x != z) rope_rank1a(mr->r[a], x - z, cx); + else memset(cx, 0, 48); + for (b = 0; b < 6; ++b) + cx[b] += c[b], c[b] += mr->r[a]->c[b]; + if (y < 0) return; + for (++a, z += l; a < 6; ++a) { + const int64_t *ca = mr->r[a]->c; + l = ca[0] + ca[1] + ca[2] + ca[3] + ca[4] + ca[5]; + if (z + l >= y) break; + for (b = 0; b < 6; ++b) c[b] += ca[b]; + z += l; + } + assert(a != 6); + if (y != z + l) rope_rank1a(mr->r[a], y - z, cy); + else for (b = 0; b < 6; ++b) cy[b] = mr->r[a]->c[b]; + for (b = 0; b < 6; ++b) cy[b] += c[b]; +} + +/********************** + *** Mrope iterator *** + **********************/ + +void mr_itr_first(mrope_t *r, mritr_t *i, int to_free) +{ + i->a = 0; i->r = r; i->to_free = to_free; + rope_itr_first(i->r->r[0], &i->i); +} + +const uint8_t *mr_itr_next_block(mritr_t *i) +{ + const uint8_t *s; + if (i->a >= 6) return 0; + while ((s = rope_itr_next_block(&i->i)) == 0) { + if (i->to_free) { + rope_destroy(i->r->r[i->a]); + i->r->r[i->a] = 0; + } + if (++i->a == 6) return 0; + rope_itr_first(i->r->r[i->a], &i->i); + } + return i->a == 6? 0 : s; +} + +/***************************************** + *** Inserting multiple strings in RLO *** + *****************************************/ + +typedef struct { + uint64_t l; + uint64_t u:61, c:3; + const uint8_t *p; +} triple64_t; + +typedef const uint8_t *cstr_t; + +#define rope_comp6(c) ((c) >= 1 && (c) <= 4? 5 - (c) : (c)) + +static void mr_insert_multi_aux(rope_t *rope, int64_t m, triple64_t *a, int is_comp) +{ + int64_t k, beg; + rpcache_t cache; + memset(&cache, 0, sizeof(rpcache_t)); + for (k = 0; k != m; ++k) // set the base to insert + a[k].c = *a[k].p++; + for (k = 1, beg = 0; k <= m; ++k) { + if (k == m || a[k].u != a[k-1].u) { + int64_t x, i, l = a[beg].l, u = a[beg].u, tl[6], tu[6], c[6]; + int start, end, step, b; + if (l == u && k == beg + 1) { // special case; still works without the following block + a[beg].l = a[beg].u = rope_insert_run(rope, l, a[beg].c, 1, &cache); + beg = k; + continue; + } else if (l == u) { + memset(tl, 0, 48); + memset(tu, 0, 48); + } else rope_rank2a(rope, l, u, tl, tu); + memset(c, 0, 48); + for (i = beg; i < k; ++i) ++c[a[i].c]; + // insert sentinel + if (c[0]) rope_insert_run(rope, l, 0, c[0], &cache); + // insert A/C/G/T + x = l + c[0] + (tu[0] - tl[0]); + if (is_comp) start = 4, end = 0, step = -1; + else start = 1, end = 5, step = 1; + for (b = start; b != end; b += step) { + int64_t size = tu[b] - tl[b]; + if (c[b]) { + tl[b] = rope_insert_run(rope, x, b, c[b], &cache); + tu[b] = tl[b] + size; + } + x += c[b] + size; + } + // insert N + if (c[5]) { + tu[5] -= tl[5]; + tl[5] = rope_insert_run(rope, x, 5, c[5], &cache); + tu[5] += tl[5]; + } + // update a[] + for (i = beg; i < k; ++i) { + triple64_t *p = &a[i]; + p->l = tl[p->c], p->u = tu[p->c]; + } + beg = k; + } + } +} + +typedef struct { + volatile int *n_fin_workers; + volatile int to_run; + int to_exit; + mrope_t *mr; + int b, is_comp; + int64_t m; + triple64_t *a; +} worker_t; + +static void *worker(void *data) +{ + worker_t *w = (worker_t*)data; + struct timespec req, rem; + req.tv_sec = 0; req.tv_nsec = 1000000; + do { + while (!__sync_bool_compare_and_swap(&w->to_run, 1, 0)) nanosleep(&req, &rem); // wait for the signal from the master thread + if (w->m) mr_insert_multi_aux(w->mr->r[w->b], w->m, w->a, w->is_comp); + __sync_add_and_fetch(w->n_fin_workers, 1); + } while (!w->to_exit); + return 0; +} + +void mr_insert_multi(mrope_t *mr, int64_t len, const uint8_t *s, int is_thr) +{ + int64_t k, m, n0; + int b, is_srt = (mr->so != MR_SO_IO), is_comp = (mr->so == MR_SO_RCLO), stop_thr = 0; + volatile int n_fin_workers = 0; + triple64_t *a[2], *curr, *prev, *swap; + pthread_t *tid = 0; + worker_t *w = 0; + + if (mr->thr_min < 0) mr->thr_min = 0; + assert(len > 0 && s[len-1] == 0); + { // split into short strings + cstr_t p, q, end = s + len; + for (p = s, m = 0; p != end; ++p) // count #sentinels + if (*p == 0) ++m; + curr = a[0] = malloc(m * sizeof(triple64_t)); + prev = a[1] = malloc(m * sizeof(triple64_t)); + for (p = q = s, k = 0; p != end; ++p) // find the start of each string + if (*p == 0) prev[k++].p = q, q = p + 1; + } + + for (k = n0 = 0; k < 6; ++k) n0 += mr->r[k]->c[0]; + for (k = 0; k != m; ++k) { + if (is_srt) prev[k].l = 0, prev[k].u = n0; + else prev[k].l = prev[k].u = n0 + k; + prev[k].c = 0; + } + mr_insert_multi_aux(mr->r[0], m, prev, is_comp); // insert the first (actually the last) column + + if (is_thr) { + tid = alloca(4 * sizeof(pthread_t)); + w = alloca(4 * sizeof(worker_t)); + memset(w, 0, 4 * sizeof(worker_t)); + for (b = 0; b < 4; ++b) { + w[b].mr = mr, w[b].b = b + 1, w[b].is_comp = is_comp; + w[b].n_fin_workers = &n_fin_workers; + } + for (b = 0; b < 4; ++b) pthread_create(&tid[b], 0, worker, &w[b]); + } + + n0 = 0; // the number of inserted strings + while (m) { + int64_t c[6], ac[6]; + triple64_t *q[6]; + + memset(c, 0, 48); + for (k = n0; k != m; ++k) ++c[prev[k].c]; // counting + for (q[0] = curr + n0, b = 1; b < 6; ++b) q[b] = q[b-1] + c[b-1]; + if (n0 + c[0] < m) { + for (k = n0; k != m; ++k) *q[prev[k].c]++ = prev[k]; // sort + for (b = 0; b < 6; ++b) q[b] -= c[b]; + } + n0 += c[0]; + + if (is_thr && !stop_thr) { + struct timespec req, rem; + req.tv_sec = 0; req.tv_nsec = 1000000; + stop_thr = (m - n0 <= mr->thr_min); + for (b = 0; b < 4; ++b) { + w[b].a = q[b+1], w[b].m = c[b+1]; + if (stop_thr) w[b].to_exit = 1; // signal the workers to exit + while (!__sync_bool_compare_and_swap(&w[b].to_run, 0, 1)); // signal the workers to start + } + if (c[5]) mr_insert_multi_aux(mr->r[5], c[5], q[5], is_comp); // the master thread processes the "N" bucket + while (!__sync_bool_compare_and_swap(&n_fin_workers, 4, 0)) // wait until all 4 workers finish + nanosleep(&req, &rem); + if (stop_thr && n0 < m) + fprintf(stderr, "[M::%s] Turn off parallelization for this batch as too few strings are left.\n", __func__); + } else { + for (b = 1; b < 6; ++b) + if (c[b]) mr_insert_multi_aux(mr->r[b], c[b], q[b], is_comp); + } + if (n0 == m) break; + + memset(ac, 0, 48); + for (b = 1; b < 6; ++b) { // update the intervals to account for buckets ahead + int a; + for (a = 0; a < 6; ++a) ac[a] += mr->r[b-1]->c[a]; + for (k = 0; k < c[b]; ++k) { + triple64_t *p = &q[b][k]; + p->l += ac[p->c]; p->u += ac[p->c]; + } + } + swap = curr, curr = prev, prev = swap; + } + if (is_thr) for (b = 0; b < 4; ++b) pthread_join(tid[b], 0); + free(a[0]); free(a[1]); +} diff --git a/mrope.h b/mrope.h new file mode 100644 index 0000000..240448c --- /dev/null +++ b/mrope.h @@ -0,0 +1,114 @@ +#ifndef MROPE_H_ +#define MROPE_H_ + +#include "rope.h" + +#define MR_SO_IO 0 +#define MR_SO_RLO 1 +#define MR_SO_RCLO 2 + +typedef struct { + uint8_t so; // sorting order + int thr_min; // when there are fewer sequences than this, disable multi-threading + rope_t *r[6]; +} mrope_t; // multi-rope + +typedef struct { + mrope_t *r; + int a, to_free; + rpitr_t i; +} mritr_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Initiate a multi-rope + * + * @param max_nodes maximum number of nodes in an internal node; use ROPE_DEF_MAX_NODES (64) if unsure + * @param block_len maximum block length in an external node; use ROPE_DEF_BLOCK_LEN (256) if unsure + * @param sorting_order the order in which sequences are added; possible values defined by the MR_SO_* macros + */ + mrope_t *mr_init(int max_nodes, int block_len, int sorting_order); + + void mr_destroy(mrope_t *r); + + int mr_thr_min(mrope_t *r, int thr_min); + + /** + * Insert one string into the index + * + * @param r multi-rope + * @param str the *reverse* of the input string (important: it is reversed!) + */ + int64_t mr_insert1(mrope_t *r, const uint8_t *str); + + /** + * Insert multiple strings + * + * @param mr multi-rope + * @param len total length of $s + * @param s concatenated, NULL delimited, reversed input strings + * @param is_thr true to use 5 threads + */ + void mr_insert_multi(mrope_t *mr, int64_t len, const uint8_t *s, int is_thr); + + void mr_rank2a(const mrope_t *mr, int64_t x, int64_t y, int64_t *cx, int64_t *cy); + #define mr_rank1a(mr, x, cx) mr_rank2a(mr, x, -1, cx, 0) + + /** + * Put the iterator at the start of the index + * + * @param r multi-rope + * @param i iterator to be initialized + * @param to_free if true, free visited buckets + */ + void mr_itr_first(mrope_t *r, mritr_t *i, int to_free); + + /** + * Iterate to the next block + * + * @param i iterator + * + * @return pointer to the start of a block; see rle.h for decoding the block + */ + const uint8_t *mr_itr_next_block(mritr_t *i); + +#ifdef __cplusplus +} +#endif + +static inline int64_t mr_get_c(const mrope_t *mr, int64_t c[6]) +{ + int a, b; + int64_t tot = 0; + for (a = 0; a < 6; ++a) c[a] = 0; + for (a = 0; a < 6; ++a) { + for (b = 0; b < 6; ++b) + c[b] += mr->r[a]->c[b]; + tot += c[b]; + } + return tot; +} + +static inline int64_t mr_get_ac(const mrope_t *mr, int64_t ac[7]) +{ + int a; + int64_t c[6], tot; + tot = mr_get_c(mr, c); + for (a = 1, ac[0] = 0; a <= 6; ++a) ac[a] = ac[a-1] + c[a-1]; + return tot; +} + +static inline int64_t mr_get_tot(const mrope_t *mr) +{ + int a, b; + int64_t tot = 0; + for (a = 0; a < 6; ++a) + for (b = 0; b < 6; ++b) + tot += mr->r[a]->c[b]; + return tot; +} + +#endif diff --git a/rld0.c b/rld0.c new file mode 100644 index 0000000..7f8b5ba --- /dev/null +++ b/rld0.c @@ -0,0 +1,489 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "rld0.h" + +#define RLD_IBITS_PLUS 4 + +#define rld_file_size(e) ((4 + (e)->asize) * 8 + (e)->n_bytes + 8 * (e)->n_frames * ((e)->asize + 1)) + +#ifndef xcalloc +#define xcalloc(n, s) calloc(n, s) +#endif +#ifndef xmalloc +#define xmalloc(s) malloc(s) +#endif + +/****************** + * Delta encoding * + ******************/ + +static const char LogTable256[256] = { +#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n + -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), + LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) +}; + +static inline int ilog2_32(uint32_t v) +{ + register uint32_t t, tt; + if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; + return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; +} + +static inline int ilog2(uint64_t v) +{ + return v>>32? 32 + ilog2_32(v>>32) : ilog2_32(v); +} + +static inline int64_t rld_delta_enc1(uint64_t x, int *width) +{ + int y = ilog2(x); + int z = ilog2_32(y + 1); + *width = (z<<1) + 1 + y; + return (x ^ (uint64_t)1<n = 1; + e->z = xmalloc(sizeof(void*)); + e->z[0] = xcalloc(RLD_LSIZE, 8); + e->ssize = 1<cnt = xcalloc(asize + 1, 8); + e->mcnt = xcalloc(asize + 1, 8); + e->abits = ilog2(asize) + 1; + e->asize = asize; + e->sbits = bbits; + e->asize1 = asize + 1; + e->offset0[0] = (e->asize1*16+63)/64; + e->offset0[1] = (e->asize1*32+63)/64; + e->offset0[2] = e->asize1; + return e; +} + +void rld_destroy(rld_t *e) +{ + int i = 0; + if (e == 0) return; + if (e->mem) { + close(e->fd); + munmap(e->mem, rld_file_size(e)); + } else { + for (i = 0; i < e->n; ++i) free(e->z[i]); + free(e->frame); + } + free(e->z); free(e->cnt); free(e->mcnt); free(e); +} + +void rld_itr_init(const rld_t *e, rlditr_t *itr, uint64_t k) +{ + itr->i = e->z + (k >> RLD_LBITS); + itr->shead = *itr->i + k%RLD_LSIZE; + itr->stail = rld_get_stail(e, itr); + itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)]; + itr->q = (uint8_t*)itr->p; + itr->r = 64; + itr->c = -1; + itr->l = 0; +} + +/************ + * Encoding * + ************/ + +static inline void enc_next_block(rld_t *e, rlditr_t *itr) +{ + int i, type; + if (itr->stail + 2 - *itr->i == RLD_LSIZE) { + ++e->n; + e->z = realloc(e->z, e->n * sizeof(void*)); + itr->i = e->z + e->n - 1; + itr->shead = *itr->i = xcalloc(RLD_LSIZE, 8); + } else itr->shead += e->ssize; + if (e->cnt[0] - e->mcnt[0] < 0x4000) { + uint16_t *p = (uint16_t*)itr->shead; + for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i]; + type = 0; + } else if (e->cnt[0] - e->mcnt[0] < 0x40000000) { + uint32_t *p = (uint32_t*)itr->shead; + for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i]; + type = 1; + } else { + uint64_t *p = (uint64_t*)itr->shead; + for (i = 0; i <= e->asize; ++i) p[i] = e->cnt[i] - e->mcnt[i]; + type = 2; + } + *itr->shead |= (uint64_t)type<<62; + itr->p = itr->shead + e->offset0[type]; + itr->stail = rld_get_stail(e, itr); + itr->q = (uint8_t*)itr->p; + itr->r = 64; + for (i = 0; i <= e->asize; ++i) e->mcnt[i] = e->cnt[i]; +} + +static int rld_enc1(rld_t *e, rlditr_t *itr, int64_t l, uint8_t c) +{ + int w; + uint64_t x = rld_delta_enc1(l, &w) << e->abits | c; + w += e->abits; + if (w >= itr->r && itr->p == itr->stail) enc_next_block(e, itr); + if (w > itr->r) { + w -= itr->r; + *itr->p++ |= x >> w; + *itr->p = x << (itr->r = 64 - w); + } else itr->r -= w, *itr->p |= x << itr->r; + e->cnt[0] += l; + e->cnt[c + 1] += l; + return 0; +} + +int rld_enc(rld_t *e, rlditr_t *itr, int64_t l, uint8_t c) +{ + if (l == 0) return 0; + if (itr->c != c) { + if (itr->l) rld_enc1(e, itr, itr->l, itr->c); + itr->l = l; itr->c = c; + } else itr->l += l; + return 0; +} + +void rld_rank_index(rld_t *e) +{ + uint64_t last, n_blks, i, k, *cnt; + int j; + + n_blks = e->n_bytes * 8 / 64 / e->ssize + 1; + last = rld_last_blk(e); + cnt = alloca(e->asize * 8); + e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS; + e->n_frames = ((e->mcnt[0] + (1ll<ibits) - 1) >> e->ibits) + 1; + e->frame = xcalloc(e->n_frames * e->asize1, 8); + e->frame[0] = 0; + for (j = 0; j < e->asize; ++j) cnt[j] = 0; + for (i = e->ssize, k = 1; i <= last; i += e->ssize) { + uint64_t sum, *p = rld_seek_blk(e, i); + int type = rld_block_type(*p); + if (type == 0) { + uint16_t *q = (uint16_t*)p; + for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j]; + } else if (type == 1) { + uint32_t *q = (uint32_t*)p; + for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j] & 0x3fffffff; + } else { + uint64_t *q = (uint64_t*)p; + for (j = 1; j <= e->asize; ++j) cnt[j-1] += q[j]; + } + for (j = 0, sum = 0; j < e->asize; ++j) sum += cnt[j]; + while (sum >= k<ibits) ++k; + if (k < e->n_frames) { + uint64_t x = k * e->asize1; + e->frame[x] = i; + for (j = 0; j < e->asize; ++j) e->frame[x + j + 1] = cnt[j]; + } + } + assert(k >= e->n_frames - 1); + for (k = 1; k < e->n_frames; ++k) { // fill zero cells + uint64_t x = k * e->asize1; + if (e->frame[x] == 0) { + for (j = 0; j <= e->asize; ++j) + e->frame[x + j] = e->frame[x - e->asize1 + j]; + } + } +} + +uint64_t rld_enc_finish(rld_t *e, rlditr_t *itr) +{ + int i; + if (itr->l) rld_enc1(e, itr, itr->l, itr->c); + enc_next_block(e, itr); + e->n_bytes = (((uint64_t)(e->n - 1) * RLD_LSIZE) + (itr->p - *itr->i)) * 8; + // recompute e->cnt as the accumulative count; e->mcnt[] keeps the marginal counts + for (e->cnt[0] = 0, i = 1; i <= e->asize; ++i) e->cnt[i] += e->cnt[i - 1]; + rld_rank_index(e); + return e->n_bytes; +} + +/***************** + * Save and load * + *****************/ + +int rld_dump(const rld_t *e, const char *fn) +{ + uint64_t k = 0; + int i; + uint32_t a; + FILE *fp; + fp = strcmp(fn, "-")? fopen(fn, "wb") : fdopen(fileno(stdout), "wb"); + if (fp == 0) return -1; + a = e->asize<<16 | e->sbits; + fwrite("RLD\3", 1, 4, fp); // write magic + fwrite(&a, 4, 1, fp); // write sbits and asize + fwrite(&k, 8, 1, fp); // preserve 8 bytes for future uses + fwrite(&e->n_bytes, 8, 1, fp); // n_bytes can always be divided by 8 + fwrite(&e->n_frames, 8, 1, fp); // number of frames + fwrite(e->mcnt + 1, 8, e->asize, fp); // write the marginal counts + for (i = 0, k = e->n_bytes / 8; i < e->n - 1; ++i, k -= RLD_LSIZE) + fwrite(e->z[i], 8, RLD_LSIZE, fp); + fwrite(e->z[i], 8, k, fp); + fwrite(e->frame, 8 * e->asize1, e->n_frames, fp); + fclose(fp); + return 0; +} + +static rld_t *rld_restore_header(const char *fn, FILE **_fp) +{ + FILE *fp; + rld_t *e; + char magic[4]; + uint64_t a[3]; + int32_t i, x; + + if (strcmp(fn, "-") == 0) *_fp = fp = stdin; + else if ((*_fp = fp = fopen(fn, "rb")) == 0) return 0; + fread(magic, 1, 4, fp); + if (strncmp(magic, "RLD\3", 4)) return 0; + fread(&x, 4, 1, fp); + e = rld_init(x>>16, x&0xffff); + fread(a, 8, 3, fp); + e->n_bytes = a[1]; e->n_frames = a[2]; + fread(e->mcnt + 1, 8, e->asize, fp); + for (i = 0; i <= e->asize; ++i) e->cnt[i] = e->mcnt[i]; + for (i = 1; i <= e->asize; ++i) e->cnt[i] += e->cnt[i - 1]; + e->mcnt[0] = e->cnt[e->asize]; + return e; +} + +rld_t *rld_restore(const char *fn) +{ + FILE *fp; + rld_t *e; + uint64_t k, n_blks; + int32_t i; + + if ((e = rld_restore_header(fn, &fp)) == 0) { // then load as plain DNA rle + uint8_t *buf; + int l; + rlditr_t itr; + buf = malloc(0x10000); + e = rld_init(6, 3); + rld_itr_init(e, &itr, 0); + while ((l = fread(buf, 1, 0x10000, fp)) != 0) + for (i = 0; i < l; ++i) + if (buf[i]>>3) rld_enc(e, &itr, buf[i]>>3, buf[i]&7); + fclose(fp); + free(buf); + rld_enc_finish(e, &itr); + return e; + } + if (e->n_bytes / 8 > RLD_LSIZE) { // allocate enough memory + e->n = (e->n_bytes / 8 + RLD_LSIZE - 1) / RLD_LSIZE; + e->z = realloc(e->z, e->n * sizeof(void*)); + for (i = 1; i < e->n; ++i) + e->z[i] = xcalloc(RLD_LSIZE, 8); + } + for (i = 0, k = e->n_bytes / 8; i < e->n - 1; ++i, k -= RLD_LSIZE) + fread(e->z[i], 8, RLD_LSIZE, fp); + fread(e->z[i], 8, k, fp); + e->frame = xmalloc(e->n_frames * e->asize1 * 8); + fread(e->frame, 8 * e->asize1, e->n_frames, fp); + fclose(fp); + n_blks = e->n_bytes * 8 / 64 / e->ssize + 1; + e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS; + return e; +} + +rld_t *rld_restore_mmap(const char *fn) +{ + FILE *fp; + rld_t *e; + int i; + int64_t n_blks; + + e = rld_restore_header(fn, &fp); + fclose(fp); + free(e->z[0]); free(e->z); + e->n = (e->n_bytes / 8 + RLD_LSIZE - 1) / RLD_LSIZE; + e->z = xcalloc(e->n, sizeof(void*)); + e->fd = open(fn, O_RDONLY); + e->mem = (uint64_t*)mmap(0, rld_file_size(e), PROT_READ, MAP_PRIVATE, e->fd, 0); + for (i = 0; i < e->n; ++i) e->z[i] = e->mem + (4 + e->asize) + (size_t)i * RLD_LSIZE; + e->frame = e->mem + (4 + e->asize) + e->n_bytes/8; + n_blks = e->n_bytes * 8 / 64 / e->ssize + 1; + e->ibits = ilog2(e->mcnt[0] / n_blks) + RLD_IBITS_PLUS; + return e; +} + +/****************** + * Computing rank * + ******************/ + +#ifdef _DNA_ONLY +static inline int64_t rld_dec0_fast_dna(const rld_t *e, rlditr_t *itr, int *c) +{ // This is NOT a replacement of rld_dec0(). It does not do boundary check. + uint64_t x = itr->r == 64? itr->p[0] : itr->p[0] << (64 - itr->r) | itr->p[1] >> itr->r; + if (x>>63 == 0) { + int64_t y; + int l, w = 0x333333335555779bll>>(x>>59<<2)&0xf; + l = (x >> (64 - w)) - 1; + y = x << w >> (64 - l) | 1u << l; + w += l; + *c = x << w >> 61; + w += 3; + itr->r -= w; + if (itr->r <= 0) ++itr->p, itr->r += 64; + return y; + } else { + *c = x << 1 >> 61; + itr->r -= 4; + if (itr->r <= 0) ++itr->p, itr->r += 64; + return 1; + } +} +#endif + +static inline uint64_t rld_locate_blk(const rld_t *e, rlditr_t *itr, uint64_t k, uint64_t *cnt, uint64_t *sum) +{ + int j; + uint64_t c = 0, *q, *z = e->frame + (k>>e->ibits) * e->asize1; + itr->i = e->z + (*z>>RLD_LBITS); + q = itr->p = *itr->i + (*z&RLD_LMASK); + for (j = 1, *sum = 0; j < e->asize1; ++j) *sum += (cnt[j-1] = z[j]); + while (1) { // seek to the small block + int type; + q += e->ssize; + if (q - *itr->i == RLD_LSIZE) q = *++itr->i; + type = rld_block_type(*q); + c = type == 2? *q&0x3fffffffffffffffULL : type == 1? *(uint32_t*)q : *(uint16_t*)q; + if (*sum + c > k) break; + if (type == 0) { + uint16_t *p = (uint16_t*)q + 1; +#ifdef _DNA_ONLY + cnt[0] += p[0]; cnt[1] += p[1]; cnt[2] += p[2]; cnt[3] += p[3]; cnt[4] += p[4]; cnt[5] += p[5]; +#else + for (j = 0; j < e->asize; ++j) cnt[j] += p[j]; +#endif + } else if (type == 1) { + uint32_t *p = (uint32_t*)q + 1; + for (j = 0; j < e->asize; ++j) cnt[j] += p[j] & 0x3fffffff; + } else { + uint64_t *p = (uint64_t*)q + 1; + for (j = 0; j < e->asize; ++j) cnt[j] += p[j]; + } + *sum += c; + itr->p = q; + } + itr->shead = itr->p; + itr->stail = rld_get_stail(e, itr); + itr->p += e->offset0[rld_block_type(*itr->shead)]; + itr->q = (uint8_t*)itr->p; + itr->r = 64; + return c + *sum; +} + +void rld_rank21(const rld_t *e, uint64_t k, uint64_t l, int c, uint64_t *ok, uint64_t *ol) // FIXME: can be faster +{ + *ok = rld_rank11(e, k, c); + *ol = rld_rank11(e, l, c); +} + +int rld_rank1a(const rld_t *e, uint64_t k, uint64_t *ok) +{ + uint64_t z, l; + int a = -1; + rlditr_t itr; + if (k == 0) { + for (a = 0; a < e->asize; ++a) ok[a] = 0; + return -1; + } + rld_locate_blk(e, &itr, k-1, ok, &z); + while (1) { +#ifdef _DNA_ONLY + l = rld_dec0_fast_dna(e, &itr, &a); +#else + l = rld_dec0(e, &itr, &a); +#endif + if (z + l >= k) break; + z += l; ok[a] += l; + } + ok[a] += k - z; + return a; +} + +uint64_t rld_rank11(const rld_t *e, uint64_t k, int c) +{ + uint64_t *ok; + if (k == (uint64_t)-1) return 0; + ok = alloca(e->asize1 * 8); + rld_rank1a(e, k, ok); + return ok[c]; +} + +void rld_rank2a(const rld_t *e, uint64_t k, uint64_t l, uint64_t *ok, uint64_t *ol) +{ + uint64_t z, y, len; + rlditr_t itr; + int a = -1; + if (k == 0) { + for (a = 0; a < e->asize; ++a) ok[a] = 0; + rld_rank1a(e, l, ol); + return; + } + y = rld_locate_blk(e, &itr, k-1, ok, &z); // locate the block bracketing k + while (1) { // compute ok[] +#ifdef _DNA_ONLY + len = rld_dec0_fast_dna(e, &itr, &a); +#else + len = rld_dec0(e, &itr, &a); +#endif + if (z + len >= k) break; + z += len; ok[a] += len; + } + if (y > l) { // we do not need to decode other blocks + int b; + for (b = 0; b < e->asize; ++b) ol[b] = ok[b]; // copy ok[] to ol[] + ok[a] += k - z; // finalize ok[a] + if (z + len < l) { // we need to decode the next run + z += len; ol[a] += len; + while (1) { + len = rld_dec0(e, &itr, &a); + if (z + len >= l) break; + z += len; ol[a] += len; + } + } + ol[a] += l - z; + } else { // we have to decode other blocks + ok[a] += k - z; + rld_rank1a(e, l, ol); + } +} + +int rld_extend(const rld_t *e, const rldintv_t *ik, rldintv_t ok[6], int is_back) +{ // TODO: this can be accelerated a little by using rld_rank1a() when ik.x[2]==1 + uint64_t tk[6], tl[6]; + int i; + rld_rank2a(e, ik->x[!is_back], ik->x[!is_back] + ik->x[2], tk, tl); + for (i = 0; i < 6; ++i) { + ok[i].x[!is_back] = e->cnt[i] + tk[i]; + ok[i].x[2] = (tl[i] -= tk[i]); + } + ok[0].x[is_back] = ik->x[is_back]; + ok[4].x[is_back] = ok[0].x[is_back] + tl[0]; + ok[3].x[is_back] = ok[4].x[is_back] + tl[4]; + ok[2].x[is_back] = ok[3].x[is_back] + tl[3]; + ok[1].x[is_back] = ok[2].x[is_back] + tl[2]; + ok[5].x[is_back] = ok[1].x[is_back] + tl[1]; + return 0; +} diff --git a/rld0.h b/rld0.h new file mode 100644 index 0000000..c811cad --- /dev/null +++ b/rld0.h @@ -0,0 +1,137 @@ +#ifndef RLDELTA0_H +#define RLDELTA0_H + +#define _DNA_ONLY + +#include +#include +#include +#include + +#define RLD_LBITS 23 +#define RLD_LSIZE (1<n_bytes>>3>>(e)->sbits<<(e)->sbits) +#define rld_seek_blk(e, k) ((e)->z[(k)>>RLD_LBITS] + ((k)&RLD_LMASK)) +#define rld_get_stail(e, itr) ((itr)->shead + (e)->ssize - ((itr)->shead + (e)->ssize - *(itr)->i == RLD_LSIZE? 2 : 1)) + +#define rld_block_type(x) ((uint64_t)(x)>>62) + +static inline int64_t rld_dec0(const rld_t *e, rlditr_t *itr, int *c) +{ + int w; + uint64_t x; + int64_t l, y = 0; + x = itr->p[0] << (64 - itr->r) | (itr->p != itr->stail && itr->r != 64? itr->p[1] >> itr->r : 0); + if (x>>63 == 0) { + if ((w = 0x333333335555779bll>>(x>>59<<2)&0xf) == 0xb && x>>58 == 0) return 0; + l = (x >> (64 - w)) - 1; + y = x << w >> (64 - l) | 1u << l; + w += l; + } else w = y = 1; + *c = x << w >> (64 - e->abits); + w += e->abits; + if (itr->r > w) itr->r -= w; + else ++itr->p, itr->r = 64 + itr->r - w; + return y; +} + +static inline int64_t rld_dec(const rld_t *e, rlditr_t *itr, int *_c, int is_free) +{ + int64_t l = rld_dec0(e, itr, _c); + if (l == 0 || *_c > e->asize) { + uint64_t last = rld_last_blk(e); + if (itr->p - *itr->i > RLD_LSIZE - e->ssize) { + if (is_free) { + free(*itr->i); *itr->i = 0; + } + itr->shead = *++itr->i; + } else itr->shead += e->ssize; + if (itr->shead == rld_seek_blk(e, last)) return -1; + itr->p = itr->shead + e->offset0[rld_block_type(*itr->shead)]; + itr->q = (uint8_t*)itr->p; + itr->stail = rld_get_stail(e, itr); + itr->r = 64; + return rld_dec0(e, itr, _c); + } else return l; +} + +// take k symbols from e0 and write it to e +static inline void rld_dec_enc(rld_t *e, rlditr_t *itr, const rld_t *e0, rlditr_t *itr0, int64_t k) +{ + if (itr0->l >= k) { // there are more pending symbols + rld_enc(e, itr, k, itr0->c); + itr0->l -= k; // l - k symbols remains + } else { // use up all pending symbols + int c = -1; // to please gcc + int64_t l; + rld_enc(e, itr, itr0->l, itr0->c); // write all pending symbols + k -= itr0->l; + for (; k > 0; k -= l) { // we always go into this loop because l0l = -k; itr0->c = c; + } +} + +#endif diff --git a/rle.c b/rle.c new file mode 100644 index 0000000..221e1cd --- /dev/null +++ b/rle.c @@ -0,0 +1,191 @@ +#include +#include +#include +#include +#include "rle.h" + +const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 }; + +// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase +int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]) +{ + uint16_t *nptr = (uint16_t*)block; + int diff; + + block += 2; // skip the first 2 counting bytes + if (*nptr == 0) { + memset(cnt, 0, 48); + diff = rle_enc1(block, a, rl); + } else { + uint8_t *p, *end = block + *nptr, *q; + int64_t pre, z, l = 0, tot, beg_l; + int c = -1, n_bytes = 0, n_bytes2, t = 0; + uint8_t tmp[24]; + beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5]; + tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; + if (x < beg_l) { + beg_l = 0, *beg = 0; + memset(bc, 0, 48); + } + if (x == beg_l) { + p = q = block + (*beg); z = beg_l; + memcpy(cnt, bc, 48); + } else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward + z = beg_l; p = block + (*beg); + memcpy(cnt, bc, 48); + while (z < x) { + rle_dec1(p, c, l); + z += l; cnt[c] += l; + } + for (q = p - 1; *q>>6 == 2; --q); + } else { // backward + memcpy(cnt, ec, 48); + z = tot; p = end; + while (z >= x) { + --p; + if (*p>>6 != 2) { + l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; + z -= l; cnt[*p&7] -= l; + l = 0; t = 0; + } else { + l |= (*p&0x3fL) << t; + t += 6; + } + } + q = p; + rle_dec1(p, c, l); + z += l; cnt[c] += l; + } + *beg = q - block; + memcpy(bc, cnt, 48); + bc[c] -= l; + n_bytes = p - q; + if (x == z && a != c && p < end) { // then try the next run + int tc; + int64_t tl; + q = p; + rle_dec1(q, tc, tl); + if (a == tc) + c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl; + } + if (z != x) cnt[c] -= z - x; + pre = x - (z - l); p -= n_bytes; + if (a == c) { // insert to the same run + n_bytes2 = rle_enc1(tmp, c, l + rl); + } else if (x == z) { // at the end; append to the existing run + p += n_bytes; n_bytes = 0; + n_bytes2 = rle_enc1(tmp, a, rl); + } else { // break the current run + n_bytes2 = rle_enc1(tmp, c, pre); + n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl); + n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre); + } + if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed + memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes); + memcpy(p, tmp, n_bytes2); + diff = n_bytes2 - n_bytes; + } + return (*nptr += diff); +} + +int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6]) +{ + int beg = 0; + int64_t bc[6]; + memset(bc, 0, 48); + return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc); +} + +void rle_split(uint8_t *block, uint8_t *new_block) +{ + int n = *(uint16_t*)block; + uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1); + while (*q>>6 == 2) --q; + memcpy(new_block + 2, q, end - q); + *(uint16_t*)new_block = end - q; + *(uint16_t*)block = q - block - 2; +} + +void rle_count(const uint8_t *block, int64_t cnt[6]) +{ + const uint8_t *q = block + 2, *end = q + *(uint16_t*)block; + while (q < end) { + int c; + int64_t l; + rle_dec1(q, c, l); + cnt[c] += l; + } +} + +void rle_print(const uint8_t *block, int expand) +{ + const uint16_t *p = (const uint16_t*)block; + const uint8_t *q = block + 2, *end = block + 2 + *p; + while (q < end) { + int c; + int64_t l, x; + rle_dec1(q, c, l); + if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]); + else printf("%c%ld", "$ACGTN"[c], (long)l); + } + putchar('\n'); +} + +void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]) +{ + int a; + int64_t tot, cnt[6]; + const uint8_t *p; + + y = y >= x? y : x; + tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; + if (tot == 0) return; + if (x <= (tot - y) + (tot>>3)) { + int c = 0; + int64_t l, z = 0; + memset(cnt, 0, 48); + p = block + 2; + while (z < x) { + rle_dec1(p, c, l); + z += l; cnt[c] += l; + } + for (a = 0; a != 6; ++a) cx[a] += cnt[a]; + cx[c] -= z - x; + if (cy) { + while (z < y) { + rle_dec1(p, c, l); + z += l; cnt[c] += l; + } + for (a = 0; a != 6; ++a) cy[a] += cnt[a]; + cy[c] -= z - y; + } + } else { +#define move_backward(_x) \ + while (z >= (_x)) { \ + --p; \ + if (*p>>6 != 2) { \ + l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \ + z -= l; cnt[*p&7] -= l; \ + l = 0; t = 0; \ + } else { \ + l |= (*p&0x3fL) << t; \ + t += 6; \ + } \ + } \ + + int t = 0; + int64_t l = 0, z = tot; + memcpy(cnt, ec, 48); + p = block + 2 + *(const uint16_t*)block; + if (cy) { + move_backward(y) + for (a = 0; a != 6; ++a) cy[a] += cnt[a]; + cy[*p&7] += y - z; + } + move_backward(x) + for (a = 0; a != 6; ++a) cx[a] += cnt[a]; + cx[*p&7] += x - z; + +#undef move_backward + } +} diff --git a/rle.h b/rle.h new file mode 100644 index 0000000..4f8946d --- /dev/null +++ b/rle.h @@ -0,0 +1,77 @@ +#ifndef RLE6_H_ +#define RLE6_H_ + +#include + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#else +#define LIKELY(x) (x) +#endif +#ifdef __cplusplus + +extern "C" { +#endif + + int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]); + int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]); + void rle_split(uint8_t *block, uint8_t *new_block); + void rle_count(const uint8_t *block, int64_t cnt[6]); + void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]); + #define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec) + + void rle_print(const uint8_t *block, int expand); + +#ifdef __cplusplus +} +#endif + +/****************** + *** 43+3 codec *** + ******************/ + +extern const uint8_t rle_auxtab[8]; + +#define RLE_MIN_SPACE 18 +#define rle_nptr(block) ((uint16_t*)(block)) + +// decode one run (c,l) and move the pointer p +#define rle_dec1(p, c, l) do { \ + (c) = *(p) & 7; \ + if (LIKELY((*(p)&0x80) == 0)) { \ + (l) = *(p)++ >> 3; \ + } else if (LIKELY(*(p)>>5 == 6)) { \ + (l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \ + (p) += 2; \ + } else { \ + int n = ((*(p)&0x10) >> 2) + 4; \ + (l) = *(p)++ >> 3 & 1; \ + while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \ + } \ + } while (0) + +static inline int rle_enc1(uint8_t *p, int c, int64_t l) +{ + if (l < 1LL<<4) { + *p = l << 3 | c; + return 1; + } else if (l < 1LL<<8) { + *p = 0xC0 | l >> 6 << 3 | c; + p[1] = 0x80 | (l & 0x3f); + return 2; + } else if (l < 1LL<<19) { + *p = 0xE0 | l >> 18 << 3 | c; + p[1] = 0x80 | (l >> 12 & 0x3f); + p[2] = 0x80 | (l >> 6 & 0x3f); + p[3] = 0x80 | (l & 0x3f); + return 4; + } else { + int i, shift = 36; + *p = 0xF0 | l >> 42 << 3 | c; + for (i = 1; i < 8; ++i, shift -= 6) + p[i] = 0x80 | (l>>shift & 0x3f); + return 8; + } +} + +#endif diff --git a/rope.c b/rope.c new file mode 100644 index 0000000..8d21a06 --- /dev/null +++ b/rope.c @@ -0,0 +1,219 @@ +#include +#include +#include +#include +#include +#include "rle.h" +#include "rope.h" + +/******************* + *** Memory Pool *** + *******************/ + +#define MP_CHUNK_SIZE 0x100000 // 1MB per chunk + +typedef struct { // memory pool for fast and compact memory allocation (no free) + int size, i, n_elems; + int64_t top, max; + uint8_t **mem; +} mempool_t; + +static mempool_t *mp_init(int size) +{ + mempool_t *mp; + mp = calloc(1, sizeof(mempool_t)); + mp->size = size; + mp->i = mp->n_elems = MP_CHUNK_SIZE / size; + mp->top = -1; + return mp; +} + +static void mp_destroy(mempool_t *mp) +{ + int64_t i; + for (i = 0; i <= mp->top; ++i) free(mp->mem[i]); + free(mp->mem); free(mp); +} + +static inline void *mp_alloc(mempool_t *mp) +{ + if (mp->i == mp->n_elems) { + if (++mp->top == mp->max) { + mp->max = mp->max? mp->max<<1 : 1; + mp->mem = realloc(mp->mem, sizeof(void*) * mp->max); + } + mp->mem[mp->top] = calloc(mp->n_elems, mp->size); + mp->i = 0; + } + return mp->mem[mp->top] + (mp->i++) * mp->size; +} + +/*************** + *** B+ rope *** + ***************/ + +rope_t *rope_init(int max_nodes, int block_len) +{ + rope_t *rope; + rope = calloc(1, sizeof(rope_t)); + if (block_len < 32) block_len = 32; + rope->max_nodes = (max_nodes+ 1)>>1<<1; + rope->block_len = (block_len + 7) >> 3 << 3; + rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes); + rope->leaf = mp_init(rope->block_len); + rope->root = mp_alloc(rope->node); + rope->root->n = 1; + rope->root->is_bottom = 1; + rope->root->p = mp_alloc(rope->leaf); + return rope; +} + +void rope_destroy(rope_t *rope) +{ + mp_destroy(rope->node); + mp_destroy(rope->leaf); + free(rope); +} + +static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v) +{ // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u + int j, i = v - u; + rpnode_t *w; // $w is the sibling of $v + if (u == 0) { // only happens at the root; add a new root + u = v = mp_alloc(rope->node); + v->n = 1; v->p = rope->root; // the new root has the old root as the only child + memcpy(v->c, rope->c, 48); + for (j = 0; j < 6; ++j) v->l += v->c[j]; + rope->root = v; + } + if (i != u->n - 1) // then make room for a new node + memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1)); + ++u->n; w = v + 1; + memset(w, 0, sizeof(rpnode_t)); + w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node); + if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node + uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p; + rle_split(p, q); + rle_count(q, w->c); + } else { // $v->p is a node, not a string + rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins + p->n -= rope->max_nodes>>1; + memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1)); + q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy() + q->is_bottom = p->is_bottom; + for (i = 0; i < q->n; ++i) + for (j = 0; j < 6; ++j) + w->c[j] += q[i].c[j]; + } + for (j = 0; j < 6; ++j) // compute $w->l and update $v->c + w->l += w->c[j], v->c[j] -= w->c[j]; + v->l -= w->l; // update $v->c + return v; +} + +int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache) +{ // insert $a after $x symbols in $rope and the returns rank(a, x) + rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket + int64_t y = 0, z = 0, cnt[6]; + int n_runs; + do { // top-down update. Searching and node splitting are done together in one pass. + if (p->n == rope->max_nodes) { // node is full; split + v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root + if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v + y += v->l, z += v->c[a], ++v, p = v->p; + } + u = p; + if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend + p += p->n - 1; y += v->l; z += v->c[a]; + for (; y >= x; --p) y -= p->l, z -= p->c[a]; + ++p; + } else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly + assert(p - u < u->n); + if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split + v = p; p = p->p; // descend + } while (!u->is_bottom); + rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts + if (cache) { + if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t)); + n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc); + cache->p = (uint8_t*)p; + } else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c); + z += cnt[a]; + v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work + if (n_runs + RLE_MIN_SPACE > rope->block_len) { + split_node(rope, u, v); + if (cache) memset(cache, 0, sizeof(rpcache_t)); + } + return z; +} + +static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest) +{ + rpnode_t *u, *v = 0, *p = rope->root; + int64_t y = 0; + int a; + + memset(cx, 0, 48); + do { + u = p; + if (v && x - y > v->l>>1) { + p += p->n - 1; y += v->l; + for (a = 0; a != 6; ++a) cx[a] += v->c[a]; + for (; y >= x; --p) { + y -= p->l; + for (a = 0; a != 6; ++a) cx[a] -= p->c[a]; + } + ++p; + } else { + for (; y + p->l < x; ++p) { + y += p->l; + for (a = 0; a != 6; ++a) cx[a] += p->c[a]; + } + } + v = p; p = p->p; + } while (!u->is_bottom); + *rest = x - y; + return v; +} + +void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy) +{ + rpnode_t *v; + int64_t rest; + v = rope_count_to_leaf(rope, x, cx, &rest); + if (y < x || cy == 0) { + rle_rank1a((const uint8_t*)v->p, rest, cx, v->c); + } else if (rest + (y - x) <= v->l) { + memcpy(cy, cx, 48); + rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c); + } else { + rle_rank1a((const uint8_t*)v->p, rest, cx, v->c); + v = rope_count_to_leaf(rope, y, cy, &rest); + rle_rank1a((const uint8_t*)v->p, rest, cy, v->c); + } +} + +/********************* + *** Rope iterator *** + *********************/ + +void rope_itr_first(const rope_t *rope, rpitr_t *i) +{ + memset(i, 0, sizeof(rpitr_t)); + i->rope = rope; + for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf + ++i->d, i->pa[i->d] = i->pa[i->d - 1]->p; +} + +const uint8_t *rope_itr_next_block(rpitr_t *i) +{ + const uint8_t *ret; + assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall + if (i->d < 0) return 0; + ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p; + while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking + if (i->d >= 0) + while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf + ++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p; + return ret; +} diff --git a/rope.h b/rope.h new file mode 100644 index 0000000..d114713 --- /dev/null +++ b/rope.h @@ -0,0 +1,54 @@ +#ifndef ROPE_H_ +#define ROPE_H_ + +#include +#include + +#define ROPE_MAX_DEPTH 80 +#define ROPE_DEF_MAX_NODES 64 +#define ROPE_DEF_BLOCK_LEN 512 + +typedef struct rpnode_s { + struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs) + uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket + int64_t c[6]; // marginal counts +} rpnode_t; + +typedef struct { + int32_t max_nodes, block_len; // both MUST BE even numbers + int64_t c[6]; // marginal counts + rpnode_t *root; + void *node, *leaf; // memory pool +} rope_t; + +typedef struct { + const rope_t *rope; // the rope + const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes + int ia[ROPE_MAX_DEPTH]; // index in the parent nodes + int d; // the current depth in the B+-tree +} rpitr_t; + +typedef struct { + int beg; + int64_t bc[6]; + uint8_t *p; +} rpcache_t; + +#ifdef __cplusplus +extern "C" { +#endif + + rope_t *rope_init(int max_nodes, int block_len); + void rope_destroy(rope_t *rope); + int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache); + void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy); + #define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0) + + void rope_itr_first(const rope_t *rope, rpitr_t *i); + const uint8_t *rope_itr_next_block(rpitr_t *i); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/test/MT-simu.fq.gz b/test/MT-simu.fq.gz new file mode 100644 index 0000000..8d0558a Binary files /dev/null and b/test/MT-simu.fq.gz differ diff --git a/unitig.c b/unitig.c new file mode 100644 index 0000000..55e2fd9 --- /dev/null +++ b/unitig.c @@ -0,0 +1,455 @@ +#include +#include +#include +#include "kvec.h" +#include "kstring.h" +#include "rld0.h" +#include "mag.h" +#include "internal.h" + +/****************** + *** From fermi *** + ******************/ + +typedef struct { size_t n, m; int32_t *a; } fm32s_v; +typedef struct { size_t n, m; rldintv_t *a; } rldintv_v; + +static uint64_t utg_primes[] = { 123457, 234571, 345679, 456791, 567899, 0 }; + +#define fm6_comp(a) ((a) >= 1 && (a) <= 4? 5 - (a) : (a)) +#define fm6_set_intv(e, c, ik) ((ik).x[0] = (e)->cnt[(int)(c)], (ik).x[2] = (e)->cnt[(int)(c)+1] - (e)->cnt[(int)(c)], (ik).x[1] = (e)->cnt[fm6_comp(c)], (ik).info = 0) + +int rld_extend0(const rld_t *e, const rldintv_t *ik, rldintv_t *ok0, int is_back) +{ // FIXME: this can be accelerated a little by using rld_rank1a() when ik.x[2]==1 + uint64_t tk[6], tl[6]; + rld_rank2a(e, ik->x[!is_back], ik->x[!is_back] + ik->x[2], tk, tl); + ok0->x[!is_back] = tk[0]; + ok0->x[is_back] = ik->x[is_back]; + ok0->x[2] = tl[0] - tk[0]; + return 0; +} + +uint64_t fm6_retrieve(const rld_t *e, uint64_t x, kstring_t *s, rldintv_t *k2, int *contained) +{ + uint64_t k = x, ok[6]; + rldintv_t ok2[6]; + s->l = 0; *contained = 0; + while (1) { + int c = rld_rank1a(e, k + 1, ok); + k = e->cnt[c] + ok[c] - 1; + if (c == 0) break; + if (s->l > 0) { + if (k2->x[2] == 1) k2->x[0] = k; + else { + rld_extend(e, k2, ok2, 1); + *k2 = ok2[c]; + } + } else fm6_set_intv(e, c, *k2); + kputc(c, s); + } + if (k2->x[2] != 1) { + rld_extend(e, k2, ok2, 1); + if (ok2[0].x[2] != k2->x[2]) *contained |= 1; // left contained + *k2 = ok2[0]; + } else k2->x[0] = k; + rld_extend(e, k2, ok2, 0); + if (ok2[0].x[2] != k2->x[2]) *contained |= 2; // right contained + *k2 = ok2[0]; + return k; +} + +/***************** + *** Main body *** + *****************/ + +#define info_lt(a, b) ((a).info < (b).info) + +#include "ksort.h" +KSORT_INIT(infocmp, rldintv_t, info_lt) + +static inline void set_bit(uint64_t *bits, uint64_t x) +{ + uint64_t *p = bits + (x>>6); + uint64_t z = 1LLU<<(x&0x3f); + __sync_fetch_and_or(p, z); +} + +static inline void set_bits(uint64_t *bits, const rldintv_t *p) +{ + uint64_t k; + for (k = 0; k < p->x[2]; ++k) { + set_bit(bits, p->x[0] + k); + set_bit(bits, p->x[1] + k); + } +} + +static rldintv_t overlap_intv(const rld_t *e, int len, const uint8_t *seq, int min, int j, int at5, rldintv_v *p, int inc_sentinel) +{ // requirement: seq[j] matches the end of a read + int c, depth, dir, end; + rldintv_t ik, ok[6]; + p->n = 0; + dir = at5? 1 : -1; // at5 is true iff we start from the 5'-end of a read + end = at5? len : -1; + c = seq[j]; + fm6_set_intv(e, c, ik); + for (depth = 1, j += dir; j != end; j += dir, ++depth) { + c = at5? fm6_comp(seq[j]) : seq[j]; + rld_extend(e, &ik, ok, !at5); + if (!ok[c].x[2]) break; // cannot be extended + if (depth >= min && ok[0].x[2]) { + if (inc_sentinel) { + ok[0].info = j - dir; + kv_push(rldintv_t, *p, ok[0]); + } else { + ik.info = j - dir; + kv_push(rldintv_t, *p, ik); + } + } + ik = ok[c]; + } + kv_reverse(rldintv_t, *p, 0); // reverse the array such that the smallest interval comes first + return ik; +} + +typedef struct { + const rld_t *e; + int min_match, min_merge_len; + rldintv_v a[2], nei; + fm32s_v cat; + uint64_t *used, *bend; + kstring_t str; + uint64_t n, sum, sum2, unpaired; +} aux_t; + +int fm6_is_contained(const rld_t *e, int min_match, const kstring_t *s, rldintv_t *intv, rldintv_v *ovlp) +{ // for s is a sequence in e, test if s is contained in other sequences in e; return intervals right overlapping with s + rldintv_t ik, ok[6]; + int ret = 0; + assert(s->l > min_match); + ovlp->n = 0; + ik = overlap_intv(e, s->l, (uint8_t*)s->s, min_match, s->l - 1, 0, ovlp, 0); + rld_extend(e, &ik, ok, 1); assert(ok[0].x[2]); + if (ik.x[2] != ok[0].x[2]) ret = -1; // the sequence is left contained + ik = ok[0]; + rld_extend(e, &ik, ok, 0); assert(ok[0].x[2]); + if (ik.x[2] != ok[0].x[2]) ret = -1; // the sequence is right contained + *intv = ok[0]; + return ret; +} + +int fm6_get_nei(const rld_t *e, int min_match, int beg, kstring_t *s, rldintv_v *nei, // input and output variables + rldintv_v *prev, rldintv_v *curr, fm32s_v *cat, // temporary arrays + uint64_t *used) // optional info +{ + int ori_l = s->l, j, i, c, rbeg, is_forked = 0; + rldintv_v *swap; + rldintv_t ok[6], ok0; + + curr->n = nei->n = cat->n = 0; + if (prev->n == 0) { // when this routine is called for the seed, prev may filled by fm6_is_contained() + overlap_intv(e, s->l - beg, (uint8_t*)s->s + beg, min_match, s->l - beg - 1, 0, prev, 0); + if (prev->n == 0) return -1; // no overlap + for (j = 0; j < prev->n; ++j) prev->a[j].info += beg; + } + kv_resize(int, *cat, prev->m); + for (j = 0; j < prev->n; ++j) cat->a[j] = 0; // only one interval; all point to 0 + while (prev->n) { + for (j = 0, curr->n = 0; j < prev->n; ++j) { + rldintv_t *p = &prev->a[j]; + if (cat->a[j] < 0) continue; + rld_extend(e, p, ok, 0); // forward extension + if (ok[0].x[2] && ori_l != s->l) { // some (partial) reads end here + rld_extend0(e, &ok[0], &ok0, 1); // backward extension to look for sentinels + if (ok0.x[2]) { // the match is bounded by sentinels - a full-length match + if (ok[0].x[2] == p->x[2] && p->x[2] == ok0.x[2]) { // never consider a read contained in another read + int cat0 = cat->a[j]; // a category approximately corresponds to one neighbor, though not always + assert(j == 0 || cat->a[j] > cat->a[j-1]); // otherwise not irreducible + ok0.info = ori_l - (p->info&0xffffffffU); + for (i = j; i < prev->n && cat->a[i] == cat0; ++i) cat->a[i] = -1; // mask out other intervals of the same cat + kv_push(rldintv_t, *nei, ok0); // keep in the neighbor vector + continue; // no need to go through for(c); do NOT set "used" as this neighbor may be rejected later + } else if (used) set_bits(used, &ok0); // the read is contained in another read; mark it as used + } + } // ~if(ok[0].x[2]) + if (cat->a[j] < 0) continue; // no need to proceed if we have finished this path + for (c = 1; c < 5; ++c) // collect extensible intervals + if (ok[c].x[2]) { + rld_extend0(e, &ok[c], &ok0, 1); + if (ok0.x[2]) { // do not extend intervals whose left end is not bounded by a sentinel + ok[c].info = (p->info&0xfffffff0ffffffffLLU) | (uint64_t)c<<32; + kv_push(rldintv_t, *curr, ok[c]); + } + } + } // ~for(j) + if (curr->n) { // update category + uint32_t last, cat0; + kv_resize(int, *cat, curr->m); + c = curr->a[0].info>>32&0xf; + kputc(fm6_comp(c), s); + ks_introsort(infocmp, curr->n, curr->a); + last = curr->a[0].info >> 32; + cat->a[0] = 0; + curr->a[0].info &= 0xffffffff; + for (j = 1, cat0 = 0; j < curr->n; ++j) { // this loop recalculate cat + if (curr->a[j].info>>32 != last) + last = curr->a[j].info>>32, cat0 = j; + cat->a[j] = cat0; + curr->a[j].info = (curr->a[j].info&0xffffffff) | (uint64_t)cat0<<36; + } + if (cat0 != 0) is_forked = 1; + } + swap = curr; curr = prev; prev = swap; // swap curr and prev + } // ~while(prev->n) + if (nei->n == 0) return -1; // no overlap + rbeg = ori_l - (uint32_t)nei->a[0].info; + if (nei->n == 1 && is_forked) { // this may happen if there are contained reads; fix this + fm6_set_intv(e, 0, ok0); + for (i = rbeg; i < ori_l; ++i) { + rld_extend(e, &ok0, ok, 0); + ok0 = ok[fm6_comp(s->s[i])]; + } + for (i = ori_l; i < s->l; ++i) { + int c0 = -1; + rld_extend(e, &ok0, ok, 0); + for (c = 1, j = 0; c < 5; ++c) + if (ok[c].x[2] && ok[c].x[0] <= nei->a[0].x[0] && ok[c].x[0] + ok[c].x[2] >= nei->a[0].x[0] + nei->a[0].x[2]) + ++j, c0 = c; + if (j == 0 && ok[0].x[2]) break; + assert(j == 1); + s->s[i] = fm6_comp(c0); + ok0 = ok[c0]; + } + s->l = i; s->s[s->l] = 0; + } + if (nei->n > 1) s->l = ori_l, s->s[s->l] = 0; + return rbeg; +} + +static int try_right(aux_t *a, int beg, kstring_t *s) +{ + return fm6_get_nei(a->e, a->min_match, beg, s, &a->nei, &a->a[0], &a->a[1], &a->cat, a->used); +} + +static int check_left_simple(aux_t *a, int beg, int rbeg, const kstring_t *s) +{ + rldintv_t ok[6]; + rldintv_v *prev = &a->a[0], *curr = &a->a[1], *swap; + int i, j; + + overlap_intv(a->e, s->l, (uint8_t*)s->s, a->min_match, rbeg, 1, prev, 1); + for (i = rbeg - 1; i >= beg; --i) { + for (j = 0, curr->n = 0; j < prev->n; ++j) { + rldintv_t *p = &prev->a[j]; + rld_extend(a->e, p, ok, 1); + if (ok[0].x[2]) set_bits(a->used, &ok[0]); // some reads end here; they must be contained in a longer read + if (ok[0].x[2] + ok[(int)s->s[i]].x[2] != p->x[2]) return -1; // potential backward bifurcation + kv_push(rldintv_t, *curr, ok[(int)s->s[i]]); + } + swap = curr; curr = prev; prev = swap; + } // ~for(i) + return 0; +} + +static int check_left(aux_t *a, int beg, int rbeg, const kstring_t *s) +{ + int i, ret; + rldintv_t tmp; + assert(a->nei.n == 1); + ret = check_left_simple(a, beg, rbeg, s); + if (ret == 0) return 0; + // when ret<0, the back fork may be caused by a contained read. we have to do more to confirm this. + tmp = a->nei.a[0]; // backup the neighbour as it will be overwritten by try_right() + a->a[0].n = a->a[1].n = a->nei.n = 0; + ks_resize(&a->str, s->l - rbeg + 1); + for (i = s->l - 1, a->str.l = 0; i >= rbeg; --i) + a->str.s[a->str.l++] = fm6_comp(s->s[i]); + a->str.s[a->str.l] = 0; + try_right(a, 0, &a->str); + assert(a->nei.n >= 1); + ret = a->nei.n > 1? -1 : 0; + a->nei.n = 1; a->nei.a[0] = tmp; // recover the original neighbour + return ret; +} + +static int unitig_unidir(aux_t *a, kstring_t *s, kstring_t *cov, int beg0, uint64_t k0, uint64_t *end, int *is_loop) +{ + int i, beg = beg0, rbeg, ori_l = s->l, n_reads = 0; + *is_loop = 0; + while ((rbeg = try_right(a, beg, s)) >= 0) { // loop if there is at least one overlap + uint64_t k; + if (a->nei.n > 1) { // forward bifurcation + set_bit(a->bend, *end); + break; + } + if ((k = a->nei.a[0].x[0]) == *end) break; // a loop like b>>c>>a>bend[k>>6]>>(k&0x3f)&1) || check_left(a, beg, rbeg, s) < 0)) { // backward bifurcation + set_bit(a->bend, k); + break; + } + if (k == k0) { // a loop like a>>b>>c>>a + *is_loop = 1; + break; + } + if (a->nei.a[0].x[1] == *end) { // a loop like b>>c>>a>>a; cut the last link + a->nei.n = 0; + break; + } + if ((int)a->nei.a[0].info < a->min_merge_len) break; // the overlap is not long enough + *end = a->nei.a[0].x[1]; + set_bits(a->used, &a->nei.a[0]); // successful extension + ++n_reads; + if (cov->m < s->m) ks_resize(cov, s->m); + cov->l = s->l; cov->s[cov->l] = 0; + for (i = rbeg; i < ori_l; ++i) // update the coverage string + if (cov->s[i] != '~') ++cov->s[i]; + for (i = ori_l; i < s->l; ++i) cov->s[i] = '"'; + beg = rbeg; ori_l = s->l; a->a[0].n = a->a[1].n = 0; // prepare for the next round of loop + } + cov->l = s->l = ori_l; s->s[ori_l] = cov->s[ori_l] = 0; + return n_reads; +} + +static void copy_nei(ku128_v *dst, const rldintv_v *src) +{ + int i; + for (i = 0; i < src->n; ++i) { + ku128_t z; + z.x = src->a[i].x[0]; z.y = src->a[i].info; + kv_push(ku128_t, *dst, z); + } +} + +static int unitig1(aux_t *a, int64_t seed, kstring_t *s, kstring_t *cov, uint64_t end[2], ku128_v nei[2], int *n_reads) +{ + rldintv_t intv0; + int seed_len, ret, is_loop, contained; + int64_t k; + size_t i; + + *n_reads = nei[0].n = nei[1].n = 0; + if (a->used[seed>>6]>>(seed&0x3f)&1) return -2; // used + // retrieve the sequence pointed by seed + k = fm6_retrieve(a->e, seed, s, &intv0, &contained); + seq_reverse(s->l, (uint8_t*)s->s); + seed_len = s->l; + // check contained status + if (intv0.x[2] > 1 && k != intv0.x[0]) return -3; // duplicated, but not the first + set_bits(a->used, &intv0); + if (contained) return -3; // contained + // check length, containment and if used before + if (s->l <= a->min_match) return -1; // too short + ret = fm6_is_contained(a->e, a->min_match, s, &intv0, &a->a[0]); + *n_reads = 1; + // initialize the coverage string + if (cov->m < s->m) ks_resize(cov, s->m); + cov->l = s->l; cov->s[cov->l] = 0; + for (i = 0; i < cov->l; ++i) cov->s[i] = '"'; + // left-wards extension + end[0] = intv0.x[1]; end[1] = intv0.x[0]; + if (a->a[0].n) { // no need to extend to the right if there is no overlap + *n_reads += unitig_unidir(a, s, cov, 0, intv0.x[0], &end[0], &is_loop); + copy_nei(&nei[0], &a->nei); + if (is_loop) { + ku128_t z; + z.x = end[0]; z.y = a->nei.a[0].info; + kv_push(ku128_t, nei[1], z); + return 0; + } + } + // right-wards extension + a->a[0].n = a->a[1].n = a->nei.n = 0; + seq_revcomp6(s->l, (uint8_t*)s->s); // reverse complement for extension in the other direction + seq_reverse(cov->l, (uint8_t*)cov->s); // reverse the coverage + *n_reads += unitig_unidir(a, s, cov, s->l - seed_len, intv0.x[1], &end[1], &is_loop); + copy_nei(&nei[1], &a->nei); + return 0; +} + +typedef struct { + long max_l; + aux_t a; + kstring_t str, cov; + magv_t z; + magv_v v; +} thrdat_t; + +typedef struct { + uint64_t prime, *used, *bend, *visited; + const rld_t *e; + thrdat_t *d; +} worker_t; + +static void worker(void *data, long _i, int tid) +{ + worker_t *w = (worker_t*)data; + thrdat_t *d = &w->d[tid]; + uint64_t i = (w->prime * _i) % w->e->mcnt[1]; + if (unitig1(&d->a, i, &d->str, &d->cov, d->z.k, d->z.nei, &d->z.nsr) >= 0) { // then we keep the unitig + uint64_t *p[2], x[2]; + magv_t *q; + p[0] = w->visited + (d->z.k[0]>>6); x[0] = 1LLU<<(d->z.k[0]&0x3f); + p[1] = w->visited + (d->z.k[1]>>6); x[1] = 1LLU<<(d->z.k[1]&0x3f); + if ((__sync_fetch_and_or(p[0], x[0])&x[0]) || (__sync_fetch_and_or(p[1], x[1])&x[1])) return; + d->z.len = d->str.l; + if (d->max_l < d->str.m) { + d->max_l = d->str.m; + d->z.seq = realloc(d->z.seq, d->max_l); + d->z.cov = realloc(d->z.cov, d->max_l); + } + memcpy(d->z.seq, d->str.s, d->z.len); + memcpy(d->z.cov, d->cov.s, d->z.len + 1); + kv_pushp(magv_t, d->v, &q); + mag_v_copy_to_empty(q, &d->z); + } +} + +mag_t *fml_fmi2mag_core(const rld_t *e, int min_match, int min_merge_len, int n_threads) +{ + extern void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); + worker_t w; + int j; + mag_t *g; + + w.used = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8); + w.bend = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8); + w.visited = (uint64_t*)calloc((e->mcnt[1] + 63)/64, 8); + w.e = e; + assert(e->mcnt[1] >= n_threads * 2); + w.d = calloc(n_threads, sizeof(thrdat_t)); + w.prime = 0; + for (j = 0; utg_primes[j] > 0; ++j) + if (e->mcnt[1] % utg_primes[j] != 0) { + w.prime = utg_primes[j]; + break; + } + assert(w.prime); + for (j = 0; j < n_threads; ++j) { + w.d[j].a.e = e; w.d[j].a.min_match = min_match; w.d[j].a.min_merge_len = min_merge_len; + w.d[j].a.used = w.used; w.d[j].a.bend = w.bend; + } + kt_for(n_threads, worker, &w, e->mcnt[1]); + g = (mag_t*)calloc(1, sizeof(mag_t)); + for (j = 0; j < n_threads; ++j) { + kv_resize(magv_t, g->v, g->v.n + w.d[j].v.n); + memcpy(g->v.a + g->v.n, w.d[j].v.a, w.d[j].v.n * sizeof(magv_t)); + g->v.n += w.d[j].v.n; + free(w.d[j].v.a); + free(w.d[j].a.a[0].a); free(w.d[j].a.a[1].a); free(w.d[j].a.nei.a); free(w.d[j].a.cat.a); + free(w.d[j].z.nei[0].a); free(w.d[j].z.nei[1].a); free(w.d[j].z.seq); free(w.d[j].z.cov); + free(w.d[j].a.str.s); free(w.d[j].str.s); free(w.d[j].cov.s); + } + free(w.d); free(w.used); free(w.bend); free(w.visited); + + mag_g_build_hash(g); + mag_g_amend(g); + g->rdist = mag_cal_rdist(g); + return g; +} + +mag_t *fml_fmi2mag(const fml_opt_t *opt, rld_t *e) +{ + mag_t *g; + g = fml_fmi2mag_core(e, opt->min_asm_ovlp, opt->min_merge_len, opt->n_threads); + rld_destroy(e); + return g; +}