diff --git a/layout.c b/layout.c index c90d9e3..0069b44 100644 --- a/layout.c +++ b/layout.c @@ -4,6 +4,8 @@ #include #include #include +#include + #include "kvec.h" #include "khash.h" #include "kseq.h" @@ -24,20 +26,20 @@ typedef struct { float min_aln_ratio; } lo_opt_t; -KHASH_SET_INIT_INT(int) +KHASH_MAP_INIT_INT(int, int) typedef khash_t(int) inthash_t; typedef struct { int id; int contained; char *name; - inthash_t *nei; + inthash_t *nei[2]; } vertex_t; typedef kvec_t(vertex_t) vertex_v; typedef struct { - int type, score, l[2], s[2], e[2]; // length, start and end + int type, score, l[2], s[2], e[2], d[2]; // length, start and end } edgeinfo_t; KHASH_MAP_INIT_INT64(edge, edgeinfo_t) @@ -71,7 +73,7 @@ ograph_t *lo_graph_init() return g; } -int lo_infer_edge_type(const lo_opt_t *opt, int l[2], int s[2], int e[2]) +int lo_infer_edge_type(const lo_opt_t *opt, int l[2], int s[2], int e[2], int d[2]) { int el, x[2], a[2], r[2]; // x: eXtended length, a: Aligned length; r: Remaining length int t[2][2], type; @@ -93,6 +95,7 @@ int lo_infer_edge_type(const lo_opt_t *opt, int l[2], int s[2], int e[2]) if (r[0] < 0) type ^= 3; // reverse the direction } else type = x[0] / l[0] > x[1] / l[1]? LO_T_C1 : LO_T_C2; } else type = LO_T_I; // internal local match; not a suffix-prefix match + d[0] = abs(r[0]); d[1] = abs(r[1]); return type; } @@ -119,8 +122,9 @@ ograph_t *lo_graph_parse(const lo_opt_t *opt, kstream_t *ks) z->id = kh_size(g->n); z->name = strdup(p); z->contained = 0; - z->nei = 0; // don't initialize the hash table right now + z->nei[0] = z->nei[1] = 0; // don't initialize the hash table right now k = kh_put(name, g->n, z->name, &absent); + assert(absent); kh_val(g->n, k) = z->id; } id[(i==4)] = kh_val(g->n, k); @@ -138,7 +142,7 @@ ograph_t *lo_graph_parse(const lo_opt_t *opt, kstream_t *ks) if (*q == 0) break; } if (i < 9) continue; // not enough fields - e.type = lo_infer_edge_type(opt, e.l, e.s, e.e); + e.type = lo_infer_edge_type(opt, e.l, e.s, e.e, e.d); if (e.type == LO_T_C1) { g->v.a[id[0]].contained = 1; } else if (e.type == LO_T_C2) { @@ -152,7 +156,8 @@ ograph_t *lo_graph_parse(const lo_opt_t *opt, kstream_t *ks) // printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id[0], e.l[0], e.s[0], e.e[0], id[1], e.l[1], e.s[1], e.e[1]); } free(str.s); - if (lo_verbose >= 3) fprintf(stderr, "[M::%s] read %d edges\n", __func__, kh_size(g->e)); + if (lo_verbose >= 3) + fprintf(stderr, "[M::%s] read %d edges\n", __func__, kh_size(g->e)); return g; } @@ -160,7 +165,8 @@ void lo_graph_destroy(ograph_t *g) { int i; for (i = 0; i < g->v.n; ++i) { - if (g->v.a[i].nei) kh_destroy(int, g->v.a[i].nei); + if (g->v.a[i].nei[0]) kh_destroy(int, g->v.a[i].nei[0]); + if (g->v.a[i].nei[1]) kh_destroy(int, g->v.a[i].nei[1]); free(g->v.a[i].name); } free(g->v.a); @@ -173,18 +179,77 @@ void lo_graph_destroy(ograph_t *g) * Graph routines * ******************/ +#define lo_swap(tmp, a, b) ((tmp) = (a), (a) = (b), (b) = (tmp)) + +static inline void lo_flip_edge(edgeinfo_t *e) +{ + int tmp; + lo_swap(tmp, e->l[0], e->l[1]); + lo_swap(tmp, e->s[0], e->s[1]); + lo_swap(tmp, e->e[0], e->e[1]); + e->type = (e->type|1)<<1 | (e->type|2)>>1; +} + void lo_rm_contained(ograph_t *g) { - khint_t k; + khint_t k, l; + int n_del = 0, n_add = 0, absent; + ehash_t *tmp; + tmp = kh_init(edge); for (k = 0; k != kh_end(g->e); ++k) { int id[2]; if (!kh_exist(g->e, k)) continue; id[0] = kh_key(g->e, k)>>32; id[1] = (uint32_t)kh_key(g->e, k); - if (g->v.a[id[0]].contained || g->v.a[id[1]].contained) - kh_del(edge, g->e, k); + if (g->v.a[id[0]].contained || g->v.a[id[1]].contained) { + ++n_del; + kh_del(edge, g->e, k); // kh_del() will not trigger rehash + } else { + uint64_t key2 = (uint64_t)id[1]<<32|id[0]; + l = kh_get(edge, g->e, key2); + if (l == kh_end(g->e)) { + l = kh_put(edge, tmp, key2, &absent); + kh_val(tmp, l) = kh_val(g->e, k); + lo_flip_edge(&kh_val(tmp, l)); + ++n_add; + } + } + } + for (k = 0; k != kh_end(tmp); ++k) { + if (!kh_exist(tmp, k)) continue; + l = kh_put(edge, g->e, kh_key(tmp, k), &absent); + assert(absent); + kh_val(g->e, l) = kh_val(tmp, k); + } + if (lo_verbose >= 3) + fprintf(stderr, "[M::%s] removed %d and added %d; %d edges remain\n", __func__, n_del, kh_size(tmp), kh_size(g->e)); + kh_destroy(edge, tmp); +} + +void lo_populate_nei(ograph_t *g) +{ + int i, absent; + khint_t k, l; + for (i = 0; i < g->v.n; ++i) { + if (g->v.a[i].contained) continue; + g->v.a[i].nei[0] = kh_init(int); + g->v.a[i].nei[1] = kh_init(int); + } + for (k = 0; k != kh_end(g->e); ++k) { + int id[2]; + edgeinfo_t *e; + vertex_t *v; + if (!kh_exist(g->e, k)) continue; + id[0] = kh_key(g->e, k)>>32; + id[1] = (uint32_t)kh_key(g->e, k); + e = &kh_val(g->e, k); + v = &g->v.a[id[0]]; + l = kh_put(int, v->nei[!(e->type>>1)], id[1]<<1|(e->type&1), &absent); + kh_val(v->nei[!(e->type>>1)], l) = e->d[0]; + v = &g->v.a[id[1]]; + l = kh_put(int, v->nei[e->type&1], id[0]<<1|(e->type>>1^1), &absent); + kh_val(v->nei[e->type&1], l) = e->d[1]; } - if (lo_verbose >= 3) fprintf(stderr, "[M::%s] %d edges remain after removing contained vertices\n", __func__, kh_size(g->e)); } /***************** @@ -210,6 +275,7 @@ int main_layout(int argc, char *argv[]) ks = ks_init(fp); g = lo_graph_parse(&opt, ks); lo_rm_contained(g); + lo_populate_nei(g); lo_graph_destroy(g); ks_destroy(ks); gzclose(fp);