From 6d4348db449752f9ffe64b46559d622c3ba1cb4a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 6 Jun 2017 14:19:50 -0400 Subject: [PATCH] dp chaining mostly works, but fails sometimes which means there are bugs that need to be fixed --- chain.c | 29 +++++++++++++++++------------ index.c | 2 -- main.c | 31 +++++++++++++++++++++++++++---- map.c | 19 +++++++++++++------ minimap.h | 2 ++ mmpriv.h | 2 +- 6 files changed, 60 insertions(+), 25 deletions(-) diff --git a/chain.c b/chain.c index eb7546a..102907e 100644 --- a/chain.c +++ b/chain.c @@ -19,11 +19,13 @@ static inline int ilog2_32(uint32_t v) return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; } -int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, int32_t **_v, void *km) +int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, void *km) { - int32_t st = 0, i, j, k, *p, *f, *t, *idx, *v, n_u, n_v; + int32_t st = 0, i, j, k, *p, *f, *t, *v, n_u, n_v; uint64_t *u; + mm128_t *b; + if (_u) *_u = 0; f = (int32_t*)kmalloc(km, n * 4); p = (int32_t*)kmalloc(km, n * 4); t = (int32_t*)kmalloc(km, n * 4); @@ -54,12 +56,6 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i else f[i] = match_len, p[i] = -1; } - // free the a[] array - for (i = 0; i < n; ++i) t[i] = a[i].y>>32; - kfree(km, a); - idx = (int32_t*)kmalloc(km, n * 4); - for (i = 0; i < n; ++i) idx[i] = t[i]; - // find the ending positions of chains memset(t, 0, n * 4); for (i = 0; i < n; ++i) @@ -67,6 +63,7 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i for (i = n_u = 0; i < n; ++i) if (t[i] == 0 && f[i] >= min_sc) ++n_u; + if (n_u == 0) return 0; u = (uint64_t*)kmalloc(km, n_u * 8); for (i = n_u = 0; i < n; ++i) if (t[i] == 0 && f[i] >= min_sc) @@ -80,22 +77,30 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i int32_t n_v0 = n_v; j = (int32_t)u[i]; do { - v[n_v++] = idx[j]; + v[n_v++] = j; t[j] = 1; j = p[j]; } while (j >= 0 && t[j] == 0); if (j < 0) u[k++] = u[i]>>32<<32 | (n_v - n_v0); else if ((u[i]>>32) - f[j] >= min_sc) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0); else n_v0 = n_v; - for (j = 0; j < (n_v - n_v0) >> 1; ++j) { + for (j = 0; j < (n_v - n_v0) >> 1; ++j) { // reverse v[] such that the smallest index comes the first int32_t t = v[n_v0 + j]; v[n_v0 + j] = v[n_v - 1 - j]; v[n_v - 1 - j] = t; } } - n_u = k, *_u = u, *_v = v; + n_u = k, *_u = u; // free - kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, idx); + kfree(km, f); kfree(km, p); kfree(km, t); + + // write the result to _a_ + b = kmalloc(km, n_v * sizeof(mm128_t)); + for (i = 0, k = 0; i < n_u; ++i) + for (j = 0; j < (int32_t)u[i]; ++j) + b[k] = a[v[k]], ++k; + memcpy(a, b, n_v * sizeof(mm128_t)); + kfree(km, b); return n_u; } diff --git a/index.c b/index.c index 202e38b..ab9d63d 100644 --- a/index.c +++ b/index.c @@ -299,8 +299,6 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads) * index I/O * *************/ -#define MM_IDX_MAGIC "MMI\2" - void mm_idx_dump(FILE *fp, const mm_idx_t *mi) { uint64_t sum_len = 0; diff --git a/main.c b/main.c index 7fdca1a..662eb6a 100644 --- a/main.c +++ b/main.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "bseq.h" #include "minimap.h" #include "mmpriv.h" @@ -20,10 +21,29 @@ void liftrlimit() #endif } +static int test_idx(const char *fn) +{ + int fd, is_idx = 0; + off_t ret; + char magic[4]; + + if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index + fd = open(fn, O_RDONLY); + if (fd < 0) return -1; // error + if ((ret = lseek(fd, 0, SEEK_END)) >= 4) { + lseek(fd, 0, SEEK_SET); + read(fd, magic, 4); + if (strncmp(magic, MM_IDX_MAGIC, 4) == 0) + is_idx = 1; + } + close(fd); + return is_idx; +} + int main(int argc, char *argv[]) { mm_mapopt_t opt; - int i, c, k = 17, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0, is_hpc = 0; + int i, c, k = 17, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0; int mini_batch_size = 100000000; uint64_t batch_size = 4000000000ULL; bseq_file_t *fp = 0; @@ -34,12 +54,11 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt(argc, argv, "w:k:B:b:t:r:f:Vv:NOg:I:d:lST:s:Dx:H")) >= 0) { + while ((c = getopt(argc, argv, "w:k:B:b:t:r:f:Vv:NOg:I:d:ST:s:Dx:H")) >= 0) { if (c == 'w') w = atoi(optarg); else if (c == 'k') k = atoi(optarg); else if (c == 'b') b = atoi(optarg); else if (c == 'H') is_hpc = 1; - else if (c == 'l') is_idx = 1; else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I else if (c == 'r') opt.bw = atoi(optarg); else if (c == 'f') opt.mid_occ_frac = atof(optarg); @@ -83,7 +102,6 @@ int main(int argc, char *argv[]) fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n"); fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n"); fprintf(stderr, " -d FILE dump index to FILE []\n"); - fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n"); // fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users wouldn't care about this fprintf(stderr, " Mapping:\n"); fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", opt.mid_occ_frac); @@ -106,6 +124,11 @@ int main(int argc, char *argv[]) return 1; } + is_idx = test_idx(argv[optind]); + if (is_idx < 0) { + fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]); + return 1; + } if (is_idx) fpr = fopen(argv[optind], "rb"); else fp = bseq_open(argv[optind]); if (fnw) fpw = fopen(fnw, "wb"); diff --git a/map.c b/map.c index a94d362..8f826ac 100644 --- a/map.c +++ b/map.c @@ -155,8 +155,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2) void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen) { - int i, n = m_en - m_st, last = -1, last2 = -1, j, n_a, n_u; - int32_t *v; + int i, n = m_en - m_st, j, n_a, n_u; uint64_t *u; mm_match_t *m; mm128_t *a; @@ -178,6 +177,7 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3 } if (mm_verbose >= 5) printf("\n"); #if 0 + int last = -1, last2 = -1; // pair k-mer thinning for (i = 0; i < n; ++i) { if (m[i].n >= opt->mid_occ && m[i].n < opt->max_occ) { @@ -214,10 +214,10 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3 p = &a[j++]; if ((r[k]&1) == (q->qpos&1)) { // forward strand p->x = (r[k]&0xffffffff00000000ULL) | (uint32_t)r[k]>>1; - p->y = (uint64_t)i << 32 | q->qpos >> 1; + p->y = (uint64_t)q_span << 32 | q->qpos >> 1; } else { // reverse strand p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | (uint32_t)r[k]>>1; - p->y = (uint64_t)i << 32 | (qlen - ((q->qpos>>1) + 1 - q_span) - 1); + p->y = (uint64_t)q_span << 32 | (qlen - ((q->qpos>>1) + 1 - q_span) - 1); } } } @@ -227,11 +227,18 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3 kfree(b->km_fixed, m); //for (i = 0; i < n_a; ++i) printf("%c\t%s\t%d\t%d\n", "+-"[a[i].x>>63], mi->seq[a[i].x<<1>>33].name, (uint32_t)a[i].x>>1, (uint32_t)a[i].y); - n_u = mm_chain_dp(mi->k, opt->max_gap, opt->bw, opt->max_skip, opt->min_score, n_a, a, &u, &v, b->km_fixed); + n_u = mm_chain_dp(mi->k, opt->max_gap, opt->bw, opt->max_skip, opt->min_score, n_a, a, &u, b->km_fixed); + printf("%s\t%d", qname, n_u); + for (i = j = 0; i < n_u; ++i) { + int n = (uint32_t)u[i]; + printf("\t%d@%s:%d-%d", (uint32_t)(u[i]>>32), mi->seq[a[j].x<<1>>33].name, (uint32_t)a[j].x, (uint32_t)a[j+n-1].x); + j += n; + } + printf("\n"); // free + kfree(b->km_fixed, a); kfree(b->km_fixed, u); - kfree(b->km_fixed, v); } const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) diff --git a/minimap.h b/minimap.h index 25925f1..fb9d54d 100644 --- a/minimap.h +++ b/minimap.h @@ -12,6 +12,8 @@ #define MM_F_NO_ISO 0x4 #define MM_F_AVA 0x8 +#define MM_IDX_MAGIC "MMI\2" + typedef struct { uint64_t x, y; } mm128_t; diff --git a/mmpriv.h b/mmpriv.h index bb6bbe9..13d64aa 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -14,7 +14,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); -int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, int32_t **_v, void *km); +int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, void *km); #ifdef __cplusplus }