From f5cdd3f72f2b5d462f003e86399783d02bb4b54f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 7 Apr 2017 15:42:33 -0400 Subject: [PATCH] is_hpc is a property of the index --- Makefile | 2 +- index.c | 23 +++++---- main.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ minimap.h | 4 +- 4 files changed, 158 insertions(+), 15 deletions(-) create mode 100644 main.c diff --git a/Makefile b/Makefile index a957929..1cb0dcc 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ all:$(PROG) extra:all $(PROG_EXTRA) -minimap2:libminimap2.a +minimap2:main.o libminimap2.a $(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS) libminimap2.a:$(OBJS) diff --git a/index.c b/index.c index 52af8e4..e1e51ab 100644 --- a/index.c +++ b/index.c @@ -14,13 +14,13 @@ typedef khash_t(idx) idxhash_t; #define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x)) -mm_idx_t *mm_idx_init(int w, int k, int b) +mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc) { mm_idx_t *mi; if (k*2 < b) b = k * 2; if (w < 1) w = 1; mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t)); - mi->w = w, mi->k = k, mi->b = b; + mi->w = w, mi->k = k, mi->b = b, mi->is_hpc = is_hpc; mi->B = (mm_idx_bucket_t*)calloc(1<n_seq; ++i) { bseq1_t *t = &s->seq[i]; - mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->is_hpc, &s->a); + mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->is_hpc, &s->a); free(t->seq); free(t->name); } free(s->seq); s->seq = 0; @@ -244,11 +244,10 @@ mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_ memset(&pl, 0, sizeof(pipeline_t)); pl.mini_batch_size = mini_batch_size; pl.keep_name = keep_name; - pl.is_hpc = is_hpc; pl.batch_size = batch_size; pl.fp = fp; if (pl.fp == 0) return 0; - pl.mi = mm_idx_init(w, k, b); + pl.mi = mm_idx_init(w, k, b, is_hpc); kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3); if (mm_verbose >= 3) @@ -280,11 +279,11 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads) void mm_idx_dump(FILE *fp, const mm_idx_t *mi) { - uint32_t x[4]; + uint32_t x[5]; int i; - x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq; + x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->is_hpc; fwrite(MM_IDX_MAGIC, 1, 4, fp); - fwrite(x, 4, 4, fp); + fwrite(x, 4, 5, fp); for (i = 0; i < mi->n_seq; ++i) { uint8_t l; l = strlen(mi->seq[i].name); @@ -314,12 +313,12 @@ mm_idx_t *mm_idx_load(FILE *fp) { int i; char magic[4]; - uint32_t x[4]; + uint32_t x[5]; mm_idx_t *mi; if (fread(magic, 1, 4, fp) != 4) return 0; if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0; - if (fread(x, 4, 4, fp) != 6) return 0; - mi = mm_idx_init(x[0], x[1], x[2]); + if (fread(x, 4, 5, fp) != 6) return 0; + mi = mm_idx_init(x[0], x[1], x[2], x[4]); mi->n_seq = x[3]; mi->seq = (mm_idx_seq_t*)calloc(mi->n_seq, sizeof(mm_idx_seq_t)); for (i = 0; i < mi->n_seq; ++i) { diff --git a/main.c b/main.c new file mode 100644 index 0000000..2b0a743 --- /dev/null +++ b/main.c @@ -0,0 +1,144 @@ +#include +#include +#include +#include +#include +#include +#include "bseq.h" +#include "minimap.h" + +#define MM_VERSION "0.2-r124-dirty" + +void liftrlimit() +{ +#ifdef __linux__ + struct rlimit r; + getrlimit(RLIMIT_AS, &r); + r.rlim_cur = r.rlim_max; + setrlimit(RLIMIT_AS, &r); +#endif +} + +int main(int argc, char *argv[]) +{ + mm_mapopt_t opt; + int i, c, k = 15, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0, is_hpc = 0; + int mini_batch_size = 100000000; + uint64_t batch_size = 4000000000ULL; + float f = 0.001; + bseq_file_t *fp = 0; + char *fnw = 0; + FILE *fpr = 0, *fpw = 0; + + liftrlimit(); + mm_realtime0 = realtime(); + mm_mapopt_init(&opt); + + while ((c = getopt(argc, argv, "w:k:B:b:t:r:c:f:Vv:NOg:I:d:lRPST:m:L:Dx:H")) >= 0) { + if (c == 'w') w = atoi(optarg); + else if (c == 'k') k = atoi(optarg); + else if (c == 'b') b = atoi(optarg); + else if (c == 'r') opt.radius = atoi(optarg); + else if (c == 'c') opt.min_cnt = atoi(optarg); + else if (c == 'm') opt.merge_frac = atof(optarg); + else if (c == 'f') f = atof(optarg); + else if (c == 't') n_threads = atoi(optarg); + else if (c == 'v') mm_verbose = atoi(optarg); + else if (c == 'g') opt.max_gap = atoi(optarg); + else if (c == 'N') keep_name = 0; + else if (c == 'd') fnw = optarg; + else if (c == 'l') is_idx = 1; + else if (c == 'R') opt.flag |= MM_F_WITH_REP; + else if (c == 'P') opt.flag &= ~MM_F_WITH_REP; + else if (c == 'D') opt.flag |= MM_F_NO_SELF; + else if (c == 'O') opt.flag |= MM_F_NO_ISO; + else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF; + else if (c == 'T') opt.sdust_thres = atoi(optarg); + else if (c == 'L') opt.min_match = atoi(optarg); + else if (c == 'V') { + puts(MM_VERSION); + return 0; + } else if (c == 'B' || c == 'I') { + double x; + char *p; + x = strtod(optarg, &p); + if (*p == 'G' || *p == 'g') x *= 1e9; + else if (*p == 'M' || *p == 'm') x *= 1e6; + else if (*p == 'K' || *p == 'k') x *= 1e3; + if (c == 'B') mini_batch_size = (uint64_t)(x + .499); + else batch_size = (uint64_t)(x + .499); + } else if (c == 'x') { + if (strcmp(optarg, "ava10k") == 0) { + opt.flag |= MM_F_AVA | MM_F_NO_SELF; + opt.min_match = 100; + opt.merge_frac = 0.0; + w = 5; + } + } + } + if (w < 0) w = (int)(.6666667 * k + .499); + + if (argc == optind) { + fprintf(stderr, "Usage: minimap [options] [query.fa] [...]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " Indexing:\n"); + fprintf(stderr, " -k INT k-mer size [%d]\n", k); + fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n"); + fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n"); + fprintf(stderr, " -d FILE dump index to FILE []\n"); + fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n"); +// fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users would not care about this + fprintf(stderr, " Mapping:\n"); + fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", f); + fprintf(stderr, " -r INT bandwidth [%d]\n", opt.radius); + fprintf(stderr, " -m FLOAT merge two chains if FLOAT fraction of minimizers are shared [%.2f]\n", opt.merge_frac); + fprintf(stderr, " -c INT retain a mapping if it consists of >=INT minimizers [%d]\n", opt.min_cnt); + fprintf(stderr, " -L INT min matching length [%d]\n", opt.min_match); + fprintf(stderr, " -g INT split a mapping if there is a gap longer than INT [%d]\n", opt.max_gap); + fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); +// fprintf(stderr, " -D skip self mappings but keep dual mappings\n"); // too confusing to expose to end users + fprintf(stderr, " -S skip self and dual mappings\n"); + fprintf(stderr, " -O drop isolated hits before chaining (EXPERIMENTAL)\n"); + fprintf(stderr, " -P filtering potential repeats after mapping (EXPERIMENTAL)\n"); +// fprintf(stderr, " -R skip post-mapping repeat filtering\n"); // deprecated option for backward compatibility + fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); + fprintf(stderr, " ava10k: -Sw5 -L100 -m0 (PacBio/ONT all-vs-all read mapping)\n"); + fprintf(stderr, " Input/Output:\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); +// fprintf(stderr, " -B NUM process ~NUM bp in each batch [100M]\n"); +// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); +// fprintf(stderr, " -N use integer as target names\n"); + fprintf(stderr, " -V show version number\n"); + fprintf(stderr, "\nSee minimap.1 for detailed description of the command-line options.\n"); + return 1; + } + + if (is_idx) fpr = fopen(argv[optind], "rb"); + else fp = bseq_open(argv[optind]); + if (fnw) fpw = fopen(fnw, "wb"); + for (;;) { + mm_idx_t *mi = 0; + if (fpr) mi = mm_idx_load(fpr); + else if (!bseq_eof(fp)) + mi = mm_idx_gen(fp, w, k, b, is_hpc, mini_batch_size, n_threads, batch_size, keep_name); + if (mi == 0) break; + if (mm_verbose >= 3) + fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", + __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq); + mm_idx_set_max_occ(mi, f); + if (fpw) mm_idx_dump(fpw, mi); + for (i = optind + 1; i < argc; ++i) + mm_map_file(mi, argv[i], &opt, n_threads, mini_batch_size); + mm_idx_destroy(mi); + } + if (fpw) fclose(fpw); + if (fpr) fclose(fpr); + if (fp) bseq_close(fp); + + fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION); + fprintf(stderr, "[M::%s] CMD:", __func__); + for (i = 0; i < argc; ++i) + fprintf(stderr, " %s", argv[i]); + fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - mm_realtime0, cputime()); + return 0; +} diff --git a/minimap.h b/minimap.h index 5728cf2..41a561a 100644 --- a/minimap.h +++ b/minimap.h @@ -35,7 +35,7 @@ typedef struct { } mm_idx_seq_t; typedef struct { - int32_t b, w, k; + int32_t b, w, k, is_hpc; uint64_t sum_len; // sum of lengths uint32_t n_seq; // number of reference sequences mm_idx_seq_t *seq; // sequence name, length and offset @@ -79,7 +79,7 @@ extern "C" { void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); // minimizer indexing -mm_idx_t *mm_idx_init(int w, int k, int b); +mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc); void mm_idx_destroy(mm_idx_t *mi); mm_idx_t *mm_idx_gen(struct bseq_file_s *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name); void mm_idx_set_max_occ(mm_idx_t *mi, float f);