is_hpc is a property of the index
This commit is contained in:
parent
b3bc4911ba
commit
f5cdd3f72f
2
Makefile
2
Makefile
|
|
@ -16,7 +16,7 @@ all:$(PROG)
|
|||
|
||||
extra:all $(PROG_EXTRA)
|
||||
|
||||
minimap2:libminimap2.a
|
||||
minimap2:main.o libminimap2.a
|
||||
$(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS)
|
||||
|
||||
libminimap2.a:$(OBJS)
|
||||
|
|
|
|||
23
index.c
23
index.c
|
|
@ -14,13 +14,13 @@ typedef khash_t(idx) idxhash_t;
|
|||
|
||||
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
|
||||
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b)
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc)
|
||||
{
|
||||
mm_idx_t *mi;
|
||||
if (k*2 < b) b = k * 2;
|
||||
if (w < 1) w = 1;
|
||||
mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
|
||||
mi->w = w, mi->k = k, mi->b = b;
|
||||
mi->w = w, mi->k = k, mi->b = b, mi->is_hpc = is_hpc;
|
||||
mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
|
||||
return mi;
|
||||
}
|
||||
|
|
@ -151,7 +151,7 @@ static void mm_idx_post(mm_idx_t *mi, int n_threads)
|
|||
#include "bseq.h"
|
||||
|
||||
typedef struct {
|
||||
int mini_batch_size, keep_name, is_hpc;
|
||||
int mini_batch_size, keep_name;
|
||||
uint64_t batch_size;
|
||||
bseq_file_t *fp;
|
||||
mm_idx_t *mi;
|
||||
|
|
@ -225,7 +225,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
step_t *s = (step_t*)in;
|
||||
for (i = 0; i < s->n_seq; ++i) {
|
||||
bseq1_t *t = &s->seq[i];
|
||||
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->is_hpc, &s->a);
|
||||
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->is_hpc, &s->a);
|
||||
free(t->seq); free(t->name);
|
||||
}
|
||||
free(s->seq); s->seq = 0;
|
||||
|
|
@ -244,11 +244,10 @@ mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_
|
|||
memset(&pl, 0, sizeof(pipeline_t));
|
||||
pl.mini_batch_size = mini_batch_size;
|
||||
pl.keep_name = keep_name;
|
||||
pl.is_hpc = is_hpc;
|
||||
pl.batch_size = batch_size;
|
||||
pl.fp = fp;
|
||||
if (pl.fp == 0) return 0;
|
||||
pl.mi = mm_idx_init(w, k, b);
|
||||
pl.mi = mm_idx_init(w, k, b, is_hpc);
|
||||
|
||||
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
|
||||
if (mm_verbose >= 3)
|
||||
|
|
@ -280,11 +279,11 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads)
|
|||
|
||||
void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
|
||||
{
|
||||
uint32_t x[4];
|
||||
uint32_t x[5];
|
||||
int i;
|
||||
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq;
|
||||
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->is_hpc;
|
||||
fwrite(MM_IDX_MAGIC, 1, 4, fp);
|
||||
fwrite(x, 4, 4, fp);
|
||||
fwrite(x, 4, 5, fp);
|
||||
for (i = 0; i < mi->n_seq; ++i) {
|
||||
uint8_t l;
|
||||
l = strlen(mi->seq[i].name);
|
||||
|
|
@ -314,12 +313,12 @@ mm_idx_t *mm_idx_load(FILE *fp)
|
|||
{
|
||||
int i;
|
||||
char magic[4];
|
||||
uint32_t x[4];
|
||||
uint32_t x[5];
|
||||
mm_idx_t *mi;
|
||||
if (fread(magic, 1, 4, fp) != 4) return 0;
|
||||
if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
|
||||
if (fread(x, 4, 4, fp) != 6) return 0;
|
||||
mi = mm_idx_init(x[0], x[1], x[2]);
|
||||
if (fread(x, 4, 5, fp) != 6) return 0;
|
||||
mi = mm_idx_init(x[0], x[1], x[2], x[4]);
|
||||
mi->n_seq = x[3];
|
||||
mi->seq = (mm_idx_seq_t*)calloc(mi->n_seq, sizeof(mm_idx_seq_t));
|
||||
for (i = 0; i < mi->n_seq; ++i) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,144 @@
|
|||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
#include "bseq.h"
|
||||
#include "minimap.h"
|
||||
|
||||
#define MM_VERSION "0.2-r124-dirty"
|
||||
|
||||
void liftrlimit()
|
||||
{
|
||||
#ifdef __linux__
|
||||
struct rlimit r;
|
||||
getrlimit(RLIMIT_AS, &r);
|
||||
r.rlim_cur = r.rlim_max;
|
||||
setrlimit(RLIMIT_AS, &r);
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
mm_mapopt_t opt;
|
||||
int i, c, k = 15, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0, is_hpc = 0;
|
||||
int mini_batch_size = 100000000;
|
||||
uint64_t batch_size = 4000000000ULL;
|
||||
float f = 0.001;
|
||||
bseq_file_t *fp = 0;
|
||||
char *fnw = 0;
|
||||
FILE *fpr = 0, *fpw = 0;
|
||||
|
||||
liftrlimit();
|
||||
mm_realtime0 = realtime();
|
||||
mm_mapopt_init(&opt);
|
||||
|
||||
while ((c = getopt(argc, argv, "w:k:B:b:t:r:c:f:Vv:NOg:I:d:lRPST:m:L:Dx:H")) >= 0) {
|
||||
if (c == 'w') w = atoi(optarg);
|
||||
else if (c == 'k') k = atoi(optarg);
|
||||
else if (c == 'b') b = atoi(optarg);
|
||||
else if (c == 'r') opt.radius = atoi(optarg);
|
||||
else if (c == 'c') opt.min_cnt = atoi(optarg);
|
||||
else if (c == 'm') opt.merge_frac = atof(optarg);
|
||||
else if (c == 'f') f = atof(optarg);
|
||||
else if (c == 't') n_threads = atoi(optarg);
|
||||
else if (c == 'v') mm_verbose = atoi(optarg);
|
||||
else if (c == 'g') opt.max_gap = atoi(optarg);
|
||||
else if (c == 'N') keep_name = 0;
|
||||
else if (c == 'd') fnw = optarg;
|
||||
else if (c == 'l') is_idx = 1;
|
||||
else if (c == 'R') opt.flag |= MM_F_WITH_REP;
|
||||
else if (c == 'P') opt.flag &= ~MM_F_WITH_REP;
|
||||
else if (c == 'D') opt.flag |= MM_F_NO_SELF;
|
||||
else if (c == 'O') opt.flag |= MM_F_NO_ISO;
|
||||
else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
|
||||
else if (c == 'T') opt.sdust_thres = atoi(optarg);
|
||||
else if (c == 'L') opt.min_match = atoi(optarg);
|
||||
else if (c == 'V') {
|
||||
puts(MM_VERSION);
|
||||
return 0;
|
||||
} else if (c == 'B' || c == 'I') {
|
||||
double x;
|
||||
char *p;
|
||||
x = strtod(optarg, &p);
|
||||
if (*p == 'G' || *p == 'g') x *= 1e9;
|
||||
else if (*p == 'M' || *p == 'm') x *= 1e6;
|
||||
else if (*p == 'K' || *p == 'k') x *= 1e3;
|
||||
if (c == 'B') mini_batch_size = (uint64_t)(x + .499);
|
||||
else batch_size = (uint64_t)(x + .499);
|
||||
} else if (c == 'x') {
|
||||
if (strcmp(optarg, "ava10k") == 0) {
|
||||
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
|
||||
opt.min_match = 100;
|
||||
opt.merge_frac = 0.0;
|
||||
w = 5;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (w < 0) w = (int)(.6666667 * k + .499);
|
||||
|
||||
if (argc == optind) {
|
||||
fprintf(stderr, "Usage: minimap [options] <target.fa> [query.fa] [...]\n");
|
||||
fprintf(stderr, "Options:\n");
|
||||
fprintf(stderr, " Indexing:\n");
|
||||
fprintf(stderr, " -k INT k-mer size [%d]\n", k);
|
||||
fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n");
|
||||
fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n");
|
||||
fprintf(stderr, " -d FILE dump index to FILE []\n");
|
||||
fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n");
|
||||
// fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users would not care about this
|
||||
fprintf(stderr, " Mapping:\n");
|
||||
fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", f);
|
||||
fprintf(stderr, " -r INT bandwidth [%d]\n", opt.radius);
|
||||
fprintf(stderr, " -m FLOAT merge two chains if FLOAT fraction of minimizers are shared [%.2f]\n", opt.merge_frac);
|
||||
fprintf(stderr, " -c INT retain a mapping if it consists of >=INT minimizers [%d]\n", opt.min_cnt);
|
||||
fprintf(stderr, " -L INT min matching length [%d]\n", opt.min_match);
|
||||
fprintf(stderr, " -g INT split a mapping if there is a gap longer than INT [%d]\n", opt.max_gap);
|
||||
fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres);
|
||||
// fprintf(stderr, " -D skip self mappings but keep dual mappings\n"); // too confusing to expose to end users
|
||||
fprintf(stderr, " -S skip self and dual mappings\n");
|
||||
fprintf(stderr, " -O drop isolated hits before chaining (EXPERIMENTAL)\n");
|
||||
fprintf(stderr, " -P filtering potential repeats after mapping (EXPERIMENTAL)\n");
|
||||
// fprintf(stderr, " -R skip post-mapping repeat filtering\n"); // deprecated option for backward compatibility
|
||||
fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
|
||||
fprintf(stderr, " ava10k: -Sw5 -L100 -m0 (PacBio/ONT all-vs-all read mapping)\n");
|
||||
fprintf(stderr, " Input/Output:\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
|
||||
// fprintf(stderr, " -B NUM process ~NUM bp in each batch [100M]\n");
|
||||
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
|
||||
// fprintf(stderr, " -N use integer as target names\n");
|
||||
fprintf(stderr, " -V show version number\n");
|
||||
fprintf(stderr, "\nSee minimap.1 for detailed description of the command-line options.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (is_idx) fpr = fopen(argv[optind], "rb");
|
||||
else fp = bseq_open(argv[optind]);
|
||||
if (fnw) fpw = fopen(fnw, "wb");
|
||||
for (;;) {
|
||||
mm_idx_t *mi = 0;
|
||||
if (fpr) mi = mm_idx_load(fpr);
|
||||
else if (!bseq_eof(fp))
|
||||
mi = mm_idx_gen(fp, w, k, b, is_hpc, mini_batch_size, n_threads, batch_size, keep_name);
|
||||
if (mi == 0) break;
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
|
||||
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
|
||||
mm_idx_set_max_occ(mi, f);
|
||||
if (fpw) mm_idx_dump(fpw, mi);
|
||||
for (i = optind + 1; i < argc; ++i)
|
||||
mm_map_file(mi, argv[i], &opt, n_threads, mini_batch_size);
|
||||
mm_idx_destroy(mi);
|
||||
}
|
||||
if (fpw) fclose(fpw);
|
||||
if (fpr) fclose(fpr);
|
||||
if (fp) bseq_close(fp);
|
||||
|
||||
fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION);
|
||||
fprintf(stderr, "[M::%s] CMD:", __func__);
|
||||
for (i = 0; i < argc; ++i)
|
||||
fprintf(stderr, " %s", argv[i]);
|
||||
fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - mm_realtime0, cputime());
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -35,7 +35,7 @@ typedef struct {
|
|||
} mm_idx_seq_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t b, w, k;
|
||||
int32_t b, w, k, is_hpc;
|
||||
uint64_t sum_len; // sum of lengths
|
||||
uint32_t n_seq; // number of reference sequences
|
||||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
|
|
@ -79,7 +79,7 @@ extern "C" {
|
|||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
||||
|
||||
// minimizer indexing
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b);
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc);
|
||||
void mm_idx_destroy(mm_idx_t *mi);
|
||||
mm_idx_t *mm_idx_gen(struct bseq_file_s *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name);
|
||||
void mm_idx_set_max_occ(mm_idx_t *mi, float f);
|
||||
|
|
|
|||
Loading…
Reference in New Issue