is_hpc is a property of the index

This commit is contained in:
Heng Li 2017-04-07 15:42:33 -04:00
parent b3bc4911ba
commit f5cdd3f72f
4 changed files with 158 additions and 15 deletions

View File

@ -16,7 +16,7 @@ all:$(PROG)
extra:all $(PROG_EXTRA)
minimap2:libminimap2.a
minimap2:main.o libminimap2.a
$(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS)
libminimap2.a:$(OBJS)

23
index.c
View File

@ -14,13 +14,13 @@ typedef khash_t(idx) idxhash_t;
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
mm_idx_t *mm_idx_init(int w, int k, int b)
mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc)
{
mm_idx_t *mi;
if (k*2 < b) b = k * 2;
if (w < 1) w = 1;
mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
mi->w = w, mi->k = k, mi->b = b;
mi->w = w, mi->k = k, mi->b = b, mi->is_hpc = is_hpc;
mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
return mi;
}
@ -151,7 +151,7 @@ static void mm_idx_post(mm_idx_t *mi, int n_threads)
#include "bseq.h"
typedef struct {
int mini_batch_size, keep_name, is_hpc;
int mini_batch_size, keep_name;
uint64_t batch_size;
bseq_file_t *fp;
mm_idx_t *mi;
@ -225,7 +225,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
step_t *s = (step_t*)in;
for (i = 0; i < s->n_seq; ++i) {
bseq1_t *t = &s->seq[i];
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->is_hpc, &s->a);
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->is_hpc, &s->a);
free(t->seq); free(t->name);
}
free(s->seq); s->seq = 0;
@ -244,11 +244,10 @@ mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_
memset(&pl, 0, sizeof(pipeline_t));
pl.mini_batch_size = mini_batch_size;
pl.keep_name = keep_name;
pl.is_hpc = is_hpc;
pl.batch_size = batch_size;
pl.fp = fp;
if (pl.fp == 0) return 0;
pl.mi = mm_idx_init(w, k, b);
pl.mi = mm_idx_init(w, k, b, is_hpc);
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
if (mm_verbose >= 3)
@ -280,11 +279,11 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads)
void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
{
uint32_t x[4];
uint32_t x[5];
int i;
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq;
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->is_hpc;
fwrite(MM_IDX_MAGIC, 1, 4, fp);
fwrite(x, 4, 4, fp);
fwrite(x, 4, 5, fp);
for (i = 0; i < mi->n_seq; ++i) {
uint8_t l;
l = strlen(mi->seq[i].name);
@ -314,12 +313,12 @@ mm_idx_t *mm_idx_load(FILE *fp)
{
int i;
char magic[4];
uint32_t x[4];
uint32_t x[5];
mm_idx_t *mi;
if (fread(magic, 1, 4, fp) != 4) return 0;
if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
if (fread(x, 4, 4, fp) != 6) return 0;
mi = mm_idx_init(x[0], x[1], x[2]);
if (fread(x, 4, 5, fp) != 6) return 0;
mi = mm_idx_init(x[0], x[1], x[2], x[4]);
mi->n_seq = x[3];
mi->seq = (mm_idx_seq_t*)calloc(mi->n_seq, sizeof(mm_idx_seq_t));
for (i = 0; i < mi->n_seq; ++i) {

144
main.c 100644
View File

@ -0,0 +1,144 @@
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/time.h>
#include "bseq.h"
#include "minimap.h"
#define MM_VERSION "0.2-r124-dirty"
void liftrlimit()
{
#ifdef __linux__
struct rlimit r;
getrlimit(RLIMIT_AS, &r);
r.rlim_cur = r.rlim_max;
setrlimit(RLIMIT_AS, &r);
#endif
}
int main(int argc, char *argv[])
{
mm_mapopt_t opt;
int i, c, k = 15, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0, is_hpc = 0;
int mini_batch_size = 100000000;
uint64_t batch_size = 4000000000ULL;
float f = 0.001;
bseq_file_t *fp = 0;
char *fnw = 0;
FILE *fpr = 0, *fpw = 0;
liftrlimit();
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
while ((c = getopt(argc, argv, "w:k:B:b:t:r:c:f:Vv:NOg:I:d:lRPST:m:L:Dx:H")) >= 0) {
if (c == 'w') w = atoi(optarg);
else if (c == 'k') k = atoi(optarg);
else if (c == 'b') b = atoi(optarg);
else if (c == 'r') opt.radius = atoi(optarg);
else if (c == 'c') opt.min_cnt = atoi(optarg);
else if (c == 'm') opt.merge_frac = atof(optarg);
else if (c == 'f') f = atof(optarg);
else if (c == 't') n_threads = atoi(optarg);
else if (c == 'v') mm_verbose = atoi(optarg);
else if (c == 'g') opt.max_gap = atoi(optarg);
else if (c == 'N') keep_name = 0;
else if (c == 'd') fnw = optarg;
else if (c == 'l') is_idx = 1;
else if (c == 'R') opt.flag |= MM_F_WITH_REP;
else if (c == 'P') opt.flag &= ~MM_F_WITH_REP;
else if (c == 'D') opt.flag |= MM_F_NO_SELF;
else if (c == 'O') opt.flag |= MM_F_NO_ISO;
else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
else if (c == 'T') opt.sdust_thres = atoi(optarg);
else if (c == 'L') opt.min_match = atoi(optarg);
else if (c == 'V') {
puts(MM_VERSION);
return 0;
} else if (c == 'B' || c == 'I') {
double x;
char *p;
x = strtod(optarg, &p);
if (*p == 'G' || *p == 'g') x *= 1e9;
else if (*p == 'M' || *p == 'm') x *= 1e6;
else if (*p == 'K' || *p == 'k') x *= 1e3;
if (c == 'B') mini_batch_size = (uint64_t)(x + .499);
else batch_size = (uint64_t)(x + .499);
} else if (c == 'x') {
if (strcmp(optarg, "ava10k") == 0) {
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
opt.min_match = 100;
opt.merge_frac = 0.0;
w = 5;
}
}
}
if (w < 0) w = (int)(.6666667 * k + .499);
if (argc == optind) {
fprintf(stderr, "Usage: minimap [options] <target.fa> [query.fa] [...]\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " Indexing:\n");
fprintf(stderr, " -k INT k-mer size [%d]\n", k);
fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n");
fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n");
fprintf(stderr, " -d FILE dump index to FILE []\n");
fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n");
// fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users would not care about this
fprintf(stderr, " Mapping:\n");
fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", f);
fprintf(stderr, " -r INT bandwidth [%d]\n", opt.radius);
fprintf(stderr, " -m FLOAT merge two chains if FLOAT fraction of minimizers are shared [%.2f]\n", opt.merge_frac);
fprintf(stderr, " -c INT retain a mapping if it consists of >=INT minimizers [%d]\n", opt.min_cnt);
fprintf(stderr, " -L INT min matching length [%d]\n", opt.min_match);
fprintf(stderr, " -g INT split a mapping if there is a gap longer than INT [%d]\n", opt.max_gap);
fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres);
// fprintf(stderr, " -D skip self mappings but keep dual mappings\n"); // too confusing to expose to end users
fprintf(stderr, " -S skip self and dual mappings\n");
fprintf(stderr, " -O drop isolated hits before chaining (EXPERIMENTAL)\n");
fprintf(stderr, " -P filtering potential repeats after mapping (EXPERIMENTAL)\n");
// fprintf(stderr, " -R skip post-mapping repeat filtering\n"); // deprecated option for backward compatibility
fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
fprintf(stderr, " ava10k: -Sw5 -L100 -m0 (PacBio/ONT all-vs-all read mapping)\n");
fprintf(stderr, " Input/Output:\n");
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
// fprintf(stderr, " -B NUM process ~NUM bp in each batch [100M]\n");
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
// fprintf(stderr, " -N use integer as target names\n");
fprintf(stderr, " -V show version number\n");
fprintf(stderr, "\nSee minimap.1 for detailed description of the command-line options.\n");
return 1;
}
if (is_idx) fpr = fopen(argv[optind], "rb");
else fp = bseq_open(argv[optind]);
if (fnw) fpw = fopen(fnw, "wb");
for (;;) {
mm_idx_t *mi = 0;
if (fpr) mi = mm_idx_load(fpr);
else if (!bseq_eof(fp))
mi = mm_idx_gen(fp, w, k, b, is_hpc, mini_batch_size, n_threads, batch_size, keep_name);
if (mi == 0) break;
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
mm_idx_set_max_occ(mi, f);
if (fpw) mm_idx_dump(fpw, mi);
for (i = optind + 1; i < argc; ++i)
mm_map_file(mi, argv[i], &opt, n_threads, mini_batch_size);
mm_idx_destroy(mi);
}
if (fpw) fclose(fpw);
if (fpr) fclose(fpr);
if (fp) bseq_close(fp);
fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION);
fprintf(stderr, "[M::%s] CMD:", __func__);
for (i = 0; i < argc; ++i)
fprintf(stderr, " %s", argv[i]);
fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - mm_realtime0, cputime());
return 0;
}

View File

@ -35,7 +35,7 @@ typedef struct {
} mm_idx_seq_t;
typedef struct {
int32_t b, w, k;
int32_t b, w, k, is_hpc;
uint64_t sum_len; // sum of lengths
uint32_t n_seq; // number of reference sequences
mm_idx_seq_t *seq; // sequence name, length and offset
@ -79,7 +79,7 @@ extern "C" {
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
// minimizer indexing
mm_idx_t *mm_idx_init(int w, int k, int b);
mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc);
void mm_idx_destroy(mm_idx_t *mi);
mm_idx_t *mm_idx_gen(struct bseq_file_s *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name);
void mm_idx_set_max_occ(mm_idx_t *mi, float f);