From 56723ad5800c275f5c16e7d52e861c2a9e996cca Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Apr 2017 11:06:24 -0400 Subject: [PATCH] moved `sum_len` out of the index as it can be inferred. --- index.c | 35 +++++++++++++++++++++++------------ main.c | 6 +++++- minimap.h | 1 - 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/index.c b/index.c index 2308dc2..7e55c4e 100644 --- a/index.c +++ b/index.c @@ -153,7 +153,7 @@ static void mm_idx_post(mm_idx_t *mi, int n_threads) typedef struct { int mini_batch_size, keep_name; - uint64_t batch_size; + uint64_t batch_size, sum_len; bseq_file_t *fp; mm_idx_t *mi; } pipeline_t; @@ -179,7 +179,7 @@ static void *worker_pipeline(void *shared, int step, void *in) pipeline_t *p = (pipeline_t*)shared; if (step == 0) { // step 0: read sequences step_t *s; - if (p->mi->sum_len > p->batch_size) return 0; + if (p->sum_len > p->batch_size) return 0; s = (step_t*)calloc(1, sizeof(step_t)); s->seq = bseq_read(p->fp, p->mini_batch_size, &s->n_seq); // read a mini-batch if (s->seq) { @@ -193,8 +193,8 @@ static void *worker_pipeline(void *shared, int step, void *in) p->mi->seq = (mm_idx_seq_t*)realloc(p->mi->seq, m * sizeof(mm_idx_seq_t)); // make room for p->mi->S for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq; - old_max_len = (p->mi->sum_len + 7) / 8; - max_len = (p->mi->sum_len + sum_len + 7) / 8; + old_max_len = (p->sum_len + 7) / 8; + max_len = (p->sum_len + sum_len + 7) / 8; kroundup64(old_max_len); kroundup64(max_len); if (old_max_len != max_len) { p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4); @@ -209,15 +209,15 @@ static void *worker_pipeline(void *shared, int step, void *in) seq->name = strdup(s->seq[i].name); } else seq->name = 0; seq->len = s->seq[i].l_seq; - seq->offset = p->mi->sum_len; + seq->offset = p->sum_len; // copy the sequence for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here - uint64_t o = p->mi->sum_len + j; + uint64_t o = p->sum_len + j; int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]]; mm_seq4_set(p->mi->S, o, c); } - // update p->mi->{sum_len,n_seq} - p->mi->sum_len += seq->len; + // update p->sum_len and p->mi->n_seq + p->sum_len += seq->len; s->seq[i].rid = p->mi->n_seq++; } return s; @@ -280,8 +280,10 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads) void mm_idx_dump(FILE *fp, const mm_idx_t *mi) { + uint64_t sum_len = 0; uint32_t x[5]; int i; + x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->is_hpc; fwrite(MM_IDX_MAGIC, 1, 4, fp); fwrite(x, 4, 5, fp); @@ -291,6 +293,7 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi) fwrite(&l, 1, 1, fp); fwrite(mi->seq[i].name, 1, l, fp); fwrite(&mi->seq[i].len, 4, 1, fp); + sum_len += mi->seq[i].len; } for (i = 0; i < 1<b; ++i) { mm_idx_bucket_t *b = &mi->B[i]; @@ -308,6 +311,8 @@ void mm_idx_dump(FILE *fp, const mm_idx_t *mi) fwrite(x, 8, 2, fp); } } + fwrite(mi->S, 4, (sum_len + 7) / 8, fp); + fflush(fp); } mm_idx_t *mm_idx_load(FILE *fp) @@ -315,7 +320,9 @@ mm_idx_t *mm_idx_load(FILE *fp) int i; char magic[4]; uint32_t x[5]; + uint64_t sum_len = 0; mm_idx_t *mi; + if (fread(magic, 1, 4, fp) != 4) return 0; if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0; if (fread(x, 4, 5, fp) != 6) return 0; @@ -324,11 +331,13 @@ mm_idx_t *mm_idx_load(FILE *fp) mi->seq = (mm_idx_seq_t*)calloc(mi->n_seq, sizeof(mm_idx_seq_t)); for (i = 0; i < mi->n_seq; ++i) { uint8_t l; + mm_idx_seq_t *s = &mi->seq[i]; fread(&l, 1, 1, fp); - mi->seq[i].name = (char*)malloc(l + 1); - fread(mi->seq[i].name, 1, l, fp); - mi->seq[i].name[l] = 0; - fread(&mi->seq[i].len, 4, 1, fp); + s->name = (char*)malloc(l + 1); + fread(s->name, 1, l, fp); + s->name[l] = 0; + fread(&s->len, 4, 1, fp); + sum_len += s->len; } for (i = 0; i < 1<b; ++i) { mm_idx_bucket_t *b = &mi->B[i]; @@ -351,5 +360,7 @@ mm_idx_t *mm_idx_load(FILE *fp) kh_val(h, k) = x[1]; } } + mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4); + fread(mi->S, 4, (sum_len + 7) / 8, fp); return mi; } diff --git a/main.c b/main.c index 437ebd2..ec1f94c 100644 --- a/main.c +++ b/main.c @@ -127,7 +127,11 @@ int main(int argc, char *argv[]) if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq); - if (fpw) mm_idx_dump(fpw, mi); + if (fpw) { + mm_idx_dump(fpw, mi); + if (mm_verbose >= 2) + fprintf(stderr, "[M::%s::%.3f*%.2f] dumpped the (partial) index to disk\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); + } /* for (i = optind + 1; i < argc; ++i) mm_map_file(mi, argv[i], &opt, n_threads, mini_batch_size); diff --git a/minimap.h b/minimap.h index 41a561a..cd6a57c 100644 --- a/minimap.h +++ b/minimap.h @@ -36,7 +36,6 @@ typedef struct { typedef struct { int32_t b, w, k, is_hpc; - uint64_t sum_len; // sum of lengths uint32_t n_seq; // number of reference sequences mm_idx_seq_t *seq; // sequence name, length and offset uint32_t *S; // 4-bit packed sequence