From e420b17496a8b9ef16c98cf55e13cf16a6e32936 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 18 Dec 2017 22:29:46 -0500 Subject: [PATCH] r629: API to construct index from strings --- index.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- main.c | 2 +- minimap.h | 15 +++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/index.c b/index.c index 4d9a333..e388ad7 100644 --- a/index.c +++ b/index.c @@ -329,7 +329,7 @@ mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini return pl.mi; } -mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface +mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface; deprecated { mm_bseq_file_t *fp; mm_idx_t *mi; @@ -340,6 +340,49 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // return mi; } +mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name) +{ + uint64_t sum_len = 0; + mm128_v a = {0,0,0}; + mm_idx_t *mi; + int i, flag = 0; + if (n <= 0) return 0; + for (i = 0; i < n; ++i) // get the total length + sum_len += strlen(seq[i]); + if (is_hpc) flag |= MM_I_HPC; + if (name == 0) flag |= MM_I_NO_NAME; + if (bucket_bits < 0) bucket_bits = 14; + mi = mm_idx_init(w, k, bucket_bits, flag); + mi->n_seq = n; + mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km + mi->S = (uint32_t*)calloc((sum_len + 7) / 8, 4); + for (i = 0, sum_len = 0; i < n; ++i) { + const char *s = seq[i]; + mm_idx_seq_t *p = &mi->seq[i]; + uint32_t j; + if (name && name[i]) { + p->name = (char*)kmalloc(mi->km, strlen(name[i]) + 1); + strcpy(p->name, name[i]); + } + p->offset = sum_len; + p->len = strlen(s); + for (j = 0; j < p->len; ++j) { + int c = seq_nt4_table[(uint8_t)s[j]]; + uint64_t o = sum_len + j; + mm_seq4_set(mi->S, o, c); + } + sum_len += p->len; + if (p->len > 0) { + a.n = 0; + mm_sketch(0, s, p->len, w, k, i, is_hpc, &a); + mm_idx_add(mi, a.n, a.a); + } + } + free(a.a); + mm_idx_post(mi, 1); + return mi; +} + /************* * index I/O * *************/ diff --git a/main.c b/main.c index da8ac7e..1281a3f 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.6-r626-dirty" +#define MM_VERSION "2.6-r629-dirty" #ifdef __linux__ #include diff --git a/minimap.h b/minimap.h index 32e1e1b..1cff5ed 100644 --- a/minimap.h +++ b/minimap.h @@ -208,6 +208,21 @@ void mm_idx_reader_close(mm_idx_reader_t *r); int mm_idx_reader_eof(const mm_idx_reader_t *r); +/** + * Create an index from strings in memory + * + * @param w minimizer window size + * @param k minimizer k-mer size + * @param is_hpc use HPC k-mer if true + * @param bucket_bits number of bits for the first level of the hash table + * @param n number of sequences + * @param seq sequences in A/C/G/T + * @param name sequence names; could be NULL + * + * @return minimap2 index + */ +mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name); + /** * Print index statistics to stderr *