From b0f39a1a6175ac1c8e8f4a851427f29576273196 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 5 Aug 2018 20:57:05 -0400 Subject: [PATCH] r823: mappy to index a single sequence --- index.c | 6 ++++++ main.c | 2 +- python/cmappy.h | 12 ++++++++++++ python/cmappy.pxd | 1 + python/mappy.pyx | 26 +++++++++++++++++--------- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/index.c b/index.c index 655a567..05420d8 100644 --- a/index.c +++ b/index.c @@ -372,7 +372,9 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha uint64_t sum_len = 0; mm128_v a = {0,0,0}; mm_idx_t *mi; + khash_t(str) *h; int i, flag = 0; + if (n <= 0) return 0; for (i = 0; i < n; ++i) // get the total length sum_len += strlen(seq[i]); @@ -383,13 +385,17 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha mi->n_seq = n; mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km mi->S = (uint32_t*)calloc((sum_len + 7) / 8, 4); + mi->h = h = kh_init(str); for (i = 0, sum_len = 0; i < n; ++i) { const char *s = seq[i]; mm_idx_seq_t *p = &mi->seq[i]; uint32_t j; if (name && name[i]) { + int absent; p->name = (char*)kmalloc(mi->km, strlen(name[i]) + 1); strcpy(p->name, name[i]); + kh_put(str, h, p->name, &absent); + assert(absent); } p->offset = sum_len; p->len = strlen(s); diff --git a/main.c b/main.c index 079155c..c327fa7 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "getopt.h" #endif -#define MM_VERSION "2.11-r822-dirty" +#define MM_VERSION "2.11-r823-dirty" #ifdef __linux__ #include diff --git a/python/cmappy.h b/python/cmappy.h index 690c9c2..6bc5635 100644 --- a/python/cmappy.h +++ b/python/cmappy.h @@ -137,4 +137,16 @@ static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int e return s; } +static mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int len) +{ + const char *fake_name = "N/A"; + char *s; + mm_idx_t *mi; + s = (char*)calloc(len + 1, 1); + memcpy(s, seq, len); + mi = mm_idx_str(w, k, is_hpc, bucket_bits, 1, (const char**)&s, (const char**)&fake_name); + free(s); + return mi; +} + #endif diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 871d326..a6c2a3d 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -110,6 +110,7 @@ cdef extern from "cmappy.h": void mm_free_reg1(mm_reg1_t *r) mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt) char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l) + mm_idx_t *mappy_idx_seq(int w, int k, int is_hpc, int bucket_bits, const char *seq, int l) ctypedef struct kstring_t: unsigned l, m diff --git a/python/mappy.pyx b/python/mappy.pyx index 8fc1543..b1c5067 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -112,7 +112,7 @@ cdef class Aligner: cdef cmappy.mm_idxopt_t idx_opt cdef cmappy.mm_mapopt_t map_opt - def __cinit__(self, fn_idx_in, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None): + def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None): cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options if preset is not None: cmappy.mm_set_opt(str.encode(preset), &self.idx_opt, &self.map_opt) # apply preset @@ -126,17 +126,24 @@ cdef class Aligner: if bw is not None: self.map_opt.bw = bw if best_n is not None: self.map_opt.best_n = best_n if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len + if extra_flags is not None: self.map_opt.flag |= extra_flags cdef cmappy.mm_idx_reader_t *r; - if fn_idx_out is None: - r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL) + + if seq is None: + if fn_idx_out is None: + r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL) + else: + r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, fn_idx_out) + if r is not NULL: + self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part + cmappy.mm_idx_reader_close(r) + cmappy.mm_mapopt_update(&self.map_opt, self._idx) + cmappy.mm_idx_index_name(self._idx) else: - r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, fn_idx_out) - if r is not NULL: - self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part - cmappy.mm_idx_reader_close(r) + self._idx = cmappy.mappy_idx_seq(self.idx_opt.w, self.idx_opt.k, self.idx_opt.flag&1, self.idx_opt.bucket_bits, str.encode(seq), len(seq)) cmappy.mm_mapopt_update(&self.map_opt, self._idx) - cmappy.mm_idx_index_name(self._idx) + self.map_opt.mid_occ = 1000 # don't filter high-occ seeds def __dealloc__(self): if self._idx is not NULL: @@ -145,7 +152,7 @@ cdef class Aligner: def __bool__(self): return (self._idx != NULL) - def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None): + def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None): cdef cmappy.mm_reg1_t *regs cdef cmappy.mm_hitpy_t h cdef ThreadBuffer b @@ -157,6 +164,7 @@ cdef class Aligner: map_opt = self.map_opt if max_frag_len is not None: map_opt.max_frag_len = max_frag_len + if extra_flags is not None: map_opt.flag |= extra_flags if self._idx is NULL: return None if buf is None: b = ThreadBuffer()