From 24a480882661a8cd3114643f6c41ff5db6e4944e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 23 Feb 2018 10:18:26 -0500 Subject: [PATCH] r718: retrieve sequence from the index --- index.c | 40 +++++++++++++++++++++++++++++++--------- main.c | 2 +- minimap.h | 7 ++++++- mmpriv.h | 1 - options.c | 9 +++++++++ python/README.rst | 17 ++++++++++++++--- python/cmappy.h | 18 ++++++++++++++++++ python/cmappy.pxd | 5 ++++- python/mappy.pyx | 36 ++++++++++++++++++++++++++++++++---- 9 files changed, 115 insertions(+), 20 deletions(-) diff --git a/index.c b/index.c index 60cdf5b..201870a 100644 --- a/index.c +++ b/index.c @@ -20,6 +20,8 @@ KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq) typedef khash_t(idx) idxhash_t; +KHASH_MAP_INIT_STR(str, uint32_t) + #define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x)) typedef struct mm_idx_bucket_s { @@ -29,15 +31,6 @@ typedef struct mm_idx_bucket_s { void *h; // hash table indexing _p_ and minimizers appearing once } mm_idx_bucket_t; -void mm_idxopt_init(mm_idxopt_t *opt) -{ - memset(opt, 0, sizeof(mm_idxopt_t)); - opt->k = 15, opt->w = 10, opt->flag = 0; - opt->bucket_bits = 14; - opt->mini_batch_size = 50000000; - opt->batch_size = 4000000000ULL; -} - mm_idx_t *mm_idx_init(int w, int k, int b, int flag) { mm_idx_t *mi; @@ -54,6 +47,7 @@ void mm_idx_destroy(mm_idx_t *mi) { int i; if (mi == 0) return; + if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h); for (i = 0; i < 1<b; ++i) { free(mi->B[i].p); free(mi->B[i].a.a); @@ -109,6 +103,34 @@ void mm_idx_stat(const mm_idx_t *mi) __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum); } +int mm_idx_index_name(mm_idx_t *mi) +{ + khash_t(str) *h; + uint32_t i; + int has_dup = 0, absent; + if (mi->h) return 0; + h = kh_init(str); + for (i = 0; i < mi->n_seq; ++i) { + khint_t k; + k = kh_put(str, h, mi->seq[i].name, &absent); + if (absent) kh_val(h, k) = i; + else has_dup = 1; + } + mi->h = h; + if (has_dup && mm_verbose >= 2) + fprintf(stderr, "[WARNING] some database sequences have identical sequence names\n"); + return has_dup; +} + +int mm_idx_name2id(const mm_idx_t *mi, const char *name) +{ + khash_t(str) *h = (khash_t(str)*)mi->h; + khint_t k; + if (h == 0) return -2; + k = kh_get(str, h, name); + return k == kh_end(h)? -1 : kh_val(h, k); +} + int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq) { uint64_t i, st1, en1; diff --git a/main.c b/main.c index 4d1e45a..cbbcb0c 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.8-r711-dirty" +#define MM_VERSION "2.8-r718-dirty" #ifdef __linux__ #include diff --git a/minimap.h b/minimap.h index f4bde58..4e3a617 100644 --- a/minimap.h +++ b/minimap.h @@ -59,7 +59,7 @@ typedef struct { mm_idx_seq_t *seq; // sequence name, length and offset uint32_t *S; // 4-bit packed sequence struct mm_idx_bucket_s *B; // index (hidden) - void *km; + void *km, *h; } mm_idx_t; // minimap2 alignment @@ -297,6 +297,11 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads); +// query sequence name and sequence in the minimap2 index +int mm_idx_index_name(mm_idx_t *mi); +int mm_idx_name2id(const mm_idx_t *mi, const char *name); +int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); + // deprecated APIs for backward compatibility void mm_mapopt_init(mm_mapopt_t *opt); mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads); diff --git a/mmpriv.h b/mmpriv.h index 176743a..4a1ae2b 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -63,7 +63,6 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se void mm_idxopt_init(mm_idxopt_t *opt); const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); -int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); diff --git a/options.c b/options.c index 685fff2..b98e0eb 100644 --- a/options.c +++ b/options.c @@ -1,6 +1,15 @@ #include #include "mmpriv.h" +void mm_idxopt_init(mm_idxopt_t *opt) +{ + memset(opt, 0, sizeof(mm_idxopt_t)); + opt->k = 15, opt->w = 10, opt->flag = 0; + opt->bucket_bits = 14; + opt->mini_batch_size = 50000000; + opt->batch_size = 4000000000ULL; +} + void mm_mapopt_init(mm_mapopt_t *opt) { memset(opt, 0, sizeof(mm_mapopt_t)); diff --git a/python/README.rst b/python/README.rst index ae53b02..ff15bab 100644 --- a/python/README.rst +++ b/python/README.rst @@ -34,6 +34,8 @@ The following Python script demonstrates the key functionality of mappy: import mappy as mp a = mp.Aligner("test/MT-human.fa") # load or build index if not a: raise Exception("ERROR: failed to load/build index") + s = a.seq("MT_human", 100, 200) # retrieve a subsequence from the index + print(mp.revcomp(s)) # reverse complement for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence for hit in a.map(seq): # traverse alignments print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str)) @@ -87,7 +89,15 @@ This method aligns :code:`seq` against the index. It is a generator, *yielding* a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy performs paired-end alignment, assuming the two ends are in the FR orientation. Alignments of the two ends can be distinguished by the :code:`read_num` field -(see below). +(see Class mappy.Alignment below). + +.. code:: python + + mappy.Aligner.seq(name, start=0, end=0x7fffffff) + +This method retrieves a (sub)sequence from the index and returns it as a Python +string. :code:`None` is returned if :code:`name` is not present in the index or +the start/end coordinates are invalid. Class mappy.Alignment ~~~~~~~~~~~~~~~~~~~~~ @@ -144,11 +154,12 @@ Miscellaneous Functions .. code:: python - mappy.fastx_read(fn) + mappy.fastx_read(fn, read_comment=False) This generator function opens a FASTA/FASTQ file and *yields* a :code:`(name,seq,qual)` tuple for each sequence entry. The input file may be -optionally gzip'd. +optionally gzip'd. If :code:`read_comment` is True, this generator yields +a :code:`(name,seq,qual,comment)` tuple instead. .. code:: python diff --git a/python/cmappy.h b/python/cmappy.h index 313a9dd..ae2428e 100644 --- a/python/cmappy.h +++ b/python/cmappy.h @@ -112,4 +112,22 @@ static inline char *mappy_revcomp(int len, const uint8_t *seq) return rev; } +static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *len) +{ + int i, rid; + char *s; + *len = 0; + rid = mm_idx_name2id(mi, name); + if (rid < 0) return 0; + if (st >= mi->seq[i].len || st >= en) return 0; + if (en < 0 || en > mi->seq[i].len) + en = mi->seq[i].len; + s = (char*)malloc(en - st + 1); + *len = mm_idx_getseq(mi, rid, st, en, s); + for (i = 0; i < *len; ++i) + s[i] = "ACGTN"[(uint8_t)s[i]]; + s[*len] = 0; + return s; +} + #endif diff --git a/python/cmappy.pxd b/python/cmappy.pxd index a40adf0..532f6f5 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -58,7 +58,7 @@ cdef extern from "minimap.h": mm_idx_seq_t *seq uint32_t *S mm_idx_bucket_t *B - void *km + void *km, *h ctypedef struct mm_idx_reader_t: pass @@ -69,6 +69,8 @@ cdef extern from "minimap.h": void mm_idx_destroy(mm_idx_t *mi) void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) + int mm_idx_index_name(mm_idx_t *mi) + # # Mapping (key struct defined in cmappy.h below) # @@ -99,6 +101,7 @@ cdef extern from "cmappy.h": void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h) void mm_free_reg1(mm_reg1_t *r) mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt) + char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l) ctypedef struct kstring_t: unsigned l, m diff --git a/python/mappy.pyx b/python/mappy.pyx index aeeab5e..5ab92c4 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -123,6 +123,7 @@ cdef class Aligner: self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part cmappy.mm_idx_reader_close(r) cmappy.mm_mapopt_update(&self.map_opt, self._idx) + cmappy.mm_idx_index_name(self._idx) def __dealloc__(self): if self._idx is not NULL: @@ -140,8 +141,13 @@ cdef class Aligner: if self._idx is NULL: return None if buf is None: b = ThreadBuffer() else: b = buf - if seq2 is None: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), NULL, &n_regs, b._b, &self.map_opt) - else: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), str.encode(seq2), &n_regs, b._b, &self.map_opt) + + _seq = seq if isinstance(seq, bytes) else seq.encode() + if seq2 is None: + regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &self.map_opt) + else: + _seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode() + regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &self.map_opt) for i in range(n_regs): cmappy.mm_reg2hitpy(self._idx, ®s[i], &h) @@ -153,7 +159,24 @@ cdef class Aligner: cmappy.mm_free_reg1(®s[i]) free(regs) -def fastx_read(fn): + def seq(self, str name, int start=0, int end=0x7fffffff): + cdef int l + cdef char *s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l) + if l == 0: return None + r = s[:l] if isinstance(s, str) else s[:l].decode() + free(s) + return r + + @property + def k(self): return self._idx.k + + @property + def w(self): return self._idx.w + + @property + def n_seq(self): return self._idx.n_seq + +def fastx_read(fn, read_comment=False): cdef cmappy.kseq_t *ks ks = cmappy.mm_fastx_open(str.encode(fn)) if ks is NULL: return None @@ -162,7 +185,12 @@ def fastx_read(fn): else: qual = None name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode() seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode() - yield name, seq, qual + if read_comment: + if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode() + else: comment = None + yield name, seq, qual, comment + else: + yield name, seq, qual cmappy.mm_fastx_close(ks) def revcomp(seq):