From 446bde214db0588cddbeb6f3ab4f845e0fc4e91e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Sep 2017 08:44:47 -0400 Subject: [PATCH] first python version --- python/cminimap2.h | 40 ++++++++++++++++ python/cminimap2.pxd | 87 ++++++++++++++++++++++++++++++++++ python/minimap2.pyx | 110 +++++++++++++++++++++++++++++++++++++++++++ setup.py | 21 +++++++++ 4 files changed, 258 insertions(+) create mode 100644 python/cminimap2.h create mode 100644 python/cminimap2.pxd create mode 100644 python/minimap2.pyx create mode 100644 setup.py diff --git a/python/cminimap2.h b/python/cminimap2.h new file mode 100644 index 0000000..b7e90b5 --- /dev/null +++ b/python/cminimap2.h @@ -0,0 +1,40 @@ +#ifndef CMINIMAP2_H +#define CMINIMAP2_H + +#include +#include "minimap.h" + +typedef struct { + const char *ctg; + int32_t ctg_start, ctg_end; + int32_t qry_start, qry_end; + int32_t n_cigar32; + uint8_t strand, mapq, is_primary; + uint32_t *cigar32; +} mm_hitpy_t; + +static inline mm_hitpy_t *mm_reg2hitpy(const mm_idx_t *mi, int n_regs, mm_reg1_t *regs) +{ + mm_hitpy_t *h; + int i; + h = (mm_hitpy_t*)calloc(n_regs, sizeof(mm_hitpy_t)); + for (i = 0; i < n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + h[i].ctg = mi->seq[r->rid].name; + h[i].ctg_start = r->rs, h[i].ctg_end = r->re; + h[i].qry_start = r->qs, h[i].qry_end = r->qe; + h[i].strand = r->rev; + h[i].mapq = r->mapq; + h[i].is_primary = (r->id == r->parent); + h[i].n_cigar32 = r->p->n_cigar; + h[i].cigar32 = r->p->cigar; + } + return h; +} + +static inline void mm_free_reg1(mm_reg1_t *r) +{ + free(r->p); +} + +#endif diff --git a/python/cminimap2.pxd b/python/cminimap2.pxd new file mode 100644 index 0000000..703e63f --- /dev/null +++ b/python/cminimap2.pxd @@ -0,0 +1,87 @@ +from libc.stdint cimport uint8_t, int32_t, int64_t, uint32_t, uint64_t + +cdef extern from "minimap.h": + # + # Options + # + ctypedef struct mm_idxopt_t: + short k, w, is_hpc, bucket_bits + int mini_batch_size + uint64_t batch_size + + ctypedef struct mm_mapopt_t: + int sdust_thres + int flag + int bw + int max_gap, max_gap_ref + int max_chain_skip + int min_cnt + int min_chain_score + float mask_level + float pri_ratio + int best_n + int max_join_long, max_join_short + int min_join_flank_sc + int a, b, q, e, q2, e2 + int noncan + int zdrop + int min_dp_max + int min_ksw_len + float mid_occ_frac + int32_t mid_occ + int mini_batch_size + + int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) + + # + # Indexing + # + ctypedef struct mm_idx_seq_t: + char *name + uint64_t offset + uint32_t len + + ctypedef struct mm_idx_bucket_t: + pass + + ctypedef struct mm_idx_t: + int32_t b, w, k, is_hpc + uint32_t n_seq + mm_idx_seq_t *seq + uint32_t *S + mm_idx_bucket_t *B + void *km + + ctypedef struct mm_idx_reader_t: + pass + + mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out) + mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads) + void mm_idx_reader_close(mm_idx_reader_t *r) + void mm_idx_destroy(mm_idx_t *mi) + void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) + + # + # Mapping (key struct defined in cminimap2.h below) + # + ctypedef struct mm_reg1_t: + pass + + ctypedef struct mm_tbuf_t: + pass + + mm_tbuf_t *mm_tbuf_init() + void mm_tbuf_destroy(mm_tbuf_t *b) + mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name) + +cdef extern from "cminimap2.h": + ctypedef struct mm_hitpy_t: + const char *ctg + int32_t ctg_start, ctg_end + int32_t qry_start, qry_end + int32_t n_cigar32 + uint8_t strand, mapq, is_primary + uint32_t *cigar32 + + mm_hitpy_t *mm_reg2hitpy(const mm_idx_t *mi, int n_regs, mm_reg1_t *regs) + void mm_free_reg1(mm_reg1_t *r) diff --git a/python/minimap2.pyx b/python/minimap2.pyx new file mode 100644 index 0000000..0137742 --- /dev/null +++ b/python/minimap2.pyx @@ -0,0 +1,110 @@ +from libc.stdint cimport uint8_t, int8_t +from libc.stdlib cimport free +cimport cminimap2 + +cdef class Alignment: + cdef const char *_ctg + cdef int _r_st, _r_en + cdef int _q_st, _q_en + cdef int8_t _is_rev, _is_primary + cdef uint8_t _mapq + cdef _cigar + + def __cinit__(self, ctg, cs, ce, strand, qs, qe, mapq, cigar, is_primary): + self._ctg, self._r_st, self._r_en = ctg, cs, ce + self._is_rev, self._q_st, self._q_en = strand, qs, qe + self._mapq = mapq + self._cigar = cigar + self._is_primary = is_primary + + @property + def ctg(self): + return self._ctg + + @property + def r_st(self): + return self._r_st + + @property + def r_en(self): + return self._r_en + + @property + def is_rev(self): + return self._is_rev + + @property + def is_primary(self): + return self._is_primary + + @property + def q_st(self): + return self._q_st + + @property + def q_en(self): + return self._q_en + + @property + def mapq(self): + return self._mapq + + @property + def cigar(self): + return self._cigar + +cdef class ThreadBuffer: + cdef cminimap2.mm_tbuf_t *_b + + def __cinit__(self): + self._b = cminimap2.mm_tbuf_init() + + def __dealloc__(self): + cminimap2.mm_tbuf_destroy(self._b) + +cdef class Aligner: + cdef cminimap2.mm_idx_t *_idx + cdef public cminimap2.mm_idxopt_t idx_opt + cdef public cminimap2.mm_mapopt_t map_opt + + def __cinit__(self, fn, preset=None): + self.config(preset) + cdef cminimap2.mm_idx_reader_t *r; + r = cminimap2.mm_idx_reader_open(fn, &self.idx_opt, NULL) + self._idx = cminimap2.mm_idx_reader_read(r, 3) # NB: ONLY read the first part + cminimap2.mm_idx_reader_close(r) + cminimap2.mm_mapopt_update(&self.map_opt, self._idx) + + def __dealloc__(self): + if self._idx is not NULL: + cminimap2.mm_idx_destroy(self._idx) + + def config(self, preset=None): + cminimap2.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) + if preset is not None: + cminimap2.mm_set_opt(preset, &self.idx_opt, &self.map_opt) + self.map_opt.flag |= 4 # always perform alignment + + def map(self, seq, buf=None): + cdef cminimap2.mm_reg1_t *regs + cdef ThreadBuffer b + cdef int n_regs + + if buf is None: b = ThreadBuffer() + else: b = buf + regs = cminimap2.mm_map(self._idx, len(seq), seq, &n_regs, b._b, &self.map_opt, NULL) + hits = cminimap2.mm_reg2hitpy(self._idx, n_regs, regs) + + arr = [] + for i in range(n_regs): + h = hits[i] + cigar = [] + for k in range(h.n_cigar32): + c = h.cigar32[k] + cigar.append([c>>4, c&0xf]) + arr.append(Alignment(h.ctg, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary)) + cminimap2.mm_free_reg1(®s[i]) + free(regs) + free(hits) + + return arr diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a5afbd9 --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +from setuptools import setup, Extension +from Cython.Build import cythonize +import sys + +sys.path.append('python') + +setup( + name = 'minimap2', + version = '2.1.1', + url = 'https://github.com/lh3/minimap2', + description = 'Minimap2 python binding', + author = 'Heng Li', + author_email = 'lh3@me.com', + keywords = ['bioinformatics', 'sequence-alignment'], + ext_modules = cythonize([Extension('minimap2', + ['python/minimap2.pyx', 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'kalloc.c', + 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', 'kthread.c', 'map.c', + 'misc.c', 'sdust.c', 'sketch.c'], + extra_compile_args = ['-msse4'], + include_dirs = ['.'])]) +)