minimap2/python/mappy.pyx

213 lines
6.3 KiB
Cython
Raw Normal View History

2017-09-16 20:44:47 +08:00
from libc.stdint cimport uint8_t, int8_t
from libc.stdlib cimport free
cimport cmappy
2018-02-21 00:05:54 +08:00
import sys
2017-09-16 20:44:47 +08:00
2018-06-21 12:04:08 +08:00
__version__ = '2.11'
cmappy.mm_reset_timer()
2017-09-16 20:44:47 +08:00
cdef class Alignment:
2017-09-16 23:14:01 +08:00
cdef int _ctg_len, _r_st, _r_en
2017-09-16 20:44:47 +08:00
cdef int _q_st, _q_en
2017-10-16 23:15:07 +08:00
cdef int _NM, _mlen, _blen
2017-09-16 23:14:01 +08:00
cdef int8_t _strand, _trans_strand
cdef uint8_t _mapq, _is_primary
cdef int _seg_id
2017-09-16 23:14:01 +08:00
cdef _ctg, _cigar # these are python objects
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id):
2017-11-09 21:57:15 +08:00
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
2017-09-16 23:14:01 +08:00
self._strand, self._q_st, self._q_en = strand, qs, qe
2017-10-16 23:15:07 +08:00
self._NM, self._mlen, self._blen = NM, mlen, blen
2017-09-16 20:44:47 +08:00
self._mapq = mapq
self._cigar = cigar
self._is_primary = is_primary
2017-09-16 23:14:01 +08:00
self._trans_strand = trans_strand
self._seg_id = seg_id
2017-09-16 23:14:01 +08:00
@property
def ctg(self): return self._ctg
@property
def ctg_len(self): return self._ctg_len
@property
def r_st(self): return self._r_st
2017-09-16 20:44:47 +08:00
@property
2017-09-16 23:14:01 +08:00
def r_en(self): return self._r_en
2017-09-16 20:44:47 +08:00
@property
2017-11-11 09:18:44 +08:00
def strand(self): return self._strand
2017-09-16 20:44:47 +08:00
@property
2017-09-17 07:55:33 +08:00
def trans_strand(self): return self._trans_strand
2017-09-16 20:44:47 +08:00
2017-10-16 23:15:07 +08:00
@property
def blen(self): return self._blen
@property
def mlen(self): return self._mlen
2017-09-16 20:44:47 +08:00
@property
2017-09-16 23:14:01 +08:00
def NM(self): return self._NM
2017-09-16 20:44:47 +08:00
@property
2017-09-16 23:14:01 +08:00
def is_primary(self): return (self._is_primary != 0)
2017-09-16 20:44:47 +08:00
@property
2017-09-16 23:14:01 +08:00
def q_st(self): return self._q_st
2017-09-16 20:44:47 +08:00
@property
2017-09-16 23:14:01 +08:00
def q_en(self): return self._q_en
2017-09-16 20:44:47 +08:00
@property
2017-09-16 23:14:01 +08:00
def mapq(self): return self._mapq
2017-09-16 20:44:47 +08:00
@property
2017-09-16 23:14:01 +08:00
def cigar(self): return self._cigar
@property
def read_num(self): return self._seg_id + 1
2017-09-16 23:14:01 +08:00
@property
def cigar_str(self):
return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
def __str__(self):
if self._strand > 0: strand = '+'
elif self._strand < 0: strand = '-'
else: strand = '?'
if self._is_primary != 0: tp = 'tp:A:P'
else: tp = 'tp:A:S'
if self._trans_strand > 0: ts = 'ts:A:+'
elif self._trans_strand < 0: ts = 'ts:A:-'
else: ts = 'ts:A:.'
2017-09-16 23:14:01 +08:00
return "\t".join([str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
2017-10-16 23:15:07 +08:00
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str])
2017-09-16 20:44:47 +08:00
cdef class ThreadBuffer:
cdef cmappy.mm_tbuf_t *_b
2017-09-16 20:44:47 +08:00
def __cinit__(self):
self._b = cmappy.mm_tbuf_init()
2017-09-16 20:44:47 +08:00
def __dealloc__(self):
cmappy.mm_tbuf_destroy(self._b)
2017-09-16 20:44:47 +08:00
cdef class Aligner:
cdef cmappy.mm_idx_t *_idx
cdef cmappy.mm_idxopt_t idx_opt
cdef cmappy.mm_mapopt_t map_opt
2017-09-16 20:44:47 +08:00
def __cinit__(self, fn_idx_in, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None):
cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
2017-09-17 06:11:43 +08:00
if preset is not None:
cmappy.mm_set_opt(str.encode(preset), &self.idx_opt, &self.map_opt) # apply preset
2017-09-17 06:11:43 +08:00
self.map_opt.flag |= 4 # always perform alignment
2017-09-16 23:14:01 +08:00
self.idx_opt.batch_size = 0x7fffffffffffffffL # always build a uni-part index
if k is not None: self.idx_opt.k = k
if w is not None: self.idx_opt.w = w
if min_cnt is not None: self.map_opt.min_cnt = min_cnt
if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
if bw is not None: self.map_opt.bw = bw
2018-02-17 10:10:57 +08:00
if best_n is not None: self.map_opt.best_n = best_n
if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
2017-09-16 23:14:01 +08:00
cdef cmappy.mm_idx_reader_t *r;
2017-09-16 23:14:01 +08:00
if fn_idx_out is None:
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL)
2017-09-16 23:14:01 +08:00
else:
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, fn_idx_out)
2017-09-16 23:14:01 +08:00
if r is not NULL:
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
cmappy.mm_idx_reader_close(r)
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
2018-02-23 23:18:26 +08:00
cmappy.mm_idx_index_name(self._idx)
2017-09-17 10:43:52 +08:00
2017-09-16 20:44:47 +08:00
def __dealloc__(self):
if self._idx is not NULL:
cmappy.mm_idx_destroy(self._idx)
2017-09-16 20:44:47 +08:00
2017-09-17 08:09:17 +08:00
def __bool__(self):
return (self._idx != NULL)
def map(self, seq, seq2=None, buf=None, max_frag_len=None):
cdef cmappy.mm_reg1_t *regs
cdef cmappy.mm_hitpy_t h
2017-09-16 20:44:47 +08:00
cdef ThreadBuffer b
cdef int n_regs
cdef cmappy.mm_mapopt_t map_opt
map_opt = self.map_opt
if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
2017-09-16 20:44:47 +08:00
2017-09-16 23:14:01 +08:00
if self._idx is NULL: return None
2017-09-16 20:44:47 +08:00
if buf is None: b = ThreadBuffer()
else: b = buf
2018-02-23 23:18:26 +08:00
_seq = seq if isinstance(seq, bytes) else seq.encode()
if seq2 is None:
regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &map_opt)
2018-02-23 23:18:26 +08:00
else:
_seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &map_opt)
2017-09-16 20:44:47 +08:00
for i in range(n_regs):
cmappy.mm_reg2hitpy(self._idx, &regs[i], &h)
2017-09-16 20:44:47 +08:00
cigar = []
for k in range(h.n_cigar32):
c = h.cigar32[k]
cigar.append([c>>4, c&0xf])
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id)
cmappy.mm_free_reg1(&regs[i])
2017-09-16 20:44:47 +08:00
free(regs)
2017-09-18 02:41:59 +08:00
2018-02-23 23:18:26 +08:00
def seq(self, str name, int start=0, int end=0x7fffffff):
cdef int l
cdef char *s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
if l == 0: return None
r = s[:l] if isinstance(s, str) else s[:l].decode()
free(s)
return r
@property
def k(self): return self._idx.k
@property
def w(self): return self._idx.w
@property
def n_seq(self): return self._idx.n_seq
def fastx_read(fn, read_comment=False):
2017-09-18 02:41:59 +08:00
cdef cmappy.kseq_t *ks
ks = cmappy.mm_fastx_open(str.encode(fn))
if ks is NULL: return None
while cmappy.kseq_read(ks) >= 0:
2017-11-09 21:57:15 +08:00
if ks.qual.l > 0: qual = ks.qual.s if isinstance(ks.qual.s, str) else ks.qual.s.decode()
2017-09-18 04:02:06 +08:00
else: qual = None
2017-11-09 21:57:15 +08:00
name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
2018-02-23 23:18:26 +08:00
if read_comment:
if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
else: comment = None
yield name, seq, qual, comment
else:
yield name, seq, qual
2017-09-18 02:41:59 +08:00
cmappy.mm_fastx_close(ks)
2018-02-20 22:41:25 +08:00
def revcomp(seq):
2018-02-21 00:05:54 +08:00
l = len(seq)
bseq = seq if isinstance(seq, bytes) else seq.encode()
cdef char *s = cmappy.mappy_revcomp(l, bseq)
r = s[:l] if isinstance(s, str) else s[:l].decode()
free(s)
return r
2018-02-20 22:41:25 +08:00
def verbose(v=None):
if v is None: v = -1
return cmappy.mm_verbose_level(v)