2017-09-16 20:44:47 +08:00
|
|
|
from libc.stdint cimport uint8_t, int8_t
|
|
|
|
|
from libc.stdlib cimport free
|
2017-09-17 12:05:30 +08:00
|
|
|
cimport cmappy
|
2018-02-21 00:05:54 +08:00
|
|
|
import sys
|
2017-09-16 20:44:47 +08:00
|
|
|
|
2018-11-06 13:03:16 +08:00
|
|
|
__version__ = '2.14'
|
2018-06-20 03:40:26 +08:00
|
|
|
|
2017-09-18 03:21:36 +08:00
|
|
|
cmappy.mm_reset_timer()
|
|
|
|
|
|
2017-09-16 20:44:47 +08:00
|
|
|
cdef class Alignment:
|
2017-09-16 23:14:01 +08:00
|
|
|
cdef int _ctg_len, _r_st, _r_en
|
2017-09-16 20:44:47 +08:00
|
|
|
cdef int _q_st, _q_en
|
2017-10-16 23:15:07 +08:00
|
|
|
cdef int _NM, _mlen, _blen
|
2017-09-16 23:14:01 +08:00
|
|
|
cdef int8_t _strand, _trans_strand
|
|
|
|
|
cdef uint8_t _mapq, _is_primary
|
2018-02-01 00:33:08 +08:00
|
|
|
cdef int _seg_id
|
2018-07-25 11:29:55 +08:00
|
|
|
cdef _ctg, _cigar, _cs, _MD # these are python objects
|
2017-09-16 23:14:01 +08:00
|
|
|
|
2018-07-25 11:29:55 +08:00
|
|
|
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str):
|
2017-11-09 21:57:15 +08:00
|
|
|
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
|
|
|
|
|
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
|
2017-09-16 23:14:01 +08:00
|
|
|
self._strand, self._q_st, self._q_en = strand, qs, qe
|
2017-10-16 23:15:07 +08:00
|
|
|
self._NM, self._mlen, self._blen = NM, mlen, blen
|
2017-09-16 20:44:47 +08:00
|
|
|
self._mapq = mapq
|
|
|
|
|
self._cigar = cigar
|
|
|
|
|
self._is_primary = is_primary
|
2017-09-16 23:14:01 +08:00
|
|
|
self._trans_strand = trans_strand
|
2018-02-01 00:33:08 +08:00
|
|
|
self._seg_id = seg_id
|
2018-07-25 11:29:55 +08:00
|
|
|
self._cs = cs_str
|
|
|
|
|
self._MD = MD_str
|
2017-09-16 23:14:01 +08:00
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def ctg(self): return self._ctg
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def ctg_len(self): return self._ctg_len
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def r_st(self): return self._r_st
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
@property
|
2017-09-16 23:14:01 +08:00
|
|
|
def r_en(self): return self._r_en
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
@property
|
2017-11-11 09:18:44 +08:00
|
|
|
def strand(self): return self._strand
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
@property
|
2017-09-17 07:55:33 +08:00
|
|
|
def trans_strand(self): return self._trans_strand
|
2017-09-16 20:44:47 +08:00
|
|
|
|
2017-10-16 23:15:07 +08:00
|
|
|
@property
|
|
|
|
|
def blen(self): return self._blen
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def mlen(self): return self._mlen
|
|
|
|
|
|
2017-09-16 20:44:47 +08:00
|
|
|
@property
|
2017-09-16 23:14:01 +08:00
|
|
|
def NM(self): return self._NM
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
@property
|
2017-09-16 23:14:01 +08:00
|
|
|
def is_primary(self): return (self._is_primary != 0)
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
@property
|
2017-09-16 23:14:01 +08:00
|
|
|
def q_st(self): return self._q_st
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
@property
|
2017-09-16 23:14:01 +08:00
|
|
|
def q_en(self): return self._q_en
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
@property
|
2017-09-16 23:14:01 +08:00
|
|
|
def mapq(self): return self._mapq
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
@property
|
2017-09-16 23:14:01 +08:00
|
|
|
def cigar(self): return self._cigar
|
|
|
|
|
|
2018-02-01 00:33:08 +08:00
|
|
|
@property
|
|
|
|
|
def read_num(self): return self._seg_id + 1
|
|
|
|
|
|
2018-07-25 11:29:55 +08:00
|
|
|
@property
|
|
|
|
|
def cs(self): return self._cs
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def MD(self): return self._MD
|
|
|
|
|
|
2017-09-16 23:14:01 +08:00
|
|
|
@property
|
|
|
|
|
def cigar_str(self):
|
|
|
|
|
return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
if self._strand > 0: strand = '+'
|
|
|
|
|
elif self._strand < 0: strand = '-'
|
|
|
|
|
else: strand = '?'
|
|
|
|
|
if self._is_primary != 0: tp = 'tp:A:P'
|
|
|
|
|
else: tp = 'tp:A:S'
|
2017-09-18 05:06:39 +08:00
|
|
|
if self._trans_strand > 0: ts = 'ts:A:+'
|
|
|
|
|
elif self._trans_strand < 0: ts = 'ts:A:-'
|
|
|
|
|
else: ts = 'ts:A:.'
|
2018-07-25 11:29:55 +08:00
|
|
|
a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
|
|
|
|
|
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
|
|
|
|
|
if self._cs != "": a.append("cs:Z:" + self._cs)
|
|
|
|
|
return "\t".join(a)
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
cdef class ThreadBuffer:
|
2017-09-17 12:05:30 +08:00
|
|
|
cdef cmappy.mm_tbuf_t *_b
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
def __cinit__(self):
|
2017-09-17 12:05:30 +08:00
|
|
|
self._b = cmappy.mm_tbuf_init()
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
def __dealloc__(self):
|
2017-09-17 12:05:30 +08:00
|
|
|
cmappy.mm_tbuf_destroy(self._b)
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
cdef class Aligner:
|
2017-09-17 12:05:30 +08:00
|
|
|
cdef cmappy.mm_idx_t *_idx
|
|
|
|
|
cdef cmappy.mm_idxopt_t idx_opt
|
|
|
|
|
cdef cmappy.mm_mapopt_t map_opt
|
2017-09-16 20:44:47 +08:00
|
|
|
|
2018-08-06 21:52:13 +08:00
|
|
|
def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
|
2017-09-17 12:05:30 +08:00
|
|
|
cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
|
2017-09-17 06:11:43 +08:00
|
|
|
if preset is not None:
|
2017-09-17 12:05:30 +08:00
|
|
|
cmappy.mm_set_opt(str.encode(preset), &self.idx_opt, &self.map_opt) # apply preset
|
2017-09-17 06:11:43 +08:00
|
|
|
self.map_opt.flag |= 4 # always perform alignment
|
2017-09-16 23:14:01 +08:00
|
|
|
self.idx_opt.batch_size = 0x7fffffffffffffffL # always build a uni-part index
|
|
|
|
|
if k is not None: self.idx_opt.k = k
|
|
|
|
|
if w is not None: self.idx_opt.w = w
|
|
|
|
|
if min_cnt is not None: self.map_opt.min_cnt = min_cnt
|
|
|
|
|
if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
|
|
|
|
|
if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
|
|
|
|
|
if bw is not None: self.map_opt.bw = bw
|
2018-02-17 10:10:57 +08:00
|
|
|
if best_n is not None: self.map_opt.best_n = best_n
|
2018-06-04 21:42:14 +08:00
|
|
|
if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
|
2018-08-06 08:57:05 +08:00
|
|
|
if extra_flags is not None: self.map_opt.flag |= extra_flags
|
2018-08-06 21:52:13 +08:00
|
|
|
if scoring is not None and len(scoring) >= 4:
|
|
|
|
|
self.map_opt.a, self.map_opt.b = scoring[0], scoring[1]
|
|
|
|
|
self.map_opt.q, self.map_opt.e = scoring[2], scoring[3]
|
|
|
|
|
self.map_opt.q2, self.map_opt.e2 = self.map_opt.q, self.map_opt.e
|
|
|
|
|
if len(scoring) >= 6:
|
|
|
|
|
self.map_opt.q2, self.map_opt.e2 = scoring[4], scoring[5]
|
|
|
|
|
if len(scoring) >= 7:
|
|
|
|
|
self.map_opt.sc_ambi = scoring[6]
|
2017-09-16 23:14:01 +08:00
|
|
|
|
2017-09-17 12:05:30 +08:00
|
|
|
cdef cmappy.mm_idx_reader_t *r;
|
2018-08-06 08:57:05 +08:00
|
|
|
|
|
|
|
|
if seq is None:
|
|
|
|
|
if fn_idx_out is None:
|
|
|
|
|
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, NULL)
|
|
|
|
|
else:
|
|
|
|
|
r = cmappy.mm_idx_reader_open(str.encode(fn_idx_in), &self.idx_opt, fn_idx_out)
|
|
|
|
|
if r is not NULL:
|
|
|
|
|
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
|
|
|
|
|
cmappy.mm_idx_reader_close(r)
|
|
|
|
|
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
|
|
|
|
|
cmappy.mm_idx_index_name(self._idx)
|
2017-09-16 23:14:01 +08:00
|
|
|
else:
|
2018-08-06 08:57:05 +08:00
|
|
|
self._idx = cmappy.mappy_idx_seq(self.idx_opt.w, self.idx_opt.k, self.idx_opt.flag&1, self.idx_opt.bucket_bits, str.encode(seq), len(seq))
|
2017-09-17 12:05:30 +08:00
|
|
|
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
|
2018-08-06 08:57:05 +08:00
|
|
|
self.map_opt.mid_occ = 1000 # don't filter high-occ seeds
|
2017-09-17 10:43:52 +08:00
|
|
|
|
2017-09-16 20:44:47 +08:00
|
|
|
def __dealloc__(self):
|
|
|
|
|
if self._idx is not NULL:
|
2017-09-17 12:05:30 +08:00
|
|
|
cmappy.mm_idx_destroy(self._idx)
|
2017-09-16 20:44:47 +08:00
|
|
|
|
2017-09-17 08:09:17 +08:00
|
|
|
def __bool__(self):
|
|
|
|
|
return (self._idx != NULL)
|
|
|
|
|
|
2018-08-06 08:57:05 +08:00
|
|
|
def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
|
2017-09-17 12:05:30 +08:00
|
|
|
cdef cmappy.mm_reg1_t *regs
|
|
|
|
|
cdef cmappy.mm_hitpy_t h
|
2017-09-16 20:44:47 +08:00
|
|
|
cdef ThreadBuffer b
|
|
|
|
|
cdef int n_regs
|
2018-07-25 11:29:55 +08:00
|
|
|
cdef char *cs_str = NULL
|
|
|
|
|
cdef int l_cs_str, m_cs_str = 0
|
|
|
|
|
cdef void *km
|
2018-06-04 21:48:54 +08:00
|
|
|
cdef cmappy.mm_mapopt_t map_opt
|
2018-07-25 11:29:55 +08:00
|
|
|
|
2018-06-04 21:48:54 +08:00
|
|
|
map_opt = self.map_opt
|
|
|
|
|
if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
|
2018-08-06 08:57:05 +08:00
|
|
|
if extra_flags is not None: map_opt.flag |= extra_flags
|
2017-09-16 20:44:47 +08:00
|
|
|
|
2017-09-16 23:14:01 +08:00
|
|
|
if self._idx is NULL: return None
|
2017-09-16 20:44:47 +08:00
|
|
|
if buf is None: b = ThreadBuffer()
|
|
|
|
|
else: b = buf
|
2018-07-25 11:29:55 +08:00
|
|
|
km = cmappy.mm_tbuf_get_km(b._b)
|
2018-02-23 23:18:26 +08:00
|
|
|
|
|
|
|
|
_seq = seq if isinstance(seq, bytes) else seq.encode()
|
|
|
|
|
if seq2 is None:
|
2018-06-04 21:48:54 +08:00
|
|
|
regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &map_opt)
|
2018-02-23 23:18:26 +08:00
|
|
|
else:
|
|
|
|
|
_seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
|
2018-06-04 21:48:54 +08:00
|
|
|
regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &map_opt)
|
2017-09-16 20:44:47 +08:00
|
|
|
|
|
|
|
|
for i in range(n_regs):
|
2017-09-17 12:05:30 +08:00
|
|
|
cmappy.mm_reg2hitpy(self._idx, ®s[i], &h)
|
2018-07-25 11:29:55 +08:00
|
|
|
cigar, _cs, _MD = [], '', ''
|
|
|
|
|
for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array
|
2017-09-16 20:44:47 +08:00
|
|
|
c = h.cigar32[k]
|
|
|
|
|
cigar.append([c>>4, c&0xf])
|
2018-07-25 11:29:55 +08:00
|
|
|
if cs or MD: # generate the cs and/or the MD tag, if requested
|
|
|
|
|
if cs:
|
|
|
|
|
l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq, 1)
|
|
|
|
|
_cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
|
|
|
|
|
if MD:
|
|
|
|
|
l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq)
|
|
|
|
|
_MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
|
|
|
|
|
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
|
2017-09-17 12:05:30 +08:00
|
|
|
cmappy.mm_free_reg1(®s[i])
|
2017-09-16 20:44:47 +08:00
|
|
|
free(regs)
|
2018-07-25 11:29:55 +08:00
|
|
|
free(cs_str)
|
2017-09-18 02:41:59 +08:00
|
|
|
|
2018-02-23 23:18:26 +08:00
|
|
|
def seq(self, str name, int start=0, int end=0x7fffffff):
|
|
|
|
|
cdef int l
|
|
|
|
|
cdef char *s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
|
|
|
|
|
if l == 0: return None
|
|
|
|
|
r = s[:l] if isinstance(s, str) else s[:l].decode()
|
|
|
|
|
free(s)
|
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def k(self): return self._idx.k
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def w(self): return self._idx.w
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def n_seq(self): return self._idx.n_seq
|
|
|
|
|
|
|
|
|
|
def fastx_read(fn, read_comment=False):
|
2017-09-18 02:41:59 +08:00
|
|
|
cdef cmappy.kseq_t *ks
|
|
|
|
|
ks = cmappy.mm_fastx_open(str.encode(fn))
|
|
|
|
|
if ks is NULL: return None
|
|
|
|
|
while cmappy.kseq_read(ks) >= 0:
|
2017-11-09 21:57:15 +08:00
|
|
|
if ks.qual.l > 0: qual = ks.qual.s if isinstance(ks.qual.s, str) else ks.qual.s.decode()
|
2017-09-18 04:02:06 +08:00
|
|
|
else: qual = None
|
2017-11-09 21:57:15 +08:00
|
|
|
name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
|
|
|
|
|
seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
|
2018-02-23 23:18:26 +08:00
|
|
|
if read_comment:
|
|
|
|
|
if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
|
|
|
|
|
else: comment = None
|
|
|
|
|
yield name, seq, qual, comment
|
|
|
|
|
else:
|
|
|
|
|
yield name, seq, qual
|
2017-09-18 02:41:59 +08:00
|
|
|
cmappy.mm_fastx_close(ks)
|
2017-09-18 03:21:36 +08:00
|
|
|
|
2018-02-20 22:41:25 +08:00
|
|
|
def revcomp(seq):
|
2018-02-21 00:05:54 +08:00
|
|
|
l = len(seq)
|
|
|
|
|
bseq = seq if isinstance(seq, bytes) else seq.encode()
|
|
|
|
|
cdef char *s = cmappy.mappy_revcomp(l, bseq)
|
|
|
|
|
r = s[:l] if isinstance(s, str) else s[:l].decode()
|
|
|
|
|
free(s)
|
|
|
|
|
return r
|
2018-02-20 22:41:25 +08:00
|
|
|
|
2017-09-18 03:21:36 +08:00
|
|
|
def verbose(v=None):
|
|
|
|
|
if v is None: v = -1
|
|
|
|
|
return cmappy.mm_verbose_level(v)
|