exposed fasta/q reader to mappy
This commit is contained in:
parent
e9c57f6d8b
commit
c8a019fae8
|
|
@ -2,14 +2,21 @@
|
||||||
Mappy: Minimap2 Python Binding
|
Mappy: Minimap2 Python Binding
|
||||||
==============================
|
==============================
|
||||||
|
|
||||||
`Minimap2 <https://github.com/lh3/minimap2>`_ is a fast and accurate pairwise
|
Mappy provides a convenient interface to `minimap2
|
||||||
aligner for genomic and transcribed nucleotide sequences. This Python extension
|
<https://github.com/lh3/minimap2>`_, a fast and accurate C program to align
|
||||||
provides a convenient interface to calling minimap2 in Python.
|
genomic and transcribe nucleotide sequences.
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
------------
|
------------
|
||||||
|
|
||||||
The mappy module can be installed directly with:
|
Mappy depends on `zlib <http://zlib.net>`_. It can be installed with `pip
|
||||||
|
<https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
|
||||||
|
|
||||||
|
.. code:: shell
|
||||||
|
|
||||||
|
pip install --user mappy
|
||||||
|
|
||||||
|
or from the minimap2 github repo:
|
||||||
|
|
||||||
.. code:: shell
|
.. code:: shell
|
||||||
|
|
||||||
|
|
@ -17,40 +24,33 @@ The mappy module can be installed directly with:
|
||||||
cd minimap2
|
cd minimap2
|
||||||
python setup.py install
|
python setup.py install
|
||||||
|
|
||||||
or with `pip <https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
|
|
||||||
|
|
||||||
.. code:: shell
|
|
||||||
|
|
||||||
pip install --user mappy
|
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
|
|
||||||
The following Python program shows the key functionality of this module:
|
The following Python program shows the key functionality of mappy:
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
import mappy as mp
|
import mappy as mp
|
||||||
a = mp.Aligner("test/MT-human.fa") # load or build index
|
a = mp.Aligner("test/MT-human.fa") # load or build index
|
||||||
if not a: raise Exception("ERROR: failed to load/build index")
|
if not a: raise Exception("ERROR: failed to load/build index")
|
||||||
for hit in a.map("GGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGT"):
|
for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
|
||||||
|
for hit in a.map(seq): # traverse alignments
|
||||||
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
|
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
|
||||||
|
|
||||||
It builds an index from the specified sequence file (or loads an index if a
|
|
||||||
pre-built index is specified), aligns a sequence against it, traverses each hit
|
|
||||||
and prints them out.
|
|
||||||
|
|
||||||
APIs
|
APIs
|
||||||
----
|
----
|
||||||
|
|
||||||
|
Mappy implements two classes and one global function.
|
||||||
|
|
||||||
Class mappy.Aligner
|
Class mappy.Aligner
|
||||||
~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
Aligner(fn_idx_in, preset=None, ...)
|
mappy.Aligner(fn_idx_in, preset=None, ...)
|
||||||
|
|
||||||
Arguments:
|
This constructor accepts the following arguments:
|
||||||
|
|
||||||
* **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
|
* **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
|
||||||
file type. If a sequence file is provided, minimap2 builds an index. The
|
file type. If a sequence file is provided, minimap2 builds an index. The
|
||||||
|
|
@ -81,15 +81,16 @@ Arguments:
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
map(seq)
|
mappy.Aligner.map(seq)
|
||||||
|
|
||||||
This method maps :code:`seq` against the index. It *yields* a generator,
|
This method aligns :code:`seq` against the index. It is a generator, *yielding*
|
||||||
generating a series of :code:`Alignment` objects.
|
a series of :code:`mappy.Alignment` objects.
|
||||||
|
|
||||||
Class mappy.Alignment
|
Class mappy.Alignment
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
This class has the following properties:
|
This class describes an alignment. An object of this class has the following
|
||||||
|
properties:
|
||||||
|
|
||||||
* **ctg**: name of the reference sequence the query is mapped to
|
* **ctg**: name of the reference sequence the query is mapped to
|
||||||
|
|
||||||
|
|
@ -118,8 +119,23 @@ This class has the following properties:
|
||||||
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
|
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
|
||||||
numbers give the length and the operator of each CIGAR operation.
|
numbers give the length and the operator of each CIGAR operation.
|
||||||
|
|
||||||
An :code:`Alignment` object can be converted to a string in the following format:
|
An :code:`Alignment` object can be converted to a string with :code:`str()` in
|
||||||
|
the following format:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
q_st q_en strand ctg ctg_len r_st r_en blen-NM blen mapq cg:Z:cigar_str
|
q_st q_en strand ctg ctg_len r_st r_en blen-NM blen mapq cg:Z:cigar_str
|
||||||
|
|
||||||
|
It is effectively the PAF format without the QueryName and QueryLength columns
|
||||||
|
(the first two columns in PAF).
|
||||||
|
|
||||||
|
Function mappy.fastx_read
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
mappy.fastx_read(fn)
|
||||||
|
|
||||||
|
This generator function opens a FASTA/FASTQ file and *yields* a
|
||||||
|
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
|
||||||
|
optionally gzip'd.
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,11 @@
|
||||||
#define CMAPPY_H
|
#define CMAPPY_H
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <zlib.h>
|
||||||
#include "minimap.h"
|
#include "minimap.h"
|
||||||
|
#include "kseq.h"
|
||||||
|
KSEQ_DECLARE(gzFile)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const char *ctg;
|
const char *ctg;
|
||||||
|
|
@ -36,4 +40,19 @@ static inline void mm_free_reg1(mm_reg1_t *r)
|
||||||
free(r->p);
|
free(r->p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline kseq_t *mm_fastx_open(const char *fn)
|
||||||
|
{
|
||||||
|
gzFile fp;
|
||||||
|
fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
||||||
|
return kseq_init(fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void mm_fastx_close(kseq_t *ks)
|
||||||
|
{
|
||||||
|
gzFile fp;
|
||||||
|
fp = ks->f->f;
|
||||||
|
kseq_destroy(ks);
|
||||||
|
gzclose(fp);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ cdef extern from "minimap.h":
|
||||||
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name)
|
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Helper header (because it is hard to expose mm_reg1_t with Cython
|
# Helper header (because it is hard to expose mm_reg1_t with Cython)
|
||||||
#
|
#
|
||||||
cdef extern from "cmappy.h":
|
cdef extern from "cmappy.h":
|
||||||
ctypedef struct mm_hitpy_t:
|
ctypedef struct mm_hitpy_t:
|
||||||
|
|
@ -90,3 +90,19 @@ cdef extern from "cmappy.h":
|
||||||
|
|
||||||
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
|
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
|
||||||
void mm_free_reg1(mm_reg1_t *r)
|
void mm_free_reg1(mm_reg1_t *r)
|
||||||
|
|
||||||
|
ctypedef struct kstring_t:
|
||||||
|
unsigned l, m
|
||||||
|
char *s
|
||||||
|
|
||||||
|
ctypedef struct kstream_t:
|
||||||
|
pass
|
||||||
|
|
||||||
|
ctypedef struct kseq_t:
|
||||||
|
kstring_t name, comment, seq, qual
|
||||||
|
int last_char
|
||||||
|
kstream_t *f
|
||||||
|
|
||||||
|
kseq_t *mm_fastx_open(const char *fn)
|
||||||
|
void mm_fastx_close(kseq_t *ks)
|
||||||
|
int kseq_read(kseq_t *seq)
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ cdef class Alignment:
|
||||||
cdef _ctg, _cigar # these are python objects
|
cdef _ctg, _cigar # these are python objects
|
||||||
|
|
||||||
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, blen, NM, trans_strand):
|
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, blen, NM, trans_strand):
|
||||||
self._ctg, self._ctg_len, self._r_st, self._r_en = ctg.decode("ascii"), cl, cs, ce
|
self._ctg, self._ctg_len, self._r_st, self._r_en = str(ctg), cl, cs, ce
|
||||||
self._strand, self._q_st, self._q_en = strand, qs, qe
|
self._strand, self._q_st, self._q_en = strand, qs, qe
|
||||||
self._NM, self._blen = NM, blen
|
self._NM, self._blen = NM, blen
|
||||||
self._mapq = mapq
|
self._mapq = mapq
|
||||||
|
|
@ -133,3 +133,13 @@ cdef class Aligner:
|
||||||
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.blen, h.NM, h.trans_strand)
|
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.blen, h.NM, h.trans_strand)
|
||||||
cmappy.mm_free_reg1(®s[i])
|
cmappy.mm_free_reg1(®s[i])
|
||||||
free(regs)
|
free(regs)
|
||||||
|
|
||||||
|
def fastx_read(fn):
|
||||||
|
cdef cmappy.kseq_t *ks
|
||||||
|
ks = cmappy.mm_fastx_open(str.encode(fn))
|
||||||
|
if ks is NULL: return None
|
||||||
|
while cmappy.kseq_read(ks) >= 0:
|
||||||
|
qual = None
|
||||||
|
if ks.qual.l > 0: qual = str(ks.qual.s)
|
||||||
|
yield str(ks.name.s), str(ks.seq.s), qual
|
||||||
|
cmappy.mm_fastx_close(ks)
|
||||||
|
|
|
||||||
|
|
@ -3,47 +3,14 @@
|
||||||
import sys, getopt
|
import sys, getopt
|
||||||
import mappy as mp
|
import mappy as mp
|
||||||
|
|
||||||
def readfq(fp): # multi-line fasta/fastq parser
|
|
||||||
last = None
|
|
||||||
while True:
|
|
||||||
if not last:
|
|
||||||
for l in fp:
|
|
||||||
if l[0] in '>@':
|
|
||||||
last = l[:-1]
|
|
||||||
break
|
|
||||||
if not last: break
|
|
||||||
name, seqs, last = last[1:].split()[0], [], None
|
|
||||||
for l in fp:
|
|
||||||
if l[0] in '@+>':
|
|
||||||
last = l[:-1]
|
|
||||||
break
|
|
||||||
seqs.append(l[:-1])
|
|
||||||
if not last or last[0] != '+':
|
|
||||||
yield name, ''.join(seqs), None
|
|
||||||
if not last: break
|
|
||||||
else:
|
|
||||||
seq, leng, seqs = ''.join(seqs), 0, []
|
|
||||||
for l in fp:
|
|
||||||
seqs.append(l[:-1])
|
|
||||||
leng += len(l) - 1
|
|
||||||
if leng >= len(seq):
|
|
||||||
last = None
|
|
||||||
yield name, seq, ''.join(seqs);
|
|
||||||
break
|
|
||||||
if last:
|
|
||||||
yield name, seq, None
|
|
||||||
break
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
opts, args = getopt.getopt(argv[1:], "")
|
opts, args = getopt.getopt(argv[1:], "")
|
||||||
if len(args) < 2:
|
if len(args) < 2:
|
||||||
print("Usage: minimap2.py <ref.fa>|<ref.mmi> <query.fq>")
|
print("Usage: minimap2.py <ref.fa>|<ref.mmi> <query.fq>")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
a = mp.Aligner(args[0]) # load/build index
|
a = mp.Aligner(args[0]) # load/build index
|
||||||
if not a:
|
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
|
||||||
print("ERROR: failed to load/build index")
|
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
|
||||||
return
|
|
||||||
for name, seq, qual in readfq(open(args[1])): # read one sequence
|
|
||||||
for h in a.map(seq): # traverse hits
|
for h in a.map(seq): # traverse hits
|
||||||
print('{}\t{}\t{}'.format(name, len(seq), h))
|
print('{}\t{}\t{}'.format(name, len(seq), h))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue