exposed fasta/q reader to mappy
This commit is contained in:
parent
e9c57f6d8b
commit
c8a019fae8
|
|
@ -2,14 +2,21 @@
|
|||
Mappy: Minimap2 Python Binding
|
||||
==============================
|
||||
|
||||
`Minimap2 <https://github.com/lh3/minimap2>`_ is a fast and accurate pairwise
|
||||
aligner for genomic and transcribed nucleotide sequences. This Python extension
|
||||
provides a convenient interface to calling minimap2 in Python.
|
||||
Mappy provides a convenient interface to `minimap2
|
||||
<https://github.com/lh3/minimap2>`_, a fast and accurate C program to align
|
||||
genomic and transcribe nucleotide sequences.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
The mappy module can be installed directly with:
|
||||
Mappy depends on `zlib <http://zlib.net>`_. It can be installed with `pip
|
||||
<https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
pip install --user mappy
|
||||
|
||||
or from the minimap2 github repo:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
|
|
@ -17,40 +24,33 @@ The mappy module can be installed directly with:
|
|||
cd minimap2
|
||||
python setup.py install
|
||||
|
||||
or with `pip <https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
|
||||
|
||||
.. code:: shell
|
||||
|
||||
pip install --user mappy
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
The following Python program shows the key functionality of this module:
|
||||
The following Python program shows the key functionality of mappy:
|
||||
|
||||
.. code:: python
|
||||
|
||||
import mappy as mp
|
||||
a = mp.Aligner("test/MT-human.fa") # load or build index
|
||||
if not a: raise Exception("ERROR: failed to load/build index")
|
||||
for hit in a.map("GGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGT"):
|
||||
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
|
||||
|
||||
It builds an index from the specified sequence file (or loads an index if a
|
||||
pre-built index is specified), aligns a sequence against it, traverses each hit
|
||||
and prints them out.
|
||||
for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
|
||||
for hit in a.map(seq): # traverse alignments
|
||||
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
|
||||
|
||||
APIs
|
||||
----
|
||||
|
||||
Mappy implements two classes and one global function.
|
||||
|
||||
Class mappy.Aligner
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code:: python
|
||||
|
||||
Aligner(fn_idx_in, preset=None, ...)
|
||||
mappy.Aligner(fn_idx_in, preset=None, ...)
|
||||
|
||||
Arguments:
|
||||
This constructor accepts the following arguments:
|
||||
|
||||
* **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
|
||||
file type. If a sequence file is provided, minimap2 builds an index. The
|
||||
|
|
@ -81,15 +81,16 @@ Arguments:
|
|||
|
||||
.. code:: python
|
||||
|
||||
map(seq)
|
||||
mappy.Aligner.map(seq)
|
||||
|
||||
This method maps :code:`seq` against the index. It *yields* a generator,
|
||||
generating a series of :code:`Alignment` objects.
|
||||
This method aligns :code:`seq` against the index. It is a generator, *yielding*
|
||||
a series of :code:`mappy.Alignment` objects.
|
||||
|
||||
Class mappy.Alignment
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This class has the following properties:
|
||||
This class describes an alignment. An object of this class has the following
|
||||
properties:
|
||||
|
||||
* **ctg**: name of the reference sequence the query is mapped to
|
||||
|
||||
|
|
@ -118,8 +119,23 @@ This class has the following properties:
|
|||
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
|
||||
numbers give the length and the operator of each CIGAR operation.
|
||||
|
||||
An :code:`Alignment` object can be converted to a string in the following format:
|
||||
An :code:`Alignment` object can be converted to a string with :code:`str()` in
|
||||
the following format:
|
||||
|
||||
::
|
||||
|
||||
q_st q_en strand ctg ctg_len r_st r_en blen-NM blen mapq cg:Z:cigar_str
|
||||
|
||||
It is effectively the PAF format without the QueryName and QueryLength columns
|
||||
(the first two columns in PAF).
|
||||
|
||||
Function mappy.fastx_read
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code:: python
|
||||
|
||||
mappy.fastx_read(fn)
|
||||
|
||||
This generator function opens a FASTA/FASTQ file and *yields* a
|
||||
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
|
||||
optionally gzip'd.
|
||||
|
|
|
|||
|
|
@ -2,7 +2,11 @@
|
|||
#define CMAPPY_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include "minimap.h"
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
typedef struct {
|
||||
const char *ctg;
|
||||
|
|
@ -36,4 +40,19 @@ static inline void mm_free_reg1(mm_reg1_t *r)
|
|||
free(r->p);
|
||||
}
|
||||
|
||||
static inline kseq_t *mm_fastx_open(const char *fn)
|
||||
{
|
||||
gzFile fp;
|
||||
fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
||||
return kseq_init(fp);
|
||||
}
|
||||
|
||||
static inline void mm_fastx_close(kseq_t *ks)
|
||||
{
|
||||
gzFile fp;
|
||||
fp = ks->f->f;
|
||||
kseq_destroy(ks);
|
||||
gzclose(fp);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ cdef extern from "minimap.h":
|
|||
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name)
|
||||
|
||||
#
|
||||
# Helper header (because it is hard to expose mm_reg1_t with Cython
|
||||
# Helper header (because it is hard to expose mm_reg1_t with Cython)
|
||||
#
|
||||
cdef extern from "cmappy.h":
|
||||
ctypedef struct mm_hitpy_t:
|
||||
|
|
@ -90,3 +90,19 @@ cdef extern from "cmappy.h":
|
|||
|
||||
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
|
||||
void mm_free_reg1(mm_reg1_t *r)
|
||||
|
||||
ctypedef struct kstring_t:
|
||||
unsigned l, m
|
||||
char *s
|
||||
|
||||
ctypedef struct kstream_t:
|
||||
pass
|
||||
|
||||
ctypedef struct kseq_t:
|
||||
kstring_t name, comment, seq, qual
|
||||
int last_char
|
||||
kstream_t *f
|
||||
|
||||
kseq_t *mm_fastx_open(const char *fn)
|
||||
void mm_fastx_close(kseq_t *ks)
|
||||
int kseq_read(kseq_t *seq)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ cdef class Alignment:
|
|||
cdef _ctg, _cigar # these are python objects
|
||||
|
||||
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, blen, NM, trans_strand):
|
||||
self._ctg, self._ctg_len, self._r_st, self._r_en = ctg.decode("ascii"), cl, cs, ce
|
||||
self._ctg, self._ctg_len, self._r_st, self._r_en = str(ctg), cl, cs, ce
|
||||
self._strand, self._q_st, self._q_en = strand, qs, qe
|
||||
self._NM, self._blen = NM, blen
|
||||
self._mapq = mapq
|
||||
|
|
@ -133,3 +133,13 @@ cdef class Aligner:
|
|||
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.blen, h.NM, h.trans_strand)
|
||||
cmappy.mm_free_reg1(®s[i])
|
||||
free(regs)
|
||||
|
||||
def fastx_read(fn):
|
||||
cdef cmappy.kseq_t *ks
|
||||
ks = cmappy.mm_fastx_open(str.encode(fn))
|
||||
if ks is NULL: return None
|
||||
while cmappy.kseq_read(ks) >= 0:
|
||||
qual = None
|
||||
if ks.qual.l > 0: qual = str(ks.qual.s)
|
||||
yield str(ks.name.s), str(ks.seq.s), qual
|
||||
cmappy.mm_fastx_close(ks)
|
||||
|
|
|
|||
|
|
@ -3,47 +3,14 @@
|
|||
import sys, getopt
|
||||
import mappy as mp
|
||||
|
||||
def readfq(fp): # multi-line fasta/fastq parser
|
||||
last = None
|
||||
while True:
|
||||
if not last:
|
||||
for l in fp:
|
||||
if l[0] in '>@':
|
||||
last = l[:-1]
|
||||
break
|
||||
if not last: break
|
||||
name, seqs, last = last[1:].split()[0], [], None
|
||||
for l in fp:
|
||||
if l[0] in '@+>':
|
||||
last = l[:-1]
|
||||
break
|
||||
seqs.append(l[:-1])
|
||||
if not last or last[0] != '+':
|
||||
yield name, ''.join(seqs), None
|
||||
if not last: break
|
||||
else:
|
||||
seq, leng, seqs = ''.join(seqs), 0, []
|
||||
for l in fp:
|
||||
seqs.append(l[:-1])
|
||||
leng += len(l) - 1
|
||||
if leng >= len(seq):
|
||||
last = None
|
||||
yield name, seq, ''.join(seqs);
|
||||
break
|
||||
if last:
|
||||
yield name, seq, None
|
||||
break
|
||||
|
||||
def main(argv):
|
||||
opts, args = getopt.getopt(argv[1:], "")
|
||||
if len(args) < 2:
|
||||
print("Usage: minimap2.py <ref.fa>|<ref.mmi> <query.fq>")
|
||||
sys.exit(1)
|
||||
a = mp.Aligner(args[0]) # load/build index
|
||||
if not a:
|
||||
print("ERROR: failed to load/build index")
|
||||
return
|
||||
for name, seq, qual in readfq(open(args[1])): # read one sequence
|
||||
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
|
||||
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
|
||||
for h in a.map(seq): # traverse hits
|
||||
print('{}\t{}\t{}'.format(name, len(seq), h))
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue