exposed fasta/q reader to mappy

This commit is contained in:
Heng Li 2017-09-17 14:41:59 -04:00
parent e9c57f6d8b
commit c8a019fae8
5 changed files with 91 additions and 63 deletions

View File

@ -2,14 +2,21 @@
Mappy: Minimap2 Python Binding Mappy: Minimap2 Python Binding
============================== ==============================
`Minimap2 <https://github.com/lh3/minimap2>`_ is a fast and accurate pairwise Mappy provides a convenient interface to `minimap2
aligner for genomic and transcribed nucleotide sequences. This Python extension <https://github.com/lh3/minimap2>`_, a fast and accurate C program to align
provides a convenient interface to calling minimap2 in Python. genomic and transcribe nucleotide sequences.
Installation Installation
------------ ------------
The mappy module can be installed directly with: Mappy depends on `zlib <http://zlib.net>`_. It can be installed with `pip
<https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
.. code:: shell
pip install --user mappy
or from the minimap2 github repo:
.. code:: shell .. code:: shell
@ -17,40 +24,33 @@ The mappy module can be installed directly with:
cd minimap2 cd minimap2
python setup.py install python setup.py install
or with `pip <https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
.. code:: shell
pip install --user mappy
Usage Usage
----- -----
The following Python program shows the key functionality of this module: The following Python program shows the key functionality of mappy:
.. code:: python .. code:: python
import mappy as mp import mappy as mp
a = mp.Aligner("test/MT-human.fa") # load or build index a = mp.Aligner("test/MT-human.fa") # load or build index
if not a: raise Exception("ERROR: failed to load/build index") if not a: raise Exception("ERROR: failed to load/build index")
for hit in a.map("GGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGT"): for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
for hit in a.map(seq): # traverse alignments
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str)) print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
It builds an index from the specified sequence file (or loads an index if a
pre-built index is specified), aligns a sequence against it, traverses each hit
and prints them out.
APIs APIs
---- ----
Mappy implements two classes and one global function.
Class mappy.Aligner Class mappy.Aligner
~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
.. code:: python .. code:: python
Aligner(fn_idx_in, preset=None, ...) mappy.Aligner(fn_idx_in, preset=None, ...)
Arguments: This constructor accepts the following arguments:
* **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the * **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
file type. If a sequence file is provided, minimap2 builds an index. The file type. If a sequence file is provided, minimap2 builds an index. The
@ -81,15 +81,16 @@ Arguments:
.. code:: python .. code:: python
map(seq) mappy.Aligner.map(seq)
This method maps :code:`seq` against the index. It *yields* a generator, This method aligns :code:`seq` against the index. It is a generator, *yielding*
generating a series of :code:`Alignment` objects. a series of :code:`mappy.Alignment` objects.
Class mappy.Alignment Class mappy.Alignment
~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
This class has the following properties: This class describes an alignment. An object of this class has the following
properties:
* **ctg**: name of the reference sequence the query is mapped to * **ctg**: name of the reference sequence the query is mapped to
@ -118,8 +119,23 @@ This class has the following properties:
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two * **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
numbers give the length and the operator of each CIGAR operation. numbers give the length and the operator of each CIGAR operation.
An :code:`Alignment` object can be converted to a string in the following format: An :code:`Alignment` object can be converted to a string with :code:`str()` in
the following format:
:: ::
q_st q_en strand ctg ctg_len r_st r_en blen-NM blen mapq cg:Z:cigar_str q_st q_en strand ctg ctg_len r_st r_en blen-NM blen mapq cg:Z:cigar_str
It is effectively the PAF format without the QueryName and QueryLength columns
(the first two columns in PAF).
Function mappy.fastx_read
~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: python
mappy.fastx_read(fn)
This generator function opens a FASTA/FASTQ file and *yields* a
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
optionally gzip'd.

View File

@ -2,7 +2,11 @@
#define CMAPPY_H #define CMAPPY_H
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include "minimap.h" #include "minimap.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
typedef struct { typedef struct {
const char *ctg; const char *ctg;
@ -36,4 +40,19 @@ static inline void mm_free_reg1(mm_reg1_t *r)
free(r->p); free(r->p);
} }
static inline kseq_t *mm_fastx_open(const char *fn)
{
gzFile fp;
fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
return kseq_init(fp);
}
static inline void mm_fastx_close(kseq_t *ks)
{
gzFile fp;
fp = ks->f->f;
kseq_destroy(ks);
gzclose(fp);
}
#endif #endif

View File

@ -75,7 +75,7 @@ cdef extern from "minimap.h":
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name) mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name)
# #
# Helper header (because it is hard to expose mm_reg1_t with Cython # Helper header (because it is hard to expose mm_reg1_t with Cython)
# #
cdef extern from "cmappy.h": cdef extern from "cmappy.h":
ctypedef struct mm_hitpy_t: ctypedef struct mm_hitpy_t:
@ -90,3 +90,19 @@ cdef extern from "cmappy.h":
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h) void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
void mm_free_reg1(mm_reg1_t *r) void mm_free_reg1(mm_reg1_t *r)
ctypedef struct kstring_t:
unsigned l, m
char *s
ctypedef struct kstream_t:
pass
ctypedef struct kseq_t:
kstring_t name, comment, seq, qual
int last_char
kstream_t *f
kseq_t *mm_fastx_open(const char *fn)
void mm_fastx_close(kseq_t *ks)
int kseq_read(kseq_t *seq)

View File

@ -11,7 +11,7 @@ cdef class Alignment:
cdef _ctg, _cigar # these are python objects cdef _ctg, _cigar # these are python objects
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, blen, NM, trans_strand): def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, blen, NM, trans_strand):
self._ctg, self._ctg_len, self._r_st, self._r_en = ctg.decode("ascii"), cl, cs, ce self._ctg, self._ctg_len, self._r_st, self._r_en = str(ctg), cl, cs, ce
self._strand, self._q_st, self._q_en = strand, qs, qe self._strand, self._q_st, self._q_en = strand, qs, qe
self._NM, self._blen = NM, blen self._NM, self._blen = NM, blen
self._mapq = mapq self._mapq = mapq
@ -133,3 +133,13 @@ cdef class Aligner:
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.blen, h.NM, h.trans_strand) yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.blen, h.NM, h.trans_strand)
cmappy.mm_free_reg1(&regs[i]) cmappy.mm_free_reg1(&regs[i])
free(regs) free(regs)
def fastx_read(fn):
cdef cmappy.kseq_t *ks
ks = cmappy.mm_fastx_open(str.encode(fn))
if ks is NULL: return None
while cmappy.kseq_read(ks) >= 0:
qual = None
if ks.qual.l > 0: qual = str(ks.qual.s)
yield str(ks.name.s), str(ks.seq.s), qual
cmappy.mm_fastx_close(ks)

View File

@ -3,47 +3,14 @@
import sys, getopt import sys, getopt
import mappy as mp import mappy as mp
def readfq(fp): # multi-line fasta/fastq parser
last = None
while True:
if not last:
for l in fp:
if l[0] in '>@':
last = l[:-1]
break
if not last: break
name, seqs, last = last[1:].split()[0], [], None
for l in fp:
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+':
yield name, ''.join(seqs), None
if not last: break
else:
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp:
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq):
last = None
yield name, seq, ''.join(seqs);
break
if last:
yield name, seq, None
break
def main(argv): def main(argv):
opts, args = getopt.getopt(argv[1:], "") opts, args = getopt.getopt(argv[1:], "")
if len(args) < 2: if len(args) < 2:
print("Usage: minimap2.py <ref.fa>|<ref.mmi> <query.fq>") print("Usage: minimap2.py <ref.fa>|<ref.mmi> <query.fq>")
sys.exit(1) sys.exit(1)
a = mp.Aligner(args[0]) # load/build index a = mp.Aligner(args[0]) # load/build index
if not a: if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
print("ERROR: failed to load/build index") for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
return
for name, seq, qual in readfq(open(args[1])): # read one sequence
for h in a.map(seq): # traverse hits for h in a.map(seq): # traverse hits
print('{}\t{}\t{}'.format(name, len(seq), h)) print('{}\t{}\t{}'.format(name, len(seq), h))