exposed fasta/q reader to mappy

This commit is contained in:
Heng Li 2017-09-17 14:41:59 -04:00
parent e9c57f6d8b
commit c8a019fae8
5 changed files with 91 additions and 63 deletions

View File

@ -2,14 +2,21 @@
Mappy: Minimap2 Python Binding
==============================
`Minimap2 <https://github.com/lh3/minimap2>`_ is a fast and accurate pairwise
aligner for genomic and transcribed nucleotide sequences. This Python extension
provides a convenient interface to calling minimap2 in Python.
Mappy provides a convenient interface to `minimap2
<https://github.com/lh3/minimap2>`_, a fast and accurate C program to align
genomic and transcribe nucleotide sequences.
Installation
------------
The mappy module can be installed directly with:
Mappy depends on `zlib <http://zlib.net>`_. It can be installed with `pip
<https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
.. code:: shell
pip install --user mappy
or from the minimap2 github repo:
.. code:: shell
@ -17,40 +24,33 @@ The mappy module can be installed directly with:
cd minimap2
python setup.py install
or with `pip <https://en.wikipedia.org/wiki/Pip_(package_manager)>`_:
.. code:: shell
pip install --user mappy
Usage
-----
The following Python program shows the key functionality of this module:
The following Python program shows the key functionality of mappy:
.. code:: python
import mappy as mp
a = mp.Aligner("test/MT-human.fa") # load or build index
if not a: raise Exception("ERROR: failed to load/build index")
for hit in a.map("GGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGT"):
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
It builds an index from the specified sequence file (or loads an index if a
pre-built index is specified), aligns a sequence against it, traverses each hit
and prints them out.
for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
for hit in a.map(seq): # traverse alignments
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
APIs
----
Mappy implements two classes and one global function.
Class mappy.Aligner
~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~
.. code:: python
Aligner(fn_idx_in, preset=None, ...)
mappy.Aligner(fn_idx_in, preset=None, ...)
Arguments:
This constructor accepts the following arguments:
* **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the
file type. If a sequence file is provided, minimap2 builds an index. The
@ -81,15 +81,16 @@ Arguments:
.. code:: python
map(seq)
mappy.Aligner.map(seq)
This method maps :code:`seq` against the index. It *yields* a generator,
generating a series of :code:`Alignment` objects.
This method aligns :code:`seq` against the index. It is a generator, *yielding*
a series of :code:`mappy.Alignment` objects.
Class mappy.Alignment
~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~
This class has the following properties:
This class describes an alignment. An object of this class has the following
properties:
* **ctg**: name of the reference sequence the query is mapped to
@ -118,8 +119,23 @@ This class has the following properties:
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
numbers give the length and the operator of each CIGAR operation.
An :code:`Alignment` object can be converted to a string in the following format:
An :code:`Alignment` object can be converted to a string with :code:`str()` in
the following format:
::
q_st q_en strand ctg ctg_len r_st r_en blen-NM blen mapq cg:Z:cigar_str
It is effectively the PAF format without the QueryName and QueryLength columns
(the first two columns in PAF).
Function mappy.fastx_read
~~~~~~~~~~~~~~~~~~~~~~~~~
.. code:: python
mappy.fastx_read(fn)
This generator function opens a FASTA/FASTQ file and *yields* a
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
optionally gzip'd.

View File

@ -2,7 +2,11 @@
#define CMAPPY_H
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include "minimap.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
typedef struct {
const char *ctg;
@ -36,4 +40,19 @@ static inline void mm_free_reg1(mm_reg1_t *r)
free(r->p);
}
static inline kseq_t *mm_fastx_open(const char *fn)
{
gzFile fp;
fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
return kseq_init(fp);
}
static inline void mm_fastx_close(kseq_t *ks)
{
gzFile fp;
fp = ks->f->f;
kseq_destroy(ks);
gzclose(fp);
}
#endif

View File

@ -75,7 +75,7 @@ cdef extern from "minimap.h":
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name)
#
# Helper header (because it is hard to expose mm_reg1_t with Cython
# Helper header (because it is hard to expose mm_reg1_t with Cython)
#
cdef extern from "cmappy.h":
ctypedef struct mm_hitpy_t:
@ -90,3 +90,19 @@ cdef extern from "cmappy.h":
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
void mm_free_reg1(mm_reg1_t *r)
ctypedef struct kstring_t:
unsigned l, m
char *s
ctypedef struct kstream_t:
pass
ctypedef struct kseq_t:
kstring_t name, comment, seq, qual
int last_char
kstream_t *f
kseq_t *mm_fastx_open(const char *fn)
void mm_fastx_close(kseq_t *ks)
int kseq_read(kseq_t *seq)

View File

@ -11,7 +11,7 @@ cdef class Alignment:
cdef _ctg, _cigar # these are python objects
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, blen, NM, trans_strand):
self._ctg, self._ctg_len, self._r_st, self._r_en = ctg.decode("ascii"), cl, cs, ce
self._ctg, self._ctg_len, self._r_st, self._r_en = str(ctg), cl, cs, ce
self._strand, self._q_st, self._q_en = strand, qs, qe
self._NM, self._blen = NM, blen
self._mapq = mapq
@ -133,3 +133,13 @@ cdef class Aligner:
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.blen, h.NM, h.trans_strand)
cmappy.mm_free_reg1(&regs[i])
free(regs)
def fastx_read(fn):
cdef cmappy.kseq_t *ks
ks = cmappy.mm_fastx_open(str.encode(fn))
if ks is NULL: return None
while cmappy.kseq_read(ks) >= 0:
qual = None
if ks.qual.l > 0: qual = str(ks.qual.s)
yield str(ks.name.s), str(ks.seq.s), qual
cmappy.mm_fastx_close(ks)

View File

@ -3,47 +3,14 @@
import sys, getopt
import mappy as mp
def readfq(fp): # multi-line fasta/fastq parser
last = None
while True:
if not last:
for l in fp:
if l[0] in '>@':
last = l[:-1]
break
if not last: break
name, seqs, last = last[1:].split()[0], [], None
for l in fp:
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+':
yield name, ''.join(seqs), None
if not last: break
else:
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp:
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq):
last = None
yield name, seq, ''.join(seqs);
break
if last:
yield name, seq, None
break
def main(argv):
opts, args = getopt.getopt(argv[1:], "")
if len(args) < 2:
print("Usage: minimap2.py <ref.fa>|<ref.mmi> <query.fq>")
sys.exit(1)
a = mp.Aligner(args[0]) # load/build index
if not a:
print("ERROR: failed to load/build index")
return
for name, seq, qual in readfq(open(args[1])): # read one sequence
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
for h in a.map(seq): # traverse hits
print('{}\t{}\t{}'.format(name, len(seq), h))