diff --git a/python/README.rst b/python/README.rst index 1ccd4eb..1d8d17f 100644 --- a/python/README.rst +++ b/python/README.rst @@ -2,14 +2,21 @@ Mappy: Minimap2 Python Binding ============================== -`Minimap2 `_ is a fast and accurate pairwise -aligner for genomic and transcribed nucleotide sequences. This Python extension -provides a convenient interface to calling minimap2 in Python. +Mappy provides a convenient interface to `minimap2 +`_, a fast and accurate C program to align +genomic and transcribe nucleotide sequences. Installation ------------ -The mappy module can be installed directly with: +Mappy depends on `zlib `_. It can be installed with `pip +`_: + +.. code:: shell + + pip install --user mappy + +or from the minimap2 github repo: .. code:: shell @@ -17,40 +24,33 @@ The mappy module can be installed directly with: cd minimap2 python setup.py install -or with `pip `_: - -.. code:: shell - - pip install --user mappy - Usage ----- -The following Python program shows the key functionality of this module: +The following Python program shows the key functionality of mappy: .. code:: python import mappy as mp a = mp.Aligner("test/MT-human.fa") # load or build index if not a: raise Exception("ERROR: failed to load/build index") - for hit in a.map("GGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGT"): - print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str)) - -It builds an index from the specified sequence file (or loads an index if a -pre-built index is specified), aligns a sequence against it, traverses each hit -and prints them out. + for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence + for hit in a.map(seq): # traverse alignments + print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str)) APIs ---- +Mappy implements two classes and one global function. + Class mappy.Aligner -~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~ .. code:: python - Aligner(fn_idx_in, preset=None, ...) + mappy.Aligner(fn_idx_in, preset=None, ...) -Arguments: +This constructor accepts the following arguments: * **fn_idx_in**: index or sequence file name. Minimap2 automatically tests the file type. If a sequence file is provided, minimap2 builds an index. The @@ -81,15 +81,16 @@ Arguments: .. code:: python - map(seq) + mappy.Aligner.map(seq) -This method maps :code:`seq` against the index. It *yields* a generator, -generating a series of :code:`Alignment` objects. +This method aligns :code:`seq` against the index. It is a generator, *yielding* +a series of :code:`mappy.Alignment` objects. Class mappy.Alignment -~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ -This class has the following properties: +This class describes an alignment. An object of this class has the following +properties: * **ctg**: name of the reference sequence the query is mapped to @@ -118,8 +119,23 @@ This class has the following properties: * **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two numbers give the length and the operator of each CIGAR operation. -An :code:`Alignment` object can be converted to a string in the following format: +An :code:`Alignment` object can be converted to a string with :code:`str()` in +the following format: :: q_st q_en strand ctg ctg_len r_st r_en blen-NM blen mapq cg:Z:cigar_str + +It is effectively the PAF format without the QueryName and QueryLength columns +(the first two columns in PAF). + +Function mappy.fastx_read +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + mappy.fastx_read(fn) + +This generator function opens a FASTA/FASTQ file and *yields* a +:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be +optionally gzip'd. diff --git a/python/cmappy.h b/python/cmappy.h index 386b7fc..127fc68 100644 --- a/python/cmappy.h +++ b/python/cmappy.h @@ -2,7 +2,11 @@ #define CMAPPY_H #include +#include +#include #include "minimap.h" +#include "kseq.h" +KSEQ_DECLARE(gzFile) typedef struct { const char *ctg; @@ -36,4 +40,19 @@ static inline void mm_free_reg1(mm_reg1_t *r) free(r->p); } +static inline kseq_t *mm_fastx_open(const char *fn) +{ + gzFile fp; + fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + return kseq_init(fp); +} + +static inline void mm_fastx_close(kseq_t *ks) +{ + gzFile fp; + fp = ks->f->f; + kseq_destroy(ks); + gzclose(fp); +} + #endif diff --git a/python/cmappy.pxd b/python/cmappy.pxd index c78777a..b4f821d 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -75,7 +75,7 @@ cdef extern from "minimap.h": mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name) # -# Helper header (because it is hard to expose mm_reg1_t with Cython +# Helper header (because it is hard to expose mm_reg1_t with Cython) # cdef extern from "cmappy.h": ctypedef struct mm_hitpy_t: @@ -90,3 +90,19 @@ cdef extern from "cmappy.h": void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h) void mm_free_reg1(mm_reg1_t *r) + + ctypedef struct kstring_t: + unsigned l, m + char *s + + ctypedef struct kstream_t: + pass + + ctypedef struct kseq_t: + kstring_t name, comment, seq, qual + int last_char + kstream_t *f + + kseq_t *mm_fastx_open(const char *fn) + void mm_fastx_close(kseq_t *ks) + int kseq_read(kseq_t *seq) diff --git a/python/mappy.pyx b/python/mappy.pyx index dec76ff..0b56673 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -11,7 +11,7 @@ cdef class Alignment: cdef _ctg, _cigar # these are python objects def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, blen, NM, trans_strand): - self._ctg, self._ctg_len, self._r_st, self._r_en = ctg.decode("ascii"), cl, cs, ce + self._ctg, self._ctg_len, self._r_st, self._r_en = str(ctg), cl, cs, ce self._strand, self._q_st, self._q_en = strand, qs, qe self._NM, self._blen = NM, blen self._mapq = mapq @@ -133,3 +133,13 @@ cdef class Aligner: yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.blen, h.NM, h.trans_strand) cmappy.mm_free_reg1(®s[i]) free(regs) + +def fastx_read(fn): + cdef cmappy.kseq_t *ks + ks = cmappy.mm_fastx_open(str.encode(fn)) + if ks is NULL: return None + while cmappy.kseq_read(ks) >= 0: + qual = None + if ks.qual.l > 0: qual = str(ks.qual.s) + yield str(ks.name.s), str(ks.seq.s), qual + cmappy.mm_fastx_close(ks) diff --git a/python/minimap2.py b/python/minimap2.py index 415a79b..ba83354 100755 --- a/python/minimap2.py +++ b/python/minimap2.py @@ -3,47 +3,14 @@ import sys, getopt import mappy as mp -def readfq(fp): # multi-line fasta/fastq parser - last = None - while True: - if not last: - for l in fp: - if l[0] in '>@': - last = l[:-1] - break - if not last: break - name, seqs, last = last[1:].split()[0], [], None - for l in fp: - if l[0] in '@+>': - last = l[:-1] - break - seqs.append(l[:-1]) - if not last or last[0] != '+': - yield name, ''.join(seqs), None - if not last: break - else: - seq, leng, seqs = ''.join(seqs), 0, [] - for l in fp: - seqs.append(l[:-1]) - leng += len(l) - 1 - if leng >= len(seq): - last = None - yield name, seq, ''.join(seqs); - break - if last: - yield name, seq, None - break - def main(argv): opts, args = getopt.getopt(argv[1:], "") if len(args) < 2: print("Usage: minimap2.py | ") sys.exit(1) a = mp.Aligner(args[0]) # load/build index - if not a: - print("ERROR: failed to load/build index") - return - for name, seq, qual in readfq(open(args[1])): # read one sequence + if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0])) + for name, seq, qual in mp.fastx_read(args[1]): # read one sequence for h in a.map(seq): # traverse hits print('{}\t{}\t{}'.format(name, len(seq), h))