r819: mappy to support cs/MD
This commit is contained in:
parent
8c064a5f29
commit
ff9917a1c4
41
format.c
41
format.c
|
|
@ -133,10 +133,10 @@ void mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int
|
||||||
free(str.s);
|
free(str.s);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden)
|
static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag)
|
||||||
{
|
{
|
||||||
int i, q_off, t_off;
|
int i, q_off, t_off;
|
||||||
mm_sprintf_lite(s, "\tcs:Z:");
|
if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
|
||||||
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
||||||
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
||||||
assert(op >= 0 && op <= 3);
|
assert(op >= 0 && op <= 3);
|
||||||
|
|
@ -181,10 +181,10 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
||||||
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp)
|
static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int write_tag)
|
||||||
{
|
{
|
||||||
int i, q_off, t_off, l_MD = 0;
|
int i, q_off, t_off, l_MD = 0;
|
||||||
mm_sprintf_lite(s, "\tMD:Z:");
|
if (write_tag) mm_sprintf_lite(s, "\tMD:Z:");
|
||||||
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
|
||||||
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
||||||
assert(op >= 0 && op <= 2); // introns (aka reference skips) are not supported
|
assert(op >= 0 && op <= 2); // introns (aka reference skips) are not supported
|
||||||
|
|
@ -210,7 +210,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
|
||||||
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD)
|
static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag)
|
||||||
{
|
{
|
||||||
extern unsigned char seq_nt4_table[256];
|
extern unsigned char seq_nt4_table[256];
|
||||||
int i;
|
int i;
|
||||||
|
|
@ -230,11 +230,34 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
|
||||||
qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
|
qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp);
|
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
|
||||||
else write_cs_core(s, tseq, qseq, r, tmp, no_iden);
|
else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag);
|
||||||
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int is_MD, int no_iden)
|
||||||
|
{
|
||||||
|
mm_bseq1_t t;
|
||||||
|
kstring_t str;
|
||||||
|
str.s = *buf, str.l = 0, str.m = *max_len;
|
||||||
|
t.l_seq = strlen(seq);
|
||||||
|
t.seq = (char*)seq;
|
||||||
|
write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0);
|
||||||
|
*max_len = str.m;
|
||||||
|
*buf = str.s;
|
||||||
|
return str.l;
|
||||||
|
}
|
||||||
|
|
||||||
|
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
|
||||||
|
{
|
||||||
|
return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 0, no_iden);
|
||||||
|
}
|
||||||
|
|
||||||
|
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
|
||||||
|
{
|
||||||
|
return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
||||||
{
|
{
|
||||||
int type;
|
int type;
|
||||||
|
|
@ -277,7 +300,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
||||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
|
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
|
||||||
}
|
}
|
||||||
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
||||||
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
|
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1);
|
||||||
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
||||||
mm_sprintf_lite(s, "\t%s", t->comment);
|
mm_sprintf_lite(s, "\t%s", t->comment);
|
||||||
}
|
}
|
||||||
|
|
@ -476,7 +499,7 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
||||||
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
|
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1);
|
||||||
if (cigar_in_tag)
|
if (cigar_in_tag)
|
||||||
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
|
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
2
main.c
2
main.c
|
|
@ -10,7 +10,7 @@
|
||||||
#include "getopt.h"
|
#include "getopt.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MM_VERSION "2.11-r815-dirty"
|
#define MM_VERSION "2.11-r819-dirty"
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
|
|
||||||
7
map.c
7
map.c
|
|
@ -29,6 +29,11 @@ void mm_tbuf_destroy(mm_tbuf_t *b)
|
||||||
free(b);
|
free(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void *mm_tbuf_get_km(mm_tbuf_t *b)
|
||||||
|
{
|
||||||
|
return b->km;
|
||||||
|
}
|
||||||
|
|
||||||
static int mm_dust_minier(void *km, int n, mm128_t *a, int l_seq, const char *seq, int sdust_thres)
|
static int mm_dust_minier(void *km, int n, mm128_t *a, int l_seq, const char *seq, int sdust_thres)
|
||||||
{
|
{
|
||||||
int n_dreg, j, k, u = 0;
|
int n_dreg, j, k, u = 0;
|
||||||
|
|
@ -682,7 +687,7 @@ int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_sp
|
||||||
for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i)
|
for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i)
|
||||||
pl.rid_shift[i] += pl.rid_shift[i - 1];
|
pl.rid_shift[i] += pl.rid_shift[i - 1];
|
||||||
if (opt->flag & MM_F_OUT_SAM)
|
if (opt->flag & MM_F_OUT_SAM)
|
||||||
for (i = 0; i < pl.mi->n_seq; ++i)
|
for (i = 0; i < (int32_t)pl.mi->n_seq; ++i)
|
||||||
printf("@SQ\tSN:%s\tLN:%d\n", pl.mi->seq[i].name, pl.mi->seq[i].len);
|
printf("@SQ\tSN:%s\tLN:%d\n", pl.mi->seq[i].name, pl.mi->seq[i].len);
|
||||||
|
|
||||||
kt_pipeline(2, worker_pipeline, &pl, 3);
|
kt_pipeline(2, worker_pipeline, &pl, 3);
|
||||||
|
|
|
||||||
18
minimap.h
18
minimap.h
|
|
@ -301,6 +301,8 @@ mm_tbuf_t *mm_tbuf_init(void);
|
||||||
*/
|
*/
|
||||||
void mm_tbuf_destroy(mm_tbuf_t *b);
|
void mm_tbuf_destroy(mm_tbuf_t *b);
|
||||||
|
|
||||||
|
void *mm_tbuf_get_km(mm_tbuf_t *b);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Align a query sequence against an index
|
* Align a query sequence against an index
|
||||||
*
|
*
|
||||||
|
|
@ -337,6 +339,22 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
|
||||||
|
|
||||||
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
|
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate the cs tag (new in 2.12)
|
||||||
|
*
|
||||||
|
* @param km memory blocks; set to NULL if unsure
|
||||||
|
* @param buf buffer to write the cs/MD tag; typicall NULL on the first call
|
||||||
|
* @param max_len max length of the buffer; typically set to 0 on the first call
|
||||||
|
* @param mi index
|
||||||
|
* @param r alignment
|
||||||
|
* @param seq query sequence
|
||||||
|
* @param no_iden true to use : instead of =
|
||||||
|
*
|
||||||
|
* @return the length of cs
|
||||||
|
*/
|
||||||
|
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
|
||||||
|
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);
|
||||||
|
|
||||||
// query sequence name and sequence in the minimap2 index
|
// query sequence name and sequence in the minimap2 index
|
||||||
int mm_idx_index_name(mm_idx_t *mi);
|
int mm_idx_index_name(mm_idx_t *mi);
|
||||||
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
|
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
|
||||||
|
|
|
||||||
|
|
@ -132,6 +132,11 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
||||||
|
|
||||||
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
|
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
|
||||||
{
|
{
|
||||||
|
if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) {
|
||||||
|
if (mm_verbose >= 1)
|
||||||
|
fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n");
|
||||||
|
return -6;
|
||||||
|
}
|
||||||
if (io->k <= 0 || io->w <= 0) {
|
if (io->k <= 0 || io->w <= 0) {
|
||||||
if (mm_verbose >= 1)
|
if (mm_verbose >= 1)
|
||||||
fprintf(stderr, "[ERROR]\033[1;31m -k and -w must be positive\033[0m\n");
|
fprintf(stderr, "[ERROR]\033[1;31m -k and -w must be positive\033[0m\n");
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,7 @@ The following Python script demonstrates the key functionality of mappy:
|
||||||
APIs
|
APIs
|
||||||
----
|
----
|
||||||
|
|
||||||
Mappy implements two classes and one global function.
|
Mappy implements two classes and two global function.
|
||||||
|
|
||||||
Class mappy.Aligner
|
Class mappy.Aligner
|
||||||
~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
@ -83,13 +83,15 @@ This constructor accepts the following arguments:
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
mappy.Aligner.map(seq, seq2=None)
|
mappy.Aligner.map(seq, seq2=None, cs=False, MD=False)
|
||||||
|
|
||||||
This method aligns :code:`seq` against the index. It is a generator, *yielding*
|
This method aligns :code:`seq` against the index. It is a generator, *yielding*
|
||||||
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
|
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
|
||||||
performs paired-end alignment, assuming the two ends are in the FR orientation.
|
performs paired-end alignment, assuming the two ends are in the FR orientation.
|
||||||
Alignments of the two ends can be distinguished by the :code:`read_num` field
|
Alignments of the two ends can be distinguished by the :code:`read_num` field
|
||||||
(see Class mappy.Alignment below).
|
(see Class mappy.Alignment below). Argument :code:`cs` asks mappy to generate
|
||||||
|
the :code:`cs` tag; :code:`MD` is similar. These two arguments might slightly
|
||||||
|
degrade performance and are not enabled by default.
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
|
|
@ -139,6 +141,11 @@ properties:
|
||||||
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
|
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
|
||||||
numbers give the length and the operator of each CIGAR operation.
|
numbers give the length and the operator of each CIGAR operation.
|
||||||
|
|
||||||
|
* **MD**: the :code:`MD` tag as in the SAM format. It is an empty string unless
|
||||||
|
the :code:`MD` argument is applied when calling :code:`mappy.Aligner.map()`.
|
||||||
|
|
||||||
|
* **cs**: the :code:`cs` tag.
|
||||||
|
|
||||||
An :code:`Alignment` object can be converted to a string with :code:`str()` in
|
An :code:`Alignment` object can be converted to a string with :code:`str()` in
|
||||||
the following format:
|
the following format:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -126,8 +126,8 @@ static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int e
|
||||||
*len = 0;
|
*len = 0;
|
||||||
rid = mm_idx_name2id(mi, name);
|
rid = mm_idx_name2id(mi, name);
|
||||||
if (rid < 0) return 0;
|
if (rid < 0) return 0;
|
||||||
if (st >= mi->seq[rid].len || st >= en) return 0;
|
if ((uint32_t)st >= mi->seq[rid].len || st >= en) return 0;
|
||||||
if (en < 0 || en > mi->seq[rid].len)
|
if (en < 0 || (uint32_t)en > mi->seq[rid].len)
|
||||||
en = mi->seq[rid].len;
|
en = mi->seq[rid].len;
|
||||||
s = (char*)malloc(en - st + 1);
|
s = (char*)malloc(en - st + 1);
|
||||||
*len = mm_idx_getseq(mi, rid, st, en, (uint8_t*)s);
|
*len = mm_idx_getseq(mi, rid, st, en, (uint8_t*)s);
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,7 @@ cdef extern from "minimap.h":
|
||||||
int32_t mid_occ
|
int32_t mid_occ
|
||||||
int32_t max_occ
|
int32_t max_occ
|
||||||
int mini_batch_size
|
int mini_batch_size
|
||||||
|
const char *split_prefix
|
||||||
|
|
||||||
int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
||||||
int mm_verbose
|
int mm_verbose
|
||||||
|
|
@ -86,6 +87,9 @@ cdef extern from "minimap.h":
|
||||||
|
|
||||||
mm_tbuf_t *mm_tbuf_init()
|
mm_tbuf_t *mm_tbuf_init()
|
||||||
void mm_tbuf_destroy(mm_tbuf_t *b)
|
void mm_tbuf_destroy(mm_tbuf_t *b)
|
||||||
|
void *mm_tbuf_get_km(mm_tbuf_t *b)
|
||||||
|
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
|
||||||
|
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Helper header (because it is hard to expose mm_reg1_t with Cython)
|
# Helper header (because it is hard to expose mm_reg1_t with Cython)
|
||||||
|
|
|
||||||
|
|
@ -14,9 +14,9 @@ cdef class Alignment:
|
||||||
cdef int8_t _strand, _trans_strand
|
cdef int8_t _strand, _trans_strand
|
||||||
cdef uint8_t _mapq, _is_primary
|
cdef uint8_t _mapq, _is_primary
|
||||||
cdef int _seg_id
|
cdef int _seg_id
|
||||||
cdef _ctg, _cigar # these are python objects
|
cdef _ctg, _cigar, _cs, _MD # these are python objects
|
||||||
|
|
||||||
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id):
|
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str):
|
||||||
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
|
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
|
||||||
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
|
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
|
||||||
self._strand, self._q_st, self._q_en = strand, qs, qe
|
self._strand, self._q_st, self._q_en = strand, qs, qe
|
||||||
|
|
@ -26,6 +26,8 @@ cdef class Alignment:
|
||||||
self._is_primary = is_primary
|
self._is_primary = is_primary
|
||||||
self._trans_strand = trans_strand
|
self._trans_strand = trans_strand
|
||||||
self._seg_id = seg_id
|
self._seg_id = seg_id
|
||||||
|
self._cs = cs_str
|
||||||
|
self._MD = MD_str
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ctg(self): return self._ctg
|
def ctg(self): return self._ctg
|
||||||
|
|
@ -72,6 +74,12 @@ cdef class Alignment:
|
||||||
@property
|
@property
|
||||||
def read_num(self): return self._seg_id + 1
|
def read_num(self): return self._seg_id + 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cs(self): return self._cs
|
||||||
|
|
||||||
|
@property
|
||||||
|
def MD(self): return self._MD
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def cigar_str(self):
|
def cigar_str(self):
|
||||||
return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
|
return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
|
||||||
|
|
@ -85,8 +93,10 @@ cdef class Alignment:
|
||||||
if self._trans_strand > 0: ts = 'ts:A:+'
|
if self._trans_strand > 0: ts = 'ts:A:+'
|
||||||
elif self._trans_strand < 0: ts = 'ts:A:-'
|
elif self._trans_strand < 0: ts = 'ts:A:-'
|
||||||
else: ts = 'ts:A:.'
|
else: ts = 'ts:A:.'
|
||||||
return "\t".join([str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
|
a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
|
||||||
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str])
|
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
|
||||||
|
if self._cs != "": a.append("cs:Z:" + self._cs)
|
||||||
|
return "\t".join(a)
|
||||||
|
|
||||||
cdef class ThreadBuffer:
|
cdef class ThreadBuffer:
|
||||||
cdef cmappy.mm_tbuf_t *_b
|
cdef cmappy.mm_tbuf_t *_b
|
||||||
|
|
@ -135,18 +145,23 @@ cdef class Aligner:
|
||||||
def __bool__(self):
|
def __bool__(self):
|
||||||
return (self._idx != NULL)
|
return (self._idx != NULL)
|
||||||
|
|
||||||
def map(self, seq, seq2=None, buf=None, max_frag_len=None):
|
def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None):
|
||||||
cdef cmappy.mm_reg1_t *regs
|
cdef cmappy.mm_reg1_t *regs
|
||||||
cdef cmappy.mm_hitpy_t h
|
cdef cmappy.mm_hitpy_t h
|
||||||
cdef ThreadBuffer b
|
cdef ThreadBuffer b
|
||||||
cdef int n_regs
|
cdef int n_regs
|
||||||
|
cdef char *cs_str = NULL
|
||||||
|
cdef int l_cs_str, m_cs_str = 0
|
||||||
|
cdef void *km
|
||||||
cdef cmappy.mm_mapopt_t map_opt
|
cdef cmappy.mm_mapopt_t map_opt
|
||||||
|
|
||||||
map_opt = self.map_opt
|
map_opt = self.map_opt
|
||||||
if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
|
if max_frag_len is not None: map_opt.max_frag_len = max_frag_len
|
||||||
|
|
||||||
if self._idx is NULL: return None
|
if self._idx is NULL: return None
|
||||||
if buf is None: b = ThreadBuffer()
|
if buf is None: b = ThreadBuffer()
|
||||||
else: b = buf
|
else: b = buf
|
||||||
|
km = cmappy.mm_tbuf_get_km(b._b)
|
||||||
|
|
||||||
_seq = seq if isinstance(seq, bytes) else seq.encode()
|
_seq = seq if isinstance(seq, bytes) else seq.encode()
|
||||||
if seq2 is None:
|
if seq2 is None:
|
||||||
|
|
@ -157,13 +172,21 @@ cdef class Aligner:
|
||||||
|
|
||||||
for i in range(n_regs):
|
for i in range(n_regs):
|
||||||
cmappy.mm_reg2hitpy(self._idx, ®s[i], &h)
|
cmappy.mm_reg2hitpy(self._idx, ®s[i], &h)
|
||||||
cigar = []
|
cigar, _cs, _MD = [], '', ''
|
||||||
for k in range(h.n_cigar32):
|
for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array
|
||||||
c = h.cigar32[k]
|
c = h.cigar32[k]
|
||||||
cigar.append([c>>4, c&0xf])
|
cigar.append([c>>4, c&0xf])
|
||||||
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id)
|
if cs or MD: # generate the cs and/or the MD tag, if requested
|
||||||
|
if cs:
|
||||||
|
l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq, 1)
|
||||||
|
_cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
|
||||||
|
if MD:
|
||||||
|
l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq)
|
||||||
|
_MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
|
||||||
|
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
|
||||||
cmappy.mm_free_reg1(®s[i])
|
cmappy.mm_free_reg1(®s[i])
|
||||||
free(regs)
|
free(regs)
|
||||||
|
free(cs_str)
|
||||||
|
|
||||||
def seq(self, str name, int start=0, int end=0x7fffffff):
|
def seq(self, str name, int start=0, int end=0x7fffffff):
|
||||||
cdef int l
|
cdef int l
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ import sys, getopt
|
||||||
import mappy as mp
|
import mappy as mp
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:")
|
opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
|
||||||
if len(args) < 2:
|
if len(args) < 2:
|
||||||
print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
|
print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
|
||||||
print("Options:")
|
print("Options:")
|
||||||
|
|
@ -14,9 +14,10 @@ def main(argv):
|
||||||
print(" -k INT k-mer length")
|
print(" -k INT k-mer length")
|
||||||
print(" -w INT minimizer window length")
|
print(" -w INT minimizer window length")
|
||||||
print(" -r INT band width")
|
print(" -r INT band width")
|
||||||
|
print(" -c output the cs tag")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
preset, min_cnt, min_sc, k, w, bw = None, None, None, None, None, None
|
preset, min_cnt, min_sc, k, w, bw, out_cs = None, None, None, None, None, None, False
|
||||||
for opt, arg in opts:
|
for opt, arg in opts:
|
||||||
if opt == '-x': preset = arg
|
if opt == '-x': preset = arg
|
||||||
elif opt == '-n': min_cnt = int(arg)
|
elif opt == '-n': min_cnt = int(arg)
|
||||||
|
|
@ -24,11 +25,12 @@ def main(argv):
|
||||||
elif opt == '-r': bw = int(arg)
|
elif opt == '-r': bw = int(arg)
|
||||||
elif opt == '-k': k = int(arg)
|
elif opt == '-k': k = int(arg)
|
||||||
elif opt == '-w': w = int(arg)
|
elif opt == '-w': w = int(arg)
|
||||||
|
elif opt == '-c': out_cs = True
|
||||||
|
|
||||||
a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
|
a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
|
||||||
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
|
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
|
||||||
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
|
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
|
||||||
for h in a.map(seq): # traverse hits
|
for h in a.map(seq, cs=out_cs): # traverse hits
|
||||||
print('{}\t{}\t{}'.format(name, len(seq), h))
|
print('{}\t{}\t{}'.format(name, len(seq), h))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
2
setup.py
2
setup.py
|
|
@ -35,7 +35,7 @@ setup(
|
||||||
ext_modules = [Extension('mappy',
|
ext_modules = [Extension('mappy',
|
||||||
sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
|
sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
|
||||||
'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
|
'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
|
||||||
'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c'],
|
'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
|
||||||
depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
|
depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
|
||||||
'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h',
|
'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h',
|
||||||
'python/cmappy.h', 'python/cmappy.pxd'],
|
'python/cmappy.h', 'python/cmappy.pxd'],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue