diff --git a/format.c b/format.c index a4b8234..eabfd1a 100644 --- a/format.c +++ b/format.c @@ -133,10 +133,10 @@ void mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int free(str.s); } -static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden) +static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag) { int i, q_off, t_off; - mm_sprintf_lite(s, "\tcs:Z:"); + if (write_tag) mm_sprintf_lite(s, "\tcs:Z:"); for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; assert(op >= 0 && op <= 3); @@ -181,10 +181,10 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq assert(t_off == r->re - r->rs && q_off == r->qe - r->qs); } -static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp) +static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int write_tag) { int i, q_off, t_off, l_MD = 0; - mm_sprintf_lite(s, "\tMD:Z:"); + if (write_tag) mm_sprintf_lite(s, "\tMD:Z:"); for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; assert(op >= 0 && op <= 2); // introns (aka reference skips) are not supported @@ -210,7 +210,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq assert(t_off == r->re - r->rs && q_off == r->qe - r->qs); } -static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD) +static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag) { extern unsigned char seq_nt4_table[256]; int i; @@ -230,11 +230,34 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_ qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c; } } - if (is_MD) write_MD_core(s, tseq, qseq, r, tmp); - else write_cs_core(s, tseq, qseq, r, tmp, no_iden); + if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag); + else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag); kfree(km, qseq); kfree(km, tseq); kfree(km, tmp); } +int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int is_MD, int no_iden) +{ + mm_bseq1_t t; + kstring_t str; + str.s = *buf, str.l = 0, str.m = *max_len; + t.l_seq = strlen(seq); + t.seq = (char*)seq; + write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0); + *max_len = str.m; + *buf = str.s; + return str.l; +} + +int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden) +{ + return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 0, no_iden); +} + +int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq) +{ + return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0); +} + static inline void write_tags(kstring_t *s, const mm_reg1_t *r) { int type; @@ -277,7 +300,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]); } if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) - write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD); + write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1); if ((opt_flag & MM_F_COPY_COMMENT) && t->comment) mm_sprintf_lite(s, "\t%s", t->comment); } @@ -476,7 +499,7 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se } } if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) - write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD); + write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1); if (cigar_in_tag) write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag); } diff --git a/main.c b/main.c index 00324ed..138edca 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "getopt.h" #endif -#define MM_VERSION "2.11-r815-dirty" +#define MM_VERSION "2.11-r819-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index ba04d32..636344f 100644 --- a/map.c +++ b/map.c @@ -29,6 +29,11 @@ void mm_tbuf_destroy(mm_tbuf_t *b) free(b); } +void *mm_tbuf_get_km(mm_tbuf_t *b) +{ + return b->km; +} + static int mm_dust_minier(void *km, int n, mm128_t *a, int l_seq, const char *seq, int sdust_thres) { int n_dreg, j, k, u = 0; @@ -682,7 +687,7 @@ int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_sp for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i) pl.rid_shift[i] += pl.rid_shift[i - 1]; if (opt->flag & MM_F_OUT_SAM) - for (i = 0; i < pl.mi->n_seq; ++i) + for (i = 0; i < (int32_t)pl.mi->n_seq; ++i) printf("@SQ\tSN:%s\tLN:%d\n", pl.mi->seq[i].name, pl.mi->seq[i].len); kt_pipeline(2, worker_pipeline, &pl, 3); diff --git a/minimap.h b/minimap.h index 8d708db..40a5d25 100644 --- a/minimap.h +++ b/minimap.h @@ -301,6 +301,8 @@ mm_tbuf_t *mm_tbuf_init(void); */ void mm_tbuf_destroy(mm_tbuf_t *b); +void *mm_tbuf_get_km(mm_tbuf_t *b); + /** * Align a query sequence against an index * @@ -337,6 +339,22 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads); +/** + * Generate the cs tag (new in 2.12) + * + * @param km memory blocks; set to NULL if unsure + * @param buf buffer to write the cs/MD tag; typicall NULL on the first call + * @param max_len max length of the buffer; typically set to 0 on the first call + * @param mi index + * @param r alignment + * @param seq query sequence + * @param no_iden true to use : instead of = + * + * @return the length of cs + */ +int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden); +int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq); + // query sequence name and sequence in the minimap2 index int mm_idx_index_name(mm_idx_t *mi); int mm_idx_name2id(const mm_idx_t *mi, const char *name); diff --git a/options.c b/options.c index 2c816f8..d43ebec 100644 --- a/options.c +++ b/options.c @@ -132,6 +132,11 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo) { + if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n"); + return -6; + } if (io->k <= 0 || io->w <= 0) { if (mm_verbose >= 1) fprintf(stderr, "[ERROR]\033[1;31m -k and -w must be positive\033[0m\n"); diff --git a/python/README.rst b/python/README.rst index ff15bab..a1e346f 100644 --- a/python/README.rst +++ b/python/README.rst @@ -43,7 +43,7 @@ The following Python script demonstrates the key functionality of mappy: APIs ---- -Mappy implements two classes and one global function. +Mappy implements two classes and two global function. Class mappy.Aligner ~~~~~~~~~~~~~~~~~~~ @@ -83,13 +83,15 @@ This constructor accepts the following arguments: .. code:: python - mappy.Aligner.map(seq, seq2=None) + mappy.Aligner.map(seq, seq2=None, cs=False, MD=False) This method aligns :code:`seq` against the index. It is a generator, *yielding* a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy performs paired-end alignment, assuming the two ends are in the FR orientation. Alignments of the two ends can be distinguished by the :code:`read_num` field -(see Class mappy.Alignment below). +(see Class mappy.Alignment below). Argument :code:`cs` asks mappy to generate +the :code:`cs` tag; :code:`MD` is similar. These two arguments might slightly +degrade performance and are not enabled by default. .. code:: python @@ -139,6 +141,11 @@ properties: * **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two numbers give the length and the operator of each CIGAR operation. +* **MD**: the :code:`MD` tag as in the SAM format. It is an empty string unless + the :code:`MD` argument is applied when calling :code:`mappy.Aligner.map()`. + +* **cs**: the :code:`cs` tag. + An :code:`Alignment` object can be converted to a string with :code:`str()` in the following format: diff --git a/python/cmappy.h b/python/cmappy.h index 1faa540..690c9c2 100644 --- a/python/cmappy.h +++ b/python/cmappy.h @@ -126,8 +126,8 @@ static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int e *len = 0; rid = mm_idx_name2id(mi, name); if (rid < 0) return 0; - if (st >= mi->seq[rid].len || st >= en) return 0; - if (en < 0 || en > mi->seq[rid].len) + if ((uint32_t)st >= mi->seq[rid].len || st >= en) return 0; + if (en < 0 || (uint32_t)en > mi->seq[rid].len) en = mi->seq[rid].len; s = (char*)malloc(en - st + 1); *len = mm_idx_getseq(mi, rid, st, en, (uint8_t*)s); diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 39be8a4..871d326 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -40,6 +40,7 @@ cdef extern from "minimap.h": int32_t mid_occ int32_t max_occ int mini_batch_size + const char *split_prefix int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) int mm_verbose @@ -86,6 +87,9 @@ cdef extern from "minimap.h": mm_tbuf_t *mm_tbuf_init() void mm_tbuf_destroy(mm_tbuf_t *b) + void *mm_tbuf_get_km(mm_tbuf_t *b) + int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden) + int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq) # # Helper header (because it is hard to expose mm_reg1_t with Cython) diff --git a/python/mappy.pyx b/python/mappy.pyx index 4ad0b8f..8fc1543 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -14,9 +14,9 @@ cdef class Alignment: cdef int8_t _strand, _trans_strand cdef uint8_t _mapq, _is_primary cdef int _seg_id - cdef _ctg, _cigar # these are python objects + cdef _ctg, _cigar, _cs, _MD # these are python objects - def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id): + def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str): self._ctg = ctg if isinstance(ctg, str) else ctg.decode() self._ctg_len, self._r_st, self._r_en = cl, cs, ce self._strand, self._q_st, self._q_en = strand, qs, qe @@ -26,6 +26,8 @@ cdef class Alignment: self._is_primary = is_primary self._trans_strand = trans_strand self._seg_id = seg_id + self._cs = cs_str + self._MD = MD_str @property def ctg(self): return self._ctg @@ -72,6 +74,12 @@ cdef class Alignment: @property def read_num(self): return self._seg_id + 1 + @property + def cs(self): return self._cs + + @property + def MD(self): return self._MD + @property def cigar_str(self): return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar)) @@ -85,8 +93,10 @@ cdef class Alignment: if self._trans_strand > 0: ts = 'ts:A:+' elif self._trans_strand < 0: ts = 'ts:A:-' else: ts = 'ts:A:.' - return "\t".join([str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en), - str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]) + a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en), + str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str] + if self._cs != "": a.append("cs:Z:" + self._cs) + return "\t".join(a) cdef class ThreadBuffer: cdef cmappy.mm_tbuf_t *_b @@ -135,18 +145,23 @@ cdef class Aligner: def __bool__(self): return (self._idx != NULL) - def map(self, seq, seq2=None, buf=None, max_frag_len=None): + def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None): cdef cmappy.mm_reg1_t *regs cdef cmappy.mm_hitpy_t h cdef ThreadBuffer b cdef int n_regs + cdef char *cs_str = NULL + cdef int l_cs_str, m_cs_str = 0 + cdef void *km cdef cmappy.mm_mapopt_t map_opt + map_opt = self.map_opt if max_frag_len is not None: map_opt.max_frag_len = max_frag_len if self._idx is NULL: return None if buf is None: b = ThreadBuffer() else: b = buf + km = cmappy.mm_tbuf_get_km(b._b) _seq = seq if isinstance(seq, bytes) else seq.encode() if seq2 is None: @@ -157,13 +172,21 @@ cdef class Aligner: for i in range(n_regs): cmappy.mm_reg2hitpy(self._idx, ®s[i], &h) - cigar = [] - for k in range(h.n_cigar32): + cigar, _cs, _MD = [], '', '' + for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array c = h.cigar32[k] cigar.append([c>>4, c&0xf]) - yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id) + if cs or MD: # generate the cs and/or the MD tag, if requested + if cs: + l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq, 1) + _cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode() + if MD: + l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq) + _MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode() + yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD) cmappy.mm_free_reg1(®s[i]) free(regs) + free(cs_str) def seq(self, str name, int start=0, int end=0x7fffffff): cdef int l diff --git a/python/minimap2.py b/python/minimap2.py index 28bbfe5..2f89fcd 100755 --- a/python/minimap2.py +++ b/python/minimap2.py @@ -4,7 +4,7 @@ import sys, getopt import mappy as mp def main(argv): - opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:") + opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c") if len(args) < 2: print("Usage: minimap2.py [options] | ") print("Options:") @@ -14,9 +14,10 @@ def main(argv): print(" -k INT k-mer length") print(" -w INT minimizer window length") print(" -r INT band width") + print(" -c output the cs tag") sys.exit(1) - preset, min_cnt, min_sc, k, w, bw = None, None, None, None, None, None + preset, min_cnt, min_sc, k, w, bw, out_cs = None, None, None, None, None, None, False for opt, arg in opts: if opt == '-x': preset = arg elif opt == '-n': min_cnt = int(arg) @@ -24,11 +25,12 @@ def main(argv): elif opt == '-r': bw = int(arg) elif opt == '-k': k = int(arg) elif opt == '-w': w = int(arg) + elif opt == '-c': out_cs = True a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw) if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0])) for name, seq, qual in mp.fastx_read(args[1]): # read one sequence - for h in a.map(seq): # traverse hits + for h in a.map(seq, cs=out_cs): # traverse hits print('{}\t{}\t{}'.format(name, len(seq), h)) if __name__ == "__main__": diff --git a/setup.py b/setup.py index 259c1ae..7f52909 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ setup( ext_modules = [Extension('mappy', sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', - 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c'], + 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'], depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h', 'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h', 'python/cmappy.h', 'python/cmappy.pxd'],