diff --git a/example.c b/example.c index 3eac4c2..7b03d83 100644 --- a/example.c +++ b/example.c @@ -43,8 +43,7 @@ int main(int argc, char *argv[]) mm_reg1_t *r = ®[j]; assert(r->p); // with MM_F_CIGAR, this should not be NULL printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]); - printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, - r->p->blen - r->p->n_ambi - r->p->n_diff, r->p->blen, r->mapq); + printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq); for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings! printf("%d%c", r->p->cigar[i]>>4, "MIDSHN"[r->p->cigar[i]&0xf]); putchar('\n'); diff --git a/python/README.rst b/python/README.rst index 8629c22..bfee564 100644 --- a/python/README.rst +++ b/python/README.rst @@ -104,9 +104,13 @@ properties: * **mapq**: mapping quality -* **NM**: number of mismatches and gaps in the alignment - * **blen**: length of the alignment, including both alignment matches and gaps + but excluding ambiguous bases. + +* **mlen**: length of the matching bases in the alignment, excluding ambiguous + base matches. + +* **NM**: number of mismatches, gaps and ambiguous poistions in the alignment * **trans_strand**: transcript strand. +1 if on the forward strand; -1 if on the reverse strand; 0 if unknown @@ -124,7 +128,7 @@ the following format: :: - q_st q_en strand ctg ctg_len r_st r_en blen-NM blen mapq cg:Z:cigar_str + q_st q_en strand ctg ctg_len r_st r_en mlen blen mapq cg:Z:cigar_str It is effectively the PAF format without the QueryName and QueryLength columns (the first two columns in PAF). diff --git a/python/cmappy.h b/python/cmappy.h index c696a80..f691035 100644 --- a/python/cmappy.h +++ b/python/cmappy.h @@ -12,7 +12,7 @@ typedef struct { const char *ctg; int32_t ctg_start, ctg_end; int32_t qry_start, qry_end; - int32_t blen, NM, ctg_len; + int32_t blen, mlen, NM, ctg_len; uint8_t mapq, is_primary; int8_t strand, trans_strand; int32_t n_cigar32; @@ -27,8 +27,9 @@ static inline void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h) h->qry_start = r->qs, h->qry_end = r->qe; h->strand = r->rev? -1 : 1; h->mapq = r->mapq; - h->blen = r->p->blen; - h->NM = r->p->n_diff; + h->mlen = r->mlen; + h->blen = r->blen; + h->NM = r->blen - r->mlen + r->p->n_ambi; h->trans_strand = r->p->trans_strand == 1? 1 : r->p->trans_strand == 2? -1 : 0; h->is_primary = (r->id == r->parent); h->n_cigar32 = r->p->n_cigar; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 60a8367..4600428 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -15,17 +15,20 @@ cdef extern from "minimap.h": int flag int bw int max_gap, max_gap_ref + int max_frag_len int max_chain_skip int min_cnt int min_chain_score float mask_level float pri_ratio int best_n + float min_iden int max_join_long, max_join_short int min_join_flank_sc int a, b, q, e, q2, e2 int noncan int zdrop + int end_bonus int min_dp_max int min_ksw_len int pe_ori, pe_bonus @@ -86,7 +89,7 @@ cdef extern from "cmappy.h": const char *ctg int32_t ctg_start, ctg_end int32_t qry_start, qry_end - int32_t blen, NM, ctg_len + int32_t blen, mlen, NM, ctg_len uint8_t mapq, is_primary int8_t strand, trans_strand int32_t n_cigar32 diff --git a/python/mappy.pyx b/python/mappy.pyx index cc1e083..0f52bad 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -7,15 +7,15 @@ cmappy.mm_reset_timer() cdef class Alignment: cdef int _ctg_len, _r_st, _r_en cdef int _q_st, _q_en - cdef int _NM, _blen + cdef int _NM, _mlen, _blen cdef int8_t _strand, _trans_strand cdef uint8_t _mapq, _is_primary cdef _ctg, _cigar # these are python objects - def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, blen, NM, trans_strand): + def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand): self._ctg, self._ctg_len, self._r_st, self._r_en = str(ctg), cl, cs, ce self._strand, self._q_st, self._q_en = strand, qs, qe - self._NM, self._blen = NM, blen + self._NM, self._mlen, self._blen = NM, mlen, blen self._mapq = mapq self._cigar = cigar self._is_primary = is_primary @@ -39,6 +39,12 @@ cdef class Alignment: @property def trans_strand(self): return self._trans_strand + @property + def blen(self): return self._blen + + @property + def mlen(self): return self._mlen + @property def NM(self): return self._NM @@ -71,7 +77,7 @@ cdef class Alignment: elif self._trans_strand < 0: ts = 'ts:A:-' else: ts = 'ts:A:.' return "\t".join([str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en), - str(self._blen - self._NM), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]) + str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]) cdef class ThreadBuffer: cdef cmappy.mm_tbuf_t *_b @@ -135,7 +141,7 @@ cdef class Aligner: for k in range(h.n_cigar32): c = h.cigar32[k] cigar.append([c>>4, c&0xf]) - yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.blen, h.NM, h.trans_strand) + yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand) cmappy.mm_free_reg1(®s[i]) free(regs) diff --git a/setup.py b/setup.py index 69ec231..ab72226 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ setup( keywords = 'sequence-alignment', scripts = ['python/minimap2.py'], ext_modules = [Extension('mappy', - sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', + sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c'], depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',