diff --git a/align.c b/align.c index 2e0defb..716d2b8 100644 --- a/align.c +++ b/align.c @@ -355,23 +355,53 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int kfree(km, tseq); } -static void mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez) +static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez) { - int tlen, qlen; + int tl, ql, ret = 0; uint8_t *tseq, *qseq; int8_t mat[25]; memset(r_inv, 0, sizeof(mm_reg1_t)); - if (r1->rid != r2->rid || r1->rev != r2->rev) return; - qlen = r2->qs - r1->qe; - tlen = r2->rs - r1->re; - if (qlen < 0 || qlen > opt->max_gap) return; - if (tlen < 0 || tlen > opt->max_gap) return; + if (!(r1->split&1) || !(r2->split&2)) return 0; + if (r1->rid != r2->rid || r1->rev != r2->rev) return 0; + ql = r2->qs - r1->qe; + tl = r2->rs - r1->re; + if (ql < 0 || ql > opt->max_gap) return 0; + if (tl < 0 || tl > opt->max_gap) return 0; ksw_gen_simple_mat(5, mat, opt->a, opt->b); - tseq = (uint8_t*)kmalloc(km, tlen); + tseq = (uint8_t*)kmalloc(km, tl); mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq); - qseq = &qseq0[!r1->rev][r1->qe]; - mm_align_pair(km, opt, qlen, qseq, tlen, tseq, mat, -1, KSW_EZ_APPROX_MAX, ez); - fprintf(stderr, "%d; (%d,%d); (%d,%d)\n", ez->score, r1->re, r2->rs, r1->qe, r2->qs); + qseq = &qseq0[!r1->rev][qlen - r2->qs]; + mm_align_pair(km, opt, ql, qseq, tl, tseq, mat, (int)(opt->bw * 1.5), KSW_EZ_APPROX_MAX, ez); + if (ez->n_cigar > 0 && ez->score > 0) { + mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); + r_inv->p->dp_score = ez->score; + mm_update_extra(r_inv->p, qseq, tseq, mat, opt->q, opt->e); + if (r_inv->p->dp_max < opt->min_dp_max) { // discard this hit + free(r_inv->p); + r_inv->p = 0; + } else { + r_inv->id = -1; + r_inv->parent = MM_PARENT_UNSET; + r_inv->inv = 1; + r_inv->rev = !r1->rev; + r_inv->qs = r1->qe, r_inv->qe = r2->qs; + r_inv->rs = r1->re, r_inv->re = r2->rs; + ret = 1; + fprintf(stderr, "here!\n"); + } + } + kfree(km, tseq); + return ret; +} + +static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, mm_reg1_t *regs) +{ + regs = (mm_reg1_t*)realloc(regs, (*n_regs + 1) * sizeof(mm_reg1_t)); + if (i + 1 != *n_regs) + memmove(®s[i + 2], ®s[i + 1], sizeof(mm_reg1_t) * (*n_regs - i - 1)); + regs[i + 1] = *r; + ++*n_regs; + return regs; } mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a) @@ -394,15 +424,10 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m for (i = 0; i < n_regs; ++i) { mm_reg1_t r2; mm_align1(km, opt, mi, qlen, qseq0, ®s[i], &r2, a, &ez); - if (r2.cnt > 0) { - regs = (mm_reg1_t*)realloc(regs, (n_regs + 1) * sizeof(mm_reg1_t)); // this should be very rare - if (i + 1 != n_regs) - memmove(®s[i + 2], ®s[i + 1], sizeof(mm_reg1_t) * (n_regs - i - 1)); - regs[i + 1] = r2; - ++n_regs; - } - if (i > 0 && (regs[i].split&2) && (regs[i-1].split&1)) { - mm_align1_inv(km, opt, mi, qseq0, ®s[i-1], ®s[i], &r2, &ez); + if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs); + if (i > 0 && mm_align1_inv(km, opt, mi, qlen, qseq0, ®s[i-1], ®s[i], &r2, &ez)) { + regs = mm_insert_reg(&r2, i, &n_regs, regs); + ++i; // skip the inserted INV alignment } } *n_regs_ = n_regs; diff --git a/format.c b/format.c index 5098aac..a8d681e 100644 --- a/format.c +++ b/format.c @@ -56,7 +56,8 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...) static inline void write_tags(kstring_t *s, const mm_reg1_t *r) { - mm_sprintf_lite(s, "\tcm:i:%d\ts1:i:%d", r->cnt, r->score); + int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S'; + mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score); if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc); if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split); if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); diff --git a/hit.c b/hit.c index e452d18..41c3b8e 100644 --- a/hit.c +++ b/hit.c @@ -125,11 +125,11 @@ void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r) aux = (uint64_t*)kmalloc(km, n * 8); t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t)); for (i = n_aux = 0; i < n; ++i) { - if (r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted) + if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted) assert(r[i].p); aux[n_aux++] = (uint64_t)r[i].p->dp_max << 32 | i; } else if (r[i].p) { - kfree(km, r[i].p); + free(r[i].p); r[i].p = 0; } } @@ -197,7 +197,7 @@ void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *re for (i = k = 0; i < *n_regs; ++i) { mm_reg1_t *r = ®s[i]; int flt = 0; - if (r->cnt < opt->min_cnt) flt = 1; + if (!r->inv && r->cnt < opt->min_cnt) flt = 1; if (r->p) { if (r->p->blen - r->p->n_ambi - r->p->n_diff < opt->min_chain_score) flt = 1; else if (r->p->dp_max < opt->min_dp_max) flt = 1; @@ -294,7 +294,9 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc) int i; for (i = 0; i < n_regs; ++i) { mm_reg1_t *r = ®s[i]; - if (r->parent == r->id) { + if (r->inv) { + r->mapq = 0; + } else if (r->parent == r->id) { int mapq, subsc; float pen_cm = r->cnt >= 10? 1.0f : 0.1f * r->cnt; subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc; diff --git a/main.c b/main.c index 5cc3690..3338766 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r222-dirty" +#define MM_VERSION "2.0-r224-dirty" void liftrlimit() { diff --git a/map.c b/map.c index e9d894e..bfb5d3d 100644 --- a/map.c +++ b/map.c @@ -31,7 +31,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; opt->zdrop = 400; - opt->min_dp_max = opt->min_chain_score; + opt->min_dp_max = opt->min_chain_score * opt->a; opt->min_ksw_len = 200; } diff --git a/minimap.h b/minimap.h index 6e47223..27eb3b4 100644 --- a/minimap.h +++ b/minimap.h @@ -61,7 +61,7 @@ typedef struct { typedef struct { int32_t id; uint32_t cnt:31, rev:1; - uint32_t rid:31, rep:1; + uint32_t rid:31, inv:1; int32_t score; int32_t qs, qe, rs, re; int32_t parent, subsc; diff --git a/minimap2.1 b/minimap2.1 index a4d4ba6..f9b84e5 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "28 July 2017" "minimap2-2.0-r219-dirty" "Bioinformatics tools" +.TH minimap2 1 "29 July 2017" "minimap2-2.0-r224-dirty" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -329,6 +329,7 @@ cb | cb | cb r | c | l . Tag Type Description _ +tp A Type of aln: P/primary, S/secondary and I/inversion cm i Number of minimizers on the chain s1 i Chaining score s2 i Chaining score of the best secondary chain @@ -343,7 +344,8 @@ cg Z CIGAR string (only in PAF) .TP 2 * Minimap2 may produce suboptimal alignments through long low-complexity regions -where seed positions may be inaccurate. +where seed positions may be suboptimal. This should not be a big concern +because even the optimal alignment may be wrong in such regions. .TP * Minimap2 may produce poor alignments that may need post-filtering. We are still