From 993a2bb5210fca4e21a5a87d7d282c395d5b0cc1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 18 Aug 2017 15:31:15 +0800 Subject: [PATCH] r301: separate introns from deletions When an intron is adjacent to a deletion, the old code count both as introns, which lead to an inaccurate exon boundary. --- align.c | 41 +++++++++++++++++++---------------------- format.c | 20 ++++++-------------- ksw2.h | 6 ++++-- ksw2_extd2_sse.c | 4 ++-- ksw2_exts2_sse.c | 4 ++-- ksw2_extz2_sse.c | 4 ++-- main.c | 2 +- map.c | 9 +++------ mmpriv.h | 10 ++-------- 9 files changed, 41 insertions(+), 59 deletions(-) diff --git a/align.c b/align.c index 25744e9..ea7bd29 100644 --- a/align.c +++ b/align.c @@ -53,7 +53,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c } else if (op == 1) { score -= q + e * len, j += len; if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1; - } else if (op == 2) { + } else if (op == 2 || op == 3) { score -= q + e * len, i += len; if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1; } @@ -64,12 +64,11 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron) { uint32_t k, l, toff = 0, qoff = 0; - int32_t s = 0, max = 0, min_intron_len, n_gtag = 0, n_ctac = 0; - min_intron_len = mm_min_intron_len(q, e, q_intron); + int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0; if (p == 0) return; for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; - if (op == 0) { + if (op == 0) { // match/mismatch for (l = 0; l < len; ++l) { int cq = qseq[qoff + l], ct = tseq[toff + l]; if (ct > 3 || cq > 3) ++p->n_ambi; @@ -79,7 +78,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t else max = max > s? max : s; } toff += len, qoff += len, p->blen += len; - } else if (op == 1) { + } else if (op == 1) { // insertion int n_ambi = 0; for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; @@ -87,23 +86,21 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t p->n_ambi += n_ambi, p->n_diff += len - n_ambi; s -= q + e * len; if (s < 0) s = 0; - } else if (op == 2) { + } else if (op == 2) { // deletion int n_ambi = 0; - if (len < min_intron_len) { - for (l = 0; l < len; ++l) - if (tseq[toff + l] > 3) ++n_ambi; - toff += len, p->blen += len; - p->n_ambi += n_ambi, p->n_diff += len - n_ambi; - s -= q + e * len; - if (s < 0) s = 0; - } else { // intron - uint8_t b[4]; - b[0] = tseq[toff], b[1] = tseq[toff+1]; - b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1]; - if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag; - else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac; - toff += len, p->blen += len; - } + for (l = 0; l < len; ++l) + if (tseq[toff + l] > 3) ++n_ambi; + toff += len, p->blen += len; + p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + s -= q + e * len; + if (s < 0) s = 0; + } else if (op == 3) { // intron + uint8_t b[4]; + b[0] = tseq[toff], b[1] = tseq[toff+1]; + b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1]; + if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag; + else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac; + toff += len, p->blen += len; } } p->dp_max = max; @@ -337,7 +334,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_idx_getseq(mi, rid, rs, re, tseq); mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez); if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop)) - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez); + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez); if (ez->n_cigar > 0) mm_append_cigar(r, ez->n_cigar, ez->cigar); if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle. diff --git a/format.c b/format.c index 2d3af37..cf81291 100644 --- a/format.c +++ b/format.c @@ -182,7 +182,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) } } -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres) +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag) { s->l = 0; mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); @@ -196,12 +196,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m if (r->p && (opt_flag & MM_F_OUT_CG)) { uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); - for (k = 0; k < r->p->n_cigar; ++k) { - int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (intron_thres > 0 && op == 2 && len >= intron_thres) - mm_sprintf_lite(s, "%dN", len); - else mm_sprintf_lite(s, "%d%c", len, "MID"[op]); - } + for (k = 0; k < r->p->n_cigar; ++k) + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]); } if (r->p && (opt_flag & MM_F_OUT_CS)) write_cs(km, s, mi, t, r); @@ -239,7 +235,7 @@ static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) } else str_copy(s, seq, seq + l); } -void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres) +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs) { int flag = 0; s->l = 0; @@ -258,12 +254,8 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs; int clip_char = (flag&0x800)? 'H' : 'S'; if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char); - for (k = 0; k < r->p->n_cigar; ++k) { - int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (intron_thres > 0 && op == 2 && len >= intron_thres) - mm_sprintf_lite(s, "%dN", len); - else mm_sprintf_lite(s, "%d%c", len, "MID"[op]); - } + for (k = 0; k < r->p->n_cigar; ++k) + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]); clip_len = r->rev? r->qs : t->l_seq - r->qe; if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char); } else mm_sprintf_lite(s, "*"); diff --git a/ksw2.h b/ksw2.h index 3811628..5e43970 100644 --- a/ksw2.h +++ b/ksw2.h @@ -111,7 +111,8 @@ static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uin // bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F} // bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E}) // bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F}) -static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_) +static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, + int *m_cigar_, int *n_cigar_, uint32_t **cigar_) { // p[] - lower 3 bits: which type gets the max; bit int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0; uint32_t *cigar = *cigar_, tmp; @@ -132,7 +133,8 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure if (force_state >= 0) state = force_state; if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match - else if (state == 1 || state == 3) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion + else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion + else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion } if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index c7f5843..56cd8cd 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -365,9 +365,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (with_cigar) { // backtrack int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); else if (ez->max_t >= 0 && ez->max_q >= 0) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); kfree(km, mem2); kfree(km, off); } } diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 0694fb3..6decd2d 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -348,9 +348,9 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (with_cigar) { // backtrack int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); else if (ez->max_t >= 0 && ez->max_q >= 0) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); kfree(km, mem2); kfree(km, off); } } diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index 04669b9..f21f184 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -278,9 +278,9 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (with_cigar) { // backtrack int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); else if (ez->max_t >= 0 && ez->max_q >= 0) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); kfree(km, mem2); kfree(km, off); } } diff --git a/main.c b/main.c index d526a44..0735b91 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r297-dirty" +#define MM_VERSION "2.0-r301-dirty" void liftrlimit() { diff --git a/map.c b/map.c index 630c78a..5af4936 100644 --- a/map.c +++ b/map.c @@ -346,24 +346,21 @@ static void *worker_pipeline(void *shared, int step, void *in) void *km = 0; step_t *s = (step_t*)in; const mm_idx_t *mi = p->mi; - int intron_thres = -1; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); - if (p->opt->flag & MM_F_SPLICE) - intron_thres = mm_min_intron_len(p->opt->q, p->opt->e, p->opt->q2); if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; if (p->opt->flag & MM_F_OUT_SAM) - mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i], intron_thres); + mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]); else - mm_write_paf(&p->str, mi, t, r, km, p->opt->flag, intron_thres); + mm_write_paf(&p->str, mi, t, r, km, p->opt->flag); puts(p->str.s); } if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) { - mm_write_sam(&p->str, 0, t, 0, 0, 0, 0); + mm_write_sam(&p->str, 0, t, 0, 0, 0); puts(p->str.s); } for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p); diff --git a/mmpriv.h b/mmpriv.h index becf5e4..874f9e2 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -42,8 +42,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_set_rg(const char *s); void sam_write_sam_header(const mm_idx_t *idx); -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres); -void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres); +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); @@ -62,10 +62,4 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc); } #endif -static inline int32_t mm_min_intron_len(int32_t q, int32_t e, int32_t q_intron) -{ - int32_t x = q_intron > q? (int)((float)(q_intron - q) / e + .999) : INT32_MAX; - return x > 4? x : 4; -} - #endif