From 260a68d23264af55d78b39d55601e99b20d20c76 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 10 Apr 2021 12:49:39 +0100 Subject: [PATCH] Use #defines for CIGAR operators in C code Give the CIGAR constants names to clarify the code. So that ksw2.h remains self-contained, define KSW_* versions of the CIGAR operators it needs for use within ksw2.h. Other code should in general use the full set of MM_CIGAR_* constants in minimap.h. --- align.c | 60 +++++++++++++++++++++++++++---------------------------- format.c | 20 +++++++++---------- ksw2.h | 19 ++++++++++++------ minimap.h | 10 ++++++++++ 4 files changed, 63 insertions(+), 46 deletions(-) diff --git a/align.c b/align.c index 3b5ff9a..9d00bb4 100644 --- a/align.c +++ b/align.c @@ -53,16 +53,16 @@ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq, // find the score and the region where score drops most along diagonal for (k = 0, score = 0; k < n_cigar; ++k) { uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { for (l = 0; l < len; ++l) { score += mat[tseq[i + l] * 5 + qseq[j + l]]; update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); } i += len, j += len; - } else if (op == 1 || op == 2 || op == 3) { + } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) { score -= opt->q + opt->e * len; - if (op == 1) j += len; // insertion - else i += len; // deletion + if (op == MM_CIGAR_INS) j += len; + else i += len; update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); } } @@ -98,12 +98,12 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, for (k = 0; k < p->n_cigar; ++k) { // indel left alignment uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; if (len == 0) to_shrink = 1; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { toff += len, qoff += len; - } else if (op == 1 || op == 2) { // insertion or deletion + } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) { if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) { int l, prev_len = p->cigar[k-1] >> 4; - if (op == 1) { + if (op == MM_CIGAR_INS) { for (l = 0; l < prev_len; ++l) if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l]) break; @@ -116,9 +116,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l; if (l == prev_len) to_shrink = 1; } - if (op == 1) qoff += len; + if (op == MM_CIGAR_INS) qoff += len; else toff += len; - } else if (op == 3) { + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -128,13 +128,13 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, uint32_t l, s[3] = {0,0,0}; for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D uint32_t op = p->cigar[l]&0xf; - if (op == 1 || op == 2 || p->cigar[l]>>4 == 0) + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0) s[op] += p->cigar[l] >> 4; else break; } if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D - p->cigar[k] = s[1]<<4|1; - p->cigar[k+1] = s[2]<<4|2; + p->cigar[k] = s[1]<<4|MM_CIGAR_INS; + p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL; for (k += 2; k < l; ++k) p->cigar[k] &= 0xf; to_shrink = 1; @@ -154,9 +154,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator p->n_cigar = l; } - if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D + if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D int32_t l = p->cigar[0] >> 4; - if ((p->cigar[0]&0xf) == 1) { + if ((p->cigar[0]&0xf) == MM_CIGAR_INS) { if (r->rev) r->qe -= l; else r->qs += l; *qshift = l; @@ -174,7 +174,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (r->p == 0) return; for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) { + if (op == MM_CIGAR_MATCH) { while (len > 0) { for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "=" if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; } @@ -183,11 +183,11 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; } } ++n_M; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -195,7 +195,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t if (n_EQX == n_M) { for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) r->p->cigar[k] = len << 4 | 7; + if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH; } return; } @@ -209,25 +209,25 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t toff = qoff = m = 0; for (k = 0; k < r->p->n_cigar; ++k) { uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (op == 0) { // match/mismatch + if (op == MM_CIGAR_MATCH) { while (len > 0) { // match for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} - if (l > 0) p->cigar[m++] = l << 4 | 7; + if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH; len -= l; toff += l, qoff += l; // mismatch for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} - if (l > 0) p->cigar[m++] = l << 4 | 8; + if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH; len -= l; toff += l, qoff += l; } continue; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } p->cigar[m++] = r->p->cigar[k]; @@ -248,7 +248,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts r->blen = r->mlen = 0; for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; - if (op == 0) { // match/mismatch + if (op == MM_CIGAR_MATCH) { int n_ambi = 0, n_diff = 0; for (l = 0; l < len; ++l) { int cq = qseq[qoff + l], ct = tseq[toff + l]; @@ -260,7 +260,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts } r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi; toff += len, qoff += len; - } else if (op == 1) { // insertion + } else if (op == MM_CIGAR_INS) { int n_ambi = 0; for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; @@ -268,7 +268,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts s -= q + e * len; if (s < 0) s = 0; qoff += len; - } else if (op == 2) { // deletion + } else if (op == MM_CIGAR_DEL) { int n_ambi = 0; for (l = 0; l < len; ++l) if (tseq[toff + l] > 3) ++n_ambi; @@ -276,7 +276,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts s -= q + e * len; if (s < 0) s = 0; toff += len; - } else if (op == 3) { // intron + } else if (op == MM_CIGAR_N_SKIP) { toff += len; } } @@ -730,7 +730,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2; else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b; } - ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs); + ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs); } else { // perform normal gapped alignment mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop } diff --git a/format.c b/format.c index a33c3b6..f94290f 100644 --- a/format.c +++ b/format.c @@ -144,8 +144,8 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq if (write_tag) mm_sprintf_lite(s, "\tcs:Z:"); for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; - assert((op >= 0 && op <= 3) || op == 7 || op == 8); - if (op == 0 || op == 7 || op == 8) { // match + assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH); + if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) { int l_tmp = 0; for (j = 0; j < len; ++j) { if (qseq[q_off + j] != tseq[t_off + j]) { @@ -166,12 +166,12 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq } else mm_sprintf_lite(s, ":%d", l_tmp); } q_off += len, t_off += len; - } else if (op == 1) { // insertion to ref + } else if (op == MM_CIGAR_INS) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "acgtn"[qseq[q_off + j]]; mm_sprintf_lite(s, "+%s", tmp); q_off += len; - } else if (op == 2) { // deletion from ref + } else if (op == MM_CIGAR_DEL) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "acgtn"[tseq[t_off + j]]; mm_sprintf_lite(s, "-%s", tmp); @@ -192,8 +192,8 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq if (write_tag) mm_sprintf_lite(s, "\tMD:Z:"); for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; - assert((op >= 0 && op <= 3) || op == 7 || op == 8); - if (op == 0 || op == 7 || op == 8) { // match + assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH); + if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) { for (j = 0; j < len; ++j) { if (qseq[q_off + j] != tseq[t_off + j]) { mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]); @@ -201,15 +201,15 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq } else ++l_MD; } q_off += len, t_off += len; - } else if (op == 1) { // insertion to ref + } else if (op == MM_CIGAR_INS) { q_off += len; - } else if (op == 2) { // deletion from ref + } else if (op == MM_CIGAR_DEL) { for (j = 0, tmp[len] = 0; j < len; ++j) tmp[j] = "ACGTN"[tseq[t_off + j]]; mm_sprintf_lite(s, "%d^%s", l_MD, tmp); l_MD = 0; t_off += len; - } else if (op == 3) { // reference skip + } else if (op == MM_CIGAR_N_SKIP) { t_off += len; } } @@ -271,7 +271,7 @@ double mm_event_identity(const mm_reg1_t *r) if (r->p == 0) return -1.0f; for (i = 0; i < r->p->n_cigar; ++i) { int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4; - if (op == 1 || op == 2) + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) ++n_gapo, n_gap += len; } return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo); diff --git a/ksw2.h b/ksw2.h index c700136..cbd1ddc 100644 --- a/ksw2.h +++ b/ksw2.h @@ -16,6 +16,13 @@ #define KSW_EZ_SPLICE_REV 0x200 #define KSW_EZ_SPLICE_FLANK 0x400 +// The subset of CIGAR operators used by ksw code. +// Use MM_CIGAR_* from minimap.h if you need the full list. +#define KSW_CIGAR_MATCH 0 +#define KSW_CIGAR_INS 1 +#define KSW_CIGAR_DEL 2 +#define KSW_CIGAR_N_SKIP 3 + #ifdef __cplusplus extern "C" { #endif @@ -137,13 +144,13 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intro else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure if (force_state >= 0) state = force_state; - if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match - else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion - else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron - else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion + if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j; + else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i; + else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i; + else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j; } - if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion - if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion + if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion + if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion if (!is_rev) for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; diff --git a/minimap.h b/minimap.h index d095f46..fecd775 100644 --- a/minimap.h +++ b/minimap.h @@ -46,6 +46,16 @@ #define MM_MAX_SEG 255 +#define MM_CIGAR_MATCH 0 +#define MM_CIGAR_INS 1 +#define MM_CIGAR_DEL 2 +#define MM_CIGAR_N_SKIP 3 +#define MM_CIGAR_SOFTCLIP 4 +#define MM_CIGAR_HARDCLIP 5 +#define MM_CIGAR_PADDING 6 +#define MM_CIGAR_EQ_MATCH 7 +#define MM_CIGAR_X_MISMATCH 8 + #define MM_CIGAR_STR "MIDNSHP=XB" #ifdef __cplusplus