Use #defines for CIGAR operators in C code

Give the CIGAR constants names to clarify the code. So that ksw2.h
remains self-contained, define KSW_* versions of the CIGAR operators
it needs for use within ksw2.h. Other code should in general use the
full set of MM_CIGAR_* constants in minimap.h.
This commit is contained in:
John Marshall 2021-04-10 12:49:39 +01:00 committed by Heng Li
parent 177eef259d
commit 260a68d232
4 changed files with 63 additions and 46 deletions

60
align.c
View File

@ -53,16 +53,16 @@ static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq,
// find the score and the region where score drops most along diagonal
for (k = 0, score = 0; k < n_cigar; ++k) {
uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4;
if (op == 0) {
if (op == MM_CIGAR_MATCH) {
for (l = 0; l < len; ++l) {
score += mat[tseq[i + l] * 5 + qseq[j + l]];
update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
}
i += len, j += len;
} else if (op == 1 || op == 2 || op == 3) {
} else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) {
score -= opt->q + opt->e * len;
if (op == 1) j += len; // insertion
else i += len; // deletion
if (op == MM_CIGAR_INS) j += len;
else i += len;
update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos);
}
}
@ -98,12 +98,12 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
for (k = 0; k < p->n_cigar; ++k) { // indel left alignment
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (len == 0) to_shrink = 1;
if (op == 0) {
if (op == MM_CIGAR_MATCH) {
toff += len, qoff += len;
} else if (op == 1 || op == 2) { // insertion or deletion
} else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) {
if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) {
int l, prev_len = p->cigar[k-1] >> 4;
if (op == 1) {
if (op == MM_CIGAR_INS) {
for (l = 0; l < prev_len; ++l)
if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l])
break;
@ -116,9 +116,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l;
if (l == prev_len) to_shrink = 1;
}
if (op == 1) qoff += len;
if (op == MM_CIGAR_INS) qoff += len;
else toff += len;
} else if (op == 3) {
} else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@ -128,13 +128,13 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
uint32_t l, s[3] = {0,0,0};
for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D
uint32_t op = p->cigar[l]&0xf;
if (op == 1 || op == 2 || p->cigar[l]>>4 == 0)
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0)
s[op] += p->cigar[l] >> 4;
else break;
}
if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D
p->cigar[k] = s[1]<<4|1;
p->cigar[k+1] = s[2]<<4|2;
p->cigar[k] = s[1]<<4|MM_CIGAR_INS;
p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL;
for (k += 2; k < l; ++k)
p->cigar[k] &= 0xf;
to_shrink = 1;
@ -154,9 +154,9 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator
p->n_cigar = l;
}
if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D
if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D
int32_t l = p->cigar[0] >> 4;
if ((p->cigar[0]&0xf) == 1) {
if ((p->cigar[0]&0xf) == MM_CIGAR_INS) {
if (r->rev) r->qe -= l;
else r->qs += l;
*qshift = l;
@ -174,7 +174,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (r->p == 0) return;
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) {
if (op == MM_CIGAR_MATCH) {
while (len > 0) {
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "="
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
@ -183,11 +183,11 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; }
}
++n_M;
} else if (op == 1) { // insertion
} else if (op == MM_CIGAR_INS) {
qoff += len;
} else if (op == 2) { // deletion
} else if (op == MM_CIGAR_DEL) {
toff += len;
} else if (op == 3) { // intron
} else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@ -195,7 +195,7 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
if (n_EQX == n_M) {
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) r->p->cigar[k] = len << 4 | 7;
if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH;
}
return;
}
@ -209,25 +209,25 @@ static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t
toff = qoff = m = 0;
for (k = 0; k < r->p->n_cigar; ++k) {
uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4;
if (op == 0) { // match/mismatch
if (op == MM_CIGAR_MATCH) {
while (len > 0) {
// match
for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {}
if (l > 0) p->cigar[m++] = l << 4 | 7;
if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH;
len -= l;
toff += l, qoff += l;
// mismatch
for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {}
if (l > 0) p->cigar[m++] = l << 4 | 8;
if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH;
len -= l;
toff += l, qoff += l;
}
continue;
} else if (op == 1) { // insertion
} else if (op == MM_CIGAR_INS) {
qoff += len;
} else if (op == 2) { // deletion
} else if (op == MM_CIGAR_DEL) {
toff += len;
} else if (op == 3) { // intron
} else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
p->cigar[m++] = r->p->cigar[k];
@ -248,7 +248,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
r->blen = r->mlen = 0;
for (k = 0; k < p->n_cigar; ++k) {
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (op == 0) { // match/mismatch
if (op == MM_CIGAR_MATCH) {
int n_ambi = 0, n_diff = 0;
for (l = 0; l < len; ++l) {
int cq = qseq[qoff + l], ct = tseq[toff + l];
@ -260,7 +260,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
}
r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi;
toff += len, qoff += len;
} else if (op == 1) { // insertion
} else if (op == MM_CIGAR_INS) {
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (qseq[qoff + l] > 3) ++n_ambi;
@ -268,7 +268,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
s -= q + e * len;
if (s < 0) s = 0;
qoff += len;
} else if (op == 2) { // deletion
} else if (op == MM_CIGAR_DEL) {
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi;
@ -276,7 +276,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
s -= q + e * len;
if (s < 0) s = 0;
toff += len;
} else if (op == 3) { // intron
} else if (op == MM_CIGAR_N_SKIP) {
toff += len;
}
}
@ -730,7 +730,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
}
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs);
ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs);
} else { // perform normal gapped alignment
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
}

View File

@ -144,8 +144,8 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
assert((op >= 0 && op <= 3) || op == 7 || op == 8);
if (op == 0 || op == 7 || op == 8) { // match
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
int l_tmp = 0;
for (j = 0; j < len; ++j) {
if (qseq[q_off + j] != tseq[t_off + j]) {
@ -166,12 +166,12 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
} else mm_sprintf_lite(s, ":%d", l_tmp);
}
q_off += len, t_off += len;
} else if (op == 1) { // insertion to ref
} else if (op == MM_CIGAR_INS) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "acgtn"[qseq[q_off + j]];
mm_sprintf_lite(s, "+%s", tmp);
q_off += len;
} else if (op == 2) { // deletion from ref
} else if (op == MM_CIGAR_DEL) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "acgtn"[tseq[t_off + j]];
mm_sprintf_lite(s, "-%s", tmp);
@ -192,8 +192,8 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
if (write_tag) mm_sprintf_lite(s, "\tMD:Z:");
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
assert((op >= 0 && op <= 3) || op == 7 || op == 8);
if (op == 0 || op == 7 || op == 8) { // match
assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) {
for (j = 0; j < len; ++j) {
if (qseq[q_off + j] != tseq[t_off + j]) {
mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]);
@ -201,15 +201,15 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
} else ++l_MD;
}
q_off += len, t_off += len;
} else if (op == 1) { // insertion to ref
} else if (op == MM_CIGAR_INS) {
q_off += len;
} else if (op == 2) { // deletion from ref
} else if (op == MM_CIGAR_DEL) {
for (j = 0, tmp[len] = 0; j < len; ++j)
tmp[j] = "ACGTN"[tseq[t_off + j]];
mm_sprintf_lite(s, "%d^%s", l_MD, tmp);
l_MD = 0;
t_off += len;
} else if (op == 3) { // reference skip
} else if (op == MM_CIGAR_N_SKIP) {
t_off += len;
}
}
@ -271,7 +271,7 @@ double mm_event_identity(const mm_reg1_t *r)
if (r->p == 0) return -1.0f;
for (i = 0; i < r->p->n_cigar; ++i) {
int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4;
if (op == 1 || op == 2)
if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL)
++n_gapo, n_gap += len;
}
return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo);

19
ksw2.h
View File

@ -16,6 +16,13 @@
#define KSW_EZ_SPLICE_REV 0x200
#define KSW_EZ_SPLICE_FLANK 0x400
// The subset of CIGAR operators used by ksw code.
// Use MM_CIGAR_* from minimap.h if you need the full list.
#define KSW_CIGAR_MATCH 0
#define KSW_CIGAR_INS 1
#define KSW_CIGAR_DEL 2
#define KSW_CIGAR_N_SKIP 3
#ifdef __cplusplus
extern "C" {
#endif
@ -137,13 +144,13 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intro
else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
if (force_state >= 0) state = force_state;
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j;
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i;
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i;
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j;
}
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion
if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion
if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion
if (!is_rev)
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;

View File

@ -46,6 +46,16 @@
#define MM_MAX_SEG 255
#define MM_CIGAR_MATCH 0
#define MM_CIGAR_INS 1
#define MM_CIGAR_DEL 2
#define MM_CIGAR_N_SKIP 3
#define MM_CIGAR_SOFTCLIP 4
#define MM_CIGAR_HARDCLIP 5
#define MM_CIGAR_PADDING 6
#define MM_CIGAR_EQ_MATCH 7
#define MM_CIGAR_X_MISMATCH 8
#define MM_CIGAR_STR "MIDNSHP=XB"
#ifdef __cplusplus