From 13b66aad4db30f2924ad3622cb19e3dced746822 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 10 Oct 2017 11:59:44 -0400 Subject: [PATCH] r495: fix impropriate CIGAR 1. Not left aligned 2. In one case, 50M24D50M becomes 24D100M. The leading D needs to be removed. 3. Avoid identical hits after DP --- align.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- hit.c | 14 ++++++++----- main.c | 2 +- 3 files changed, 71 insertions(+), 10 deletions(-) diff --git a/align.c b/align.c index f3313ee..dc1c8a1 100644 --- a/align.c +++ b/align.c @@ -62,11 +62,68 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c return 0; } -static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e) +static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, int *qshift, int *tshift) +{ + mm_extra_t *p = r->p; + int32_t k, toff = 0, qoff = 0, to_shrink = 0; + *qshift = *tshift = 0; + if (p->n_cigar <= 1) return; + for (k = 0; k < p->n_cigar; ++k) { // indel left alignment + uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; + if (len == 0) to_shrink = 1; + if (op == 0) { + toff += len, qoff += len; + } else if (op == 1 || op == 2) { // insertion or deletion + if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) { + int l, prev_len = p->cigar[k-1] >> 4; + if (op == 1) { + for (l = 0; l < prev_len; ++l) + if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l]) + break; + } else { + for (l = 0; l < prev_len; ++l) + if (tseq[toff - 1 - l] != tseq[toff + len - 1 - l]) + break; + } + if (l > 0) + p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l; + if (l == prev_len) to_shrink = 1; + } + if (op == 1) qoff += len; + else toff += len; + } else if (op == 3) { + toff += len; + } + } + assert(qoff == r->qe - r->qs && toff == r->re - r->rs); + if (to_shrink) { // squeeze out zero-length operations + int32_t l = 0; + for (k = 0; k < p->n_cigar; ++k) // squeeze out zero-length operations + if (p->cigar[k]>>4 != 0) + p->cigar[l++] = p->cigar[k]; + p->n_cigar = l; + for (k = l = 0; k < p->n_cigar; ++k) // merge two adjacent operations if they are the same + if (k == p->n_cigar - 1 || (p->cigar[k]&0xf) != (p->cigar[k+1]&0xf)) + p->cigar[l++] = p->cigar[k]; + p->n_cigar = l; + } + if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D + int32_t l = p->cigar[0] >> 4; + if ((p->cigar[0]&0xf) == 1) r->qs += l, *qshift = l; + else r->rs += l, *tshift = l; + --p->n_cigar; + memmove(p->cigar, p->cigar + 1, p->n_cigar * 4); + } +} + +static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e) { uint32_t k, l, toff = 0, qoff = 0; - int32_t s = 0, max = 0; + int32_t s = 0, max = 0, qshift, tshift; + mm_extra_t *p = r->p; if (p == 0) return; + mm_fix_cigar(r, qseq, tseq, &qshift, &tshift); + qseq += qshift, tseq += tshift; // qseq and tseq may be shifted due to the removal of leading I/D for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; if (op == 0) { // match/mismatch @@ -424,7 +481,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int assert(re1 - rs1 <= re0 - rs0); if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); - mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e); + mm_update_extra(r, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e); if (rev && r->p->trans_strand) r->p->trans_strand ^= 3; // flip to the read strand } @@ -467,7 +524,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; - mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e); + mm_update_extra(r_inv, qseq + q_off, tseq + t_off, mat, opt->q, opt->e); r_inv->id = -1; r_inv->parent = MM_PARENT_UNSET; r_inv->inv = 1; diff --git a/hit.c b/hit.c index 5a608b4..33abc06 100644 --- a/hit.c +++ b/hit.c @@ -124,7 +124,7 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff ri->parent = rp->parent; rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score; if (ri->cnt >= rp->cnt) cnt_sub = 1; - if (rp->p && ri->p) { + if (rp->p && ri->p && (rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP rp->p->dp_max2 = rp->p->dp_max2 > ri->p->dp_max? rp->p->dp_max2 : ri->p->dp_max; if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1; } @@ -204,11 +204,15 @@ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, { if (pri_ratio > 0.0f && *n_ > 0) { int i, k, n = *n_, n_2nd = 0; - for (i = k = 0; i < n; ++i) - if (r[i].parent == i) r[k++] = r[i]; - else if ((r[i].score >= r[r[i].parent].score * pri_ratio || r[i].score + min_diff >= r[r[i].parent].score) && n_2nd++ < best_n) + for (i = k = 0; i < n; ++i) { + int p = r[i].parent; + if (p == i || r[i].inv) { // primary or inversion r[k++] = r[i]; - else if (r[i].p) free(r[i].p); + } else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) { + if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].rs == r[p].rs && r[i].re == r[p].re)) // not identical hits + r[k++] = r[i], ++n_2nd; + } else if (r[i].p) free(r[i].p); + } if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync() *n_ = k; } diff --git a/main.c b/main.c index 8248f44..829884d 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.2-r494-dirty" +#define MM_VERSION "2.2-r495-dirty" #ifdef __linux__ #include