r495: fix impropriate CIGAR
1. Not left aligned 2. In one case, 50M24D50M becomes 24D100M. The leading D needs to be removed. 3. Avoid identical hits after DP
This commit is contained in:
parent
46fa520db9
commit
13b66aad4d
65
align.c
65
align.c
|
|
@ -62,11 +62,68 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
|
||||
static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, int *qshift, int *tshift)
|
||||
{
|
||||
mm_extra_t *p = r->p;
|
||||
int32_t k, toff = 0, qoff = 0, to_shrink = 0;
|
||||
*qshift = *tshift = 0;
|
||||
if (p->n_cigar <= 1) return;
|
||||
for (k = 0; k < p->n_cigar; ++k) { // indel left alignment
|
||||
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
|
||||
if (len == 0) to_shrink = 1;
|
||||
if (op == 0) {
|
||||
toff += len, qoff += len;
|
||||
} else if (op == 1 || op == 2) { // insertion or deletion
|
||||
if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) {
|
||||
int l, prev_len = p->cigar[k-1] >> 4;
|
||||
if (op == 1) {
|
||||
for (l = 0; l < prev_len; ++l)
|
||||
if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l])
|
||||
break;
|
||||
} else {
|
||||
for (l = 0; l < prev_len; ++l)
|
||||
if (tseq[toff - 1 - l] != tseq[toff + len - 1 - l])
|
||||
break;
|
||||
}
|
||||
if (l > 0)
|
||||
p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l;
|
||||
if (l == prev_len) to_shrink = 1;
|
||||
}
|
||||
if (op == 1) qoff += len;
|
||||
else toff += len;
|
||||
} else if (op == 3) {
|
||||
toff += len;
|
||||
}
|
||||
}
|
||||
assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
|
||||
if (to_shrink) { // squeeze out zero-length operations
|
||||
int32_t l = 0;
|
||||
for (k = 0; k < p->n_cigar; ++k) // squeeze out zero-length operations
|
||||
if (p->cigar[k]>>4 != 0)
|
||||
p->cigar[l++] = p->cigar[k];
|
||||
p->n_cigar = l;
|
||||
for (k = l = 0; k < p->n_cigar; ++k) // merge two adjacent operations if they are the same
|
||||
if (k == p->n_cigar - 1 || (p->cigar[k]&0xf) != (p->cigar[k+1]&0xf))
|
||||
p->cigar[l++] = p->cigar[k];
|
||||
p->n_cigar = l;
|
||||
}
|
||||
if ((p->cigar[0]&0xf) == 1 || (p->cigar[0]&0xf) == 2) { // get rid of leading I or D
|
||||
int32_t l = p->cigar[0] >> 4;
|
||||
if ((p->cigar[0]&0xf) == 1) r->qs += l, *qshift = l;
|
||||
else r->rs += l, *tshift = l;
|
||||
--p->n_cigar;
|
||||
memmove(p->cigar, p->cigar + 1, p->n_cigar * 4);
|
||||
}
|
||||
}
|
||||
|
||||
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
|
||||
{
|
||||
uint32_t k, l, toff = 0, qoff = 0;
|
||||
int32_t s = 0, max = 0;
|
||||
int32_t s = 0, max = 0, qshift, tshift;
|
||||
mm_extra_t *p = r->p;
|
||||
if (p == 0) return;
|
||||
mm_fix_cigar(r, qseq, tseq, &qshift, &tshift);
|
||||
qseq += qshift, tseq += tshift; // qseq and tseq may be shifted due to the removal of leading I/D
|
||||
for (k = 0; k < p->n_cigar; ++k) {
|
||||
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
|
||||
if (op == 0) { // match/mismatch
|
||||
|
|
@ -424,7 +481,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
assert(re1 - rs1 <= re0 - rs0);
|
||||
if (r->p) {
|
||||
mm_idx_getseq(mi, rid, rs1, re1, tseq);
|
||||
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
|
||||
mm_update_extra(r, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
|
||||
if (rev && r->p->trans_strand)
|
||||
r->p->trans_strand ^= 3; // flip to the read strand
|
||||
}
|
||||
|
|
@ -467,7 +524,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|||
if (ez->n_cigar == 0) goto end_align1_inv; // should never be here
|
||||
mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
|
||||
r_inv->p->dp_score = ez->max;
|
||||
mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e);
|
||||
mm_update_extra(r_inv, qseq + q_off, tseq + t_off, mat, opt->q, opt->e);
|
||||
r_inv->id = -1;
|
||||
r_inv->parent = MM_PARENT_UNSET;
|
||||
r_inv->inv = 1;
|
||||
|
|
|
|||
14
hit.c
14
hit.c
|
|
@ -124,7 +124,7 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff
|
|||
ri->parent = rp->parent;
|
||||
rp->subsc = rp->subsc > ri->score? rp->subsc : ri->score;
|
||||
if (ri->cnt >= rp->cnt) cnt_sub = 1;
|
||||
if (rp->p && ri->p) {
|
||||
if (rp->p && ri->p && (rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP
|
||||
rp->p->dp_max2 = rp->p->dp_max2 > ri->p->dp_max? rp->p->dp_max2 : ri->p->dp_max;
|
||||
if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
|
||||
}
|
||||
|
|
@ -204,11 +204,15 @@ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_,
|
|||
{
|
||||
if (pri_ratio > 0.0f && *n_ > 0) {
|
||||
int i, k, n = *n_, n_2nd = 0;
|
||||
for (i = k = 0; i < n; ++i)
|
||||
if (r[i].parent == i) r[k++] = r[i];
|
||||
else if ((r[i].score >= r[r[i].parent].score * pri_ratio || r[i].score + min_diff >= r[r[i].parent].score) && n_2nd++ < best_n)
|
||||
for (i = k = 0; i < n; ++i) {
|
||||
int p = r[i].parent;
|
||||
if (p == i || r[i].inv) { // primary or inversion
|
||||
r[k++] = r[i];
|
||||
else if (r[i].p) free(r[i].p);
|
||||
} else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
|
||||
if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].rs == r[p].rs && r[i].re == r[p].re)) // not identical hits
|
||||
r[k++] = r[i], ++n_2nd;
|
||||
} else if (r[i].p) free(r[i].p);
|
||||
}
|
||||
if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
|
||||
*n_ = k;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue