From 6476343a8310ea4886ac2e35702818955bde1c81 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 19:56:37 -0500 Subject: [PATCH] r331: rewrote CIGAR generation for bwa-short When backtracking, bwa-short does not keep the detailed alignment or the exact start and end positions. To find the boundary and the CIGAR, the old code does a global alignment with a small end-gap penalty. It then deals with a lot of special cases to derive the right position and CIGAR, which are actually not always right. It is a mess. As the new ksw.{c,h} does not support a different end-gap penalty, the old strategy does not work. But we get something better. The new code finds the boundaries with ksw_extend(). It is cleaner and gives more accurate CIGAR in most cases. --- bwase.c | 89 ++++++++++++++++++++++++++++----------------------------- main.c | 2 +- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/bwase.c b/bwase.c index eebe22b..83e4baa 100644 --- a/bwase.c +++ b/bwase.c @@ -156,59 +156,58 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se bwt_destroy(bwt); } -/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on - * forward strand. This happens when p->pos is calculated by - * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct - * coordinate. This happens only for color-converted alignment. */ -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, - int ext, int *n_cigar, int is_end_correct) +#define SW_BW 50 + +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, bwtint_t *_pos, int *n_cigar, int is_rev) { bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; - ubyte_t *ref_seq; - int l = 0, ref_len; - int64_t k, __pos = *_pos; + ubyte_t *rseq; + int tle, qle, gtle, gscore, lscore; + int64_t k, rb, re, rlen; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); - ref_len = len + abs(ext); - if (ext > 0) { - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } else { - int64_t x = __pos + (is_end_correct? len : ref_len); - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + if (!is_rev) { // forward strand, the end position is correct + re = *_pos + len; + if (re > l_pac) re = l_pac; + rb = re - (len + SW_BW); + if (rb < 0) rb = 0; + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences + seq_reverse(rlen, rseq, 0); + lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + if (gscore > 0) tle = gtle, qle = len; + rb = re - tle; rlen = tle; + seq_reverse(len, seq, 0); + seq_reverse(rlen, rseq, 0); + ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); + if (qle < len) { // write soft clip + cigar = realloc(cigar, (*n_cigar + 1) * 4); + memmove(cigar + 1, cigar, *n_cigar * 4); + cigar[0] = (len - qle)<<4 | FROM_S; + ++(*n_cigar); + } + } else { // reverse strand, the start position is correct + rb = *_pos; re = rb + len + SW_BW; + if (re > l_pac) re = l_pac; + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + if (gscore > 0) tle = gtle, qle = len; + re = rb + tle; rlen = tle; + ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension + if (qle < len) { + cigar = realloc(cigar, (*n_cigar + 1) * 4); + cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S; + ++(*n_cigar); + } } + *_pos = rb; - ksw_global(len, seq, l, ref_seq, 5, mat, 5, 1, 50, n_cigar, &cigar32); cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); - - if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand - for (l = k = 0; k < *n_cigar; ++k) { - if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]); - else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]); - } - __pos += l; - } - - if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end - __pos += __cigar_len(cigar[0]); - for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; - --(*n_cigar); - } - if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end - - // change "I" at either end of the read to S. just in case. This should rarely happen... - if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(FROM_S, (__cigar_len(cigar[*n_cigar-1]))); - if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(FROM_S, (__cigar_len(cigar[0]))); - - *_pos = (bwtint_t)__pos; - free(ref_seq); + free(rseq); return cigar; } @@ -316,13 +315,11 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap == 0) continue; - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 1); + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, &n_cigar, q->strand); q->n_cigar = n_cigar; } if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand); } // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); diff --git a/main.c b/main.c index 00a21b9..0954e01 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r329-beta" +#define PACKAGE_VERSION "0.7.0-r331-beta" #endif static int usage()