From f3cff1c609903b71d56a0d1fe94361e95502b140 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 23:59:50 -0500 Subject: [PATCH] r311: even tighter bw for CIGAR --- bwa.c | 33 +++++++++++++++++++++------------ bwamem.c | 16 ++++++++++++---- main.c | 2 +- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/bwa.c b/bwa.c index 3e2f30e..beea6d1 100644 --- a/bwa.c +++ b/bwa.c @@ -74,7 +74,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa { uint32_t *cigar = 0; uint8_t tmp, *rseq; - int i, w, max_gap, min_w; + int i; int64_t rlen; *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand @@ -86,17 +86,26 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa for (i = 0; i < rlen>>1; ++i) tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; } - //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); - // set the band-width - max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); - max_gap = max_gap > 1? max_gap : 1; - w = (max_gap + abs(rlen - l_query) + 1) >> 1; - w = w < w_? w : w_; - min_w = abs(rlen - l_query) + 3; - w = w > min_w? w : min_w; - // NW alignment - *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP + cigar = malloc(4); + cigar[0] = l_query<<4 | 0; + *n_cigar = 1; + for (i = 0, *score = 0; i < l_query; ++i) + *score += mat[rseq[i]*5 + query[i]]; + } else { + int w, max_gap, min_w; + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + // set the band-width + max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); + max_gap = max_gap > 1? max_gap : 1; + w = (max_gap + abs(rlen - l_query) + 1) >> 1; + w = w < w_? w : w_; + min_w = abs(rlen - l_query) + 3; + w = w > min_w? w : min_w; + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + } {// compute NM int k, x, y, n_mm = 0, n_gap = 0; for (k = 0, x = y = 0; k < *n_cigar; ++k) { diff --git a/bwamem.c b/bwamem.c index 37e65e2..52dc7fb 100644 --- a/bwamem.c +++ b/bwamem.c @@ -526,6 +526,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int * Basic hit->SAM conversion * *****************************/ +static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) +{ + int w; + if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps + w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.); + if (w < abs(l1 - l2)) w = abs(l1 - l2); + return w; +} + void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) @@ -552,8 +561,8 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { - int w2 = (int)((double)((p->qe - p->qb < p->re - p->rb? p->qe - p->qb : p->re - p->rb) * mat[0] - p->score - q) / r + 1.499); - w2 = w2 > 1? w2 : 1; + int w2; + w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r); w2 = w2 < w? w2 : w; cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) @@ -723,8 +732,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * memset(&a, 0, sizeof(mem_aln_t)); a.mapq = mem_approx_mapq_se(opt, ar); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); - w2 = (int)((double)((qe - qb < re - rb? qe - qb : re - rb) * opt->a - ar->score - opt->q) / opt->r + 1.499); - w2 = w2 > 1? w2 : 1; + w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); a.NM = NM; diff --git a/main.c b/main.c index d301bb4..bc40374 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r309-beta" +#define PACKAGE_VERSION "0.6.2-r311-beta" #endif static int usage()