r311: even tighter bw for CIGAR

This commit is contained in:
Heng Li 2013-02-27 23:59:50 -05:00
parent a33b9c0633
commit f3cff1c609
3 changed files with 34 additions and 17 deletions

33
bwa.c
View File

@ -74,7 +74,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
{ {
uint32_t *cigar = 0; uint32_t *cigar = 0;
uint8_t tmp, *rseq; uint8_t tmp, *rseq;
int i, w, max_gap, min_w; int i;
int64_t rlen; int64_t rlen;
*n_cigar = 0; *NM = -1; *n_cigar = 0; *NM = -1;
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
@ -86,17 +86,26 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
for (i = 0; i < rlen>>1; ++i) for (i = 0; i < rlen>>1; ++i)
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
} }
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); cigar = malloc(4);
// set the band-width cigar[0] = l_query<<4 | 0;
max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); *n_cigar = 1;
max_gap = max_gap > 1? max_gap : 1; for (i = 0, *score = 0; i < l_query; ++i)
w = (max_gap + abs(rlen - l_query) + 1) >> 1; *score += mat[rseq[i]*5 + query[i]];
w = w < w_? w : w_; } else {
min_w = abs(rlen - l_query) + 3; int w, max_gap, min_w;
w = w > min_w? w : min_w; //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
// NW alignment //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); // set the band-width
max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
max_gap = max_gap > 1? max_gap : 1;
w = (max_gap + abs(rlen - l_query) + 1) >> 1;
w = w < w_? w : w_;
min_w = abs(rlen - l_query) + 3;
w = w > min_w? w : min_w;
// NW alignment
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
}
{// compute NM {// compute NM
int k, x, y, n_mm = 0, n_gap = 0; int k, x, y, n_mm = 0, n_gap = 0;
for (k = 0, x = y = 0; k < *n_cigar; ++k) { for (k = 0, x = y = 0; k < *n_cigar; ++k) {

View File

@ -526,6 +526,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
* Basic hit->SAM conversion * * Basic hit->SAM conversion *
*****************************/ *****************************/
static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
{
int w;
if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps
w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
if (w < abs(l1 - l2)) w = abs(l1 - l2);
return w;
}
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
{ {
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
@ -552,8 +561,8 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
if (p->flag&0x10000) sam_flag |= 0x100; if (p->flag&0x10000) sam_flag |= 0x100;
if (!copy_mate) { if (!copy_mate) {
int w2 = (int)((double)((p->qe - p->qb < p->re - p->rb? p->qe - p->qb : p->re - p->rb) * mat[0] - p->score - q) / r + 1.499); int w2;
w2 = w2 > 1? w2 : 1; w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r);
w2 = w2 < w? w2 : w; w2 = w2 < w? w2 : w;
cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
@ -723,8 +732,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
memset(&a, 0, sizeof(mem_aln_t)); memset(&a, 0, sizeof(mem_aln_t));
a.mapq = mem_approx_mapq_se(opt, ar); a.mapq = mem_approx_mapq_se(opt, ar);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
w2 = (int)((double)((qe - qb < re - rb? qe - qb : re - rb) * opt->a - ar->score - opt->q) / opt->r + 1.499); w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
w2 = w2 > 1? w2 : 1;
w2 = w2 < opt->w? w2 : opt->w; w2 = w2 < opt->w? w2 : opt->w;
a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
a.NM = NM; a.NM = NM;

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h" #include "utils.h"
#ifndef PACKAGE_VERSION #ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.2-r309-beta" #define PACKAGE_VERSION "0.6.2-r311-beta"
#endif #endif
static int usage() static int usage()