From efd9769b07114d2d8539e6115277c7e24ce2930f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 00:57:16 -0500 Subject: [PATCH] r324: a little code cleanup The changes after r317 aim to improve the performance and accuracy for very long query alignment. The short-read alignment should not be affected. The changes include: 1) Z-dropoff. This is a variant of blast's X-dropoff. I orginally thought this heuristic only improves speed, but now I realize it also reduces poor alignment with long good flanking alignments. The difference from blast's X-dropoff is that Z-dropoff allows big gaps, but X-dropoff does not. 2) Band width doubling. When band width is too small, we will get a poor alignment in the middle. Sometimes such alignments cannot be fully excluded with Z-dropoff. Band width doubling is an alternative heuristic. It is based on the observation that the existing of close-to-boundary high score possibly implies inadequate band width. When we see such a signal, we double the band width. --- bwamem.c | 26 +++++++++++++++----------- bwamem.h | 1 - fastmap.c | 4 +--- main.c | 2 +- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index 20d8bdb..a6897ba 100644 --- a/bwamem.c +++ b/bwamem.c @@ -43,7 +43,7 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->zdrop = 100; o->pen_unpaired = 9; o->pen_clip = 5; @@ -434,6 +434,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT #define MEM_SHORT_EXT 50 #define MEM_SHORT_LEN 200 +#define MAX_BAND_TRY 2 int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { @@ -551,20 +552,22 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); a->w = aw[0] = aw[1] = opt->w; + a->score = -1; if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle, gtle, gscore, tmps = -1; + int qle, tle, gtle, gscore; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - for (aw[0] = opt->w;; aw[0] <<= 1) { + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[0] = opt->w << i; a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout); - if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; - tmps = a->score; + if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); + if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits @@ -573,15 +576,16 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore, tmps = -1, sc0 = a->score; + int qle, tle, qe, re, gtle, gscore, sc0 = a->score; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); - for (aw[1] = opt->w;; aw[1] <<= 1) { + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[1] = opt->w << i; a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); - if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); - if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; - tmps = a->score; + if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); + if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; diff --git a/bwamem.h b/bwamem.h index 7d6e821..96a3308 100644 --- a/bwamem.h +++ b/bwamem.h @@ -22,7 +22,6 @@ typedef struct { int pen_unpaired; // phred-scaled penalty for unpaired reads int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width - int max_w; // max band width int zdrop; // Z-dropoff int flag; // see MEM_F_* macros diff --git a/fastmap.c b/fastmap.c index eb93994..40c3a02 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,10 +26,9 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:W:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); - else if (c == 'W') opt->max_w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); @@ -59,7 +58,6 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); - fprintf(stderr, " -W INT max band width [%d]\n", opt->max_w); fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); diff --git a/main.c b/main.c index dd1a481..8b49c8b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r323-beta" +#define PACKAGE_VERSION "0.7.0-r324-beta" #endif static int usage()