r324: a little code cleanup
The changes after r317 aim to improve the performance and accuracy for very long query alignment. The short-read alignment should not be affected. The changes include: 1) Z-dropoff. This is a variant of blast's X-dropoff. I orginally thought this heuristic only improves speed, but now I realize it also reduces poor alignment with long good flanking alignments. The difference from blast's X-dropoff is that Z-dropoff allows big gaps, but X-dropoff does not. 2) Band width doubling. When band width is too small, we will get a poor alignment in the middle. Sometimes such alignments cannot be fully excluded with Z-dropoff. Band width doubling is an alternative heuristic. It is based on the observation that the existing of close-to-boundary high score possibly implies inadequate band width. When we see such a signal, we double the band width.
This commit is contained in:
parent
e0991d6a45
commit
efd9769b07
26
bwamem.c
26
bwamem.c
|
|
@ -43,7 +43,7 @@ mem_opt_t *mem_opt_init()
|
|||
mem_opt_t *o;
|
||||
o = calloc(1, sizeof(mem_opt_t));
|
||||
o->flag = 0;
|
||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500;
|
||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||
o->zdrop = 100;
|
||||
o->pen_unpaired = 9;
|
||||
o->pen_clip = 5;
|
||||
|
|
@ -434,6 +434,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT
|
|||
|
||||
#define MEM_SHORT_EXT 50
|
||||
#define MEM_SHORT_LEN 200
|
||||
#define MAX_BAND_TRY 2
|
||||
|
||||
int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
||||
{
|
||||
|
|
@ -551,20 +552,22 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
a = kv_pushp(mem_alnreg_t, *av);
|
||||
memset(a, 0, sizeof(mem_alnreg_t));
|
||||
a->w = aw[0] = aw[1] = opt->w;
|
||||
a->score = -1;
|
||||
|
||||
if (s->qbeg) { // left extension
|
||||
uint8_t *rs, *qs;
|
||||
int qle, tle, gtle, gscore, tmps = -1;
|
||||
int qle, tle, gtle, gscore;
|
||||
qs = malloc(s->qbeg);
|
||||
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
|
||||
tmp = s->rbeg - rmax[0];
|
||||
rs = malloc(tmp);
|
||||
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||||
for (aw[0] = opt->w;; aw[0] <<= 1) {
|
||||
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||||
int prev = a->score;
|
||||
aw[0] = opt->w << i;
|
||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]);
|
||||
if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout);
|
||||
if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
|
||||
tmps = a->score;
|
||||
if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout);
|
||||
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
|
||||
}
|
||||
// check whether we prefer to reach the end of the query
|
||||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
|
||||
|
|
@ -573,15 +576,16 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
||||
|
||||
if (s->qbeg + s->len != l_query) { // right extension
|
||||
int qle, tle, qe, re, gtle, gscore, tmps = -1, sc0 = a->score;
|
||||
int qle, tle, qe, re, gtle, gscore, sc0 = a->score;
|
||||
qe = s->qbeg + s->len;
|
||||
re = s->rbeg + s->len - rmax[0];
|
||||
assert(re >= 0);
|
||||
for (aw[1] = opt->w;; aw[1] <<= 1) {
|
||||
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||||
int prev = a->score;
|
||||
aw[1] = opt->w << i;
|
||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]);
|
||||
if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout);
|
||||
if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
|
||||
tmps = a->score;
|
||||
if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout);
|
||||
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
|
||||
}
|
||||
// similar to the above
|
||||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -22,7 +22,6 @@ typedef struct {
|
|||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||
int pen_clip; // clipping penalty. This score is not deducted from the DP score.
|
||||
int w; // band width
|
||||
int max_w; // max band width
|
||||
int zdrop; // Z-dropoff
|
||||
|
||||
int flag; // see MEM_F_* macros
|
||||
|
|
|
|||
|
|
@ -26,10 +26,9 @@ int main_mem(int argc, char *argv[])
|
|||
void *ko = 0, *ko2 = 0;
|
||||
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:W:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||
else if (c == 'w') opt->w = atoi(optarg);
|
||||
else if (c == 'W') opt->max_w = atoi(optarg);
|
||||
else if (c == 'A') opt->a = atoi(optarg);
|
||||
else if (c == 'B') opt->b = atoi(optarg);
|
||||
else if (c == 'O') opt->q = atoi(optarg);
|
||||
|
|
@ -59,7 +58,6 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
||||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||
fprintf(stderr, " -W INT max band width [%d]\n", opt->max_w);
|
||||
fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
|
|
|
|||
Loading…
Reference in New Issue