r306: introduce clipping penalty
More clipping leads to more severe reference bias. We should not clip the alignment unless necessary.
This commit is contained in:
parent
b7791105bc
commit
4bb0bdddca
9
bwa.1
9
bwa.1
|
|
@ -93,6 +93,8 @@ genome.
|
|||
.IR gapOpenPen ]
|
||||
.RB [ -E
|
||||
.IR gapExtPen ]
|
||||
.RB [ -L
|
||||
.IR clipPen ]
|
||||
.RB [ -U
|
||||
.IR unpairPen ]
|
||||
.RB [ -R
|
||||
|
|
@ -190,6 +192,13 @@ Gap extension penalty. A gap of length k costs O + k*E (i.e.
|
|||
.B -O
|
||||
is for opening a zero-length gap). [1]
|
||||
.TP
|
||||
.BI -L \ INT
|
||||
Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
|
||||
score reaching the end of query. If this score is larger than the best SW score
|
||||
minus the clipping penalty, clipping will not be applied. Note that in this
|
||||
case, the SAM AS tag reports the best SW score; clipping penalty is not
|
||||
deducted. [5]
|
||||
.TP
|
||||
.BI -U \ INT
|
||||
Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
|
||||
.RI scoreRead1+scoreRead2- INT
|
||||
|
|
|
|||
21
bwamem.c
21
bwamem.c
|
|
@ -42,8 +42,10 @@ mem_opt_t *mem_opt_init()
|
|||
{
|
||||
mem_opt_t *o;
|
||||
o = calloc(1, sizeof(mem_opt_t));
|
||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||
o->flag = 0;
|
||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||
o->pen_unpaired = 9;
|
||||
o->pen_clip = 5;
|
||||
o->min_seed_len = 19;
|
||||
o->split_width = 10;
|
||||
o->max_occ = 10000;
|
||||
|
|
@ -54,7 +56,6 @@ mem_opt_t *mem_opt_init()
|
|||
o->split_factor = 1.5;
|
||||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
o->pen_unpaired = 9;
|
||||
o->max_matesw = 100;
|
||||
mem_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
|
|
@ -487,23 +488,27 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
|
||||
if (s->qbeg) { // left extension
|
||||
uint8_t *rs, *qs;
|
||||
int qle, tle;
|
||||
int qle, tle, gtle, gscore;
|
||||
qs = malloc(s->qbeg);
|
||||
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
|
||||
tmp = s->rbeg - rmax[0];
|
||||
rs = malloc(tmp);
|
||||
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle);
|
||||
a->qb = s->qbeg - qle; a->rb = s->rbeg - tle;
|
||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore);
|
||||
// check whether we prefer to reach the end of the query
|
||||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
|
||||
else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
|
||||
free(qs); free(rs);
|
||||
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
||||
|
||||
if (s->qbeg + s->len != l_query) { // right extension
|
||||
int qle, tle, qe, re;
|
||||
int qle, tle, qe, re, gtle, gscore;
|
||||
qe = s->qbeg + s->len;
|
||||
re = s->rbeg + s->len - rmax[0];
|
||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle);
|
||||
a->qe = qe + qle; a->re = rmax[0] + re + tle;
|
||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore);
|
||||
// similar to the above
|
||||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
|
||||
else a->qe = l_query, a->re = rmax[0] + re + gtle;
|
||||
} else a->qe = l_query, a->re = s->rbeg + s->len;
|
||||
if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
|
||||
|
||||
|
|
|
|||
4
bwamem.h
4
bwamem.h
|
|
@ -19,7 +19,10 @@ typedef struct __smem_i smem_i;
|
|||
|
||||
typedef struct {
|
||||
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
|
||||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||
int pen_clip; // clipping penalty. This score is not deducted from the DP score.
|
||||
int w; // band width
|
||||
|
||||
int flag; // see MEM_F_* macros
|
||||
int min_seed_len; // minimum seed length
|
||||
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
|
||||
|
|
@ -30,7 +33,6 @@ typedef struct {
|
|||
int chunk_size; // process chunk_size-bp sequences in a batch
|
||||
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
|
||||
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
||||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
|
|
|
|||
|
|
@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[])
|
|||
void *ko = 0, *ko2 = 0;
|
||||
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||
else if (c == 'w') opt->w = atoi(optarg);
|
||||
else if (c == 'A') opt->a = atoi(optarg);
|
||||
else if (c == 'B') opt->b = atoi(optarg);
|
||||
else if (c == 'O') opt->q = atoi(optarg);
|
||||
else if (c == 'E') opt->r = atoi(optarg);
|
||||
else if (c == 'L') opt->pen_clip = atoi(optarg);
|
||||
else if (c == 'U') opt->pen_unpaired = atoi(optarg);
|
||||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
||||
|
|
@ -64,6 +65,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
||||
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
|
||||
fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
|
||||
fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip);
|
||||
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
|
||||
fprintf(stderr, "\nInput/output options:\n\n");
|
||||
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
|
||||
|
|
|
|||
12
ksw.c
12
ksw.c
|
|
@ -359,11 +359,11 @@ typedef struct {
|
|||
int32_t h, e;
|
||||
} eh_t;
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore)
|
||||
{
|
||||
eh_t *eh; // score array
|
||||
int8_t *qp; // query profile
|
||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
|
||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore;
|
||||
if (h0 < 0) h0 = 0;
|
||||
// allocate memory
|
||||
qp = malloc(qlen * m);
|
||||
|
|
@ -385,7 +385,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
max_gap = max_gap > 1? max_gap : 1;
|
||||
w = w < max_gap? w : max_gap;
|
||||
// DP loop
|
||||
max = h0, max_i = max_j = -1;
|
||||
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
|
||||
beg = 0, end = qlen;
|
||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||
int f = 0, h1, m = 0, mj = -1;
|
||||
|
|
@ -421,6 +421,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
f = f > h? f : h; // computed F(i,j+1)
|
||||
}
|
||||
eh[end].h = h1; eh[end].e = 0;
|
||||
if (j == qlen) {
|
||||
max_ie = gscore > h1? max_ie : i;
|
||||
gscore = gscore > h1? gscore : h1;
|
||||
}
|
||||
if (m == 0) break;
|
||||
if (m > max) max = m, max_i = i, max_j = mj;
|
||||
// update beg and end for the next round
|
||||
|
|
@ -433,6 +437,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
free(eh); free(qp);
|
||||
if (_qle) *_qle = max_j + 1;
|
||||
if (_tle) *_tle = max_i + 1;
|
||||
if (_gtle) *_gtle = max_ie + 1;
|
||||
if (_gscore) *_gscore = gscore;
|
||||
return max;
|
||||
}
|
||||
|
||||
|
|
|
|||
2
ksw.h
2
ksw.h
|
|
@ -62,7 +62,7 @@ extern "C" {
|
|||
*/
|
||||
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore);
|
||||
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
Loading…
Reference in New Issue