r306: introduce clipping penalty

More clipping leads to more severe reference bias. We should not clip the
alignment unless necessary.
This commit is contained in:
Heng Li 2013-02-27 21:13:39 -05:00
parent b7791105bc
commit 4bb0bdddca
7 changed files with 39 additions and 15 deletions

9
bwa.1
View File

@ -93,6 +93,8 @@ genome.
.IR gapOpenPen ]
.RB [ -E
.IR gapExtPen ]
.RB [ -L
.IR clipPen ]
.RB [ -U
.IR unpairPen ]
.RB [ -R
@ -190,6 +192,13 @@ Gap extension penalty. A gap of length k costs O + k*E (i.e.
.B -O
is for opening a zero-length gap). [1]
.TP
.BI -L \ INT
Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
score reaching the end of query. If this score is larger than the best SW score
minus the clipping penalty, clipping will not be applied. Note that in this
case, the SAM AS tag reports the best SW score; clipping penalty is not
deducted. [5]
.TP
.BI -U \ INT
Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
.RI scoreRead1+scoreRead2- INT

View File

@ -42,8 +42,10 @@ mem_opt_t *mem_opt_init()
{
mem_opt_t *o;
o = calloc(1, sizeof(mem_opt_t));
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
o->flag = 0;
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
o->pen_unpaired = 9;
o->pen_clip = 5;
o->min_seed_len = 19;
o->split_width = 10;
o->max_occ = 10000;
@ -54,7 +56,6 @@ mem_opt_t *mem_opt_init()
o->split_factor = 1.5;
o->chunk_size = 10000000;
o->n_threads = 1;
o->pen_unpaired = 9;
o->max_matesw = 100;
mem_fill_scmat(o->a, o->b, o->mat);
return o;
@ -487,23 +488,27 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
if (s->qbeg) { // left extension
uint8_t *rs, *qs;
int qle, tle;
int qle, tle, gtle, gscore;
qs = malloc(s->qbeg);
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
tmp = s->rbeg - rmax[0];
rs = malloc(tmp);
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle);
a->qb = s->qbeg - qle; a->rb = s->rbeg - tle;
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, &gtle, &gscore);
// check whether we prefer to reach the end of the query
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
free(qs); free(rs);
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
if (s->qbeg + s->len != l_query) { // right extension
int qle, tle, qe, re;
int qle, tle, qe, re, gtle, gscore;
qe = s->qbeg + s->len;
re = s->rbeg + s->len - rmax[0];
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle);
a->qe = qe + qle; a->re = rmax[0] + re + tle;
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, &gtle, &gscore);
// similar to the above
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
else a->qe = l_query, a->re = rmax[0] + re + gtle;
} else a->qe = l_query, a->re = s->rbeg + s->len;
if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);

View File

@ -19,7 +19,10 @@ typedef struct __smem_i smem_i;
typedef struct {
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
int pen_unpaired; // phred-scaled penalty for unpaired reads
int pen_clip; // clipping penalty. This score is not deducted from the DP score.
int w; // band width
int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
@ -30,7 +33,6 @@ typedef struct {
int chunk_size; // process chunk_size-bp sequences in a batch
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
int pen_unpaired; // phred-scaled penalty for unpaired reads
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset

View File

@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[])
void *ko = 0, *ko2 = 0;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:")) >= 0) {
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
else if (c == 'A') opt->a = atoi(optarg);
else if (c == 'B') opt->b = atoi(optarg);
else if (c == 'O') opt->q = atoi(optarg);
else if (c == 'E') opt->r = atoi(optarg);
else if (c == 'L') opt->pen_clip = atoi(optarg);
else if (c == 'U') opt->pen_unpaired = atoi(optarg);
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
@ -64,6 +65,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip);
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");

12
ksw.c
View File

@ -359,11 +359,11 @@ typedef struct {
int32_t h, e;
} eh_t;
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore)
{
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore;
if (h0 < 0) h0 = 0;
// allocate memory
qp = malloc(qlen * m);
@ -385,7 +385,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
max_gap = max_gap > 1? max_gap : 1;
w = w < max_gap? w : max_gap;
// DP loop
max = h0, max_i = max_j = -1;
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
beg = 0, end = qlen;
for (i = 0; LIKELY(i < tlen); ++i) {
int f = 0, h1, m = 0, mj = -1;
@ -421,6 +421,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
f = f > h? f : h; // computed F(i,j+1)
}
eh[end].h = h1; eh[end].e = 0;
if (j == qlen) {
max_ie = gscore > h1? max_ie : i;
gscore = gscore > h1? gscore : h1;
}
if (m == 0) break;
if (m > max) max = m, max_i = i, max_j = mj;
// update beg and end for the next round
@ -433,6 +437,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
free(eh); free(qp);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
return max;
}

2
ksw.h
View File

@ -62,7 +62,7 @@ extern "C" {
*/
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore);
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
#ifdef __cplusplus

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.2-r303-beta"
#define PACKAGE_VERSION "0.6.2-r306-beta"
#endif
static int usage()