diff --git a/bwa.1 b/bwa.1 index 198c2ab..45b9921 100644 --- a/bwa.1 +++ b/bwa.1 @@ -93,6 +93,8 @@ genome. .IR gapOpenPen ] .RB [ -E .IR gapExtPen ] +.RB [ -L +.IR clipPen ] .RB [ -U .IR unpairPen ] .RB [ -R @@ -190,6 +192,13 @@ Gap extension penalty. A gap of length k costs O + k*E (i.e. .B -O is for opening a zero-length gap). [1] .TP +.BI -L \ INT +Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best +score reaching the end of query. If this score is larger than the best SW score +minus the clipping penalty, clipping will not be applied. Note that in this +case, the SAM AS tag reports the best SW score; clipping penalty is not +deducted. [5] +.TP .BI -U \ INT Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as .RI scoreRead1+scoreRead2- INT diff --git a/bwamem.c b/bwamem.c index 86b3e7a..f5173d3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -42,8 +42,10 @@ mem_opt_t *mem_opt_init() { mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->flag = 0; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->pen_unpaired = 9; + o->pen_clip = 5; o->min_seed_len = 19; o->split_width = 10; o->max_occ = 10000; @@ -54,7 +56,6 @@ mem_opt_t *mem_opt_init() o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; - o->pen_unpaired = 9; o->max_matesw = 100; mem_fill_scmat(o->a, o->b, o->mat); return o; @@ -487,23 +488,27 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle; + int qle, tle, gtle, gscore; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle); - a->qb = s->qbeg - qle; a->rb = s->rbeg - tle; + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore); + // check whether we prefer to reach the end of the query + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits + else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end free(qs); free(rs); } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re; + int qle, tle, qe, re, gtle, gscore; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); - a->qe = qe + qle; a->re = rmax[0] + re + tle; + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore); + // similar to the above + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; + else a->qe = l_query, a->re = rmax[0] + re + gtle; } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); diff --git a/bwamem.h b/bwamem.h index 8a7c7b8..5c63402 100644 --- a/bwamem.h +++ b/bwamem.h @@ -19,7 +19,10 @@ typedef struct __smem_i smem_i; typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r + int pen_unpaired; // phred-scaled penalty for unpaired reads + int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width + int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor @@ -30,7 +33,6 @@ typedef struct { int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain - int pen_unpaired; // phred-scaled penalty for unpaired reads int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset diff --git a/fastmap.c b/fastmap.c index b2d5f39..56cfb01 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); + else if (c == 'L') opt->pen_clip = atoi(optarg); else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; @@ -64,6 +65,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); + fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); diff --git a/ksw.c b/ksw.c index 4cbcb32..b97fed5 100644 --- a/ksw.c +++ b/ksw.c @@ -359,11 +359,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore) { eh_t *eh; // score array int8_t *qp; // query profile - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore; if (h0 < 0) h0 = 0; // allocate memory qp = malloc(qlen * m); @@ -385,7 +385,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_gap = max_gap > 1? max_gap : 1; w = w < max_gap? w : max_gap; // DP loop - max = h0, max_i = max_j = -1; + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1; @@ -421,6 +421,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, f = f > h? f : h; // computed F(i,j+1) } eh[end].h = h1; eh[end].e = 0; + if (j == qlen) { + max_ie = gscore > h1? max_ie : i; + gscore = gscore > h1? gscore : h1; + } if (m == 0) break; if (m > max) max = m, max_i = i, max_j = mj; // update beg and end for the next round @@ -433,6 +437,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, free(eh); free(qp); if (_qle) *_qle = max_j + 1; if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; return max; } diff --git a/ksw.h b/ksw.h index 5162dc0..40216d9 100644 --- a/ksw.h +++ b/ksw.h @@ -62,7 +62,7 @@ extern "C" { */ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore); int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar); #ifdef __cplusplus diff --git a/main.c b/main.c index da1d5dc..0590e63 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r303-beta" +#define PACKAGE_VERSION "0.6.2-r306-beta" #endif static int usage()