From 1e118e0823823dce3e576e7f47200a5a7e3c5844 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 3 Apr 2013 23:57:19 -0400 Subject: [PATCH 1/6] r370: suppress "D" at the end of a cigar This is caused by seeds in tandem repeats, in which case, bwa-mem may not extend the true seed. The change in this commit is only a temporary cure. --- bwamem.c | 7 +++++++ main.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index d19848d..3da9775 100644 --- a/bwamem.c +++ b/bwamem.c @@ -863,6 +863,13 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; + if (a.n_cigar > 0) { + if ((a.cigar[0]&0xf) == 2) { + pos += a.cigar[0]>>4; + --a.n_cigar; + memmove(a.cigar, a.cigar + 1, a.n_cigar * 4); + } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar; + } if (qb != 0 || qe != l_query) { // add clipping to CIGAR int clip5, clip3; clip5 = is_rev? l_query - qe : qb; diff --git a/main.c b/main.c index 54c3093..16f6f27 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r369-beta" +#define PACKAGE_VERSION "0.7.3-r370-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From d7ca0885eb686128fe4c1b5e2cb1290faacac849 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 4 Apr 2013 00:43:43 -0400 Subject: [PATCH 2/6] r371: extend overlapping seeds to avoid misalignment in tandem repeats --- bwamem.c | 15 ++++++++++++++- main.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 3da9775..a221d28 100644 --- a/bwamem.c +++ b/bwamem.c @@ -537,7 +537,20 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int w = max_gap < opt->w? max_gap : opt->w; if (qd - rd < w && rd - qd < w) break; } - if (i < av->n) continue; + if (i < av->n) { // the seed is (almost) contained in an existing alignment + for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain + const mem_seed_t *t; + if (srt[i] == 0) continue; + t = &c->seeds[(uint32_t)srt[i]]; + if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping + if (s->qbeg <= t->qbeg && s->qbeg + s->len >= t->qbeg && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; + if (t->qbeg <= s->qbeg && t->qbeg + t->len >= s->qbeg && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; + } + if (i == c->n) { // no overlapping seeds; then skip extension + srt[k] = 0; // mark that seed extension has not been performed + continue; + } + } a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); diff --git a/main.c b/main.c index 16f6f27..0d490bc 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r370-beta" +#define PACKAGE_VERSION "0.7.3-r371-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From d64eaa851d97d34d0255d2e03e4ec61a195efe0c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 9 Apr 2013 15:17:04 -0400 Subject: [PATCH 3/6] fixed an issue caused by a Mac/Darwin bug On Mac/Darwin, it is not possible to read >2GB data with one fread(). --- bwt.c | 16 ++++++++++++++-- main.c | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bwt.c b/bwt.c index 4ee9ea8..8656b0e 100644 --- a/bwt.c +++ b/bwt.c @@ -372,6 +372,18 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt) fclose(fp); } +static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) +{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks + const int bufsize = 0x1000000; // 16M block + bwtint_t offset = 0; + while (size) { + int x = bufsize < size? bufsize : size; + if ((x = fread(a + offset, 1, x, fp)) == 0) break; + size -= x; offset += x; + } + return offset; +} + void bwt_restore_sa(const char *fn, bwt_t *bwt) { char skipped[256]; @@ -390,7 +402,7 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt) bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); bwt->sa[0] = -1; - fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1); fclose(fp); } @@ -407,7 +419,7 @@ bwt_t *bwt_restore_bwt(const char *fn) fseek(fp, 0, SEEK_SET); fread(&bwt->primary, sizeof(bwtint_t), 1, fp); fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fread(bwt->bwt, 4, bwt->bwt_size, fp); + fread_fix(fp, bwt->bwt_size<<2, bwt->bwt); bwt->seq_len = bwt->L2[4]; fclose(fp); bwt_gen_cnt_table(bwt); diff --git a/main.c b/main.c index 0d490bc..1104838 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r371-beta" +#define PACKAGE_VERSION "0.7.3-r372-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 53bb846407b3847b90b4b1d946ab1c3a880cdb3b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 9 Apr 2013 16:13:55 -0400 Subject: [PATCH 4/6] r373: optionally distable mate rescue --- bwamem.h | 1 + bwamem_pair.c | 25 +++++++++++++------------ fastmap.c | 6 ++++-- main.c | 2 +- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/bwamem.h b/bwamem.h index b7a4e79..40f47f8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,6 +16,7 @@ typedef struct __smem_i smem_i; #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 #define MEM_F_NO_MULTI 0x10 +#define MEM_F_NO_RESCUE 0x20 typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r diff --git a/bwamem_pair.c b/bwamem_pair.c index 23c99ef..0f6ff08 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -235,20 +235,21 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; kstring_t str; - mem_alnreg_v b[2]; mem_aln_t h[2]; str.l = str.m = 0; str.s = 0; - // perform SW for the best alignment - kv_init(b[0]); kv_init(b[1]); - for (i = 0; i < 2; ++i) - for (j = 0; j < a[i].n; ++j) - if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) - kv_push(mem_alnreg_t, b[i], a[i].a[j]); - for (i = 0; i < 2; ++i) - for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) - n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); - free(b[0].a); free(b[1].a); + if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment + mem_alnreg_v b[2]; + kv_init(b[0]); kv_init(b[1]); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + free(b[0].a); free(b[1].a); + } mem_mark_primary_se(opt, a[0].n, a[0].a); mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; @@ -305,7 +306,7 @@ no_pairing: h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); } - if (h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. int64_t dist; int d; d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); diff --git a/fastmap.c b/fastmap.c index eda06bb..98963c0 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,7 +26,7 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { + while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -42,6 +42,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; + else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'd') opt->zdrop = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); @@ -63,7 +64,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); - fprintf(stderr, " -P skip pairing; perform mate SW only\n"); + fprintf(stderr, " -S skip mate rescue\n"); + fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); diff --git a/main.c b/main.c index 1104838..c9acc72 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r372-beta" +#define PACKAGE_VERSION "0.7.3-r373-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 3d8a8c1e373249b799006fb6e571982b4175daaa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 10 Apr 2013 01:09:37 -0400 Subject: [PATCH 5/6] r374: fix - clipping penalty not always working This only happens to gaps where mem underestimates the bandwidth without considering the clipping penalty. --- bwamem.c | 4 ++-- ksw.c | 4 ++-- ksw.h | 2 +- main.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index a221d28..6fc2e5a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -568,7 +568,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } @@ -591,7 +591,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[1] = opt->w << i; - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } diff --git a/ksw.c b/ksw.c index e331390..2daf809 100644 --- a/ksw.c +++ b/ksw.c @@ -359,7 +359,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile @@ -381,7 +381,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, k = m * m; for (i = 0, max = 0; i < k; ++i) // get the max score max = max > mat[i]? max : mat[i]; - max_gap = (int)((double)(qlen * max - gapo) / gape + 1.); + max_gap = (int)((double)(qlen * max + end_bonus - gapo) / gape + 1.); max_gap = max_gap > 1? max_gap : 1; w = w < max_gap? w : max_gap; // DP loop diff --git a/ksw.h b/ksw.h index 2dd6499..97559fd 100644 --- a/ksw.h +++ b/ksw.h @@ -102,7 +102,7 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } diff --git a/main.c b/main.c index c9acc72..b47c897 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r373-beta" +#define PACKAGE_VERSION "0.7.3-r374-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 47520134e784a9c3d6661a5de7e74e21a54fdbfb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 10 Apr 2013 11:04:32 -0400 Subject: [PATCH 6/6] r375: fixed compiling errors by the last change --- bwase.c | 4 ++-- bwtsw2_aux.c | 4 ++-- main.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bwase.c b/bwase.c index fd06c7d..0d49739 100644 --- a/bwase.c +++ b/bwase.c @@ -176,7 +176,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences seq_reverse(rlen, rseq, 0); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; rb = re - tle; rlen = tle; seq_reverse(len, seq, 0); @@ -192,7 +192,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rb = *_pos; re = rb + len + SW_BW; if (re > l_pac) re = l_pac; rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; re = rb + tle; rlen = tle; ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index a84d7e0..d26d3fa 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -125,7 +125,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; p->k -= tle; @@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1; + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; diff --git a/main.c b/main.c index b47c897..8a0542c 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r374-beta" +#define PACKAGE_VERSION "0.7.3-r375-beta" #endif int bwa_fa2pac(int argc, char *argv[]);