diff --git a/bwamem.c b/bwamem.c index eb24e67..fc34fca 100644 --- a/bwamem.c +++ b/bwamem.c @@ -537,7 +537,20 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int w = max_gap < opt->w? max_gap : opt->w; if (qd - rd < w && rd - qd < w) break; } - if (i < av->n) continue; + if (i < av->n) { // the seed is (almost) contained in an existing alignment + for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain + const mem_seed_t *t; + if (srt[i] == 0) continue; + t = &c->seeds[(uint32_t)srt[i]]; + if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping + if (s->qbeg <= t->qbeg && s->qbeg + s->len >= t->qbeg && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; + if (t->qbeg <= s->qbeg && t->qbeg + t->len >= s->qbeg && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; + } + if (i == c->n) { // no overlapping seeds; then skip extension + srt[k] = 0; // mark that seed extension has not been performed + continue; + } + } a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); @@ -555,7 +568,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } @@ -578,7 +591,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[1] = opt->w << i; - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } @@ -863,6 +876,13 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; + if (a.n_cigar > 0) { + if ((a.cigar[0]&0xf) == 2) { + pos += a.cigar[0]>>4; + --a.n_cigar; + memmove(a.cigar, a.cigar + 1, a.n_cigar * 4); + } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar; + } if (qb != 0 || qe != l_query) { // add clipping to CIGAR int clip5, clip3; clip5 = is_rev? l_query - qe : qb; diff --git a/bwamem.h b/bwamem.h index b7a4e79..40f47f8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,6 +16,7 @@ typedef struct __smem_i smem_i; #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 #define MEM_F_NO_MULTI 0x10 +#define MEM_F_NO_RESCUE 0x20 typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r diff --git a/bwamem_pair.c b/bwamem_pair.c index 8329ebe..19fc83b 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -235,20 +235,21 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; kstring_t str; - mem_alnreg_v b[2]; mem_aln_t h[2]; str.l = str.m = 0; str.s = 0; - // perform SW for the best alignment - kv_init(b[0]); kv_init(b[1]); - for (i = 0; i < 2; ++i) - for (j = 0; j < a[i].n; ++j) - if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) - kv_push(mem_alnreg_t, b[i], a[i].a[j]); - for (i = 0; i < 2; ++i) - for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) - n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); - free(b[0].a); free(b[1].a); + if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment + mem_alnreg_v b[2]; + kv_init(b[0]); kv_init(b[1]); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + free(b[0].a); free(b[1].a); + } mem_mark_primary_se(opt, a[0].n, a[0].a); mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; @@ -305,7 +306,7 @@ no_pairing: h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); } - if (h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. int64_t dist; int d; d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); diff --git a/bwase.c b/bwase.c index 718dd5d..85efcbd 100644 --- a/bwase.c +++ b/bwase.c @@ -176,7 +176,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences seq_reverse(rlen, rseq, 0); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; rb = re - tle; rlen = tle; seq_reverse(len, seq, 0); @@ -192,7 +192,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rb = *_pos; re = rb + len + SW_BW; if (re > l_pac) re = l_pac; rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; re = rb + tle; rlen = tle; ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension diff --git a/bwt.c b/bwt.c index 412dce5..edd0afc 100644 --- a/bwt.c +++ b/bwt.c @@ -370,6 +370,18 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt) err_fclose(fp); } +static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) +{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks + const int bufsize = 0x1000000; // 16M block + bwtint_t offset = 0; + while (size) { + int x = bufsize < size? bufsize : size; + if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break; + size -= x; offset += x; + } + return offset; +} + void bwt_restore_sa(const char *fn, bwt_t *bwt) { char skipped[256]; @@ -388,7 +400,7 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt) bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t)); bwt->sa[0] = -1; - err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1); err_fclose(fp); } @@ -405,7 +417,7 @@ bwt_t *bwt_restore_bwt(const char *fn) err_fseek(fp, 0, SEEK_SET); err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); - err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp); + fread_fix(fp, bwt->bwt_size<<2, bwt->bwt); bwt->seq_len = bwt->L2[4]; err_fclose(fp); bwt_gen_cnt_table(bwt); diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 88c13c7..074af10 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -125,7 +125,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; p->k -= tle; @@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1; + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; diff --git a/fastmap.c b/fastmap.c index 76e3c03..7ff73cb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -27,7 +27,7 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { + while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -43,6 +43,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; + else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'd') opt->zdrop = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); @@ -64,7 +65,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); - fprintf(stderr, " -P skip pairing; perform mate SW only\n"); + fprintf(stderr, " -S skip mate rescue\n"); + fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); diff --git a/ksw.c b/ksw.c index 700afd0..f5b24dc 100644 --- a/ksw.c +++ b/ksw.c @@ -360,7 +360,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile @@ -382,7 +382,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, k = m * m; for (i = 0, max = 0; i < k; ++i) // get the max score max = max > mat[i]? max : mat[i]; - max_gap = (int)((double)(qlen * max - gapo) / gape + 1.); + max_gap = (int)((double)(qlen * max + end_bonus - gapo) / gape + 1.); max_gap = max_gap > 1? max_gap : 1; w = w < max_gap? w : max_gap; // DP loop diff --git a/ksw.h b/ksw.h index 2dd6499..97559fd 100644 --- a/ksw.h +++ b/ksw.h @@ -102,7 +102,7 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } diff --git a/main.c b/main.c index ca110a7..b1aa9cf 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r369-beta" +#define PACKAGE_VERSION "0.7.3-r375-beta" #endif int bwa_fa2pac(int argc, char *argv[]);