diff --git a/bwamem.c b/bwamem.c index 02f9591..b9e7f68 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1,9 +1,22 @@ #include #include #include +#include #include "bwamem.h" #include "kvec.h" #include "bntseq.h" +#include "ksw.h" + +void mem_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} mem_opt_t *mem_opt_init() { @@ -13,6 +26,7 @@ mem_opt_t *mem_opt_init() o->min_seed_len = 17; o->max_occ = 10; o->max_chain_gap = 10000; + mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -176,19 +190,52 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin return chain; } +static inline int cal_max_gap(const mem_opt_t *opt, int qlen) +{ + int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); + return l > 1? l : 1; +} + mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) { mem_aln_t a; - int i, j, max, max_i; - int64_t len; - for (i = 0; i < c->n; ++i) { - mem_seed_t *s = &c->seeds[i]; - uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len); - for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n'); - for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n'); - free(seq); + int i, j, qbeg, qend, score; + int64_t k, rlen, rbeg, rend, rmax[2], tmp; + mem_seed_t *s; + uint8_t *rseq = 0; + // get the start and end of the seeded region + rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; + s = &c->seeds[c->n-1]; + rend = s->rbeg + s->len; qend = s->qbeg + s->len; + // get the max possible span + rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg)); + rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend)); + if (rmax[0] < 0) rmax[0] = 0; + if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1; + // retrieve the reference sequence + rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); + + if (qbeg) { // left extension of the first seed + uint8_t *rs, *qs; + int qle, tle; + qs = malloc(qbeg); + for (i = 0; i < qbeg; ++i) qs[i] = query[qbeg - 1 - i]; + tmp = rbeg - rmax[0]; + rs = malloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; + score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + free(qs); free(rs); + } else score = c->seeds[0].len * opt->a; + + if (c->seeds[0].qbeg + c->seeds[0].len != l_query) { // right extension of the first seed + int qle, tle, qe, re; + s = &c->seeds[0]; + qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; + for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); + for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); + score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, score, 0, &qle, &tle); + printf("[%d] score=%d\tqle=%d\trle=%d\n", c->n, score, qle, tle); } - for (i = max = 0, max_i = -1; i < c->n; ++i) // find the longest seed - if (max < c->seeds[i].len) max = c->seeds[i].len, max_i = i; + free(rseq); return a; } diff --git a/bwamem.h b/bwamem.h index 214d780..b026de4 100644 --- a/bwamem.h +++ b/bwamem.h @@ -14,6 +14,7 @@ typedef struct { typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; typedef struct { @@ -43,6 +44,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query); const bwtintv_v *smem_next(smem_i *itr, int split_len); mem_opt_t *mem_opt_init(void); +void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); diff --git a/fastmap.c b/fastmap.c index c92311e..4a677c3 100644 --- a/fastmap.c +++ b/fastmap.c @@ -31,6 +31,7 @@ int main_mem(int argc, char *argv[]) free(opt); return 1; } + mem_fill_scmat(opt->a, opt->b, opt->mat); fp = gzopen(argv[optind + 1], "r"); seq = kseq_init(fp); { // load the packed sequences, BWT and SA diff --git a/ksw.c b/ksw.c index 437f563..763c774 100644 --- a/ksw.c +++ b/ksw.c @@ -315,11 +315,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle) { eh_t *eh; int8_t *qp; - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; if (h0 < 0) h0 = 0; // allocate memory eh = calloc(qlen + 1, 8); @@ -333,8 +333,15 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0; for (j = 2; j <= qlen && eh[j-1].h > gape; ++j) eh[j].h = eh[j-1].h - gape; + // adjust $w if it is too large + k = m * m; + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i]? max : mat[i]; + max_gap = (int)((double)(qlen * max - gapo) / gape + 1.); + max_gap = max_gap > 1? max_gap : 1; + w = w < max_gap? w : max_gap; // DP loop - max = h0, max_i = max_j = 0; + max = h0, max_i = max_j = -1; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1, t; @@ -381,8 +388,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, //beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); - if (_qpos) *_qpos = max_i + 1; - if (_tpos) *_tpos = max_j + 1; + if (_qle) *_qle = max_i + 1; + if (_tle) *_tle = max_j + 1; return max; } diff --git a/ksw.h b/ksw.h index b7b9c40..3c9b959 100644 --- a/ksw.h +++ b/ksw.h @@ -49,7 +49,7 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle); #ifdef __cplusplus }