sw extension works for the simplest case
This commit is contained in:
parent
f83dea36d8
commit
ba18db1a9f
67
bwamem.c
67
bwamem.c
|
|
@ -1,9 +1,22 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <assert.h>
|
||||||
#include "bwamem.h"
|
#include "bwamem.h"
|
||||||
#include "kvec.h"
|
#include "kvec.h"
|
||||||
#include "bntseq.h"
|
#include "bntseq.h"
|
||||||
|
#include "ksw.h"
|
||||||
|
|
||||||
|
void mem_fill_scmat(int a, int b, int8_t mat[25])
|
||||||
|
{
|
||||||
|
int i, j, k;
|
||||||
|
for (i = k = 0; i < 5; ++i) {
|
||||||
|
for (j = 0; j < 4; ++j)
|
||||||
|
mat[k++] = i == j? a : -b;
|
||||||
|
mat[k++] = 0; // ambiguous base
|
||||||
|
}
|
||||||
|
for (j = 0; j < 5; ++j) mat[k++] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
mem_opt_t *mem_opt_init()
|
mem_opt_t *mem_opt_init()
|
||||||
{
|
{
|
||||||
|
|
@ -13,6 +26,7 @@ mem_opt_t *mem_opt_init()
|
||||||
o->min_seed_len = 17;
|
o->min_seed_len = 17;
|
||||||
o->max_occ = 10;
|
o->max_occ = 10;
|
||||||
o->max_chain_gap = 10000;
|
o->max_chain_gap = 10000;
|
||||||
|
mem_fill_scmat(o->a, o->b, o->mat);
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -176,19 +190,52 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin
|
||||||
return chain;
|
return chain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
||||||
|
{
|
||||||
|
int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
|
||||||
|
return l > 1? l : 1;
|
||||||
|
}
|
||||||
|
|
||||||
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c)
|
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c)
|
||||||
{
|
{
|
||||||
mem_aln_t a;
|
mem_aln_t a;
|
||||||
int i, j, max, max_i;
|
int i, j, qbeg, qend, score;
|
||||||
int64_t len;
|
int64_t k, rlen, rbeg, rend, rmax[2], tmp;
|
||||||
for (i = 0; i < c->n; ++i) {
|
mem_seed_t *s;
|
||||||
mem_seed_t *s = &c->seeds[i];
|
uint8_t *rseq = 0;
|
||||||
uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len);
|
// get the start and end of the seeded region
|
||||||
for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n');
|
rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg;
|
||||||
for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n');
|
s = &c->seeds[c->n-1];
|
||||||
free(seq);
|
rend = s->rbeg + s->len; qend = s->qbeg + s->len;
|
||||||
|
// get the max possible span
|
||||||
|
rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg));
|
||||||
|
rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend));
|
||||||
|
if (rmax[0] < 0) rmax[0] = 0;
|
||||||
|
if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1;
|
||||||
|
// retrieve the reference sequence
|
||||||
|
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
|
||||||
|
|
||||||
|
if (qbeg) { // left extension of the first seed
|
||||||
|
uint8_t *rs, *qs;
|
||||||
|
int qle, tle;
|
||||||
|
qs = malloc(qbeg);
|
||||||
|
for (i = 0; i < qbeg; ++i) qs[i] = query[qbeg - 1 - i];
|
||||||
|
tmp = rbeg - rmax[0];
|
||||||
|
rs = malloc(tmp);
|
||||||
|
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||||||
|
score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle);
|
||||||
|
free(qs); free(rs);
|
||||||
|
} else score = c->seeds[0].len * opt->a;
|
||||||
|
|
||||||
|
if (c->seeds[0].qbeg + c->seeds[0].len != l_query) { // right extension of the first seed
|
||||||
|
int qle, tle, qe, re;
|
||||||
|
s = &c->seeds[0];
|
||||||
|
qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0];
|
||||||
|
for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n');
|
||||||
|
for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n');
|
||||||
|
score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, score, 0, &qle, &tle);
|
||||||
|
printf("[%d] score=%d\tqle=%d\trle=%d\n", c->n, score, qle, tle);
|
||||||
}
|
}
|
||||||
for (i = max = 0, max_i = -1; i < c->n; ++i) // find the longest seed
|
free(rseq);
|
||||||
if (max < c->seeds[i].len) max = c->seeds[i].len, max_i = i;
|
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
2
bwamem.h
2
bwamem.h
|
|
@ -14,6 +14,7 @@ typedef struct {
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int a, b, q, r, w;
|
int a, b, q, r, w;
|
||||||
int min_seed_len, max_occ, max_chain_gap;
|
int min_seed_len, max_occ, max_chain_gap;
|
||||||
|
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||||
} mem_opt_t;
|
} mem_opt_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
@ -43,6 +44,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query);
|
||||||
const bwtintv_v *smem_next(smem_i *itr, int split_len);
|
const bwtintv_v *smem_next(smem_i *itr, int split_len);
|
||||||
|
|
||||||
mem_opt_t *mem_opt_init(void);
|
mem_opt_t *mem_opt_init(void);
|
||||||
|
void mem_fill_scmat(int a, int b, int8_t mat[25]);
|
||||||
|
|
||||||
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
|
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
|
||||||
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c);
|
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c);
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ int main_mem(int argc, char *argv[])
|
||||||
free(opt);
|
free(opt);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
mem_fill_scmat(opt->a, opt->b, opt->mat);
|
||||||
fp = gzopen(argv[optind + 1], "r");
|
fp = gzopen(argv[optind + 1], "r");
|
||||||
seq = kseq_init(fp);
|
seq = kseq_init(fp);
|
||||||
{ // load the packed sequences, BWT and SA
|
{ // load the packed sequences, BWT and SA
|
||||||
|
|
|
||||||
17
ksw.c
17
ksw.c
|
|
@ -315,11 +315,11 @@ typedef struct {
|
||||||
int32_t h, e;
|
int32_t h, e;
|
||||||
} eh_t;
|
} eh_t;
|
||||||
|
|
||||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos)
|
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle)
|
||||||
{
|
{
|
||||||
eh_t *eh;
|
eh_t *eh;
|
||||||
int8_t *qp;
|
int8_t *qp;
|
||||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j;
|
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
|
||||||
if (h0 < 0) h0 = 0;
|
if (h0 < 0) h0 = 0;
|
||||||
// allocate memory
|
// allocate memory
|
||||||
eh = calloc(qlen + 1, 8);
|
eh = calloc(qlen + 1, 8);
|
||||||
|
|
@ -333,8 +333,15 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
|
eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
|
||||||
for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
|
for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
|
||||||
eh[j].h = eh[j-1].h - gape;
|
eh[j].h = eh[j-1].h - gape;
|
||||||
|
// adjust $w if it is too large
|
||||||
|
k = m * m;
|
||||||
|
for (i = 0, max = 0; i < k; ++i) // get the max score
|
||||||
|
max = max > mat[i]? max : mat[i];
|
||||||
|
max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
|
||||||
|
max_gap = max_gap > 1? max_gap : 1;
|
||||||
|
w = w < max_gap? w : max_gap;
|
||||||
// DP loop
|
// DP loop
|
||||||
max = h0, max_i = max_j = 0;
|
max = h0, max_i = max_j = -1;
|
||||||
beg = 0, end = qlen;
|
beg = 0, end = qlen;
|
||||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||||
int f = 0, h1, m = 0, mj = -1, t;
|
int f = 0, h1, m = 0, mj = -1, t;
|
||||||
|
|
@ -381,8 +388,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
//beg = 0; end = qlen; // uncomment this line for debugging
|
//beg = 0; end = qlen; // uncomment this line for debugging
|
||||||
}
|
}
|
||||||
free(eh); free(qp);
|
free(eh); free(qp);
|
||||||
if (_qpos) *_qpos = max_i + 1;
|
if (_qle) *_qle = max_i + 1;
|
||||||
if (_tpos) *_tpos = max_j + 1;
|
if (_tle) *_tle = max_j + 1;
|
||||||
return max;
|
return max;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
2
ksw.h
2
ksw.h
|
|
@ -49,7 +49,7 @@ extern "C" {
|
||||||
/** Unified interface for ksw_sse2_8() and ksw_sse2_16() */
|
/** Unified interface for ksw_sse2_8() and ksw_sse2_16() */
|
||||||
int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
|
int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
|
||||||
|
|
||||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos);
|
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue