sw extension works for the simplest case
This commit is contained in:
parent
f83dea36d8
commit
ba18db1a9f
67
bwamem.c
67
bwamem.c
|
|
@ -1,9 +1,22 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include "bwamem.h"
|
||||
#include "kvec.h"
|
||||
#include "bntseq.h"
|
||||
#include "ksw.h"
|
||||
|
||||
void mem_fill_scmat(int a, int b, int8_t mat[25])
|
||||
{
|
||||
int i, j, k;
|
||||
for (i = k = 0; i < 5; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
mat[k++] = i == j? a : -b;
|
||||
mat[k++] = 0; // ambiguous base
|
||||
}
|
||||
for (j = 0; j < 5; ++j) mat[k++] = 0;
|
||||
}
|
||||
|
||||
mem_opt_t *mem_opt_init()
|
||||
{
|
||||
|
|
@ -13,6 +26,7 @@ mem_opt_t *mem_opt_init()
|
|||
o->min_seed_len = 17;
|
||||
o->max_occ = 10;
|
||||
o->max_chain_gap = 10000;
|
||||
mem_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
}
|
||||
|
||||
|
|
@ -176,19 +190,52 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin
|
|||
return chain;
|
||||
}
|
||||
|
||||
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
||||
{
|
||||
int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
|
||||
return l > 1? l : 1;
|
||||
}
|
||||
|
||||
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c)
|
||||
{
|
||||
mem_aln_t a;
|
||||
int i, j, max, max_i;
|
||||
int64_t len;
|
||||
for (i = 0; i < c->n; ++i) {
|
||||
mem_seed_t *s = &c->seeds[i];
|
||||
uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len);
|
||||
for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n');
|
||||
for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n');
|
||||
free(seq);
|
||||
int i, j, qbeg, qend, score;
|
||||
int64_t k, rlen, rbeg, rend, rmax[2], tmp;
|
||||
mem_seed_t *s;
|
||||
uint8_t *rseq = 0;
|
||||
// get the start and end of the seeded region
|
||||
rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg;
|
||||
s = &c->seeds[c->n-1];
|
||||
rend = s->rbeg + s->len; qend = s->qbeg + s->len;
|
||||
// get the max possible span
|
||||
rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg));
|
||||
rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend));
|
||||
if (rmax[0] < 0) rmax[0] = 0;
|
||||
if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1;
|
||||
// retrieve the reference sequence
|
||||
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
|
||||
|
||||
if (qbeg) { // left extension of the first seed
|
||||
uint8_t *rs, *qs;
|
||||
int qle, tle;
|
||||
qs = malloc(qbeg);
|
||||
for (i = 0; i < qbeg; ++i) qs[i] = query[qbeg - 1 - i];
|
||||
tmp = rbeg - rmax[0];
|
||||
rs = malloc(tmp);
|
||||
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||||
score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle);
|
||||
free(qs); free(rs);
|
||||
} else score = c->seeds[0].len * opt->a;
|
||||
|
||||
if (c->seeds[0].qbeg + c->seeds[0].len != l_query) { // right extension of the first seed
|
||||
int qle, tle, qe, re;
|
||||
s = &c->seeds[0];
|
||||
qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0];
|
||||
for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n');
|
||||
for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n');
|
||||
score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, score, 0, &qle, &tle);
|
||||
printf("[%d] score=%d\tqle=%d\trle=%d\n", c->n, score, qle, tle);
|
||||
}
|
||||
for (i = max = 0, max_i = -1; i < c->n; ++i) // find the longest seed
|
||||
if (max < c->seeds[i].len) max = c->seeds[i].len, max_i = i;
|
||||
free(rseq);
|
||||
return a;
|
||||
}
|
||||
|
|
|
|||
2
bwamem.h
2
bwamem.h
|
|
@ -14,6 +14,7 @@ typedef struct {
|
|||
typedef struct {
|
||||
int a, b, q, r, w;
|
||||
int min_seed_len, max_occ, max_chain_gap;
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
} mem_opt_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -43,6 +44,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query);
|
|||
const bwtintv_v *smem_next(smem_i *itr, int split_len);
|
||||
|
||||
mem_opt_t *mem_opt_init(void);
|
||||
void mem_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
|
||||
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
|
||||
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c);
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ int main_mem(int argc, char *argv[])
|
|||
free(opt);
|
||||
return 1;
|
||||
}
|
||||
mem_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
fp = gzopen(argv[optind + 1], "r");
|
||||
seq = kseq_init(fp);
|
||||
{ // load the packed sequences, BWT and SA
|
||||
|
|
|
|||
17
ksw.c
17
ksw.c
|
|
@ -315,11 +315,11 @@ typedef struct {
|
|||
int32_t h, e;
|
||||
} eh_t;
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos)
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle)
|
||||
{
|
||||
eh_t *eh;
|
||||
int8_t *qp;
|
||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j;
|
||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
|
||||
if (h0 < 0) h0 = 0;
|
||||
// allocate memory
|
||||
eh = calloc(qlen + 1, 8);
|
||||
|
|
@ -333,8 +333,15 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
|
||||
for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
|
||||
eh[j].h = eh[j-1].h - gape;
|
||||
// adjust $w if it is too large
|
||||
k = m * m;
|
||||
for (i = 0, max = 0; i < k; ++i) // get the max score
|
||||
max = max > mat[i]? max : mat[i];
|
||||
max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
|
||||
max_gap = max_gap > 1? max_gap : 1;
|
||||
w = w < max_gap? w : max_gap;
|
||||
// DP loop
|
||||
max = h0, max_i = max_j = 0;
|
||||
max = h0, max_i = max_j = -1;
|
||||
beg = 0, end = qlen;
|
||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||
int f = 0, h1, m = 0, mj = -1, t;
|
||||
|
|
@ -381,8 +388,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
//beg = 0; end = qlen; // uncomment this line for debugging
|
||||
}
|
||||
free(eh); free(qp);
|
||||
if (_qpos) *_qpos = max_i + 1;
|
||||
if (_tpos) *_tpos = max_j + 1;
|
||||
if (_qle) *_qle = max_i + 1;
|
||||
if (_tle) *_tle = max_j + 1;
|
||||
return max;
|
||||
}
|
||||
|
||||
|
|
|
|||
2
ksw.h
2
ksw.h
|
|
@ -49,7 +49,7 @@ extern "C" {
|
|||
/** Unified interface for ksw_sse2_8() and ksw_sse2_16() */
|
||||
int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos);
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue