sw extension works for the simplest case

This commit is contained in:
Heng Li 2013-02-04 12:37:38 -05:00
parent f83dea36d8
commit ba18db1a9f
5 changed files with 73 additions and 16 deletions

View File

@ -1,9 +1,22 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include "bwamem.h"
#include "kvec.h"
#include "bntseq.h"
#include "ksw.h"
void mem_fill_scmat(int a, int b, int8_t mat[25])
{
int i, j, k;
for (i = k = 0; i < 5; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? a : -b;
mat[k++] = 0; // ambiguous base
}
for (j = 0; j < 5; ++j) mat[k++] = 0;
}
mem_opt_t *mem_opt_init()
{
@ -13,6 +26,7 @@ mem_opt_t *mem_opt_init()
o->min_seed_len = 17;
o->max_occ = 10;
o->max_chain_gap = 10000;
mem_fill_scmat(o->a, o->b, o->mat);
return o;
}
@ -176,19 +190,52 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin
return chain;
}
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
{
int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
return l > 1? l : 1;
}
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c)
{
mem_aln_t a;
int i, j, max, max_i;
int64_t len;
for (i = 0; i < c->n; ++i) {
mem_seed_t *s = &c->seeds[i];
uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len);
for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n');
for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n');
free(seq);
int i, j, qbeg, qend, score;
int64_t k, rlen, rbeg, rend, rmax[2], tmp;
mem_seed_t *s;
uint8_t *rseq = 0;
// get the start and end of the seeded region
rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg;
s = &c->seeds[c->n-1];
rend = s->rbeg + s->len; qend = s->qbeg + s->len;
// get the max possible span
rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg));
rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend));
if (rmax[0] < 0) rmax[0] = 0;
if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1;
// retrieve the reference sequence
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
if (qbeg) { // left extension of the first seed
uint8_t *rs, *qs;
int qle, tle;
qs = malloc(qbeg);
for (i = 0; i < qbeg; ++i) qs[i] = query[qbeg - 1 - i];
tmp = rbeg - rmax[0];
rs = malloc(tmp);
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle);
free(qs); free(rs);
} else score = c->seeds[0].len * opt->a;
if (c->seeds[0].qbeg + c->seeds[0].len != l_query) { // right extension of the first seed
int qle, tle, qe, re;
s = &c->seeds[0];
qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0];
for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n');
for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n');
score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, score, 0, &qle, &tle);
printf("[%d] score=%d\tqle=%d\trle=%d\n", c->n, score, qle, tle);
}
for (i = max = 0, max_i = -1; i < c->n; ++i) // find the longest seed
if (max < c->seeds[i].len) max = c->seeds[i].len, max_i = i;
free(rseq);
return a;
}

View File

@ -14,6 +14,7 @@ typedef struct {
typedef struct {
int a, b, q, r, w;
int min_seed_len, max_occ, max_chain_gap;
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t;
typedef struct {
@ -43,6 +44,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query);
const bwtintv_v *smem_next(smem_i *itr, int split_len);
mem_opt_t *mem_opt_init(void);
void mem_fill_scmat(int a, int b, int8_t mat[25]);
mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq);
mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c);

View File

@ -31,6 +31,7 @@ int main_mem(int argc, char *argv[])
free(opt);
return 1;
}
mem_fill_scmat(opt->a, opt->b, opt->mat);
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA

17
ksw.c
View File

@ -315,11 +315,11 @@ typedef struct {
int32_t h, e;
} eh_t;
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos)
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle)
{
eh_t *eh;
int8_t *qp;
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j;
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
if (h0 < 0) h0 = 0;
// allocate memory
eh = calloc(qlen + 1, 8);
@ -333,8 +333,15 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
eh[j].h = eh[j-1].h - gape;
// adjust $w if it is too large
k = m * m;
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i]? max : mat[i];
max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
max_gap = max_gap > 1? max_gap : 1;
w = w < max_gap? w : max_gap;
// DP loop
max = h0, max_i = max_j = 0;
max = h0, max_i = max_j = -1;
beg = 0, end = qlen;
for (i = 0; LIKELY(i < tlen); ++i) {
int f = 0, h1, m = 0, mj = -1, t;
@ -381,8 +388,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
//beg = 0; end = qlen; // uncomment this line for debugging
}
free(eh); free(qp);
if (_qpos) *_qpos = max_i + 1;
if (_tpos) *_tpos = max_j + 1;
if (_qle) *_qle = max_i + 1;
if (_tle) *_tle = max_j + 1;
return max;
}

2
ksw.h
View File

@ -49,7 +49,7 @@ extern "C" {
/** Unified interface for ksw_sse2_8() and ksw_sse2_16() */
int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos);
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle);
#ifdef __cplusplus
}