Merge branch 'master' into master_fixes
Conflicts: bntseq.c bwamem.c
This commit is contained in:
commit
8a078cc16d
4
Makefile
4
Makefile
|
|
@ -3,9 +3,9 @@ CFLAGS= -g -Wall -O2 -msse2
|
||||||
CXXFLAGS= $(CFLAGS)
|
CXXFLAGS= $(CFLAGS)
|
||||||
AR= ar
|
AR= ar
|
||||||
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
||||||
LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
|
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
|
||||||
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||||
is.o bwtindex.o bwape.o \
|
is.o bwtindex.o bwape.o kopen.o \
|
||||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||||
PROG= bwa bwamem-lite
|
PROG= bwa bwamem-lite
|
||||||
|
|
|
||||||
2
bntseq.c
2
bntseq.c
|
|
@ -138,7 +138,7 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
|
||||||
if (scanres != 3) goto badread;
|
if (scanres != 3) goto badread;
|
||||||
l_pac = xx;
|
l_pac = xx;
|
||||||
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
|
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
|
||||||
bns->ambs = (bntamb1_t*)xcalloc(bns->n_holes, sizeof(bntamb1_t));
|
bns->ambs = bns->n_holes? (bntamb1_t*)xcalloc(bns->n_holes, sizeof(bntamb1_t)) : 0;
|
||||||
for (i = 0; i < bns->n_holes; ++i) {
|
for (i = 0; i < bns->n_holes; ++i) {
|
||||||
bntamb1_t *p = bns->ambs + i;
|
bntamb1_t *p = bns->ambs + i;
|
||||||
scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str);
|
scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str);
|
||||||
|
|
|
||||||
136
bwamem.c
136
bwamem.c
|
|
@ -44,6 +44,7 @@ mem_opt_t *mem_opt_init()
|
||||||
o = xcalloc(1, sizeof(mem_opt_t));
|
o = xcalloc(1, sizeof(mem_opt_t));
|
||||||
o->flag = 0;
|
o->flag = 0;
|
||||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||||
|
o->zdrop = 100;
|
||||||
o->pen_unpaired = 9;
|
o->pen_unpaired = 9;
|
||||||
o->pen_clip = 5;
|
o->pen_clip = 5;
|
||||||
o->min_seed_len = 19;
|
o->min_seed_len = 19;
|
||||||
|
|
@ -176,7 +177,7 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
|
||||||
#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
|
#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
|
||||||
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
|
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
|
||||||
|
|
||||||
static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p)
|
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p)
|
||||||
{
|
{
|
||||||
int64_t qend, rend, x, y;
|
int64_t qend, rend, x, y;
|
||||||
const mem_seed_t *last = &c->seeds[c->n-1];
|
const mem_seed_t *last = &c->seeds[c->n-1];
|
||||||
|
|
@ -184,6 +185,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t
|
||||||
rend = last->rbeg + last->len;
|
rend = last->rbeg + last->len;
|
||||||
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
|
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
|
||||||
return 1; // contained seed; do nothing
|
return 1; // contained seed; do nothing
|
||||||
|
if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand
|
||||||
x = p->qbeg - last->qbeg; // always non-negtive
|
x = p->qbeg - last->qbeg; // always non-negtive
|
||||||
y = p->rbeg - last->rbeg;
|
y = p->rbeg - last->rbeg;
|
||||||
if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
|
if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
|
||||||
|
|
@ -197,7 +199,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t
|
||||||
return 0; // request to add a new chain
|
return 0; // request to add a new chain
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
|
static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *tree, smem_i *itr)
|
||||||
{
|
{
|
||||||
const bwtintv_v *a;
|
const bwtintv_v *a;
|
||||||
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
||||||
|
|
@ -216,9 +218,10 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i
|
||||||
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||||
s.qbeg = p->info>>32;
|
s.qbeg = p->info>>32;
|
||||||
s.len = slen;
|
s.len = slen;
|
||||||
|
if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip
|
||||||
if (kb_size(tree)) {
|
if (kb_size(tree)) {
|
||||||
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
||||||
if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1;
|
if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1;
|
||||||
} else to_add = 1;
|
} else to_add = 1;
|
||||||
if (to_add) { // add the seed as a new chain
|
if (to_add) { // add the seed as a new chain
|
||||||
tmp.n = 1; tmp.m = 4;
|
tmp.n = 1; tmp.m = 4;
|
||||||
|
|
@ -249,7 +252,7 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
|
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq)
|
||||||
{
|
{
|
||||||
mem_chain_v chain;
|
mem_chain_v chain;
|
||||||
smem_i *itr;
|
smem_i *itr;
|
||||||
|
|
@ -260,7 +263,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin
|
||||||
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
||||||
itr = smem_itr_init(bwt);
|
itr = smem_itr_init(bwt);
|
||||||
smem_set_query(itr, len, seq);
|
smem_set_query(itr, len, seq);
|
||||||
mem_insert_seed(opt, tree, itr);
|
mem_insert_seed(opt, l_pac, tree, itr);
|
||||||
|
|
||||||
kv_resize(mem_chain_t, chain, kb_size(tree));
|
kv_resize(mem_chain_t, chain, kb_size(tree));
|
||||||
|
|
||||||
|
|
@ -419,6 +422,69 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT
|
||||||
* Construct the alignment from a chain *
|
* Construct the alignment from a chain *
|
||||||
****************************************/
|
****************************************/
|
||||||
|
|
||||||
|
/* mem_chain2aln() vs mem_chain2aln_short()
|
||||||
|
*
|
||||||
|
* mem_chain2aln() covers all the functionality of mem_chain2aln_short().
|
||||||
|
* However, it may waste time on extracting the reference sequences given a
|
||||||
|
* very long query. mem_chain2aln_short() is faster for very short chains in a
|
||||||
|
* long query. It may fail when the matches are long or reach the end of the
|
||||||
|
* query. In this case, mem_chain2aln() will be called again.
|
||||||
|
* mem_chain2aln_short() is almost never used for short-read alignment.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define MEM_SHORT_EXT 50
|
||||||
|
#define MEM_SHORT_LEN 200
|
||||||
|
#define MAX_BAND_TRY 2
|
||||||
|
|
||||||
|
int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
||||||
|
{
|
||||||
|
int i, qb, qe, xtra;
|
||||||
|
int64_t rb, re, rlen;
|
||||||
|
uint8_t *rseq = 0;
|
||||||
|
mem_alnreg_t a;
|
||||||
|
kswr_t x;
|
||||||
|
|
||||||
|
if (c->n == 0) return -1;
|
||||||
|
qb = l_query; qe = 0;
|
||||||
|
rb = l_pac<<1; re = 0;
|
||||||
|
memset(&a, 0, sizeof(mem_alnreg_t));
|
||||||
|
for (i = 0; i < c->n; ++i) {
|
||||||
|
const mem_seed_t *s = &c->seeds[i];
|
||||||
|
qb = qb < s->qbeg? qb : s->qbeg;
|
||||||
|
qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len;
|
||||||
|
rb = rb < s->rbeg? rb : s->rbeg;
|
||||||
|
re = re > s->rbeg + s->len? re : s->rbeg + s->len;
|
||||||
|
a.seedcov += s->len;
|
||||||
|
}
|
||||||
|
qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT;
|
||||||
|
if (qb <= 10 || qe >= l_query - 10) return 1; // because ksw_align() does not support end-to-end alignment
|
||||||
|
rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT;
|
||||||
|
rb = rb > 0? rb : 0;
|
||||||
|
re = re < l_pac<<1? re : l_pac<<1;
|
||||||
|
if (rb < l_pac && l_pac < re) {
|
||||||
|
if (c->seeds[0].rbeg < l_pac) re = l_pac;
|
||||||
|
else rb = l_pac;
|
||||||
|
}
|
||||||
|
if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1;
|
||||||
|
if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1;
|
||||||
|
if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1;
|
||||||
|
|
||||||
|
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||||
|
assert(rlen == re - rb);
|
||||||
|
xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
|
||||||
|
x = ksw_align(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->q, opt->r, xtra, 0);
|
||||||
|
free(rseq);
|
||||||
|
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
|
||||||
|
|
||||||
|
a.rb = rb + x.tb; a.re = rb + x.te + 1;
|
||||||
|
a.qb = qb + x.qb; a.qe = qb + x.qe + 1;
|
||||||
|
a.score = x.score;
|
||||||
|
a.csub = x.score2;
|
||||||
|
kv_push(mem_alnreg_t, *av, a);
|
||||||
|
if (bwa_verbose >= 4) printf("SHORT: [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
||||||
{
|
{
|
||||||
int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
|
int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
|
||||||
|
|
@ -427,8 +493,8 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
||||||
}
|
}
|
||||||
|
|
||||||
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
||||||
{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
|
{
|
||||||
int i, k;
|
int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension
|
||||||
int64_t rlen, rmax[2], tmp, max = 0;
|
int64_t rlen, rmax[2], tmp, max = 0;
|
||||||
const mem_seed_t *s;
|
const mem_seed_t *s;
|
||||||
uint8_t *rseq = 0;
|
uint8_t *rseq = 0;
|
||||||
|
|
@ -449,12 +515,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
||||||
rmax[0] = rmax[0] > 0? rmax[0] : 0;
|
rmax[0] = rmax[0] > 0? rmax[0] : 0;
|
||||||
rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
|
rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
|
||||||
if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
|
if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
|
||||||
if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac;
|
if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand
|
||||||
else rmax[0] = l_pac;
|
else rmax[0] = l_pac;
|
||||||
}
|
}
|
||||||
// retrieve the reference sequence
|
// retrieve the reference sequence
|
||||||
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
|
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
|
||||||
if (rlen != rmax[1] - rmax[0]) return;
|
assert(rlen == rmax[1] - rmax[0]);
|
||||||
|
|
||||||
srt = xmalloc(c->n * 8);
|
srt = xmalloc(c->n * 8);
|
||||||
for (i = 0; i < c->n; ++i)
|
for (i = 0; i < c->n; ++i)
|
||||||
|
|
@ -485,6 +551,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
||||||
|
|
||||||
a = kv_pushp(mem_alnreg_t, *av);
|
a = kv_pushp(mem_alnreg_t, *av);
|
||||||
memset(a, 0, sizeof(mem_alnreg_t));
|
memset(a, 0, sizeof(mem_alnreg_t));
|
||||||
|
a->w = aw[0] = aw[1] = opt->w;
|
||||||
|
a->score = -1;
|
||||||
|
|
||||||
if (s->qbeg) { // left extension
|
if (s->qbeg) { // left extension
|
||||||
uint8_t *rs, *qs;
|
uint8_t *rs, *qs;
|
||||||
|
|
@ -494,7 +562,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
||||||
tmp = s->rbeg - rmax[0];
|
tmp = s->rbeg - rmax[0];
|
||||||
rs = xmalloc(tmp);
|
rs = xmalloc(tmp);
|
||||||
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore);
|
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||||||
|
int prev = a->score;
|
||||||
|
aw[0] = opt->w << i;
|
||||||
|
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]);
|
||||||
|
if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout);
|
||||||
|
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
|
||||||
|
}
|
||||||
// check whether we prefer to reach the end of the query
|
// check whether we prefer to reach the end of the query
|
||||||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
|
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
|
||||||
else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
|
else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
|
||||||
|
|
@ -502,15 +576,22 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
||||||
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
||||||
|
|
||||||
if (s->qbeg + s->len != l_query) { // right extension
|
if (s->qbeg + s->len != l_query) { // right extension
|
||||||
int qle, tle, qe, re, gtle, gscore;
|
int qle, tle, qe, re, gtle, gscore, sc0 = a->score;
|
||||||
qe = s->qbeg + s->len;
|
qe = s->qbeg + s->len;
|
||||||
re = s->rbeg + s->len - rmax[0];
|
re = s->rbeg + s->len - rmax[0];
|
||||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore);
|
assert(re >= 0);
|
||||||
|
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||||||
|
int prev = a->score;
|
||||||
|
aw[1] = opt->w << i;
|
||||||
|
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]);
|
||||||
|
if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout);
|
||||||
|
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
|
||||||
|
}
|
||||||
// similar to the above
|
// similar to the above
|
||||||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
|
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
|
||||||
else a->qe = l_query, a->re = rmax[0] + re + gtle;
|
else a->qe = l_query, a->re = rmax[0] + re + gtle;
|
||||||
} else a->qe = l_query, a->re = s->rbeg + s->len;
|
} else a->qe = l_query, a->re = s->rbeg + s->len;
|
||||||
if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
|
if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); }
|
||||||
|
|
||||||
// compute seedcov
|
// compute seedcov
|
||||||
for (i = 0, a->seedcov = 0; i < c->n; ++i) {
|
for (i = 0, a->seedcov = 0; i < c->n; ++i) {
|
||||||
|
|
@ -518,6 +599,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
||||||
if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
|
if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
|
||||||
a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
|
a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
|
||||||
}
|
}
|
||||||
|
a->w = aw[0] > aw[1]? aw[0] : aw[1];
|
||||||
}
|
}
|
||||||
free(srt); free(rseq);
|
free(srt); free(rseq);
|
||||||
}
|
}
|
||||||
|
|
@ -685,7 +767,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
|
||||||
h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p);
|
h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p);
|
||||||
if (k == 0) mapq0 = h.qual;
|
if (k == 0) mapq0 = h.qual;
|
||||||
else if (h.qual > mapq0) h.qual = mapq0;
|
else if (h.qual > mapq0) h.qual = mapq0;
|
||||||
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
|
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
|
||||||
}
|
}
|
||||||
} else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m);
|
} else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m);
|
||||||
s->sam = str.s;
|
s->sam = str.s;
|
||||||
|
|
@ -700,14 +782,16 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
|
||||||
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
|
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
|
||||||
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
|
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
|
||||||
|
|
||||||
chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq);
|
chn = mem_chain(opt, bwt, bns->l_pac, l_seq, (uint8_t*)seq);
|
||||||
chn.n = mem_chain_flt(opt, chn.n, chn.a);
|
chn.n = mem_chain_flt(opt, chn.n, chn.a);
|
||||||
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
|
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
|
||||||
|
|
||||||
kv_init(regs);
|
kv_init(regs);
|
||||||
for (i = 0; i < chn.n; ++i) {
|
for (i = 0; i < chn.n; ++i) {
|
||||||
mem_chain_t *p = &chn.a[i];
|
mem_chain_t *p = &chn.a[i];
|
||||||
mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
|
int ret;
|
||||||
|
ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
|
||||||
|
if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
|
||||||
free(chn.a[i].seeds);
|
free(chn.a[i].seeds);
|
||||||
}
|
}
|
||||||
free(chn.a);
|
free(chn.a);
|
||||||
|
|
@ -715,20 +799,29 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
|
||||||
return regs;
|
return regs;
|
||||||
}
|
}
|
||||||
|
|
||||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_)
|
||||||
{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se()
|
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
|
||||||
mem_alnreg_v ar;
|
mem_alnreg_v ar;
|
||||||
|
char *seq;
|
||||||
|
seq = xmalloc(l_seq);
|
||||||
|
memcpy(seq, seq_, l_seq); // makes a copy of seq_
|
||||||
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
|
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
|
||||||
mem_mark_primary_se(opt, ar.n, ar.a);
|
mem_mark_primary_se(opt, ar.n, ar.a);
|
||||||
|
free(seq);
|
||||||
return ar;
|
return ar;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This routine is only used for the API purpose
|
// This routine is only used for the API purpose
|
||||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar)
|
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
||||||
{
|
{
|
||||||
mem_aln_t a;
|
mem_aln_t a;
|
||||||
int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
|
int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
|
||||||
int64_t pos, rb = ar->rb, re = ar->re;
|
int64_t pos, rb = ar->rb, re = ar->re;
|
||||||
|
uint8_t *query;
|
||||||
|
|
||||||
|
query = xmalloc(l_query);
|
||||||
|
for (i = 0; i < l_query; ++i) // convert to the nt4 encoding
|
||||||
|
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
|
||||||
memset(&a, 0, sizeof(mem_aln_t));
|
memset(&a, 0, sizeof(mem_aln_t));
|
||||||
a.mapq = mem_approx_mapq_se(opt, ar);
|
a.mapq = mem_approx_mapq_se(opt, ar);
|
||||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
|
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
|
||||||
|
|
@ -742,7 +835,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
||||||
int clip5, clip3;
|
int clip5, clip3;
|
||||||
clip5 = is_rev? l_query - qe : qb;
|
clip5 = is_rev? l_query - qe : qb;
|
||||||
clip3 = is_rev? qb : l_query - qe;
|
clip3 = is_rev? qb : l_query - qe;
|
||||||
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
|
a.cigar = xrealloc(a.cigar, 4 * (a.n_cigar + 2));
|
||||||
if (clip5) {
|
if (clip5) {
|
||||||
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
|
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
|
||||||
a.cigar[0] = clip5<<4|3;
|
a.cigar[0] = clip5<<4|3;
|
||||||
|
|
@ -752,6 +845,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
||||||
}
|
}
|
||||||
a.rid = bns_pos2rid(bns, pos);
|
a.rid = bns_pos2rid(bns, pos);
|
||||||
a.pos = pos - bns->anns[a.rid].offset;
|
a.pos = pos - bns->anns[a.rid].offset;
|
||||||
|
free(query);
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
32
bwamem.h
32
bwamem.h
|
|
@ -22,6 +22,7 @@ typedef struct {
|
||||||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||||
int pen_clip; // clipping penalty. This score is not deducted from the DP score.
|
int pen_clip; // clipping penalty. This score is not deducted from the DP score.
|
||||||
int w; // band width
|
int w; // band width
|
||||||
|
int zdrop; // Z-dropoff
|
||||||
|
|
||||||
int flag; // see MEM_F_* macros
|
int flag; // see MEM_F_* macros
|
||||||
int min_seed_len; // minimum seed length
|
int min_seed_len; // minimum seed length
|
||||||
|
|
@ -45,6 +46,7 @@ typedef struct {
|
||||||
int sub; // 2nd best SW score
|
int sub; // 2nd best SW score
|
||||||
int csub; // SW score of a tandem hit
|
int csub; // SW score of a tandem hit
|
||||||
int sub_n; // approximate number of suboptimal hits
|
int sub_n; // approximate number of suboptimal hits
|
||||||
|
int w; // actual band width used in extension
|
||||||
int seedcov; // length of regions coverged by seeds
|
int seedcov; // length of regions coverged by seeds
|
||||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||||
} mem_alnreg_t;
|
} mem_alnreg_t;
|
||||||
|
|
@ -64,11 +66,11 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of
|
||||||
} bwahit_t;
|
} bwahit_t;
|
||||||
|
|
||||||
typedef struct { // This struct is only used for the convenience of API.
|
typedef struct { // This struct is only used for the convenience of API.
|
||||||
int rid;
|
int rid; // reference sequence index in bntseq_t
|
||||||
int pos;
|
int pos; // forward strand 5'-end mapping position
|
||||||
uint32_t is_rev:1, mapq:8, NM:23;
|
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
|
||||||
int n_cigar;
|
int n_cigar; // number of CIGAR operations
|
||||||
uint32_t *cigar;
|
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
|
||||||
} mem_aln_t;
|
} mem_aln_t;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
@ -109,20 +111,32 @@ extern "C" {
|
||||||
* Find the aligned regions for one query sequence
|
* Find the aligned regions for one query sequence
|
||||||
*
|
*
|
||||||
* Note that this routine does not generate CIGAR. CIGAR should be
|
* Note that this routine does not generate CIGAR. CIGAR should be
|
||||||
* generated later by bwa_gen_cigar() defined in bwa.c.
|
* generated later by mem_reg2aln() below.
|
||||||
*
|
*
|
||||||
* @param opt alignment parameters
|
* @param opt alignment parameters
|
||||||
* @param bwt FM-index of the reference sequence
|
* @param bwt FM-index of the reference sequence
|
||||||
* @param bns Information of the reference
|
* @param bns Information of the reference
|
||||||
* @param pac 2-bit encoded reference
|
* @param pac 2-bit encoded reference
|
||||||
* @param l_seq length of query sequence
|
* @param l_seq length of query sequence
|
||||||
* @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied
|
* @param seq query sequence
|
||||||
*
|
*
|
||||||
* @return list of aligned regions.
|
* @return list of aligned regions.
|
||||||
*/
|
*/
|
||||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
|
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq);
|
||||||
|
|
||||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar);
|
/**
|
||||||
|
* Generate CIGAR and forward-strand position from alignment region
|
||||||
|
*
|
||||||
|
* @param opt alignment parameters
|
||||||
|
* @param bns Information of the reference
|
||||||
|
* @param pac 2-bit encoded reference
|
||||||
|
* @param l_seq length of query sequence
|
||||||
|
* @param seq query sequence
|
||||||
|
* @param ar one alignment region
|
||||||
|
*
|
||||||
|
* @return CIGAR, strand, mapping quality and forward-strand position
|
||||||
|
*/
|
||||||
|
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Infer the insert size distribution from interleaved alignment regions
|
* Infer the insert size distribution from interleaved alignment regions
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,7 @@
|
||||||
#include "bwtsw2.h"
|
#include "bwtsw2.h"
|
||||||
#include "kstring.h"
|
#include "kstring.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#ifndef _NO_SSE2
|
|
||||||
#include "ksw.h"
|
#include "ksw.h"
|
||||||
#else
|
|
||||||
#include "stdaln.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define MIN_RATIO 0.8
|
#define MIN_RATIO 0.8
|
||||||
#define OUTLIER_BOUND 2.0
|
#define OUTLIER_BOUND 2.0
|
||||||
|
|
@ -127,8 +123,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
|
||||||
for (i = 0; i < l_mseq; ++i) // on the forward strand
|
for (i = 0; i < l_mseq; ++i) // on the forward strand
|
||||||
seq[i] = nst_nt4_table[(int)mseq[i]];
|
seq[i] = nst_nt4_table[(int)mseq[i]];
|
||||||
}
|
}
|
||||||
#ifndef _NO_SSE2
|
{
|
||||||
{ // FIXME!!! The following block has not been tested since the update of the ksw library
|
|
||||||
int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
|
int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
|
||||||
kswr_t aln;
|
kswr_t aln;
|
||||||
aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
|
aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
|
||||||
|
|
@ -147,24 +142,6 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
|
||||||
printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
|
printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
{
|
|
||||||
AlnParam ap;
|
|
||||||
path_t path[2];
|
|
||||||
int matrix[25];
|
|
||||||
for (i = 0; i < 25; ++i) matrix[i] = g_mat[i];
|
|
||||||
ap.gap_open = opt->q; ap.gap_ext = opt->r; ap.gap_end = opt->r;
|
|
||||||
ap.matrix = matrix; ap.row = 5; ap.band_width = 50;
|
|
||||||
a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2);
|
|
||||||
if (a->G < opt->t) a->G = 0;
|
|
||||||
if (a->G2 < opt->t) a->G2 = 0;
|
|
||||||
if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
|
|
||||||
a->k = beg + path[0].i - 1;
|
|
||||||
a->len = path[1].i - path[0].i + 1;
|
|
||||||
a->beg = path[0].j - 1;
|
|
||||||
a->end = path[1].j;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i;
|
if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i;
|
||||||
free(seq);
|
free(seq);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ int main(int argc, char *argv[])
|
||||||
for (i = 0; i < ar.n; ++i) { // traverse each hit
|
for (i = 0; i < ar.n; ++i) { // traverse each hit
|
||||||
mem_aln_t a;
|
mem_aln_t a;
|
||||||
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
|
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
|
||||||
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
|
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
|
||||||
// print alignment
|
// print alignment
|
||||||
err_printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
|
err_printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
|
||||||
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
|
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ int main_mem(int argc, char *argv[])
|
||||||
void *ko = 0, *ko2 = 0;
|
void *ko = 0, *ko2 = 0;
|
||||||
|
|
||||||
opt = mem_opt_init();
|
opt = mem_opt_init();
|
||||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) {
|
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) {
|
||||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||||
else if (c == 'w') opt->w = atoi(optarg);
|
else if (c == 'w') opt->w = atoi(optarg);
|
||||||
else if (c == 'A') opt->a = atoi(optarg);
|
else if (c == 'A') opt->a = atoi(optarg);
|
||||||
|
|
@ -43,6 +43,7 @@ int main_mem(int argc, char *argv[])
|
||||||
else if (c == 'p') opt->flag |= MEM_F_PE;
|
else if (c == 'p') opt->flag |= MEM_F_PE;
|
||||||
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
||||||
else if (c == 'c') opt->max_occ = atoi(optarg);
|
else if (c == 'c') opt->max_occ = atoi(optarg);
|
||||||
|
else if (c == 'd') opt->zdrop = atoi(optarg);
|
||||||
else if (c == 'v') bwa_verbose = atoi(optarg);
|
else if (c == 'v') bwa_verbose = atoi(optarg);
|
||||||
else if (c == 'r') opt->split_factor = atof(optarg);
|
else if (c == 'r') opt->split_factor = atof(optarg);
|
||||||
else if (c == 'C') copy_comment = 1;
|
else if (c == 'C') copy_comment = 1;
|
||||||
|
|
@ -58,6 +59,7 @@ int main_mem(int argc, char *argv[])
|
||||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||||
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
||||||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||||
|
fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop);
|
||||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||||
|
|
|
||||||
15
ksw.c
15
ksw.c
|
|
@ -360,11 +360,11 @@ typedef struct {
|
||||||
int32_t h, e;
|
int32_t h, e;
|
||||||
} eh_t;
|
} eh_t;
|
||||||
|
|
||||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore)
|
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
|
||||||
{
|
{
|
||||||
eh_t *eh; // score array
|
eh_t *eh; // score array
|
||||||
int8_t *qp; // query profile
|
int8_t *qp; // query profile
|
||||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore;
|
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore, max_off;
|
||||||
if (h0 < 0) h0 = 0;
|
if (h0 < 0) h0 = 0;
|
||||||
// allocate memory
|
// allocate memory
|
||||||
qp = xmalloc(qlen * m);
|
qp = xmalloc(qlen * m);
|
||||||
|
|
@ -387,6 +387,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
w = w < max_gap? w : max_gap;
|
w = w < max_gap? w : max_gap;
|
||||||
// DP loop
|
// DP loop
|
||||||
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
|
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
|
||||||
|
max_off = 0;
|
||||||
beg = 0, end = qlen;
|
beg = 0, end = qlen;
|
||||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||||
int f = 0, h1, m = 0, mj = -1;
|
int f = 0, h1, m = 0, mj = -1;
|
||||||
|
|
@ -411,7 +412,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
h = h > e? h : e;
|
h = h > e? h : e;
|
||||||
h = h > f? h : f;
|
h = h > f? h : f;
|
||||||
h1 = h; // save H(i,j) to h1 for the next column
|
h1 = h; // save H(i,j) to h1 for the next column
|
||||||
mj = m > h? mj : j;
|
mj = m > h? mj : j; // record the position where max score is achieved
|
||||||
m = m > h? m : h; // m is stored at eh[mj+1]
|
m = m > h? m : h; // m is stored at eh[mj+1]
|
||||||
h -= gapoe;
|
h -= gapoe;
|
||||||
h = h > 0? h : 0;
|
h = h > 0? h : 0;
|
||||||
|
|
@ -426,8 +427,11 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
max_ie = gscore > h1? max_ie : i;
|
max_ie = gscore > h1? max_ie : i;
|
||||||
gscore = gscore > h1? gscore : h1;
|
gscore = gscore > h1? gscore : h1;
|
||||||
}
|
}
|
||||||
if (m == 0) break;
|
if (m == 0 || max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop) break; // drop to zero, or below Z-dropoff
|
||||||
if (m > max) max = m, max_i = i, max_j = mj;
|
if (m > max) {
|
||||||
|
max = m, max_i = i, max_j = mj;
|
||||||
|
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
|
||||||
|
}
|
||||||
// update beg and end for the next round
|
// update beg and end for the next round
|
||||||
for (j = mj; j >= beg && eh[j].h; --j);
|
for (j = mj; j >= beg && eh[j].h; --j);
|
||||||
beg = j + 1;
|
beg = j + 1;
|
||||||
|
|
@ -440,6 +444,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
if (_tle) *_tle = max_i + 1;
|
if (_tle) *_tle = max_i + 1;
|
||||||
if (_gtle) *_gtle = max_ie + 1;
|
if (_gtle) *_gtle = max_ie + 1;
|
||||||
if (_gscore) *_gscore = gscore;
|
if (_gscore) *_gscore = gscore;
|
||||||
|
if (_max_off) *_max_off = max_off;
|
||||||
return max;
|
return max;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
2
ksw.h
2
ksw.h
|
|
@ -102,7 +102,7 @@ extern "C" {
|
||||||
*
|
*
|
||||||
* @return best semi-local alignment score
|
* @return best semi-local alignment score
|
||||||
*/
|
*/
|
||||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore);
|
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue