r929: added simplified LAST-like seeding
This commit is contained in:
parent
25d45512d9
commit
038af2a551
21
bwamem.c
21
bwamem.c
|
|
@ -121,7 +121,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
||||||
// first pass: find all SMEMs
|
// first pass: find all SMEMs
|
||||||
while (x < len) {
|
while (x < len) {
|
||||||
if (seq[x] < 4) {
|
if (seq[x] < 4) {
|
||||||
x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv);
|
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
|
||||||
for (i = 0; i < a->mem1.n; ++i) {
|
for (i = 0; i < a->mem1.n; ++i) {
|
||||||
bwtintv_t *p = &a->mem1.a[i];
|
bwtintv_t *p = &a->mem1.a[i];
|
||||||
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||||||
|
|
@ -131,7 +131,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
||||||
} else ++x;
|
} else ++x;
|
||||||
}
|
}
|
||||||
// second pass: find MEMs inside a long SMEM
|
// second pass: find MEMs inside a long SMEM
|
||||||
if (opt->max_mem_intv > 0) goto sort_intv;
|
|
||||||
old_n = a->mem.n;
|
old_n = a->mem.n;
|
||||||
for (k = 0; k < old_n; ++k) {
|
for (k = 0; k < old_n; ++k) {
|
||||||
bwtintv_t *p = &a->mem.a[k];
|
bwtintv_t *p = &a->mem.a[k];
|
||||||
|
|
@ -142,8 +141,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
||||||
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
|
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
|
||||||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||||
}
|
}
|
||||||
|
// third pass: LAST-like
|
||||||
|
if (opt->max_mem_intv > 0) {
|
||||||
|
x = 0;
|
||||||
|
while (x < len) {
|
||||||
|
if (seq[x] < 4) {
|
||||||
|
if (1) {
|
||||||
|
bwtintv_t m;
|
||||||
|
x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m);
|
||||||
|
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
|
||||||
|
} else { // for now, we never come to this block which is slower
|
||||||
|
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
|
||||||
|
for (i = 0; i < a->mem1.n; ++i)
|
||||||
|
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||||
|
}
|
||||||
|
} else ++x;
|
||||||
|
}
|
||||||
|
}
|
||||||
// sort
|
// sort
|
||||||
sort_intv:
|
|
||||||
ks_introsort(mem_intv, a->mem.n, a->mem.a);
|
ks_introsort(mem_intv, a->mem.n, a->mem.a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -65,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr)
|
||||||
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
||||||
if (itr->start == itr->len) return 0;
|
if (itr->start == itr->len) return 0;
|
||||||
ori_start = itr->start;
|
ori_start = itr->start;
|
||||||
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
|
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
|
||||||
return itr->matches;
|
return itr->matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
44
bwt.c
44
bwt.c
|
|
@ -285,8 +285,8 @@ static void bwt_reverse_intvs(bwtintv_v *p)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// NOTE: $max_intv is not currently used in BWA-MEM
|
||||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||||
{
|
{
|
||||||
int i, j, c, ret;
|
int i, j, c, ret;
|
||||||
bwtintv_t ik, ok[4];
|
bwtintv_t ik, ok[4];
|
||||||
|
|
@ -302,10 +302,12 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
||||||
ik.info = x + 1;
|
ik.info = x + 1;
|
||||||
|
|
||||||
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
|
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
|
||||||
if (q[i] < 4) { // an A/C/G/T base
|
if (ik.x[2] < max_intv) { // an interval small enough
|
||||||
|
kv_push(bwtintv_t, *curr, ik);
|
||||||
|
break;
|
||||||
|
} else if (q[i] < 4) { // an A/C/G/T base
|
||||||
c = 3 - q[i]; // complement of q[i]
|
c = 3 - q[i]; // complement of q[i]
|
||||||
bwt_extend(bwt, &ik, ok, 0);
|
bwt_extend(bwt, &ik, ok, 0);
|
||||||
if (ik.x[2] < max_intv && (i - x > max_len || ik.x[2] == 1)) break;
|
|
||||||
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
|
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
|
||||||
kv_push(bwtintv_t, *curr, ik);
|
kv_push(bwtintv_t, *curr, ik);
|
||||||
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
|
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
|
||||||
|
|
@ -325,13 +327,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
||||||
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
|
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
|
||||||
for (j = 0, curr->n = 0; j < prev->n; ++j) {
|
for (j = 0, curr->n = 0; j < prev->n; ++j) {
|
||||||
bwtintv_t *p = &prev->a[j];
|
bwtintv_t *p = &prev->a[j];
|
||||||
int max_stop = 0;
|
if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
|
||||||
bwt_extend(bwt, p, ok, 1);
|
if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
||||||
if (ok[c].x[2] != p->x[2]) { // change of interval size
|
|
||||||
if (p->info - i - 1 > max_len && p->x[2] < max_intv)
|
|
||||||
max_stop = 1;
|
|
||||||
}
|
|
||||||
if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
|
||||||
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
|
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
|
||||||
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
|
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
|
||||||
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
|
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
|
||||||
|
|
@ -355,7 +352,30 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
||||||
|
|
||||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||||
{
|
{
|
||||||
return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec);
|
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
|
||||||
|
}
|
||||||
|
|
||||||
|
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem)
|
||||||
|
{
|
||||||
|
int i, c;
|
||||||
|
bwtintv_t ik, ok[4];
|
||||||
|
|
||||||
|
memset(mem, 0, sizeof(bwtintv_t));
|
||||||
|
if (q[x] > 3) return x + 1;
|
||||||
|
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||||
|
for (i = x + 1; i < len; ++i) { // forward search
|
||||||
|
if (q[i] < 4) { // an A/C/G/T base
|
||||||
|
c = 3 - q[i]; // complement of q[i]
|
||||||
|
bwt_extend(bwt, &ik, ok, 0);
|
||||||
|
if (ok[c].x[2] < max_intv) {
|
||||||
|
*mem = ok[c];
|
||||||
|
mem->info = (uint64_t)x<<32 | (i + 1);
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
ik = ok[c];
|
||||||
|
} else return i + 1;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*************************
|
/*************************
|
||||||
|
|
|
||||||
4
bwt.h
4
bwt.h
|
|
@ -119,9 +119,9 @@ extern "C" {
|
||||||
* Return the end of the longest exact match starting from _x_.
|
* Return the end of the longest exact match starting from _x_.
|
||||||
*/
|
*/
|
||||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||||
|
|
||||||
// SMEM iterator interface
|
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
||||||
2
main.c
2
main.c
|
|
@ -4,7 +4,7 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#ifndef PACKAGE_VERSION
|
#ifndef PACKAGE_VERSION
|
||||||
#define PACKAGE_VERSION "0.7.10-r926-dirty"
|
#define PACKAGE_VERSION "0.7.10-r929-dirty"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int bwa_fa2pac(int argc, char *argv[]);
|
int bwa_fa2pac(int argc, char *argv[]);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue