preparation for further changes

This commit is contained in:
Heng Li 2013-01-31 11:42:31 -05:00
parent 292f9061ab
commit 6641788d38
3 changed files with 24 additions and 21 deletions

27
bwt.c
View File

@ -277,7 +277,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
}
}
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2])
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
@ -285,37 +285,38 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
mem->n = 0;
if (q[x] > 3) return x + 1;
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
kv_init(a[0]); kv_init(a[1]);
prev = tmpvec[0]? tmpvec[0] : &a[0];
curr = tmpvec[1]? tmpvec[1] : &a[1];
bwt_set_intv(bwt, q[x], ik);
prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
ik.info = x + 1;
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
if (q[i] < 4) {
c = 3 - q[i];
if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] != ik.x[2]) // change of the interval size
kv_push(bwtintv_t, *curr, ik);
if (ok[c].x[2] == 0) break; // cannot be extended
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
ik = ok[c]; ik.info = i + 1;
} else { // an ambiguous base
kv_push(bwtintv_t, *curr, ik);
break; // cannot be extended; in this case, i<len always stands
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
}
}
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
bwt_reverse_intvs(curr); // s.t. smaller intervals visited first
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
ret = curr->a[0].info; // this will be the returned value
swap = curr; curr = prev; prev = swap;
for (i = x - 1; i >= -1; --i) { // backward search for MEMs
if (q[i] > 3) break;
if (i >= 0 && q[i] > 3) break; // always stop at an ambiguous base as the FM-index does not have any.
c = i < 0? 0 : q[i];
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
bwt_extend(bwt, p, ok, 1);
if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further
if (ok[c].x[2] < min_intv || i == -1) { // keep the hit if reaching the beginning or not extended further
if (curr->n == 0) { // curr->n to make sure there is no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
@ -333,7 +334,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
}
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
if (tmpvec[0] == 0) free(a[0].a);
if (tmpvec[1] == 0) free(a[1].a);
if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
return ret;
}

2
bwt.h
View File

@ -121,7 +121,7 @@ extern "C" {
* Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
* Return the end of the longest exact match starting from _x_.
*/
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
#ifdef __cplusplus
}

View File

@ -13,11 +13,11 @@ extern unsigned char nst_nt4_table[256];
typedef struct {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
int start, len, min_intv;
bwtintv_v *tmpvec[2], *matches;
} smem_i;
smem_i *smem_iter_init(const bwt_t *bwt)
smem_i *smem_iter_init(const bwt_t *bwt, int min_intv)
{
smem_i *iter;
iter = calloc(1, sizeof(smem_i));
@ -25,6 +25,7 @@ smem_i *smem_iter_init(const bwt_t *bwt)
iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
iter->matches = calloc(1, sizeof(bwtintv_v));
iter->min_intv = min_intv > 0? min_intv : 1;
return iter;
}
@ -49,13 +50,13 @@ int smem_next(smem_i *iter)
if (iter->start >= iter->len || iter->start < 0) return -1;
while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases
if (iter->start == iter->len) return -1;
iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec);
iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->min_intv, iter->matches, iter->tmpvec);
return iter->start;
}
int main_fastmap(int argc, char *argv[])
{
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1;
kseq_t *seq;
bwtint_t k;
gzFile fp;
@ -63,15 +64,16 @@ int main_fastmap(int argc, char *argv[])
bntseq_t *bns;
smem_i *iter;
while ((c = getopt(argc, argv, "w:l:s")) >= 0) {
while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) {
switch (c) {
case 's': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
case 'm': min_intv = atoi(optarg); break;
}
}
if (optind + 1 >= argc) {
fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] [-m minIntv=%d] <idxbase> <in.fq>\n", min_len, min_iwidth, min_intv);
return 1;
}
@ -86,7 +88,7 @@ int main_fastmap(int argc, char *argv[])
free(tmp);
bns = bns_restore(argv[optind]);
}
iter = smem_iter_init(bwt);
iter = smem_iter_init(bwt, min_intv);
while (kseq_read(seq) >= 0) {
printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {