reseed long SMEMs
This commit is contained in:
parent
abc675f278
commit
620ad6e5b9
40
bwamem.c
40
bwamem.c
|
|
@ -2,6 +2,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "bwamem.h"
|
#include "bwamem.h"
|
||||||
|
#include "kvec.h"
|
||||||
|
|
||||||
memopt_t *mem_opt_init()
|
memopt_t *mem_opt_init()
|
||||||
{
|
{
|
||||||
|
|
@ -26,6 +27,7 @@ smem_i *smem_itr_init(const bwt_t *bwt)
|
||||||
itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
|
itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
|
||||||
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
|
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
|
||||||
itr->matches = calloc(1, sizeof(bwtintv_v));
|
itr->matches = calloc(1, sizeof(bwtintv_v));
|
||||||
|
itr->sub = calloc(1, sizeof(bwtintv_v));
|
||||||
return itr;
|
return itr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -34,24 +36,50 @@ void smem_itr_destroy(smem_i *itr)
|
||||||
free(itr->tmpvec[0]->a);
|
free(itr->tmpvec[0]->a);
|
||||||
free(itr->tmpvec[1]->a);
|
free(itr->tmpvec[1]->a);
|
||||||
free(itr->matches->a);
|
free(itr->matches->a);
|
||||||
|
free(itr->sub->a);
|
||||||
free(itr);
|
free(itr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query)
|
void smem_set_query(smem_i *itr, int len, const uint8_t *query)
|
||||||
{
|
{
|
||||||
itr->query = query;
|
itr->query = query;
|
||||||
itr->start = 0;
|
itr->start = 0;
|
||||||
itr->len = len;
|
itr->len = len;
|
||||||
itr->min_intv = min_intv;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int smem_next(smem_i *itr)
|
int smem_next(smem_i *itr, int split_len)
|
||||||
{
|
{
|
||||||
|
int i, max, max_i;
|
||||||
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0;
|
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0;
|
||||||
if (itr->start >= itr->len || itr->start < 0) return -1;
|
if (itr->start >= itr->len || itr->start < 0) return -1;
|
||||||
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
||||||
if (itr->start == itr->len) return -1;
|
if (itr->start == itr->len) return -1;
|
||||||
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec);
|
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec);
|
||||||
|
if (itr->matches->n == 0) return itr->start;
|
||||||
|
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) {
|
||||||
|
bwtintv_t *p = &itr->matches->a[i];
|
||||||
|
int len = (uint32_t)p->info - (p->info>>32);
|
||||||
|
if (max < len) max = len, max_i = i;
|
||||||
|
}
|
||||||
|
if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) {
|
||||||
|
int j;
|
||||||
|
bwtintv_v *a = itr->tmpvec[0];
|
||||||
|
bwtintv_t *p = &itr->matches->a[max_i];
|
||||||
|
bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest match
|
||||||
|
i = j = 0; a->n = 0;
|
||||||
|
while (i < itr->matches->n && j < itr->sub->n) { // ordered merge
|
||||||
|
if (itr->matches->a[i].info < itr->sub->a[j].info) {
|
||||||
|
kv_push(bwtintv_t, *a, itr->matches->a[i]);
|
||||||
|
++i;
|
||||||
|
} else {
|
||||||
|
kv_push(bwtintv_t, *a, itr->sub->a[j]);
|
||||||
|
++j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]);
|
||||||
|
for (; j < itr->sub->n; ++j) kv_push(bwtintv_t, *a, itr->sub->a[j]);
|
||||||
|
kv_copy(bwtintv_t, *itr->matches, *a);
|
||||||
|
}
|
||||||
return itr->start;
|
return itr->start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -83,7 +111,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *
|
||||||
|
|
||||||
static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
|
static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
|
||||||
{
|
{
|
||||||
while (smem_next(itr) > 0) {
|
while (smem_next(itr, opt->min_seed_len<<1) > 0) {
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < itr->matches->n; ++i) {
|
for (i = 0; i < itr->matches->n; ++i) {
|
||||||
bwtintv_t *p = &itr->matches->a[i];
|
bwtintv_t *p = &itr->matches->a[i];
|
||||||
|
|
@ -122,7 +150,7 @@ memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8
|
||||||
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
|
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
|
||||||
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
||||||
itr = smem_itr_init(bwt);
|
itr = smem_itr_init(bwt);
|
||||||
smem_set_query(itr, 1, len, seq);
|
smem_set_query(itr, len, seq);
|
||||||
mem_insert_seed(opt, tree, itr);
|
mem_insert_seed(opt, tree, itr);
|
||||||
|
|
||||||
chain.m = kb_size(tree); chain.n = 0;
|
chain.m = kb_size(tree); chain.n = 0;
|
||||||
|
|
|
||||||
9
bwamem.h
9
bwamem.h
|
|
@ -6,8 +6,9 @@
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const bwt_t *bwt;
|
const bwt_t *bwt;
|
||||||
const uint8_t *query;
|
const uint8_t *query;
|
||||||
int start, len, min_intv;
|
int start, len;
|
||||||
bwtintv_v *tmpvec[2], *matches;
|
bwtintv_v *matches; // matches
|
||||||
|
bwtintv_v *tmpvec[2], *sub; // these are temporary arrays
|
||||||
} smem_i;
|
} smem_i;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
@ -37,8 +38,8 @@ extern "C" {
|
||||||
|
|
||||||
smem_i *smem_itr_init(const bwt_t *bwt);
|
smem_i *smem_itr_init(const bwt_t *bwt);
|
||||||
void smem_itr_destroy(smem_i *itr);
|
void smem_itr_destroy(smem_i *itr);
|
||||||
void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query);
|
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
|
||||||
int smem_next(smem_i *itr);
|
int smem_next(smem_i *itr, int split_len);
|
||||||
|
|
||||||
memopt_t *mem_opt_init(void);
|
memopt_t *mem_opt_init(void);
|
||||||
|
|
||||||
|
|
|
||||||
14
fastmap.c
14
fastmap.c
|
|
@ -69,7 +69,7 @@ int main_mem(int argc, char *argv[])
|
||||||
|
|
||||||
int main_fastmap(int argc, char *argv[])
|
int main_fastmap(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1;
|
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_long = 0;
|
||||||
kseq_t *seq;
|
kseq_t *seq;
|
||||||
bwtint_t k;
|
bwtint_t k;
|
||||||
gzFile fp;
|
gzFile fp;
|
||||||
|
|
@ -77,16 +77,16 @@ int main_fastmap(int argc, char *argv[])
|
||||||
bntseq_t *bns;
|
bntseq_t *bns;
|
||||||
smem_i *itr;
|
smem_i *itr;
|
||||||
|
|
||||||
while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) {
|
while ((c = getopt(argc, argv, "w:l:ps")) >= 0) {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 's': print_seq = 1; break;
|
case 's': split_long = 1; break;
|
||||||
|
case 'p': print_seq = 1; break;
|
||||||
case 'w': min_iwidth = atoi(optarg); break;
|
case 'w': min_iwidth = atoi(optarg); break;
|
||||||
case 'l': min_len = atoi(optarg); break;
|
case 'l': min_len = atoi(optarg); break;
|
||||||
case 'm': min_intv = atoi(optarg); break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (optind + 1 >= argc) {
|
if (optind + 1 >= argc) {
|
||||||
fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] [-m minIntv=%d] <idxbase> <in.fq>\n", min_len, min_iwidth, min_intv);
|
fprintf(stderr, "Usage: bwa fastmap [-ps] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -110,8 +110,8 @@ int main_fastmap(int argc, char *argv[])
|
||||||
} else putchar('\n');
|
} else putchar('\n');
|
||||||
for (i = 0; i < seq->seq.l; ++i)
|
for (i = 0; i < seq->seq.l; ++i)
|
||||||
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
||||||
smem_set_query(itr, min_intv, seq->seq.l, (uint8_t*)seq->seq.s);
|
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
|
||||||
while (smem_next(itr) > 0) {
|
while (smem_next(itr, split_long? min_len<<1 : 0) > 0) {
|
||||||
for (i = 0; i < itr->matches->n; ++i) {
|
for (i = 0; i < itr->matches->n; ++i) {
|
||||||
bwtintv_t *p = &itr->matches->a[i];
|
bwtintv_t *p = &itr->matches->a[i];
|
||||||
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
|
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue