allow more seeds to be seen (thus slower..)

This commit is contained in:
Heng Li 2013-02-08 16:56:28 -05:00
parent 2848d3045a
commit 39607065e0
3 changed files with 20 additions and 16 deletions

View File

@ -33,7 +33,8 @@ mem_opt_t *mem_opt_init()
o = calloc(1, sizeof(mem_opt_t));
o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100;
o->min_seed_len = 19;
o->max_occ = 50;
o->split_width = 10;
o->max_occ = 10000;
o->max_chain_gap = 10000;
o->mask_level = 0.50;
o->chain_drop_ratio = 0.50;
@ -87,25 +88,26 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
}
const bwtintv_v *smem_next(smem_i *itr, int split_len)
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
{
int i, max, max_i;
int i, max, max_i, ori_start;
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
if (itr->start >= itr->len || itr->start < 0) return 0;
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); // search for SMEM
ori_start = itr->start;
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
bwtintv_t *p = &itr->matches->a[i];
int len = (uint32_t)p->info - (p->info>>32);
if (max < len) max = len, max_i = i;
}
if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { // if the longest SMEM is unique and long
if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long
int j;
bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging
bwtintv_t *p = &itr->matches->a[max_i];
bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM
bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM
i = j = 0; a->n = 0;
while (i < itr->matches->n && j < itr->sub->n) { // ordered merge
int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info);
@ -113,14 +115,14 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len)
if (xi < xj) {
kv_push(bwtintv_t, *a, itr->matches->a[i]);
++i;
} else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) {
} else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) {
kv_push(bwtintv_t, *a, itr->sub->a[j]);
++j;
} else ++j;
}
for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]);
for (; j < itr->sub->n; ++j)
if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1)
if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start)
kv_push(bwtintv_t, *a, itr->sub->a[j]);
kv_copy(bwtintv_t, *itr->matches, *a);
}
@ -160,7 +162,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t
static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
{
const bwtintv_v *a;
while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM
while ((a = smem_next(itr, opt->min_seed_len<<1, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
int i;
for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
bwtintv_t *p = &a->a[i];

View File

@ -16,6 +16,7 @@ typedef struct {
typedef struct {
int a, b, q, r, w;
int split_width;
int min_seed_len, max_occ, max_chain_gap;
int n_threads, chunk_size;
int pe_dir, is_pe;
@ -44,7 +45,7 @@ extern "C" {
smem_i *smem_itr_init(const bwt_t *bwt);
void smem_itr_destroy(smem_i *itr);
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
const bwtintv_v *smem_next(smem_i *itr, int split_len);
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width);
mem_opt_t *mem_opt_init(void);
void mem_fill_scmat(int a, int b, int8_t mat[25]);

View File

@ -25,10 +25,11 @@ int main_mem(int argc, char *argv[])
bseq1_t *seqs;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "k:c:D:")) >= 0) {
while ((c = getopt(argc, argv, "k:c:D:s:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'c') opt->max_occ = atoi(optarg);
else if (c == 'D') mem_debug = atoi(optarg);
else if (c == 's') opt->split_width = atoi(optarg);
}
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
@ -76,7 +77,7 @@ int main_mem(int argc, char *argv[])
int main_fastmap(int argc, char *argv[])
{
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_long = 0;
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0;
kseq_t *seq;
bwtint_t k;
gzFile fp;
@ -85,16 +86,16 @@ int main_fastmap(int argc, char *argv[])
smem_i *itr;
const bwtintv_v *a;
while ((c = getopt(argc, argv, "w:l:ps")) >= 0) {
while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) {
switch (c) {
case 's': split_long = 1; break;
case 's': split_width = atoi(optarg); break;
case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
}
}
if (optind + 1 >= argc) {
fprintf(stderr, "Usage: bwa fastmap [-ps] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", split_width, min_len, min_iwidth);
return 1;
}
@ -119,7 +120,7 @@ int main_fastmap(int argc, char *argv[])
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
while ((a = smem_next(itr, split_long? min_len<<1 : 0)) != 0) {
while ((a = smem_next(itr, min_len<<1, split_width)) != 0) {
for (i = 0; i < a->n; ++i) {
bwtintv_t *p = &a->a[i];
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;