diff --git a/bwamem.c b/bwamem.c index 4643ea7..e1ef4e7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -33,7 +33,8 @@ mem_opt_t *mem_opt_init() o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; o->min_seed_len = 19; - o->max_occ = 50; + o->split_width = 10; + o->max_occ = 10000; o->max_chain_gap = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; @@ -87,25 +88,26 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) } -const bwtintv_v *smem_next(smem_i *itr, int split_len) +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) { - int i, max, max_i; + int i, max, max_i, ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; if (itr->start >= itr->len || itr->start < 0) return 0; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); // search for SMEM + ori_start = itr->start; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; int len = (uint32_t)p->info - (p->info>>32); if (max < len) max = len, max_i = i; } - if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { // if the longest SMEM is unique and long + if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long int j; bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging bwtintv_t *p = &itr->matches->a[max_i]; - bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM + bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); @@ -113,14 +115,14 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; - } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) { + } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) { kv_push(bwtintv_t, *a, itr->sub->a[j]); ++j; } else ++j; } for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); for (; j < itr->sub->n; ++j) - if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) + if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) kv_push(bwtintv_t, *a, itr->sub->a[j]); kv_copy(bwtintv_t, *itr->matches, *a); } @@ -160,7 +162,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; - while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM + while ((a = smem_next(itr, opt->min_seed_len<<1, opt->split_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start bwtintv_t *p = &a->a[i]; diff --git a/bwamem.h b/bwamem.h index 4b30daf..d415ccf 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,6 +16,7 @@ typedef struct { typedef struct { int a, b, q, r, w; + int split_width; int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; int pe_dir, is_pe; @@ -44,7 +45,7 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); -const bwtintv_v *smem_next(smem_i *itr, int split_len); +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); diff --git a/fastmap.c b/fastmap.c index 0d2354a..e31b0a5 100644 --- a/fastmap.c +++ b/fastmap.c @@ -25,10 +25,11 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:c:D:")) >= 0) { + while ((c = getopt(argc, argv, "k:c:D:s:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'D') mem_debug = atoi(optarg); + else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); @@ -76,7 +77,7 @@ int main_mem(int argc, char *argv[]) int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_long = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -85,16 +86,16 @@ int main_fastmap(int argc, char *argv[]) smem_i *itr; const bwtintv_v *a; - while ((c = getopt(argc, argv, "w:l:ps")) >= 0) { + while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { switch (c) { - case 's': split_long = 1; break; + case 's': split_width = atoi(optarg); break; case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-ps] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] \n", split_width, min_len, min_iwidth); return 1; } @@ -119,7 +120,7 @@ int main_fastmap(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); - while ((a = smem_next(itr, split_long? min_len<<1 : 0)) != 0) { + while ((a = smem_next(itr, min_len<<1, split_width)) != 0) { for (i = 0; i < a->n; ++i) { bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue;