diff --git a/bwamem.c b/bwamem.c index 19ca561..4a08fed 100644 --- a/bwamem.c +++ b/bwamem.c @@ -111,7 +111,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } -const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) +const bwtintv_v *smem_next2(smem_i *itr, int split_len, int split_width, int start_width) { int i, max, max_i, ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; @@ -119,7 +119,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, start_width, itr->matches, itr->tmpvec); // search for SMEM if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; @@ -152,6 +152,11 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) return itr->matches; } +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) +{ + return smem_next2(itr, split_len, split_width, 1); +} + /******************************** * Chaining while finding SMEMs * ********************************/ @@ -200,8 +205,9 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) * { const bwtintv_v *a; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1; split_len = split_len < itr->len? split_len : itr->len; - while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM + while ((a = smem_next2(itr, split_len, opt->split_width, start_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start bwtintv_t *p = &a->a[i]; @@ -425,6 +431,13 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) return m; } +int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen) +{ + if (!(opt->flag & MEM_F_NO_EXACT) || n == 0 || a->truesc != qlen * opt->a) return n; + memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t)); + return n - 1; +} + void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function { // similar to the loop in mem_chain_flt() int i, k, tmp; @@ -894,6 +907,8 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse } free(chn.a); regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); + if (opt->flag & MEM_F_NO_EXACT) + regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); return regs; } diff --git a/bwamem.h b/bwamem.h index 8b24c51..9686d27 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,6 +16,7 @@ typedef struct __smem_i smem_i; #define MEM_F_ALL 0x8 #define MEM_F_NO_MULTI 0x10 #define MEM_F_NO_RESCUE 0x20 +#define MEM_F_NO_EXACT 0x40 typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r diff --git a/fastmap.c b/fastmap.c index 72d850c..2b2d3da 100644 --- a/fastmap.c +++ b/fastmap.c @@ -30,7 +30,7 @@ int main_mem(int argc, char *argv[]) int64_t n_processed = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) { + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -45,6 +45,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; + else if (c == 'e') opt->flag |= MEM_F_NO_EXACT; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'd') opt->zdrop = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); @@ -81,6 +82,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); + fprintf(stderr, " -e discard full-length exact matches\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); diff --git a/main.c b/main.c index a8df9c0..4edf9bd 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7-r441" +#define PACKAGE_VERSION "0.7.7+dev-r442" #endif int bwa_fa2pac(int argc, char *argv[]);