dev-r442: suppress exact full-length matches

This commit is contained in:
Heng Li 2014-02-26 22:04:19 -05:00
parent 1c19bc630f
commit 2e9463ebf1
4 changed files with 23 additions and 5 deletions

View File

@ -111,7 +111,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
itr->len = len;
}
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
const bwtintv_v *smem_next2(smem_i *itr, int split_len, int split_width, int start_width)
{
int i, max, max_i, ori_start;
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
@ -119,7 +119,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, start_width, itr->matches, itr->tmpvec); // search for SMEM
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
bwtintv_t *p = &itr->matches->a[i];
@ -152,6 +152,11 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
return itr->matches;
}
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
{
return smem_next2(itr, split_len, split_width, 1);
}
/********************************
* Chaining while finding SMEMs *
********************************/
@ -200,8 +205,9 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *
{
const bwtintv_v *a;
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1;
split_len = split_len < itr->len? split_len : itr->len;
while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
while ((a = smem_next2(itr, split_len, opt->split_width, start_width)) != 0) { // to find all SMEM and some internal MEM
int i;
for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
bwtintv_t *p = &a->a[i];
@ -425,6 +431,13 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun)
return m;
}
int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen)
{
if (!(opt->flag & MEM_F_NO_EXACT) || n == 0 || a->truesc != qlen * opt->a) return n;
memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t));
return n - 1;
}
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
{ // similar to the loop in mem_chain_flt()
int i, k, tmp;
@ -894,6 +907,8 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
}
free(chn.a);
regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun);
if (opt->flag & MEM_F_NO_EXACT)
regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq);
return regs;
}

View File

@ -16,6 +16,7 @@ typedef struct __smem_i smem_i;
#define MEM_F_ALL 0x8
#define MEM_F_NO_MULTI 0x10
#define MEM_F_NO_RESCUE 0x20
#define MEM_F_NO_EXACT 0x40
typedef struct {
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r

View File

@ -30,7 +30,7 @@ int main_mem(int argc, char *argv[])
int64_t n_processed = 0;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) {
while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
else if (c == 'A') opt->a = atoi(optarg);
@ -45,6 +45,7 @@ int main_mem(int argc, char *argv[])
else if (c == 'p') opt->flag |= MEM_F_PE;
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
else if (c == 'e') opt->flag |= MEM_F_NO_EXACT;
else if (c == 'c') opt->max_occ = atoi(optarg);
else if (c == 'd') opt->zdrop = atoi(optarg);
else if (c == 'v') bwa_verbose = atoi(optarg);
@ -81,6 +82,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
fprintf(stderr, " -e discard full-length exact matches\n");
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.7-r441"
#define PACKAGE_VERSION "0.7.7+dev-r442"
#endif
int bwa_fa2pac(int argc, char *argv[]);