diff --git a/bwamem.c b/bwamem.c index 346715c..27e341b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -358,7 +358,7 @@ KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) #define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) -int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) +int mem_sort_and_dedup2(int n, mem_alnreg_t *a, float mask_level_redun, int merge_bw) { int m, i, j; if (n <= 1) return n; @@ -366,19 +366,39 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) for (i = 1; i < n; ++i) { mem_alnreg_t *p = &a[i]; if (p->rb >= a[i-1].re) continue; - for (j = i - 1; j >= 0 && p->rb < a[j].re; --j) { + for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re; --j) { mem_alnreg_t *q = &a[j]; int64_t or, oq, mr, mq; + int is_merged = 0; if (q->qe == q->qb) continue; // a[j] has been excluded - or = q->re - p->rb; // overlap length on the reference - oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query - mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment - mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment - if (or > mask_level_redun * mr && oq > mask_level_redun * mq) { // one of the hits is redundant - if (p->score < q->score) { - p->qe = p->qb; - break; - } else q->qe = q->qb; + if (merge_bw > 0 && q->qb < p->qb && q->qe < p->qe && q->re < p->re) { + int flag = 0; + int64_t l1, l2; + l1 = p->qb - q->qb, l2 = p->rb - q->rb; + if (l1 - l2 < merge_bw && l2 - l1 < merge_bw) ++flag; + l1 = p->qe - q->qe, l2 = p->re - q->re; + if (l1 - l2 < merge_bw && l2 - l1 < merge_bw) ++flag; + if (flag == 2) { // merge q into p + mem_alnreg_t t = *p; + *p = *q; + p->qe = t.qe, p->re = t.re; + p->score = p->score > t.score? p->score : t.score; + p->w = p->w > t.w? p->w : t.w; + q->qb = q->qe; + is_merged = 1; + } + } + if (is_merged == 0) { + or = q->re - p->rb; // overlap length on the reference + oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query + mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment + mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment + if (or > mask_level_redun * mr && oq > mask_level_redun * mq) { // one of the hits is redundant + if (p->score < q->score) { + p->qe = p->qb; + break; + } else q->qe = q->qb; + } } } } @@ -401,6 +421,11 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) return m; } +int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) +{ + return mem_sort_and_dedup2(n, a, mask_level_redun, -1); +} + int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen) { if (!(opt->flag & MEM_F_SELF_OVLP) || n == 0 || a->truesc != qlen * opt->a) return n; @@ -960,7 +985,8 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse free(chn.a[i].seeds); } free(chn.a); - regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); + if (opt->flag & MEM_F_MERGE_REG) regs.n = mem_sort_and_dedup2(regs.n, regs.a, opt->mask_level, opt->w); + else regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); if (opt->flag & MEM_F_SELF_OVLP) regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); if (bwa_verbose >= 4) { diff --git a/bwamem.h b/bwamem.h index 53472fe..488d1cf 100644 --- a/bwamem.h +++ b/bwamem.h @@ -18,6 +18,7 @@ typedef struct __smem_i smem_i; #define MEM_F_NO_RESCUE 0x20 #define MEM_F_SELF_OVLP 0x40 #define MEM_F_ALN_REG 0x80 +#define MEM_F_MERGE_REG 0x100 typedef struct { int a, b; // match score and mismatch penalty diff --git a/fastmap.c b/fastmap.c index 81c0f20..b898adb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -53,7 +53,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHVk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -69,6 +69,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; else if (c == 'F') opt->flag |= MEM_F_ALN_REG; + else if (c == 'V') opt->flag |= MEM_F_MERGE_REG; else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); @@ -147,7 +148,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); fprintf(stderr, " pacbio: -k17 -W40 -w200 -c1000 -r10 -A2 -B5 -O2 -E1 -L0\n"); - fprintf(stderr, " pbread: -k13 -W40 -w200 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n"); + fprintf(stderr, " pbread: -k13 -W40 -w200 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FVeaD.001\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -182,11 +183,12 @@ int main_mem(int argc, char *argv[]) if (opt0.split_factor == 0.) opt->split_factor = 10.; if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { - opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; + opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG | MEM_F_MERGE_REG; if (!opt0.min_seed_len) opt->min_seed_len = 13; if (!opt0.max_chain_extend) opt->max_chain_extend = 25; if (opt0.drop_ratio == 0.) opt->drop_ratio = .001; } else { + opt->flag |= MEM_F_MERGE_REG; if (!opt0.min_seed_len) opt->min_seed_len = 17; if (!opt0.pen_clip5) opt->pen_clip5 = 0; if (!opt0.pen_clip3) opt->pen_clip3 = 0; diff --git a/main.c b/main.c index e2f9266..38cfa1a 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8-r716-dirty" +#define PACKAGE_VERSION "0.7.8-r718-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);