From 88f89be60e8f005553869404ad6651561ad2831b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 30 Apr 2014 14:14:20 -0400 Subject: [PATCH] r736: improved in low-complexity regions Example: GGAGGGGAAGGGTGGGCTGGAGGGGACGGGTGGGCTGGAGGGGAAGGGTGTGCTGGAGGGAAAAGGTGGACTGGAGGGGAAGGGTGGGCTGGAGGGGAAGG This read has 5 chains, two of which are: weight=80 26;26;0,4591439948(10:-3095894) 23;23;27,4591439957(10:-3095888) 31;31;70,4591439964(10:-3095873) weight=50 45;45;51,4591440017(10:-3095806) 50;50;51,4591440017(10:-3095801) 31;31;70,4591440090(10:-3095747) Extension from the 26bp seed in the 1st chain gives an alignment [0,101) <=> [4591439948,4591440067), which contains the 50bp seed in the second chain. However, if we extend the 50bp seed, it yields a better alignment [0,101) <=> [4591439966,4591440067) with a different starting position. The 26bp seed is wrong. This commit adds a heuristic to fix this issue. --- bwamem.c | 7 ++++++- bwamem.h | 1 + main.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 1b94fb9..660c51b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -417,6 +417,8 @@ int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_ } else if (q->rb < p->rb && (score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) { // then merge q into p p->n_comp += q->n_comp + 1; p->seedcov = p->seedcov > q->seedcov? p->seedcov : q->seedcov; + p->sub = p->sub > q->sub? p->sub : q->sub; + p->csub = p->csub > q->csub? p->csub : q->csub; p->qb = q->qb, p->rb = q->rb; p->truesc = p->score = score; p->w = w; @@ -650,6 +652,7 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac int64_t rd; int qd, w, max_gap; if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained + if (s->len - p->seedlen0 > .1 * l_query) continue; // this seed may give a better alignment // qd: distance ahead of the seed on query; rd: on reference qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed @@ -663,7 +666,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac } if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln if (bwa_verbose >= 4) - printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment. Confirming whether extension is needed...\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg); + printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment [%d,%d) <=> [%ld,%ld)\n", + k, (long)s->len, (long)s->qbeg, (long)s->rbeg, av->a[i].qb, av->a[i].qe, (long)av->a[i].rb, (long)av->a[i].re); for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain const mem_seed_t *t; if (srt[i] == 0) continue; @@ -753,6 +757,7 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } a->w = aw[0] > aw[1]? aw[0] : aw[1]; + a->seedlen0 = s->len; } free(srt); free(rseq); } diff --git a/bwamem.h b/bwamem.h index fa071d8..ba630aa 100644 --- a/bwamem.h +++ b/bwamem.h @@ -62,6 +62,7 @@ typedef struct { int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary + int seedlen0; // length of the starting seed int n_comp; // number of sub-alignments chained together uint64_t hash; } mem_alnreg_t; diff --git a/main.c b/main.c index 48fcc36..7394f5f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8-r735-dirty" +#define PACKAGE_VERSION "0.7.8-r736-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);