r723: merge adjacent hits
This commit is contained in:
parent
48847af2fc
commit
b93fca2b2e
5
bwa.c
5
bwa.c
|
|
@ -95,7 +95,8 @@ uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins,
|
|||
kstring_t str;
|
||||
const char *int2base;
|
||||
|
||||
*n_cigar = 0; *NM = -1;
|
||||
if (n_cigar) *n_cigar = 0;
|
||||
if (NM) *NM = -1;
|
||||
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
||||
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
|
||||
|
|
@ -131,7 +132,7 @@ uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins,
|
|||
}
|
||||
*score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
|
||||
}
|
||||
{// compute NM and MD
|
||||
if (NM && n_cigar) {// compute NM and MD
|
||||
int k, x, y, u, n_mm = 0, n_gap = 0;
|
||||
str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR
|
||||
int2base = rb < l_pac? "ACGTN" : "TGCAN";
|
||||
|
|
|
|||
78
bwamem.c
78
bwamem.c
|
|
@ -211,11 +211,10 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
|
|||
err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p));
|
||||
for (j = 0; j < p->n; ++j) {
|
||||
bwtint_t pos;
|
||||
int is_rev, ref_id;
|
||||
int is_rev;
|
||||
pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
|
||||
if (is_rev) pos -= p->seeds[j].len - 1;
|
||||
bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
|
||||
err_printf("\t%d;%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
|
||||
err_printf("\t%d;%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[p->rid].name, "+-"[is_rev], (long)(pos - bns->anns[p->rid].offset) + 1);
|
||||
}
|
||||
err_putchar('\n');
|
||||
}
|
||||
|
|
@ -349,7 +348,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
|
|||
* De-overlap single-end hits *
|
||||
******************************/
|
||||
|
||||
#define alnreg_slt2(a, b) ((a).re < (b).re)
|
||||
#define alnreg_slt2(a, b) ((a).rb < (b).rb)
|
||||
KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
|
||||
|
||||
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
|
||||
|
|
@ -358,52 +357,69 @@ KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
|
|||
#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))
|
||||
KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
|
||||
|
||||
int mem_chain_reg(const bntseq_t *bns, const uint8_t *pac, mem_alnreg_t *a, mem_alnreg_t *b, int merge_bw, int max_gap)
|
||||
#define PATCH_MAX_R_BW 0.05f
|
||||
#define PATCH_MIN_SC_RATIO 0.90f
|
||||
|
||||
int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *a, const mem_alnreg_t *b, int *_w)
|
||||
{
|
||||
return 0;
|
||||
int w, score, q_s, r_s;
|
||||
double r;
|
||||
if (bns == 0 || pac == 0 || query == 0) return 0;
|
||||
assert(a->rid == b->rid && a->rb <= b->rb);
|
||||
if (a->qb >= b->qb || a->qe >= b->qe || a->re >= b->re) return 0; // not colinear
|
||||
w = (a->re - b->rb) - (a->qe - b->qb); // required bandwidth
|
||||
w = w > 0? w : -w; // l = abs(l)
|
||||
r = (double)(a->re - b->rb) / (b->re - a->rb) - (double)(a->qe - b->qb) / (b->qe - a->qb); // relative bandwidth
|
||||
r = r > 0.? r : -r; // r = fabs(r)
|
||||
if (bwa_verbose >= 4)
|
||||
printf("* potential hit merge between [%d,%d)<=>[%ld,%ld) and [%d,%d)<=>[%ld,%ld), @ %s; w=%d, r=%.4g\n",
|
||||
a->qb, a->qe, (long)a->rb, (long)a->re, b->qb, b->qe, (long)b->rb, (long)b->re, bns->anns[a->rid].name, w, r);
|
||||
if (w > opt->w) return 0; // the bandwidth is too large
|
||||
if (r >= PATCH_MAX_R_BW) return 0; // relative bandwidth is too large
|
||||
// global alignment
|
||||
w += a->w + b->w;
|
||||
if (bwa_verbose >= 4) printf("* test potential hit merge with global alignment; w=%d\n", w);
|
||||
bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w, bns->l_pac, pac, b->qe - a->qb, query + a->qb, a->rb, b->re, &score, 0, 0);
|
||||
q_s = (int)((double)(b->qe - a->qb) / ((b->qe - b->qb) + (a->qe - a->qb)) * (b->score + a->score) + .499); // predicted score from query
|
||||
r_s = (int)((double)(b->re - a->rb) / ((b->re - b->rb) + (a->re - a->rb)) * (b->score + a->score) + .499); // predicted score from ref
|
||||
if (2. * score / (q_s + r_s) < PATCH_MIN_SC_RATIO) return 0;
|
||||
if (bwa_verbose >= 4) printf("* score=%d;(%d,%d)\n", score, q_s, r_s);
|
||||
*_w = w;
|
||||
return score;
|
||||
}
|
||||
|
||||
int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun, int merge_bw)
|
||||
int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a)
|
||||
{
|
||||
int m, i, j;
|
||||
if (n <= 1) return n;
|
||||
ks_introsort(mem_ars2, n, a);
|
||||
for (i = 0; i < n; ++i) a[i].n_comp = 1;
|
||||
for (i = 1; i < n; ++i) {
|
||||
mem_alnreg_t *p = &a[i];
|
||||
if (p->rb >= a[i-1].re) continue;
|
||||
for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re; --j) {
|
||||
mem_alnreg_t *q = &a[j];
|
||||
int64_t or, oq, mr, mq;
|
||||
int score, w;
|
||||
if (q->qe == q->qb) continue; // a[j] has been excluded
|
||||
or = q->re - p->rb; // overlap length on the reference
|
||||
oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
|
||||
mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment
|
||||
mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment
|
||||
if (or > mask_level_redun * mr && oq > mask_level_redun * mq) { // one of the hits is redundant
|
||||
if (or > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant
|
||||
if (p->score < q->score) {
|
||||
p->qe = p->qb;
|
||||
break;
|
||||
} else q->qe = q->qb;
|
||||
} else if (merge_bw > 0 && q->qb < p->qb && q->qe < p->qe && q->re < p->re && fabs((float)oq / (p->qe - q->qb) - (float)or / (p->re - q->rb)) < .05) {
|
||||
int flag = 0, q_s, r_s;
|
||||
int64_t l1, l2;
|
||||
l1 = p->qb - q->qb, l2 = p->rb - q->rb;
|
||||
if (l1 - l2 < merge_bw && l2 - l1 < merge_bw) ++flag;
|
||||
l1 = p->qe - q->qe, l2 = p->re - q->re;
|
||||
if (l1 - l2 < merge_bw && l2 - l1 < merge_bw) ++flag;
|
||||
l1 = p->qe - q->qb, l2 = p->re - q->rb;
|
||||
q_s = (int)((double)(p->qe - q->qb) / ((p->qe - p->qb) + (q->qe - q->qb)) * (p->score + q->score) + .499);
|
||||
r_s = (int)((double)(p->re - q->rb) / ((p->re - p->rb) + (q->re - q->rb)) * (p->score + q->score) + .499);
|
||||
if (flag == 2) { // merge p into q; don't merge q into p as this breaks sorting
|
||||
mem_alnreg_t t = *q;
|
||||
*q = *p;
|
||||
q->qe = t.qe, q->re = t.re;
|
||||
q->truesc = q->score = (q_s + r_s) >> 1;
|
||||
q->w = q->w > t.w? q->w : t.w;
|
||||
q->w = q->w > abs(l1 - l2) + 50? q->w : abs(l1 - l2) + 50;
|
||||
p->qb = p->qe;
|
||||
break;
|
||||
}
|
||||
} else if ((score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) {
|
||||
mem_alnreg_t t = *q;
|
||||
*q = *p;
|
||||
q->n_comp = t.n_comp + 1;
|
||||
q->qb = t.qb, q->rb = t.rb;
|
||||
q->truesc = q->score = score;
|
||||
q->w = w;
|
||||
p->qb = p->qe;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -433,7 +449,7 @@ int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int
|
|||
return n - 1;
|
||||
}
|
||||
|
||||
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
|
||||
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
|
||||
{ // similar to the loop in mem_chain_flt()
|
||||
int i, k, tmp;
|
||||
kvec_t(int) z;
|
||||
|
|
@ -681,7 +697,7 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
|||
a->score = a->truesc = -1;
|
||||
a->rid = c->rid;
|
||||
|
||||
if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg);
|
||||
if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name);
|
||||
if (s->qbeg) { // left extension
|
||||
uint8_t *rs, *qs;
|
||||
int qle, tle, gtle, gscore;
|
||||
|
|
@ -985,7 +1001,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
|
|||
free(chn.a[i].seeds);
|
||||
}
|
||||
free(chn.a);
|
||||
regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun, opt->w);
|
||||
regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a);
|
||||
if (opt->flag & MEM_F_SELF_OVLP)
|
||||
regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq);
|
||||
if (bwa_verbose >= 4) {
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -61,6 +61,7 @@ typedef struct {
|
|||
int w; // actual band width used in extension
|
||||
int seedcov; // length of regions coverged by seeds
|
||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||
int n_comp; // number of sub-alignments chained together
|
||||
uint64_t hash;
|
||||
} mem_alnreg_t;
|
||||
|
||||
|
|
|
|||
|
|
@ -100,6 +100,7 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
|
|||
kputw(bns->anns[rid].len, &str); kputc('\t', &str);
|
||||
kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
|
||||
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
|
||||
kputc('\t', &str); kputw(p->n_comp, &str);
|
||||
kputc('\n', &str);
|
||||
}
|
||||
s->sam = str.s;
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
|
|||
|
||||
int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
|
||||
{
|
||||
extern int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun, int merge_bw);
|
||||
extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
|
||||
int64_t l_pac = bns->l_pac;
|
||||
int i, r, skip[4], n = 0, rid;
|
||||
for (r = 0; r < 4; ++r)
|
||||
|
|
@ -170,7 +170,7 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
|||
}
|
||||
++n;
|
||||
}
|
||||
if (n) ma->n = mem_sort_and_dedup(ma->n, ma->a, opt->mask_level_redun, -1);
|
||||
if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
|
||||
if (rev) free(rev);
|
||||
free(ref);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue