From e70c7c2a71744f5a316c84dad8016d054020d425 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 00:03:49 -0500 Subject: [PATCH] r284: amend cross-reference hit I really hate this: complex and twisted logic for a nasty scenario that almost never happens to short reads - but it may become serious when the reference genome consists of many contigs. On toy examples, the code seems to work. Don't know if it really works... --- bwa.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.h | 1 + bwamem.c | 1 + bwamem_pair.c | 8 ++++++-- main.c | 2 +- 5 files changed, 65 insertions(+), 3 deletions(-) diff --git a/bwa.c b/bwa.c index f34bd12..e8221ca 100644 --- a/bwa.c +++ b/bwa.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "bntseq.h" #include "bwa.h" #include "ksw.h" @@ -103,6 +104,61 @@ ret_gen_cigar: return cigar; } +int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) +{ + int ib, ie, is_rev; + int64_t fb, fe, mid = -1; + if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary + *qb = *qe = *rb = *re = -1; + return -1; // unable to fix + } else { + fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev); + ib = bns_pos2rid(bns, fb); + if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix + fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev); + ie = bns_pos2rid(bns, fe); + if (ie - ib > 1) { // bridge three or more references + *qb = *qe = *rb = *re = -1; + return -2; // unable to fix + } else { + int l = bns->anns[ib].offset + bns->anns[ib].len - fb; + mid = is_rev? *re - l : *rb + l; + } + } + if (mid >= 0) { + int i, score, n_cigar, y; + uint32_t *cigar; + int64_t x; + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar); + for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { + int op = cigar[i]&0xf, len = cigar[i]>>4; + if (op == 0) { + if (x <= mid && mid < x + len) { + if (mid - *rb > *re - mid) { // the first part is longer + if (x == mid) { // need to check the previous operation + assert(i); // mid != *rb should always stand + if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x; + else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4); + else abort(); // should not be here + } else *qe = y + (mid - x), *re = mid; + } else *qb = y + (mid - x), *rb = mid; + break; + } else x += len, y += len; + } else if (op == 1) { // insertion + y += len; + } else if (op == 2) { // deletion + if (x <= mid && mid < x + len) { + if (mid - *rb > *re - mid) *qe = y, *re = x; + else *qb = y, *rb = x + len; + break; + } else x += len; + } else abort(); // should not be here + } + free(cigar); + } + return 1; +} + /********************* * Full index reader * *********************/ diff --git a/bwa.h b/bwa.h index d4ca807..2d6c7bf 100644 --- a/bwa.h +++ b/bwa.h @@ -31,6 +31,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); diff --git a/bwamem.c b/bwamem.c index 2c24ba5..7c837bf 100644 --- a/bwamem.c +++ b/bwamem.c @@ -633,6 +633,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; mem_alnreg2hit(p, &h); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re); h.flag |= extra_flag; if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard) h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); diff --git a/bwamem_pair.c b/bwamem_pair.c index 3fbdec7..9ff12b3 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -293,7 +293,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); } mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re); mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s; } else goto no_pairing; @@ -301,8 +303,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co no_pairing: for (i = 0; i < 2; ++i) { - if (a[i].n) mem_alnreg2hit(&a[i].a[0], &h[i]); - else h[i].rb = h[i].re = -1; + if (a[i].n) { + mem_alnreg2hit(&a[i].a[0], &h[i]); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); + } else h[i].rb = h[i].re = -1; } mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); diff --git a/main.c b/main.c index 4bad9ee..c1c232a 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r283-beta" +#define PACKAGE_VERSION "0.6.2-r284-beta" #endif static int usage()