From 9a6abe51b67e7bdb9eaf429d5a4ed2d74ad45099 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 22 May 2013 18:57:51 -0400 Subject: [PATCH] r391: better method to resolve xref alignment The old method does not work when the alignment bridges three chr. This may actually happen often. The new method does not work all the time, either, but should be better than the old one. It is also simpler, arguably. --- Makefile | 6 +++--- bwa.c | 56 +++++++++++++++++++++++++------------------------------- bwamem.c | 4 ---- main.c | 2 +- 4 files changed, 29 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index d39a787..85bb185 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,13 @@ CC= gcc CFLAGS= -g -Wall -O2 -WRAP_MALLOC= -DUSE_MALLOC_WRAPPERS +WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o fastmap.o bwtsw2_pair.o malloc_wrap.o + bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread diff --git a/bwa.c b/bwa.c index a20c027..35d3e95 100644 --- a/bwa.c +++ b/bwa.c @@ -146,57 +146,51 @@ ret_gen_cigar: int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) { - int ib, ie, is_rev; - int64_t fb, fe, mid = -1; + int is_rev; + int64_t cb, ce, fm; + bntann1_t *ra; if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here *qb = *qe = *rb = *re = -1; return -1; // unable to fix - } else { - fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev); - ib = bns_pos2rid(bns, fb); - if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix - fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev); - ie = bns_pos2rid(bns, fe); - if (ie - ib > 1) { // bridge three or more references - *qb = *qe = *rb = *re = -1; - return -2; // unable to fix - } else { - int l = bns->anns[ib].offset + bns->anns[ib].len - fb; - mid = is_rev? *re - l : *rb + l; - } } - if (mid >= 0) { + fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand + ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point + cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand + ce = cb + ra->len; // chr end + if (cb > *rb || ce < *re) { // fix is needed int i, score, n_cigar, y, NM; uint32_t *cigar; int64_t x; + cb = cb > *rb? cb : *rb; + ce = ce < *re? ce : *re; cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { int op = cigar[i]&0xf, len = cigar[i]>>4; if (op == 0) { - if (x <= mid && mid < x + len) { - if (mid - *rb > *re - mid) { // the first part is longer - if (x == mid) { // need to check the previous operation - assert(i); // mid != *rb should always stand - if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x; - else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4); - else abort(); // should not be here - } else *qe = y + (mid - x), *re = mid; - } else *qb = y + (mid - x), *rb = mid; + if (x <= cb && cb < x + len) + *qb = y + (cb - x), *rb = cb; + if (x < ce && ce <= x + len) { + *qe = y + (ce - x), *re = ce; break; } else x += len, y += len; - } else if (op == 1) { // insertion + } else if (op == 1) { y += len; - } else if (op == 2) { // deletion - if (x <= mid && mid < x + len) { - if (mid - *rb > *re - mid) *qe = y, *re = x; - else *qb = y, *rb = x + len; + } else if (op == 2) { + if (x <= cb && cb < x + len) + *qb = y, *rb = x + len; + if (x < ce && ce <= x + len) { + *qe = y, *re = x; break; } else x += len; } else abort(); // should not be here } free(cigar); + if (*qb == *qe || *rb == *re) { // TODO: this may happen in theory, but should be very very rare... + fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Sorry.\n", __func__); + exit(1); + } } - return 1; + return 0; } /********************* diff --git a/bwamem.c b/bwamem.c index 779a221..5ee833b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -789,10 +789,6 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); - if (q->rid < 0) { // unfixable cross-reference alignment - --aa.n; - continue; - } q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; diff --git a/main.c b/main.c index cc1a4b4..96d8064 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r389-beta" +#define PACKAGE_VERSION "0.7.4-r391-beta" #endif int bwa_fa2pac(int argc, char *argv[]);