r391: better method to resolve xref alignment

The old method does not work when the alignment bridges three chr. This may actually happen often. The new method does not work all the time, either, but should be better than the old one. It is also simpler, arguably.
2013-05-22 18:57:51 -04:00 · 2013-05-22 18:57:51 -04:00 · 9a6abe51b6
parent 338ebfaae0
commit 9a6abe51b6
4 changed files with 29 additions and 39 deletions
--- a/6
+++ b/6
@ -1,13 +1,13 @@
 CC=			gcc
 CFLAGS=		-g -Wall -O2
-WRAP_MALLOC= -DUSE_MALLOC_WRAPPERS
+WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
 AR=			ar
 DFLAGS=		-DHAVE_PTHREAD $(WRAP_MALLOC)
-LOBJS=		utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
+LOBJS=		utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o
 AOBJS=		QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
 			is.o bwtindex.o bwape.o kopen.o pemerge.o \
 			bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
-			bwtsw2_chain.o fastmap.o bwtsw2_pair.o malloc_wrap.o
+			bwtsw2_chain.o fastmap.o bwtsw2_pair.o
 PROG=		bwa
 INCLUDES=	
 LIBS=		-lm -lz -lpthread
--- a/bwa.c
+++ b/bwa.c
@ -146,57 +146,51 @@ ret_gen_cigar:

 int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
 {
-	int ib, ie, is_rev;
-	int64_t fb, fe, mid = -1;
+	int is_rev;
+	int64_t cb, ce, fm;
+	bntann1_t *ra;
 	if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here
 		*qb = *qe = *rb = *re = -1;
 		return -1; // unable to fix
-	} else {
-		fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev);
-		ib = bns_pos2rid(bns, fb);
-		if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix
-		fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev);
-		ie = bns_pos2rid(bns, fe);
-		if (ie - ib > 1) { // bridge three or more references
-			*qb = *qe = *rb = *re = -1;
-			return -2; // unable to fix
-		} else {
-			int l = bns->anns[ib].offset + bns->anns[ib].len - fb;
-			mid = is_rev? *re - l : *rb + l;
-		}
 	}
-	if (mid >= 0) {
+	fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand
+	ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point
+	cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand
+	ce = cb + ra->len; // chr end
+	if (cb > *rb || ce < *re) { // fix is needed
 		int i, score, n_cigar, y, NM;
 		uint32_t *cigar;
 		int64_t x;
+		cb = cb > *rb? cb : *rb;
+		ce = ce < *re? ce : *re;
 		cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
 		for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
 			int op = cigar[i]&0xf, len = cigar[i]>>4;
 			if (op == 0) {
-				if (x <= mid && mid < x + len) {
-					if (mid - *rb > *re - mid) { // the first part is longer
-						if (x == mid) { // need to check the previous operation
-							assert(i); // mid != *rb should always stand
-							if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x;
-							else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4);
-							else abort(); // should not be here
-						} else *qe = y + (mid - x), *re = mid;
-					} else *qb = y + (mid - x), *rb = mid;
+				if (x <= cb && cb < x + len)
+					*qb = y + (cb - x), *rb = cb;
+				if (x < ce && ce <= x + len) {
+					*qe = y + (ce - x), *re = ce;
 					break;
 				} else x += len, y += len;
-			} else if (op == 1) { // insertion
+			} else if (op == 1) {
 				y += len;
-			} else if (op == 2) { // deletion
-				if (x <= mid && mid < x + len) {
-					if (mid - *rb > *re - mid) *qe = y, *re = x;
-					else *qb = y, *rb = x + len;
+			} else if (op == 2) {
+				if (x <= cb && cb < x + len)
+					*qb = y, *rb = x + len;
+				if (x < ce && ce <= x + len) {
+					*qe = y, *re = x;
 					break;
 				} else x += len;
 			} else abort(); // should not be here
 		}
 		free(cigar);
+		if (*qb == *qe || *rb == *re) { // TODO: this may happen in theory, but should be very very rare...
+			fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Sorry.\n", __func__);
+			exit(1);
+		}
 	}
-	return 1;
+	return 0;
 }

 /*********************
--- a/bwamem.c
+++ b/bwamem.c
@ -789,10 +789,6 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
 		if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
 		q = kv_pushp(mem_aln_t, aa);
 		*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
-		if (q->rid < 0) { // unfixable cross-reference alignment
-			--aa.n;
-			continue;
-		}
 		q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary
 		if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
 		if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000;
--- a/main.c
+++ b/main.c
@ -3,7 +3,7 @@
 #include "utils.h"

 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.4-r389-beta"
+#define PACKAGE_VERSION "0.7.4-r391-beta"
 #endif

 int bwa_fa2pac(int argc, char *argv[]);