r391: better method to resolve xref alignment

The old method does not work when the alignment bridges three chr. This may
actually happen often. The new method does not work all the time, either, but
should be better than the old one. It is also simpler, arguably.
This commit is contained in:
Heng Li 2013-05-22 18:57:51 -04:00
parent 338ebfaae0
commit 9a6abe51b6
4 changed files with 29 additions and 39 deletions

View File

@ -1,13 +1,13 @@
CC= gcc
CFLAGS= -g -Wall -O2
WRAP_MALLOC= -DUSE_MALLOC_WRAPPERS
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o malloc_wrap.o
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
INCLUDES=
LIBS= -lm -lz -lpthread

56
bwa.c
View File

@ -146,57 +146,51 @@ ret_gen_cigar:
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
int ib, ie, is_rev;
int64_t fb, fe, mid = -1;
int is_rev;
int64_t cb, ce, fm;
bntann1_t *ra;
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here
*qb = *qe = *rb = *re = -1;
return -1; // unable to fix
} else {
fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev);
ib = bns_pos2rid(bns, fb);
if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix
fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev);
ie = bns_pos2rid(bns, fe);
if (ie - ib > 1) { // bridge three or more references
*qb = *qe = *rb = *re = -1;
return -2; // unable to fix
} else {
int l = bns->anns[ib].offset + bns->anns[ib].len - fb;
mid = is_rev? *re - l : *rb + l;
}
}
if (mid >= 0) {
fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand
ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point
cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand
ce = cb + ra->len; // chr end
if (cb > *rb || ce < *re) { // fix is needed
int i, score, n_cigar, y, NM;
uint32_t *cigar;
int64_t x;
cb = cb > *rb? cb : *rb;
ce = ce < *re? ce : *re;
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
int op = cigar[i]&0xf, len = cigar[i]>>4;
if (op == 0) {
if (x <= mid && mid < x + len) {
if (mid - *rb > *re - mid) { // the first part is longer
if (x == mid) { // need to check the previous operation
assert(i); // mid != *rb should always stand
if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x;
else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4);
else abort(); // should not be here
} else *qe = y + (mid - x), *re = mid;
} else *qb = y + (mid - x), *rb = mid;
if (x <= cb && cb < x + len)
*qb = y + (cb - x), *rb = cb;
if (x < ce && ce <= x + len) {
*qe = y + (ce - x), *re = ce;
break;
} else x += len, y += len;
} else if (op == 1) { // insertion
} else if (op == 1) {
y += len;
} else if (op == 2) { // deletion
if (x <= mid && mid < x + len) {
if (mid - *rb > *re - mid) *qe = y, *re = x;
else *qb = y, *rb = x + len;
} else if (op == 2) {
if (x <= cb && cb < x + len)
*qb = y, *rb = x + len;
if (x < ce && ce <= x + len) {
*qe = y, *re = x;
break;
} else x += len;
} else abort(); // should not be here
}
free(cigar);
if (*qb == *qe || *rb == *re) { // TODO: this may happen in theory, but should be very very rare...
fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Sorry.\n", __func__);
exit(1);
}
}
return 1;
return 0;
}
/*********************

View File

@ -789,10 +789,6 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
q = kv_pushp(mem_aln_t, aa);
*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
if (q->rid < 0) { // unfixable cross-reference alignment
--aa.n;
continue;
}
q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000;

2
main.c
View File

@ -3,7 +3,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.4-r389-beta"
#define PACKAGE_VERSION "0.7.4-r391-beta"
#endif
int bwa_fa2pac(int argc, char *argv[]);