r284: amend cross-reference hit

I really hate this: complex and twisted logic for a nasty scenario that almost
never happens to short reads - but it may become serious when the reference
genome consists of many contigs.

On toy examples, the code seems to work. Don't know if it really works...
This commit is contained in:
Heng Li 2013-02-26 00:03:49 -05:00
parent 61dd3bf13a
commit e70c7c2a71
5 changed files with 65 additions and 3 deletions

56
bwa.c
View File

@ -1,6 +1,7 @@
#include <string.h>
#include <stdio.h>
#include <zlib.h>
#include <assert.h>
#include "bntseq.h"
#include "bwa.h"
#include "ksw.h"
@ -103,6 +104,61 @@ ret_gen_cigar:
return cigar;
}
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
int ib, ie, is_rev;
int64_t fb, fe, mid = -1;
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary
*qb = *qe = *rb = *re = -1;
return -1; // unable to fix
} else {
fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev);
ib = bns_pos2rid(bns, fb);
if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix
fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev);
ie = bns_pos2rid(bns, fe);
if (ie - ib > 1) { // bridge three or more references
*qb = *qe = *rb = *re = -1;
return -2; // unable to fix
} else {
int l = bns->anns[ib].offset + bns->anns[ib].len - fb;
mid = is_rev? *re - l : *rb + l;
}
}
if (mid >= 0) {
int i, score, n_cigar, y;
uint32_t *cigar;
int64_t x;
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar);
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
int op = cigar[i]&0xf, len = cigar[i]>>4;
if (op == 0) {
if (x <= mid && mid < x + len) {
if (mid - *rb > *re - mid) { // the first part is longer
if (x == mid) { // need to check the previous operation
assert(i); // mid != *rb should always stand
if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x;
else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4);
else abort(); // should not be here
} else *qe = y + (mid - x), *re = mid;
} else *qb = y + (mid - x), *rb = mid;
break;
} else x += len, y += len;
} else if (op == 1) { // insertion
y += len;
} else if (op == 2) { // deletion
if (x <= mid && mid < x + len) {
if (mid - *rb > *re - mid) *qe = y, *re = x;
else *qb = y, *rb = x + len;
break;
} else x += len;
} else abort(); // should not be here
}
free(cigar);
}
return 1;
}
/*********************
* Full index reader *
*********************/

1
bwa.h
View File

@ -31,6 +31,7 @@ extern "C" {
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);

View File

@ -633,6 +633,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
mem_alnreg2hit(p, &h);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re);
h.flag |= extra_flag;
if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard)
h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p);

View File

@ -293,7 +293,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
}
mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag;
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re);
mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag;
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re);
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0;
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s;
} else goto no_pairing;
@ -301,8 +303,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
no_pairing:
for (i = 0; i < 2; ++i) {
if (a[i].n) mem_alnreg2hit(&a[i].a[0], &h[i]);
else h[i].rb = h[i].re = -1;
if (a[i].n) {
mem_alnreg2hit(&a[i].a[0], &h[i]);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
} else h[i].rb = h[i].re = -1;
}
mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]);
mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]);

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.2-r283-beta"
#define PACKAGE_VERSION "0.6.2-r284-beta"
#endif
static int usage()