r284: amend cross-reference hit
I really hate this: complex and twisted logic for a nasty scenario that almost never happens to short reads - but it may become serious when the reference genome consists of many contigs. On toy examples, the code seems to work. Don't know if it really works...
This commit is contained in:
parent
61dd3bf13a
commit
e70c7c2a71
56
bwa.c
56
bwa.c
|
|
@ -1,6 +1,7 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
#include <assert.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
|
|
@ -103,6 +104,61 @@ ret_gen_cigar:
|
|||
return cigar;
|
||||
}
|
||||
|
||||
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
|
||||
{
|
||||
int ib, ie, is_rev;
|
||||
int64_t fb, fe, mid = -1;
|
||||
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary
|
||||
*qb = *qe = *rb = *re = -1;
|
||||
return -1; // unable to fix
|
||||
} else {
|
||||
fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev);
|
||||
ib = bns_pos2rid(bns, fb);
|
||||
if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix
|
||||
fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev);
|
||||
ie = bns_pos2rid(bns, fe);
|
||||
if (ie - ib > 1) { // bridge three or more references
|
||||
*qb = *qe = *rb = *re = -1;
|
||||
return -2; // unable to fix
|
||||
} else {
|
||||
int l = bns->anns[ib].offset + bns->anns[ib].len - fb;
|
||||
mid = is_rev? *re - l : *rb + l;
|
||||
}
|
||||
}
|
||||
if (mid >= 0) {
|
||||
int i, score, n_cigar, y;
|
||||
uint32_t *cigar;
|
||||
int64_t x;
|
||||
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar);
|
||||
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
|
||||
int op = cigar[i]&0xf, len = cigar[i]>>4;
|
||||
if (op == 0) {
|
||||
if (x <= mid && mid < x + len) {
|
||||
if (mid - *rb > *re - mid) { // the first part is longer
|
||||
if (x == mid) { // need to check the previous operation
|
||||
assert(i); // mid != *rb should always stand
|
||||
if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x;
|
||||
else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4);
|
||||
else abort(); // should not be here
|
||||
} else *qe = y + (mid - x), *re = mid;
|
||||
} else *qb = y + (mid - x), *rb = mid;
|
||||
break;
|
||||
} else x += len, y += len;
|
||||
} else if (op == 1) { // insertion
|
||||
y += len;
|
||||
} else if (op == 2) { // deletion
|
||||
if (x <= mid && mid < x + len) {
|
||||
if (mid - *rb > *re - mid) *qe = y, *re = x;
|
||||
else *qb = y, *rb = x + len;
|
||||
break;
|
||||
} else x += len;
|
||||
} else abort(); // should not be here
|
||||
}
|
||||
free(cigar);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*********************
|
||||
* Full index reader *
|
||||
*********************/
|
||||
|
|
|
|||
1
bwa.h
1
bwa.h
|
|
@ -31,6 +31,7 @@ extern "C" {
|
|||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
|
||||
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
|
||||
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
|
||||
|
||||
char *bwa_idx_infer_prefix(const char *hint);
|
||||
bwt_t *bwa_idx_load_bwt(const char *hint);
|
||||
|
|
|
|||
1
bwamem.c
1
bwamem.c
|
|
@ -633,6 +633,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
|
|||
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
|
||||
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
|
||||
mem_alnreg2hit(p, &h);
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re);
|
||||
h.flag |= extra_flag;
|
||||
if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard)
|
||||
h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p);
|
||||
|
|
|
|||
|
|
@ -293,7 +293,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
|||
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
|
||||
}
|
||||
mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag;
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re);
|
||||
mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag;
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re);
|
||||
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0;
|
||||
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s;
|
||||
} else goto no_pairing;
|
||||
|
|
@ -301,8 +303,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
|||
|
||||
no_pairing:
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (a[i].n) mem_alnreg2hit(&a[i].a[0], &h[i]);
|
||||
else h[i].rb = h[i].re = -1;
|
||||
if (a[i].n) {
|
||||
mem_alnreg2hit(&a[i].a[0], &h[i]);
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
|
||||
} else h[i].rb = h[i].re = -1;
|
||||
}
|
||||
mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]);
|
||||
mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]);
|
||||
|
|
|
|||
Loading…
Reference in New Issue