dev-470: don't stop on bwa_fix_xref2() failures

Peter Field has sent me an example caused by an alignment bridging three
adjacent chromosomes/contigs. Bwa-mem always aligns the query to the contig
covering the middle point of the alignment. In this example, it chooses the
middle contig, which should not be aligned. This leads to weird things failing
bwa_fix_xref2(), which cannot be fixed unless we build the contig boundaries
into the FM-index.

In the old code, bwa-mem halts when bwa_fix_xref2() fails. With this commit,
bwa-mem will give a warning instead of halting.
This commit is contained in:
Heng Li 2014-04-10 11:43:17 -04:00
parent db58392e9b
commit ccbbe48c4f
5 changed files with 30 additions and 12 deletions

7
bwa.c
View File

@ -178,9 +178,10 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
int is_rev;
int64_t cb, ce, fm;
int is_rev, ori_ql = *qe - *qb;
int64_t cb, ce, fm, ori_rl = *re - *rb;
bntann1_t *ra;
assert(ori_ql > 0 && ori_rl > 0);
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here
*qb = *qe = *rb = *re = -1;
return -1; // unable to fix
@ -218,7 +219,7 @@ int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_i
}
free(cigar);
}
return (*qb == *qe || *rb == *re)? -2 : 0;
return (*qe - *qb < .33 * ori_ql || *re - *rb < .33 * ori_rl)? -2 : 0;
}
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)

View File

@ -905,7 +905,11 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
q = kv_pushp(mem_aln_t, aa);
*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
*q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name);
if (q->rid < 0) {
--aa.n;
continue;
}
q->flag |= extra_flag; // flag secondary
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
if (k && p->secondary < 0) // if supplementary
@ -982,11 +986,10 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
if (name) fprintf(stderr, "[E::%s] Internal code inconsistency for read '%s'. Please contact the developer. Sorry.\n", __func__, name);
else fprintf(stderr, "[E::%s] Internal code inconsistency. Please contact the developer. Sorry.\n", __func__);
a.rid = -1; a.pos = -1; a.flag |= 0x4;
return a;
if ((ret = bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re)) < 0) {
if (bwa_verbose >= 2 && name)
fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, name);
goto err_reg2aln;
}
tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del);
w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins);
@ -1002,6 +1005,11 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
last_sc = score;
w2 <<= 1;
} while (++i < 3 && score < ar->truesc - opt->a);
if (score < 0) {
if (bwa_verbose >= 2 && name)
fprintf(stderr, "[W::%s] A hit to read '%s' has been dropped.\n", __func__, name);
goto err_reg2aln;
}
l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1;
a.NM = NM;
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
@ -1036,6 +1044,13 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
free(query);
return a;
err_reg2aln:
free(a.cigar);
memset(&a, 0, sizeof(mem_aln_t));
a.rid = -1; a.pos = -1; a.flag |= 0x4;
free(query);
return a;
}
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)

View File

@ -148,6 +148,7 @@ extern "C" {
* @return CIGAR, strand, mapping quality and forward-strand position
*/
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar);
mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name);
/**
* Infer the insert size distribution from interleaved alignment regions

View File

@ -89,8 +89,9 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
int is_rev, rid, qb = p->qb, qe = p->qe;
int64_t pos, rb = p->rb, re = p->re;
if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) {
fprintf(stderr, "[E::%s] Internal errors when processing read '%s'. Please let the developer know. Abort. Sorry.\n", __func__, s->name);
exit(1);
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, s->name);
continue;
}
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
rid = bns_pos2rid(bns, pos);

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.8+dev-r469"
#define PACKAGE_VERSION "0.7.8+dev-r470"
#endif
int bwa_fa2pac(int argc, char *argv[]);