From ccbbe48c4f4124623551e1c72ad257a7040b8ad3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 11:43:17 -0400 Subject: [PATCH] dev-470: don't stop on bwa_fix_xref2() failures Peter Field has sent me an example caused by an alignment bridging three adjacent chromosomes/contigs. Bwa-mem always aligns the query to the contig covering the middle point of the alignment. In this example, it chooses the middle contig, which should not be aligned. This leads to weird things failing bwa_fix_xref2(), which cannot be fixed unless we build the contig boundaries into the FM-index. In the old code, bwa-mem halts when bwa_fix_xref2() fails. With this commit, bwa-mem will give a warning instead of halting. --- bwa.c | 7 ++++--- bwamem.c | 27 +++++++++++++++++++++------ bwamem.h | 1 + bwamem_extra.c | 5 +++-- main.c | 2 +- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/bwa.c b/bwa.c index 0e9e606..08881c0 100644 --- a/bwa.c +++ b/bwa.c @@ -178,9 +178,10 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) { - int is_rev; - int64_t cb, ce, fm; + int is_rev, ori_ql = *qe - *qb; + int64_t cb, ce, fm, ori_rl = *re - *rb; bntann1_t *ra; + assert(ori_ql > 0 && ori_rl > 0); if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here *qb = *qe = *rb = *re = -1; return -1; // unable to fix @@ -218,7 +219,7 @@ int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_i } free(cigar); } - return (*qb == *qe || *rb == *re)? -2 : 0; + return (*qe - *qb < .33 * ori_ql || *re - *rb < .33 * ori_rl)? -2 : 0; } int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) diff --git a/bwamem.c b/bwamem.c index bfb45d9..9a58968 100644 --- a/bwamem.c +++ b/bwamem.c @@ -905,7 +905,11 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; q = kv_pushp(mem_aln_t, aa); - *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + *q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name); + if (q->rid < 0) { + --aa.n; + continue; + } q->flag |= extra_flag; // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if (k && p->secondary < 0) // if supplementary @@ -982,11 +986,10 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment - if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { - if (name) fprintf(stderr, "[E::%s] Internal code inconsistency for read '%s'. Please contact the developer. Sorry.\n", __func__, name); - else fprintf(stderr, "[E::%s] Internal code inconsistency. Please contact the developer. Sorry.\n", __func__); - a.rid = -1; a.pos = -1; a.flag |= 0x4; - return a; + if ((ret = bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re)) < 0) { + if (bwa_verbose >= 2 && name) + fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, name); + goto err_reg2aln; } tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); @@ -1002,6 +1005,11 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t last_sc = score; w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); + if (score < 0) { + if (bwa_verbose >= 2 && name) + fprintf(stderr, "[W::%s] A hit to read '%s' has been dropped.\n", __func__, name); + goto err_reg2aln; + } l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); @@ -1036,6 +1044,13 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; free(query); return a; + +err_reg2aln: + free(a.cigar); + memset(&a, 0, sizeof(mem_aln_t)); + a.rid = -1; a.pos = -1; a.flag |= 0x4; + free(query); + return a; } mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) diff --git a/bwamem.h b/bwamem.h index a6d6aa9..7cfe8b8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -148,6 +148,7 @@ extern "C" { * @return CIGAR, strand, mapping quality and forward-strand position */ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); + mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name); /** * Infer the insert size distribution from interleaved alignment regions diff --git a/bwamem_extra.c b/bwamem_extra.c index 58e9f67..157026c 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -89,8 +89,9 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int is_rev, rid, qb = p->qb, qe = p->qe; int64_t pos, rb = p->rb, re = p->re; if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) { - fprintf(stderr, "[E::%s] Internal errors when processing read '%s'. Please let the developer know. Abort. Sorry.\n", __func__, s->name); - exit(1); + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, s->name); + continue; } pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); rid = bns_pos2rid(bns, pos); diff --git a/main.c b/main.c index 91f28bc..dec0b04 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r469" +#define PACKAGE_VERSION "0.7.8+dev-r470" #endif int bwa_fa2pac(int argc, char *argv[]);