From 8638cfadc875a44f7db5ee822e71f7cddb5221fa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 20:54:27 -0400 Subject: [PATCH] dev-472: get rid of bwa_fix_xref() This function causes all kinds of problems when the reference genome consists of many short reads/contigs/chromsomes. Some of the problems are nearly unfixable at the point where bwa_fix_xref() gets called. This commit attempts to fix the problem at the root. It disallows chains spanning multiple contigs and never retrieves sequences bridging two adjacent contigs. Thus all the chaining, extension, SW and global alignments are confined to on contig only. This commit brings many changes. I have tested it on a couple examples including Peter Field's PacBio example. It works well so far. --- bntseq.c | 32 +++++++++++++++++++++ bntseq.h | 3 ++ bwa.c | 51 --------------------------------- bwa.h | 2 -- bwamem.c | 77 ++++++++++++++++++++++---------------------------- bwamem.h | 1 + bwamem_extra.c | 5 ---- bwamem_pair.c | 16 ++++++----- main.c | 2 +- 9 files changed, 79 insertions(+), 110 deletions(-) diff --git a/bntseq.c b/bntseq.c index e1cd323..b63dff4 100644 --- a/bntseq.c +++ b/bntseq.c @@ -329,6 +329,15 @@ int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) return mid; } +int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re) +{ + int is_rev, rid_b, rid_e; + if (rb < bns->l_pac && re > bns->l_pac) return -2; + rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); + rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1); + return rid_b == rid_e? rid_b : -1; +} + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; @@ -374,3 +383,26 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end } else *len = 0; // if bridging the forward-reverse boundary, return nothing return seq; } + +uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) +{ + int64_t far_beg, far_end, len; + int is_rev; + uint8_t *seq; + + if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap + assert(*beg <= mid && mid < *end); + *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[*rid].offset; + far_end = far_beg + bns->anns[*rid].len; + if (is_rev) { // flip to the reverse strand + int64_t tmp = far_beg; + far_beg = (bns->l_pac<<1) - 1 - far_end; + far_end = (bns->l_pac<<1) - 1 - tmp; + } + *beg = *beg > far_beg? *beg : far_beg; + *end = *end < far_end? *end : far_end; + seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len); + assert(seq && *end - *beg == len); // assertion failure should never happen + return seq; +} diff --git a/bntseq.h b/bntseq.h index 4061438..6437cf6 100644 --- a/bntseq.h +++ b/bntseq.h @@ -28,6 +28,7 @@ #ifndef BWT_BNTSEQ_H #define BWT_BNTSEQ_H +#include #include #include #include @@ -75,6 +76,8 @@ extern "C" { int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); + uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); + int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); #ifdef __cplusplus } diff --git a/bwa.c b/bwa.c index 08881c0..db3b947 100644 --- a/bwa.c +++ b/bwa.c @@ -176,57 +176,6 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM); } -int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) -{ - int is_rev, ori_ql = *qe - *qb; - int64_t cb, ce, fm, ori_rl = *re - *rb; - bntann1_t *ra; - assert(ori_ql > 0 && ori_rl > 0); - if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here - *qb = *qe = *rb = *re = -1; - return -1; // unable to fix - } - fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand - ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point - cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand - ce = cb + ra->len; // chr end - if (cb > *rb || ce < *re) { // fix is needed - int i, score, n_cigar, y, NM; - uint32_t *cigar; - int64_t x; - cb = cb > *rb? cb : *rb; - ce = ce < *re? ce : *re; - cigar = bwa_gen_cigar2(mat, o_del, e_del, o_ins, e_ins, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); - for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { - int op = cigar[i]&0xf, len = cigar[i]>>4; - if (op == 0) { - if (x <= cb && cb < x + len) - *qb = y + (cb - x), *rb = cb; - if (x < ce && ce <= x + len) { - *qe = y + (ce - x), *re = ce; - break; - } else x += len, y += len; - } else if (op == 1) { - y += len; - } else if (op == 2) { - if (x <= cb && cb < x + len) - *qb = y, *rb = x + len; - if (x < ce && ce <= x + len) { - *qe = y, *re = x; - break; - } else x += len; - } else abort(); // should not be here - } - free(cigar); - } - return (*qe - *qb < .33 * ori_ql || *re - *rb < .33 * ori_rl)? -2 : 0; -} - -int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) -{ - return bwa_fix_xref2(mat, q, r, q, r, w, bns, pac, query, qb, qe, rb, re); -} - /********************* * Full index reader * *********************/ diff --git a/bwa.h b/bwa.h index 8d46e58..bbc2525 100644 --- a/bwa.h +++ b/bwa.h @@ -33,8 +33,6 @@ extern "C" { void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); - int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); - int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); diff --git a/bwamem.c b/bwamem.c index 189e58b..fe83f93 100644 --- a/bwamem.c +++ b/bwamem.c @@ -143,11 +143,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co typedef struct { int64_t rbeg; int32_t qbeg, len; -} mem_seed_t; +} mem_seed_t; // unaligned memory typedef struct { - int n, m, first; - uint32_t w:30, kept:2; + int n, m, first, rid; + int w, kept; int64_t pos; mem_seed_t *seeds; } mem_chain_t; @@ -160,12 +160,13 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; KBTREE_INIT(chn, mem_chain_t, chain_cmp) // return 1 if the seed is merged into the chain -static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p) +static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid) { int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; + if (seed_rid != c->rid) return 0; // different chr; request a new chain if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand @@ -220,9 +221,10 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) } } -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq) +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq) { int i; + int64_t l_pac = bns->l_pac; mem_chain_v chain; kbtree_t(chn) *tree; smem_aux_t *aux; @@ -241,19 +243,21 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int for (k = 0; k < p->x[2]; ++k) { mem_chain_t tmp, *lower, *upper; mem_seed_t s; - int to_add = 0; + int rid, to_add = 0; s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; - if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip + rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); + if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain - if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1; + if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1; } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds[0] = s; + tmp.rid = rid; kb_putp(chn, tree, &tmp); } } @@ -473,11 +477,11 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i * low-divergence sequences, more testing is needed. For now, I only recommend * to use mem_test_chain_sw() for PacBio data. It is disabled by default. */ -int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) +int mem_test_chain_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) { - int i, qb, qe; + int i, qb, qe, rid; int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499); - int64_t rb, re, rlen; + int64_t rb, re, l_pac = bns->l_pac; uint8_t *rseq = 0; kswr_t x; @@ -505,8 +509,8 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; - rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); - assert(rlen == re - rb); + rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid); + assert(c->rid == rid); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); free(rseq); if (x.score >= min_HSP_score) return 1; @@ -514,10 +518,10 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i return 0; } -int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { - int i, qb, qe, xtra; - int64_t rb, re, rlen; + int i, qb, qe, xtra, rid; + int64_t rb, re, l_pac = bns->l_pac; uint8_t *rseq = 0; mem_alnreg_t a; kswr_t x; @@ -547,8 +551,8 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; - rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); - assert(rlen == re - rb); + rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid); + assert(c->rid == rid); xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); free(rseq); @@ -556,6 +560,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, a.qb = qb + x.qb; a.qe = qb + x.qe + 1; a.score = x.score; a.csub = x.score2; + a.rid = c->rid; if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb); if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; kv_push(mem_alnreg_t, *av, a); @@ -571,10 +576,10 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l < opt->w<<1? l : opt->w<<1; } -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { - int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension - int64_t rlen, rmax[2], tmp, max = 0; + int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension + int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; uint64_t *srt; @@ -598,8 +603,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int else rmax[0] = l_pac; } // retrieve the reference sequence - rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); - assert(rlen == rmax[1] - rmax[0]); + rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid); + assert(c->rid == rid); srt = malloc(c->n * 8); for (i = 0; i < c->n; ++i) @@ -649,6 +654,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int memset(a, 0, sizeof(mem_alnreg_t)); a->w = aw[0] = aw[1] = opt->w; a->score = a->truesc = -1; + a->rid = c->rid; if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg); if (s->qbeg) { // left extension @@ -939,7 +945,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; - chn = mem_chain(opt, bwt, bns->l_pac, l_seq, (uint8_t*)seq); + chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); @@ -948,9 +954,9 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse mem_chain_t *p = &chn.a[i]; int ret; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); - else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); - if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns, pac, l_seq, (uint8_t*)seq, p); + else ret = mem_chain2aln_short(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); + if (ret > 0) mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } free(chn.a); @@ -986,11 +992,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment - if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { - if (bwa_verbose >= 2 && name) - fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, name); - goto err_reg2aln; - } tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); w2 = w2 > tmp? w2 : tmp; @@ -1005,11 +1006,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t last_sc = score; w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); - if (score < 0) { - if (bwa_verbose >= 2 && name) - fprintf(stderr, "[W::%s] A hit to read '%s' has been dropped.\n", __func__, name); - goto err_reg2aln; - } l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); @@ -1044,13 +1040,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; free(query); return a; - -err_reg2aln: - free(a.cigar); - memset(&a, 0, sizeof(mem_aln_t)); - a.rid = -1; a.pos = -1; a.flag |= 0x4; - free(query); - return a; } mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) diff --git a/bwamem.h b/bwamem.h index 7cfe8b8..53472fe 100644 --- a/bwamem.h +++ b/bwamem.h @@ -52,6 +52,7 @@ typedef struct { typedef struct { int64_t rb, re; // [rb,re): reference sequence in the alignment int qb, qe; // [qb,qe): query sequence in the alignment + int rid; // reference seq ID int score; // best local SW score int truesc; // actual score corresponding to the aligned region; possibly smaller than $score int sub; // 2nd best SW score diff --git a/bwamem_extra.c b/bwamem_extra.c index 157026c..aee1eb4 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -88,11 +88,6 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_t *p = &a->a[i]; int is_rev, rid, qb = p->qb, qe = p->qe; int64_t pos, rb = p->rb, re = p->re; - if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) { - if (bwa_verbose >= 2) - fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, s->name); - continue; - } pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); rid = bns_pos2rid(bns, pos); pos -= bns->anns[rid].offset; diff --git a/bwamem_pair.c b/bwamem_pair.c index 4a7cdf3..cec25da 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -106,10 +106,11 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { extern int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun); - int i, r, skip[4], n = 0; + int64_t l_pac = bns->l_pac; + int i, r, skip[4], n = 0, rid; for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; for (i = 0; i < ma->n; ++i) { // check which orinentation has been found @@ -122,7 +123,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me for (r = 0; r < 4; ++r) { int is_rev, is_larger; uint8_t *seq, *rev = 0, *ref; - int64_t rb, re, len; + int64_t rb, re; if (skip[r]) continue; is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate is_larger = !(r>>1); // whether the mate has larger coordinate @@ -140,14 +141,15 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me } if (rb < 0) rb = 0; if (re > l_pac<<1) re = l_pac<<1; - ref = bns_get_seq(l_pac, pac, rb, re, &len); - if (len == re - rb) { // no funny things happening + ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid); + if (a->rid == rid) { // no funny things happening kswr_t aln; mem_alnreg_t b; int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); - aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); + aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 + b.rid = a->rid; b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; @@ -258,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co kv_push(mem_alnreg_t, b[i], a[i].a[j]); for (i = 0; i < 2; ++i) for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) - n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); } mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); diff --git a/main.c b/main.c index 6a43b5f..49cc9ae 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r471" +#define PACKAGE_VERSION "0.7.8+dev-r472" #endif int bwa_fa2pac(int argc, char *argv[]);