dev-472: get rid of bwa_fix_xref()
This function causes all kinds of problems when the reference genome consists of many short reads/contigs/chromsomes. Some of the problems are nearly unfixable at the point where bwa_fix_xref() gets called. This commit attempts to fix the problem at the root. It disallows chains spanning multiple contigs and never retrieves sequences bridging two adjacent contigs. Thus all the chaining, extension, SW and global alignments are confined to on contig only. This commit brings many changes. I have tested it on a couple examples including Peter Field's PacBio example. It works well so far.
This commit is contained in:
parent
23e0e99ec0
commit
8638cfadc8
32
bntseq.c
32
bntseq.c
|
|
@ -329,6 +329,15 @@ int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
|
|||
return mid;
|
||||
}
|
||||
|
||||
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
|
||||
{
|
||||
int is_rev, rid_b, rid_e;
|
||||
if (rb < bns->l_pac && re > bns->l_pac) return -2;
|
||||
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
|
||||
rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1);
|
||||
return rid_b == rid_e? rid_b : -1;
|
||||
}
|
||||
|
||||
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
|
||||
{
|
||||
int left, mid, right, nn;
|
||||
|
|
@ -374,3 +383,26 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end
|
|||
} else *len = 0; // if bridging the forward-reverse boundary, return nothing
|
||||
return seq;
|
||||
}
|
||||
|
||||
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid)
|
||||
{
|
||||
int64_t far_beg, far_end, len;
|
||||
int is_rev;
|
||||
uint8_t *seq;
|
||||
|
||||
if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap
|
||||
assert(*beg <= mid && mid < *end);
|
||||
*rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev));
|
||||
far_beg = bns->anns[*rid].offset;
|
||||
far_end = far_beg + bns->anns[*rid].len;
|
||||
if (is_rev) { // flip to the reverse strand
|
||||
int64_t tmp = far_beg;
|
||||
far_beg = (bns->l_pac<<1) - 1 - far_end;
|
||||
far_end = (bns->l_pac<<1) - 1 - tmp;
|
||||
}
|
||||
*beg = *beg > far_beg? *beg : far_beg;
|
||||
*end = *end < far_end? *end : far_end;
|
||||
seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len);
|
||||
assert(seq && *end - *beg == len); // assertion failure should never happen
|
||||
return seq;
|
||||
}
|
||||
|
|
|
|||
3
bntseq.h
3
bntseq.h
|
|
@ -28,6 +28,7 @@
|
|||
#ifndef BWT_BNTSEQ_H
|
||||
#define BWT_BNTSEQ_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
|
|
@ -75,6 +76,8 @@ extern "C" {
|
|||
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
|
||||
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
|
||||
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
|
||||
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid);
|
||||
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
51
bwa.c
51
bwa.c
|
|
@ -176,57 +176,6 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
|
|||
return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM);
|
||||
}
|
||||
|
||||
int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
|
||||
{
|
||||
int is_rev, ori_ql = *qe - *qb;
|
||||
int64_t cb, ce, fm, ori_rl = *re - *rb;
|
||||
bntann1_t *ra;
|
||||
assert(ori_ql > 0 && ori_rl > 0);
|
||||
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here
|
||||
*qb = *qe = *rb = *re = -1;
|
||||
return -1; // unable to fix
|
||||
}
|
||||
fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand
|
||||
ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point
|
||||
cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand
|
||||
ce = cb + ra->len; // chr end
|
||||
if (cb > *rb || ce < *re) { // fix is needed
|
||||
int i, score, n_cigar, y, NM;
|
||||
uint32_t *cigar;
|
||||
int64_t x;
|
||||
cb = cb > *rb? cb : *rb;
|
||||
ce = ce < *re? ce : *re;
|
||||
cigar = bwa_gen_cigar2(mat, o_del, e_del, o_ins, e_ins, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
|
||||
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
|
||||
int op = cigar[i]&0xf, len = cigar[i]>>4;
|
||||
if (op == 0) {
|
||||
if (x <= cb && cb < x + len)
|
||||
*qb = y + (cb - x), *rb = cb;
|
||||
if (x < ce && ce <= x + len) {
|
||||
*qe = y + (ce - x), *re = ce;
|
||||
break;
|
||||
} else x += len, y += len;
|
||||
} else if (op == 1) {
|
||||
y += len;
|
||||
} else if (op == 2) {
|
||||
if (x <= cb && cb < x + len)
|
||||
*qb = y, *rb = x + len;
|
||||
if (x < ce && ce <= x + len) {
|
||||
*qe = y, *re = x;
|
||||
break;
|
||||
} else x += len;
|
||||
} else abort(); // should not be here
|
||||
}
|
||||
free(cigar);
|
||||
}
|
||||
return (*qe - *qb < .33 * ori_ql || *re - *rb < .33 * ori_rl)? -2 : 0;
|
||||
}
|
||||
|
||||
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
|
||||
{
|
||||
return bwa_fix_xref2(mat, q, r, q, r, w, bns, pac, query, qb, qe, rb, re);
|
||||
}
|
||||
|
||||
/*********************
|
||||
* Full index reader *
|
||||
*********************/
|
||||
|
|
|
|||
2
bwa.h
2
bwa.h
|
|
@ -33,8 +33,6 @@ extern "C" {
|
|||
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
|
||||
int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
|
||||
|
||||
char *bwa_idx_infer_prefix(const char *hint);
|
||||
bwt_t *bwa_idx_load_bwt(const char *hint);
|
||||
|
|
|
|||
77
bwamem.c
77
bwamem.c
|
|
@ -143,11 +143,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
typedef struct {
|
||||
int64_t rbeg;
|
||||
int32_t qbeg, len;
|
||||
} mem_seed_t;
|
||||
} mem_seed_t; // unaligned memory
|
||||
|
||||
typedef struct {
|
||||
int n, m, first;
|
||||
uint32_t w:30, kept:2;
|
||||
int n, m, first, rid;
|
||||
int w, kept;
|
||||
int64_t pos;
|
||||
mem_seed_t *seeds;
|
||||
} mem_chain_t;
|
||||
|
|
@ -160,12 +160,13 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
|
|||
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
|
||||
|
||||
// return 1 if the seed is merged into the chain
|
||||
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p)
|
||||
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid)
|
||||
{
|
||||
int64_t qend, rend, x, y;
|
||||
const mem_seed_t *last = &c->seeds[c->n-1];
|
||||
qend = last->qbeg + last->len;
|
||||
rend = last->rbeg + last->len;
|
||||
if (seed_rid != c->rid) return 0; // different chr; request a new chain
|
||||
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
|
||||
return 1; // contained seed; do nothing
|
||||
if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand
|
||||
|
|
@ -220,9 +221,10 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
|
|||
}
|
||||
}
|
||||
|
||||
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq)
|
||||
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq)
|
||||
{
|
||||
int i;
|
||||
int64_t l_pac = bns->l_pac;
|
||||
mem_chain_v chain;
|
||||
kbtree_t(chn) *tree;
|
||||
smem_aux_t *aux;
|
||||
|
|
@ -241,19 +243,21 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int
|
|||
for (k = 0; k < p->x[2]; ++k) {
|
||||
mem_chain_t tmp, *lower, *upper;
|
||||
mem_seed_t s;
|
||||
int to_add = 0;
|
||||
int rid, to_add = 0;
|
||||
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
s.qbeg = p->info>>32;
|
||||
s.len = slen;
|
||||
if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip
|
||||
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
|
||||
if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary
|
||||
if (kb_size(tree)) {
|
||||
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
||||
if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1;
|
||||
if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1;
|
||||
} else to_add = 1;
|
||||
if (to_add) { // add the seed as a new chain
|
||||
tmp.n = 1; tmp.m = 4;
|
||||
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
|
||||
tmp.seeds[0] = s;
|
||||
tmp.rid = rid;
|
||||
kb_putp(chn, tree, &tmp);
|
||||
}
|
||||
}
|
||||
|
|
@ -473,11 +477,11 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
|
|||
* low-divergence sequences, more testing is needed. For now, I only recommend
|
||||
* to use mem_test_chain_sw() for PacBio data. It is disabled by default.
|
||||
*/
|
||||
int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c)
|
||||
int mem_test_chain_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c)
|
||||
{
|
||||
int i, qb, qe;
|
||||
int i, qb, qe, rid;
|
||||
int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499);
|
||||
int64_t rb, re, rlen;
|
||||
int64_t rb, re, l_pac = bns->l_pac;
|
||||
uint8_t *rseq = 0;
|
||||
kswr_t x;
|
||||
|
||||
|
|
@ -505,8 +509,8 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i
|
|||
if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1;
|
||||
if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1;
|
||||
|
||||
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||
assert(rlen == re - rb);
|
||||
rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid);
|
||||
assert(c->rid == rid);
|
||||
x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0);
|
||||
free(rseq);
|
||||
if (x.score >= min_HSP_score) return 1;
|
||||
|
|
@ -514,10 +518,10 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i
|
|||
return 0;
|
||||
}
|
||||
|
||||
int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
||||
int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
||||
{
|
||||
int i, qb, qe, xtra;
|
||||
int64_t rb, re, rlen;
|
||||
int i, qb, qe, xtra, rid;
|
||||
int64_t rb, re, l_pac = bns->l_pac;
|
||||
uint8_t *rseq = 0;
|
||||
mem_alnreg_t a;
|
||||
kswr_t x;
|
||||
|
|
@ -547,8 +551,8 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
|
|||
if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1;
|
||||
if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1;
|
||||
|
||||
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||
assert(rlen == re - rb);
|
||||
rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid);
|
||||
assert(c->rid == rid);
|
||||
xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
|
||||
x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
|
||||
free(rseq);
|
||||
|
|
@ -556,6 +560,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
|
|||
a.qb = qb + x.qb; a.qe = qb + x.qe + 1;
|
||||
a.score = x.score;
|
||||
a.csub = x.score2;
|
||||
a.rid = c->rid;
|
||||
if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb);
|
||||
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
|
||||
kv_push(mem_alnreg_t, *av, a);
|
||||
|
|
@ -571,10 +576,10 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
|||
return l < opt->w<<1? l : opt->w<<1;
|
||||
}
|
||||
|
||||
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
||||
void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
||||
{
|
||||
int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension
|
||||
int64_t rlen, rmax[2], tmp, max = 0;
|
||||
int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension
|
||||
int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0;
|
||||
const mem_seed_t *s;
|
||||
uint8_t *rseq = 0;
|
||||
uint64_t *srt;
|
||||
|
|
@ -598,8 +603,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
else rmax[0] = l_pac;
|
||||
}
|
||||
// retrieve the reference sequence
|
||||
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
|
||||
assert(rlen == rmax[1] - rmax[0]);
|
||||
rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid);
|
||||
assert(c->rid == rid);
|
||||
|
||||
srt = malloc(c->n * 8);
|
||||
for (i = 0; i < c->n; ++i)
|
||||
|
|
@ -649,6 +654,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
memset(a, 0, sizeof(mem_alnreg_t));
|
||||
a->w = aw[0] = aw[1] = opt->w;
|
||||
a->score = a->truesc = -1;
|
||||
a->rid = c->rid;
|
||||
|
||||
if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg);
|
||||
if (s->qbeg) { // left extension
|
||||
|
|
@ -939,7 +945,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
|
|||
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
|
||||
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
|
||||
|
||||
chn = mem_chain(opt, bwt, bns->l_pac, l_seq, (uint8_t*)seq);
|
||||
chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq);
|
||||
chn.n = mem_chain_flt(opt, chn.n, chn.a);
|
||||
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
|
||||
|
||||
|
|
@ -948,9 +954,9 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
|
|||
mem_chain_t *p = &chn.a[i];
|
||||
int ret;
|
||||
if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
|
||||
if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p);
|
||||
else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
|
||||
if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
|
||||
if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns, pac, l_seq, (uint8_t*)seq, p);
|
||||
else ret = mem_chain2aln_short(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s);
|
||||
if (ret > 0) mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s);
|
||||
free(chn.a[i].seeds);
|
||||
}
|
||||
free(chn.a);
|
||||
|
|
@ -986,11 +992,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
|
|||
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
|
||||
a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
|
||||
if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
|
||||
if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
|
||||
if (bwa_verbose >= 2 && name)
|
||||
fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, name);
|
||||
goto err_reg2aln;
|
||||
}
|
||||
tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del);
|
||||
w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins);
|
||||
w2 = w2 > tmp? w2 : tmp;
|
||||
|
|
@ -1005,11 +1006,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
|
|||
last_sc = score;
|
||||
w2 <<= 1;
|
||||
} while (++i < 3 && score < ar->truesc - opt->a);
|
||||
if (score < 0) {
|
||||
if (bwa_verbose >= 2 && name)
|
||||
fprintf(stderr, "[W::%s] A hit to read '%s' has been dropped.\n", __func__, name);
|
||||
goto err_reg2aln;
|
||||
}
|
||||
l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1;
|
||||
a.NM = NM;
|
||||
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||||
|
|
@ -1044,13 +1040,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
|
|||
a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
|
||||
free(query);
|
||||
return a;
|
||||
|
||||
err_reg2aln:
|
||||
free(a.cigar);
|
||||
memset(&a, 0, sizeof(mem_aln_t));
|
||||
a.rid = -1; a.pos = -1; a.flag |= 0x4;
|
||||
free(query);
|
||||
return a;
|
||||
}
|
||||
|
||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -52,6 +52,7 @@ typedef struct {
|
|||
typedef struct {
|
||||
int64_t rb, re; // [rb,re): reference sequence in the alignment
|
||||
int qb, qe; // [qb,qe): query sequence in the alignment
|
||||
int rid; // reference seq ID
|
||||
int score; // best local SW score
|
||||
int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
|
||||
int sub; // 2nd best SW score
|
||||
|
|
|
|||
|
|
@ -88,11 +88,6 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
|
|||
const mem_alnreg_t *p = &a->a[i];
|
||||
int is_rev, rid, qb = p->qb, qe = p->qe;
|
||||
int64_t pos, rb = p->rb, re = p->re;
|
||||
if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) {
|
||||
if (bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, s->name);
|
||||
continue;
|
||||
}
|
||||
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||||
rid = bns_pos2rid(bns, pos);
|
||||
pos -= bns->anns[rid].offset;
|
||||
|
|
|
|||
|
|
@ -106,10 +106,11 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
|
|||
}
|
||||
}
|
||||
|
||||
int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
|
||||
int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
|
||||
{
|
||||
extern int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun);
|
||||
int i, r, skip[4], n = 0;
|
||||
int64_t l_pac = bns->l_pac;
|
||||
int i, r, skip[4], n = 0, rid;
|
||||
for (r = 0; r < 4; ++r)
|
||||
skip[r] = pes[r].failed? 1 : 0;
|
||||
for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
|
||||
|
|
@ -122,7 +123,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me
|
|||
for (r = 0; r < 4; ++r) {
|
||||
int is_rev, is_larger;
|
||||
uint8_t *seq, *rev = 0, *ref;
|
||||
int64_t rb, re, len;
|
||||
int64_t rb, re;
|
||||
if (skip[r]) continue;
|
||||
is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
|
||||
is_larger = !(r>>1); // whether the mate has larger coordinate
|
||||
|
|
@ -140,14 +141,15 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me
|
|||
}
|
||||
if (rb < 0) rb = 0;
|
||||
if (re > l_pac<<1) re = l_pac<<1;
|
||||
ref = bns_get_seq(l_pac, pac, rb, re, &len);
|
||||
if (len == re - rb) { // no funny things happening
|
||||
ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid);
|
||||
if (a->rid == rid) { // no funny things happening
|
||||
kswr_t aln;
|
||||
mem_alnreg_t b;
|
||||
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
|
||||
aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
|
||||
aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
|
||||
memset(&b, 0, sizeof(mem_alnreg_t));
|
||||
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
|
||||
b.rid = a->rid;
|
||||
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
|
||||
b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
|
||||
b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
|
||||
|
|
@ -258,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
|||
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
||||
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
|
||||
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
|
||||
free(b[0].a); free(b[1].a);
|
||||
}
|
||||
mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
|
||||
|
|
|
|||
Loading…
Reference in New Issue