Merge branch 'dev' into layout

Conflicts:
	main.c
This commit is contained in:
Heng Li 2014-04-10 21:07:16 -04:00
commit 7d25fe2de3
9 changed files with 81 additions and 110 deletions

View File

@ -329,6 +329,15 @@ int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
return mid; return mid;
} }
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
{
int is_rev, rid_b, rid_e;
if (rb < bns->l_pac && re > bns->l_pac) return -2;
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1);
return rid_b == rid_e? rid_b : -1;
}
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
{ {
int left, mid, right, nn; int left, mid, right, nn;
@ -374,3 +383,26 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end
} else *len = 0; // if bridging the forward-reverse boundary, return nothing } else *len = 0; // if bridging the forward-reverse boundary, return nothing
return seq; return seq;
} }
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid)
{
int64_t far_beg, far_end, len;
int is_rev;
uint8_t *seq;
if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap
assert(*beg <= mid && mid < *end);
*rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev));
far_beg = bns->anns[*rid].offset;
far_end = far_beg + bns->anns[*rid].len;
if (is_rev) { // flip to the reverse strand
int64_t tmp = far_beg;
far_beg = (bns->l_pac<<1) - 1 - far_end;
far_end = (bns->l_pac<<1) - 1 - tmp;
}
*beg = *beg > far_beg? *beg : far_beg;
*end = *end < far_end? *end : far_end;
seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len);
assert(seq && *end - *beg == len); // assertion failure should never happen
return seq;
}

View File

@ -28,6 +28,7 @@
#ifndef BWT_BNTSEQ_H #ifndef BWT_BNTSEQ_H
#define BWT_BNTSEQ_H #define BWT_BNTSEQ_H
#include <assert.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <zlib.h> #include <zlib.h>
@ -75,6 +76,8 @@ extern "C" {
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid);
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re);
#ifdef __cplusplus #ifdef __cplusplus
} }

51
bwa.c
View File

@ -176,57 +176,6 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM); return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM);
} }
int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
int is_rev, ori_ql = *qe - *qb;
int64_t cb, ce, fm, ori_rl = *re - *rb;
bntann1_t *ra;
assert(ori_ql > 0 && ori_rl > 0);
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here
*qb = *qe = *rb = *re = -1;
return -1; // unable to fix
}
fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand
ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point
cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand
ce = cb + ra->len; // chr end
if (cb > *rb || ce < *re) { // fix is needed
int i, score, n_cigar, y, NM;
uint32_t *cigar;
int64_t x;
cb = cb > *rb? cb : *rb;
ce = ce < *re? ce : *re;
cigar = bwa_gen_cigar2(mat, o_del, e_del, o_ins, e_ins, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
int op = cigar[i]&0xf, len = cigar[i]>>4;
if (op == 0) {
if (x <= cb && cb < x + len)
*qb = y + (cb - x), *rb = cb;
if (x < ce && ce <= x + len) {
*qe = y + (ce - x), *re = ce;
break;
} else x += len, y += len;
} else if (op == 1) {
y += len;
} else if (op == 2) {
if (x <= cb && cb < x + len)
*qb = y, *rb = x + len;
if (x < ce && ce <= x + len) {
*qe = y, *re = x;
break;
} else x += len;
} else abort(); // should not be here
}
free(cigar);
}
return (*qe - *qb < .33 * ori_ql || *re - *rb < .33 * ori_rl)? -2 : 0;
}
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
return bwa_fix_xref2(mat, q, r, q, r, w, bns, pac, query, qb, qe, rb, re);
}
/********************* /*********************
* Full index reader * * Full index reader *
*********************/ *********************/

2
bwa.h
View File

@ -33,8 +33,6 @@ extern "C" {
void bwa_fill_scmat(int a, int b, int8_t mat[25]); void bwa_fill_scmat(int a, int b, int8_t mat[25]);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
char *bwa_idx_infer_prefix(const char *hint); char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint);

View File

@ -143,11 +143,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
typedef struct { typedef struct {
int64_t rbeg; int64_t rbeg;
int32_t qbeg, len; int32_t qbeg, len;
} mem_seed_t; } mem_seed_t; // unaligned memory
typedef struct { typedef struct {
int n, m, first; int n, m, first, rid;
uint32_t w:30, kept:2; int w, kept;
int64_t pos; int64_t pos;
mem_seed_t *seeds; mem_seed_t *seeds;
} mem_chain_t; } mem_chain_t;
@ -160,12 +160,13 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
KBTREE_INIT(chn, mem_chain_t, chain_cmp) KBTREE_INIT(chn, mem_chain_t, chain_cmp)
// return 1 if the seed is merged into the chain // return 1 if the seed is merged into the chain
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p) static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid)
{ {
int64_t qend, rend, x, y; int64_t qend, rend, x, y;
const mem_seed_t *last = &c->seeds[c->n-1]; const mem_seed_t *last = &c->seeds[c->n-1];
qend = last->qbeg + last->len; qend = last->qbeg + last->len;
rend = last->rbeg + last->len; rend = last->rbeg + last->len;
if (seed_rid != c->rid) return 0; // different chr; request a new chain
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
return 1; // contained seed; do nothing return 1; // contained seed; do nothing
if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand
@ -220,9 +221,10 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
} }
} }
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq) mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq)
{ {
int i; int i;
int64_t l_pac = bns->l_pac;
mem_chain_v chain; mem_chain_v chain;
kbtree_t(chn) *tree; kbtree_t(chn) *tree;
smem_aux_t *aux; smem_aux_t *aux;
@ -241,19 +243,21 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int
for (k = 0; k < p->x[2]; ++k) { for (k = 0; k < p->x[2]; ++k) {
mem_chain_t tmp, *lower, *upper; mem_chain_t tmp, *lower, *upper;
mem_seed_t s; mem_seed_t s;
int to_add = 0; int rid, to_add = 0;
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
s.qbeg = p->info>>32; s.qbeg = p->info>>32;
s.len = slen; s.len = slen;
if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary
if (kb_size(tree)) { if (kb_size(tree)) {
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1; if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1;
} else to_add = 1; } else to_add = 1;
if (to_add) { // add the seed as a new chain if (to_add) { // add the seed as a new chain
tmp.n = 1; tmp.m = 4; tmp.n = 1; tmp.m = 4;
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
tmp.seeds[0] = s; tmp.seeds[0] = s;
tmp.rid = rid;
kb_putp(chn, tree, &tmp); kb_putp(chn, tree, &tmp);
} }
} }
@ -473,11 +477,11 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
* low-divergence sequences, more testing is needed. For now, I only recommend * low-divergence sequences, more testing is needed. For now, I only recommend
* to use mem_test_chain_sw() for PacBio data. It is disabled by default. * to use mem_test_chain_sw() for PacBio data. It is disabled by default.
*/ */
int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) int mem_test_chain_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c)
{ {
int i, qb, qe; int i, qb, qe, rid;
int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499); int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499);
int64_t rb, re, rlen; int64_t rb, re, l_pac = bns->l_pac;
uint8_t *rseq = 0; uint8_t *rseq = 0;
kswr_t x; kswr_t x;
@ -505,8 +509,8 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i
if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1;
if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1;
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid);
assert(rlen == re - rb); assert(c->rid == rid);
x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0);
free(rseq); free(rseq);
if (x.score >= min_HSP_score) return 1; if (x.score >= min_HSP_score) return 1;
@ -514,10 +518,10 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i
return 0; return 0;
} }
int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
{ {
int i, qb, qe, xtra; int i, qb, qe, xtra, rid;
int64_t rb, re, rlen; int64_t rb, re, l_pac = bns->l_pac;
uint8_t *rseq = 0; uint8_t *rseq = 0;
mem_alnreg_t a; mem_alnreg_t a;
kswr_t x; kswr_t x;
@ -547,8 +551,8 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1;
if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1;
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid);
assert(rlen == re - rb); assert(c->rid == rid);
xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
free(rseq); free(rseq);
@ -556,6 +560,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
a.qb = qb + x.qb; a.qe = qb + x.qe + 1; a.qb = qb + x.qb; a.qe = qb + x.qe + 1;
a.score = x.score; a.score = x.score;
a.csub = x.score2; a.csub = x.score2;
a.rid = c->rid;
if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb); if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb);
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
kv_push(mem_alnreg_t, *av, a); kv_push(mem_alnreg_t, *av, a);
@ -571,10 +576,10 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
return l < opt->w<<1? l : opt->w<<1; return l < opt->w<<1? l : opt->w<<1;
} }
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
{ {
int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension
int64_t rlen, rmax[2], tmp, max = 0; int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0;
const mem_seed_t *s; const mem_seed_t *s;
uint8_t *rseq = 0; uint8_t *rseq = 0;
uint64_t *srt; uint64_t *srt;
@ -598,8 +603,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
else rmax[0] = l_pac; else rmax[0] = l_pac;
} }
// retrieve the reference sequence // retrieve the reference sequence
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid);
assert(rlen == rmax[1] - rmax[0]); assert(c->rid == rid);
srt = malloc(c->n * 8); srt = malloc(c->n * 8);
for (i = 0; i < c->n; ++i) for (i = 0; i < c->n; ++i)
@ -649,6 +654,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
memset(a, 0, sizeof(mem_alnreg_t)); memset(a, 0, sizeof(mem_alnreg_t));
a->w = aw[0] = aw[1] = opt->w; a->w = aw[0] = aw[1] = opt->w;
a->score = a->truesc = -1; a->score = a->truesc = -1;
a->rid = c->rid;
if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg); if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg);
if (s->qbeg) { // left extension if (s->qbeg) { // left extension
@ -939,7 +945,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
chn = mem_chain(opt, bwt, bns->l_pac, l_seq, (uint8_t*)seq); chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq);
chn.n = mem_chain_flt(opt, chn.n, chn.a); chn.n = mem_chain_flt(opt, chn.n, chn.a);
if (bwa_verbose >= 4) mem_print_chain(bns, &chn); if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
@ -948,9 +954,9 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
mem_chain_t *p = &chn.a[i]; mem_chain_t *p = &chn.a[i];
int ret; int ret;
if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns, pac, l_seq, (uint8_t*)seq, p);
else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs); else ret = mem_chain2aln_short(opt, bns, pac, l_seq, (uint8_t*)seq, p, &regs);
if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs); if (ret > 0) mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, &regs);
free(chn.a[i].seeds); free(chn.a[i].seeds);
} }
free(chn.a); free(chn.a);
@ -986,11 +992,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
if (bwa_verbose >= 2 && name)
fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, name);
goto err_reg2aln;
}
tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del);
w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins);
w2 = w2 > tmp? w2 : tmp; w2 = w2 > tmp? w2 : tmp;
@ -1005,11 +1006,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
last_sc = score; last_sc = score;
w2 <<= 1; w2 <<= 1;
} while (++i < 3 && score < ar->truesc - opt->a); } while (++i < 3 && score < ar->truesc - opt->a);
if (score < 0) {
if (bwa_verbose >= 2 && name)
fprintf(stderr, "[W::%s] A hit to read '%s' has been dropped.\n", __func__, name);
goto err_reg2aln;
}
l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1;
a.NM = NM; a.NM = NM;
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
@ -1040,17 +1036,11 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t
} }
} }
a.rid = bns_pos2rid(bns, pos); a.rid = bns_pos2rid(bns, pos);
assert(a.rid == ar->rid);
a.pos = pos - bns->anns[a.rid].offset; a.pos = pos - bns->anns[a.rid].offset;
a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
free(query); free(query);
return a; return a;
err_reg2aln:
free(a.cigar);
memset(&a, 0, sizeof(mem_aln_t));
a.rid = -1; a.pos = -1; a.flag |= 0x4;
free(query);
return a;
} }
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)

View File

@ -52,6 +52,7 @@ typedef struct {
typedef struct { typedef struct {
int64_t rb, re; // [rb,re): reference sequence in the alignment int64_t rb, re; // [rb,re): reference sequence in the alignment
int qb, qe; // [qb,qe): query sequence in the alignment int qb, qe; // [qb,qe): query sequence in the alignment
int rid; // reference seq ID
int score; // best local SW score int score; // best local SW score
int truesc; // actual score corresponding to the aligned region; possibly smaller than $score int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
int sub; // 2nd best SW score int sub; // 2nd best SW score

View File

@ -88,13 +88,9 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
const mem_alnreg_t *p = &a->a[i]; const mem_alnreg_t *p = &a->a[i];
int is_rev, rid, qb = p->qb, qe = p->qe; int is_rev, rid, qb = p->qb, qe = p->qe;
int64_t pos, rb = p->rb, re = p->re; int64_t pos, rb = p->rb, re = p->re;
if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) {
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, s->name);
continue;
}
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
rid = bns_pos2rid(bns, pos); rid = bns_pos2rid(bns, pos);
assert(rid == p->rid);
pos -= bns->anns[rid].offset; pos -= bns->anns[rid].offset;
kputs(s->name, &str); kputc('\t', &str); kputs(s->name, &str); kputc('\t', &str);
kputw(s->l_seq, &str); kputc('\t', &str); kputw(s->l_seq, &str); kputc('\t', &str);

View File

@ -106,10 +106,11 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
} }
} }
int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
{ {
extern int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun); extern int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun);
int i, r, skip[4], n = 0; int64_t l_pac = bns->l_pac;
int i, r, skip[4], n = 0, rid;
for (r = 0; r < 4; ++r) for (r = 0; r < 4; ++r)
skip[r] = pes[r].failed? 1 : 0; skip[r] = pes[r].failed? 1 : 0;
for (i = 0; i < ma->n; ++i) { // check which orinentation has been found for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
@ -122,7 +123,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me
for (r = 0; r < 4; ++r) { for (r = 0; r < 4; ++r) {
int is_rev, is_larger; int is_rev, is_larger;
uint8_t *seq, *rev = 0, *ref; uint8_t *seq, *rev = 0, *ref;
int64_t rb, re, len; int64_t rb, re;
if (skip[r]) continue; if (skip[r]) continue;
is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
is_larger = !(r>>1); // whether the mate has larger coordinate is_larger = !(r>>1); // whether the mate has larger coordinate
@ -140,14 +141,15 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me
} }
if (rb < 0) rb = 0; if (rb < 0) rb = 0;
if (re > l_pac<<1) re = l_pac<<1; if (re > l_pac<<1) re = l_pac<<1;
ref = bns_get_seq(l_pac, pac, rb, re, &len); ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid);
if (len == re - rb) { // no funny things happening if (a->rid == rid) { // no funny things happening
kswr_t aln; kswr_t aln;
mem_alnreg_t b; mem_alnreg_t b;
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
memset(&b, 0, sizeof(mem_alnreg_t)); memset(&b, 0, sizeof(mem_alnreg_t));
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
b.rid = a->rid;
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
@ -258,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
kv_push(mem_alnreg_t, b[i], a[i].a[j]); kv_push(mem_alnreg_t, b[i], a[i].a[j]);
for (i = 0; i < 2; ++i) for (i = 0; i < 2; ++i)
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a); free(b[0].a); free(b[1].a);
} }
mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h" #include "utils.h"
#ifndef PACKAGE_VERSION #ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.8+layout-r477" #define PACKAGE_VERSION "0.7.8+layout-r478"
#endif #endif
int bwa_fa2pac(int argc, char *argv[]); int bwa_fa2pac(int argc, char *argv[]);