Merge branch 'master' into master_fixes. Merged up to master r375.

Conflicts:
	bwt.c
This commit is contained in:
Rob Davies 2013-04-11 11:15:39 +01:00
commit 90ecd344ba
10 changed files with 63 additions and 27 deletions

View File

@ -537,7 +537,20 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
w = max_gap < opt->w? max_gap : opt->w;
if (qd - rd < w && rd - qd < w) break;
}
if (i < av->n) continue;
if (i < av->n) { // the seed is (almost) contained in an existing alignment
for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain
const mem_seed_t *t;
if (srt[i] == 0) continue;
t = &c->seeds[(uint32_t)srt[i]];
if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping
if (s->qbeg <= t->qbeg && s->qbeg + s->len >= t->qbeg && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break;
if (t->qbeg <= s->qbeg && t->qbeg + t->len >= s->qbeg && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break;
}
if (i == c->n) { // no overlapping seeds; then skip extension
srt[k] = 0; // mark that seed extension has not been performed
continue;
}
}
a = kv_pushp(mem_alnreg_t, *av);
memset(a, 0, sizeof(mem_alnreg_t));
@ -555,7 +568,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[0] = opt->w << i;
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0]);
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0]);
if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
}
@ -578,7 +591,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[1] = opt->w << i;
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1]);
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1]);
if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
}
@ -863,6 +876,13 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
a.NM = NM;
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
a.is_rev = is_rev;
if (a.n_cigar > 0) {
if ((a.cigar[0]&0xf) == 2) {
pos += a.cigar[0]>>4;
--a.n_cigar;
memmove(a.cigar, a.cigar + 1, a.n_cigar * 4);
} else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar;
}
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
int clip5, clip3;
clip5 = is_rev? l_query - qe : qb;

View File

@ -16,6 +16,7 @@ typedef struct __smem_i smem_i;
#define MEM_F_NOPAIRING 0x4
#define MEM_F_ALL 0x8
#define MEM_F_NO_MULTI 0x10
#define MEM_F_NO_RESCUE 0x20
typedef struct {
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r

View File

@ -235,20 +235,21 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1;
kstring_t str;
mem_alnreg_v b[2];
mem_aln_t h[2];
str.l = str.m = 0; str.s = 0;
// perform SW for the best alignment
kv_init(b[0]); kv_init(b[1]);
for (i = 0; i < 2; ++i)
for (j = 0; j < a[i].n; ++j)
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
for (i = 0; i < 2; ++i)
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a);
if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
mem_alnreg_v b[2];
kv_init(b[0]); kv_init(b[1]);
for (i = 0; i < 2; ++i)
for (j = 0; j < a[i].n; ++j)
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
for (i = 0; i < 2; ++i)
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a);
}
mem_mark_primary_se(opt, a[0].n, a[0].a);
mem_mark_primary_se(opt, a[1].n, a[1].a);
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
@ -305,7 +306,7 @@ no_pairing:
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]);
else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
}
if (h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
int64_t dist;
int d;
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);

View File

@ -176,7 +176,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences
seq_reverse(rlen, rseq, 0);
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, &gtle, &gscore, 0);
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, &gtle, &gscore, 0);
if (gscore > 0) tle = gtle, qle = len;
rb = re - tle; rlen = tle;
seq_reverse(len, seq, 0);
@ -192,7 +192,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l
rb = *_pos; re = rb + len + SW_BW;
if (re > l_pac) re = l_pac;
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, &gtle, &gscore, 0);
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, &gtle, &gscore, 0);
if (gscore > 0) tle = gtle, qle = len;
re = rb + tle; rlen = tle;
ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension

16
bwt.c
View File

@ -370,6 +370,18 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt)
err_fclose(fp);
}
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
const int bufsize = 0x1000000; // 16M block
bwtint_t offset = 0;
while (size) {
int x = bufsize < size? bufsize : size;
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break;
size -= x; offset += x;
}
return offset;
}
void bwt_restore_sa(const char *fn, bwt_t *bwt)
{
char skipped[256];
@ -388,7 +400,7 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt)
bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t));
bwt->sa[0] = -1;
err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
err_fclose(fp);
}
@ -405,7 +417,7 @@ bwt_t *bwt_restore_bwt(const char *fn)
err_fseek(fp, 0, SEEK_SET);
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp);
fread_fix(fp, bwt->bwt_size<<2, bwt->bwt);
bwt->seq_len = bwt->L2[4];
err_fclose(fp);
bwt_gen_cnt_table(bwt);

View File

@ -125,7 +125,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0);
score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0);
if (score > p->G) { // extensible
p->G = score;
p->k -= tle;
@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq,
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1;
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1;
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
if (score >= p->G) {
p->G = score;

View File

@ -27,7 +27,7 @@ int main_mem(int argc, char *argv[])
void *ko = 0, *ko2 = 0;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) {
while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
else if (c == 'A') opt->a = atoi(optarg);
@ -43,6 +43,7 @@ int main_mem(int argc, char *argv[])
else if (c == 'a') opt->flag |= MEM_F_ALL;
else if (c == 'p') opt->flag |= MEM_F_PE;
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
else if (c == 'c') opt->max_occ = atoi(optarg);
else if (c == 'd') opt->zdrop = atoi(optarg);
else if (c == 'v') bwa_verbose = atoi(optarg);
@ -64,7 +65,8 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
fprintf(stderr, " -P skip pairing; perform mate SW only\n");
fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);

4
ksw.c
View File

@ -360,7 +360,7 @@ typedef struct {
int32_t h, e;
} eh_t;
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
{
eh_t *eh; // score array
int8_t *qp; // query profile
@ -382,7 +382,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
k = m * m;
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i]? max : mat[i];
max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
max_gap = (int)((double)(qlen * max + end_bonus - gapo) / gape + 1.);
max_gap = max_gap > 1? max_gap : 1;
w = w < max_gap? w : max_gap;
// DP loop

2
ksw.h
View File

@ -102,7 +102,7 @@ extern "C" {
*
* @return best semi-local alignment score
*/
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
#ifdef __cplusplus
}

2
main.c
View File

@ -3,7 +3,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.3-r369-beta"
#define PACKAGE_VERSION "0.7.3-r375-beta"
#endif
int bwa_fa2pac(int argc, char *argv[]);