Merge branch 'master' into master_fixes. Merged up to master r375.
Conflicts: bwt.c
This commit is contained in:
commit
90ecd344ba
26
bwamem.c
26
bwamem.c
|
|
@ -537,7 +537,20 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
w = max_gap < opt->w? max_gap : opt->w;
|
||||
if (qd - rd < w && rd - qd < w) break;
|
||||
}
|
||||
if (i < av->n) continue;
|
||||
if (i < av->n) { // the seed is (almost) contained in an existing alignment
|
||||
for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain
|
||||
const mem_seed_t *t;
|
||||
if (srt[i] == 0) continue;
|
||||
t = &c->seeds[(uint32_t)srt[i]];
|
||||
if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping
|
||||
if (s->qbeg <= t->qbeg && s->qbeg + s->len >= t->qbeg && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break;
|
||||
if (t->qbeg <= s->qbeg && t->qbeg + t->len >= s->qbeg && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break;
|
||||
}
|
||||
if (i == c->n) { // no overlapping seeds; then skip extension
|
||||
srt[k] = 0; // mark that seed extension has not been performed
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
a = kv_pushp(mem_alnreg_t, *av);
|
||||
memset(a, 0, sizeof(mem_alnreg_t));
|
||||
|
|
@ -555,7 +568,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||||
int prev = a->score;
|
||||
aw[0] = opt->w << i;
|
||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]);
|
||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]);
|
||||
if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
|
||||
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
|
||||
}
|
||||
|
|
@ -578,7 +591,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||||
int prev = a->score;
|
||||
aw[1] = opt->w << i;
|
||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]);
|
||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]);
|
||||
if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
|
||||
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
|
||||
}
|
||||
|
|
@ -863,6 +876,13 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
|||
a.NM = NM;
|
||||
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||||
a.is_rev = is_rev;
|
||||
if (a.n_cigar > 0) {
|
||||
if ((a.cigar[0]&0xf) == 2) {
|
||||
pos += a.cigar[0]>>4;
|
||||
--a.n_cigar;
|
||||
memmove(a.cigar, a.cigar + 1, a.n_cigar * 4);
|
||||
} else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar;
|
||||
}
|
||||
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
|
||||
int clip5, clip3;
|
||||
clip5 = is_rev? l_query - qe : qb;
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -16,6 +16,7 @@ typedef struct __smem_i smem_i;
|
|||
#define MEM_F_NOPAIRING 0x4
|
||||
#define MEM_F_ALL 0x8
|
||||
#define MEM_F_NO_MULTI 0x10
|
||||
#define MEM_F_NO_RESCUE 0x20
|
||||
|
||||
typedef struct {
|
||||
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
|
||||
|
|
|
|||
|
|
@ -235,20 +235,21 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
|||
|
||||
int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1;
|
||||
kstring_t str;
|
||||
mem_alnreg_v b[2];
|
||||
mem_aln_t h[2];
|
||||
|
||||
str.l = str.m = 0; str.s = 0;
|
||||
// perform SW for the best alignment
|
||||
kv_init(b[0]); kv_init(b[1]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < a[i].n; ++j)
|
||||
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
|
||||
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
||||
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
|
||||
free(b[0].a); free(b[1].a);
|
||||
if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
|
||||
mem_alnreg_v b[2];
|
||||
kv_init(b[0]); kv_init(b[1]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < a[i].n; ++j)
|
||||
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
|
||||
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
||||
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
|
||||
free(b[0].a); free(b[1].a);
|
||||
}
|
||||
mem_mark_primary_se(opt, a[0].n, a[0].a);
|
||||
mem_mark_primary_se(opt, a[1].n, a[1].a);
|
||||
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
|
||||
|
|
@ -305,7 +306,7 @@ no_pairing:
|
|||
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]);
|
||||
else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
|
||||
}
|
||||
if (h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
|
||||
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
|
||||
int64_t dist;
|
||||
int d;
|
||||
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
|
||||
|
|
|
|||
4
bwase.c
4
bwase.c
|
|
@ -176,7 +176,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l
|
|||
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
|
||||
seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences
|
||||
seq_reverse(rlen, rseq, 0);
|
||||
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0);
|
||||
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0);
|
||||
if (gscore > 0) tle = gtle, qle = len;
|
||||
rb = re - tle; rlen = tle;
|
||||
seq_reverse(len, seq, 0);
|
||||
|
|
@ -192,7 +192,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l
|
|||
rb = *_pos; re = rb + len + SW_BW;
|
||||
if (re > l_pac) re = l_pac;
|
||||
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
|
||||
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0);
|
||||
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0);
|
||||
if (gscore > 0) tle = gtle, qle = len;
|
||||
re = rb + tle; rlen = tle;
|
||||
ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension
|
||||
|
|
|
|||
16
bwt.c
16
bwt.c
|
|
@ -370,6 +370,18 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt)
|
|||
err_fclose(fp);
|
||||
}
|
||||
|
||||
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
|
||||
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
|
||||
const int bufsize = 0x1000000; // 16M block
|
||||
bwtint_t offset = 0;
|
||||
while (size) {
|
||||
int x = bufsize < size? bufsize : size;
|
||||
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break;
|
||||
size -= x; offset += x;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
||||
{
|
||||
char skipped[256];
|
||||
|
|
@ -388,7 +400,7 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
|||
bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t));
|
||||
bwt->sa[0] = -1;
|
||||
|
||||
err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
||||
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
|
|
@ -405,7 +417,7 @@ bwt_t *bwt_restore_bwt(const char *fn)
|
|||
err_fseek(fp, 0, SEEK_SET);
|
||||
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp);
|
||||
fread_fix(fp, bwt->bwt_size<<2, bwt->bwt);
|
||||
bwt->seq_len = bwt->L2[4];
|
||||
err_fclose(fp);
|
||||
bwt_gen_cnt_table(bwt);
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
|
|||
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
|
||||
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
||||
lt = j;
|
||||
score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0);
|
||||
score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0);
|
||||
if (score > p->G) { // extensible
|
||||
p->G = score;
|
||||
p->k -= tle;
|
||||
|
|
@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq,
|
|||
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
|
||||
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
||||
lt = j;
|
||||
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1;
|
||||
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1;
|
||||
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
|
||||
if (score >= p->G) {
|
||||
p->G = score;
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ int main_mem(int argc, char *argv[])
|
|||
void *ko = 0, *ko2 = 0;
|
||||
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||
else if (c == 'w') opt->w = atoi(optarg);
|
||||
else if (c == 'A') opt->a = atoi(optarg);
|
||||
|
|
@ -43,6 +43,7 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 'a') opt->flag |= MEM_F_ALL;
|
||||
else if (c == 'p') opt->flag |= MEM_F_PE;
|
||||
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
||||
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
|
||||
else if (c == 'c') opt->max_occ = atoi(optarg);
|
||||
else if (c == 'd') opt->zdrop = atoi(optarg);
|
||||
else if (c == 'v') bwa_verbose = atoi(optarg);
|
||||
|
|
@ -64,7 +65,8 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -P skip pairing; perform mate SW only\n");
|
||||
fprintf(stderr, " -S skip mate rescue\n");
|
||||
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
|
||||
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
|
||||
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
||||
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
|
||||
|
|
|
|||
4
ksw.c
4
ksw.c
|
|
@ -360,7 +360,7 @@ typedef struct {
|
|||
int32_t h, e;
|
||||
} eh_t;
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
|
||||
{
|
||||
eh_t *eh; // score array
|
||||
int8_t *qp; // query profile
|
||||
|
|
@ -382,7 +382,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
k = m * m;
|
||||
for (i = 0, max = 0; i < k; ++i) // get the max score
|
||||
max = max > mat[i]? max : mat[i];
|
||||
max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
|
||||
max_gap = (int)((double)(qlen * max + end_bonus - gapo) / gape + 1.);
|
||||
max_gap = max_gap > 1? max_gap : 1;
|
||||
w = w < max_gap? w : max_gap;
|
||||
// DP loop
|
||||
|
|
|
|||
2
ksw.h
2
ksw.h
|
|
@ -102,7 +102,7 @@ extern "C" {
|
|||
*
|
||||
* @return best semi-local alignment score
|
||||
*/
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue