dev-467: limit the max #chains to extend
This commit is contained in:
parent
c0a308a8b6
commit
99f6f9a0d1
47
bwamem.c
47
bwamem.c
|
|
@ -63,15 +63,15 @@ mem_opt_t *mem_opt_init()
|
|||
o->max_chain_gap = 10000;
|
||||
o->max_ins = 10000;
|
||||
o->mask_level = 0.50;
|
||||
o->chain_drop_ratio = 0.50;
|
||||
o->drop_ratio = 0.50;
|
||||
o->split_factor = 1.5;
|
||||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
o->max_matesw = 100;
|
||||
o->mask_level_redun = 0.95;
|
||||
o->min_chain_weight = 0;
|
||||
o->max_chain_extend = 1<<30;
|
||||
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
|
||||
// o->mapQ_coef_len = o->mapQ_coef_fac = 0;
|
||||
bwa_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
}
|
||||
|
|
@ -98,7 +98,8 @@ static smem_aux_t *smem_aux_init()
|
|||
|
||||
static void smem_aux_destroy(smem_aux_t *a)
|
||||
{
|
||||
free(a->tmpv[0]->a); free(a->tmpv[1]->a);
|
||||
free(a->tmpv[0]->a); free(a->tmpv[0]);
|
||||
free(a->tmpv[1]->a); free(a->tmpv[1]);
|
||||
free(a->mem.a); free(a->mem1.a);
|
||||
free(a);
|
||||
}
|
||||
|
|
@ -135,9 +136,9 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
ks_introsort(mem_intv, a->mem.n, a->mem.a);
|
||||
}
|
||||
|
||||
/********************************
|
||||
* Chaining while finding SMEMs *
|
||||
********************************/
|
||||
/************
|
||||
* Chaining *
|
||||
************/
|
||||
|
||||
typedef struct {
|
||||
int64_t rbeg;
|
||||
|
|
@ -295,8 +296,10 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
|
|||
n_chn = k;
|
||||
ks_introsort(mem_flt, n_chn, a);
|
||||
// pairwise chain comparisons
|
||||
a[0].kept = 3;
|
||||
kv_push(int, chains, 0);
|
||||
for (i = 1; i < n_chn; ++i) {
|
||||
int large_ovlp = 0;
|
||||
for (k = 0; k < chains.n; ++k) {
|
||||
int j = chains.a[k];
|
||||
int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]);
|
||||
|
|
@ -306,25 +309,35 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
|
|||
int lj = chn_end(a[j]) - chn_beg(a[j]);
|
||||
int min_l = li < lj? li : lj;
|
||||
if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap
|
||||
large_ovlp = 1;
|
||||
if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate
|
||||
if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
|
||||
if (a[i].w < a[j].w * opt->drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (k == chains.n) kv_push(int, chains, i);
|
||||
if (k == chains.n) {
|
||||
kv_push(int, chains, i);
|
||||
a[i].kept = large_ovlp? 2 : 3;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < chains.n; ++i) {
|
||||
mem_chain_t *c = &a[chains.a[i]];
|
||||
c->kept = 2;
|
||||
if (c->first >= 0) a[c->first].kept = 1;
|
||||
}
|
||||
free(chains.a);
|
||||
for (i = k = 0; i < n_chn; ++i) { // don't extend more than opt->max_chain_extend .kept=1/2 chains
|
||||
if (a[i].kept == 0 || a[i].kept == 3) continue;
|
||||
if (++k >= opt->max_chain_extend) break;
|
||||
}
|
||||
for (; i < n_chn; ++i)
|
||||
if (a[i].kept < 3) a[i].kept = 0;
|
||||
for (i = k = 0; i < n_chn; ++i) { // free discarded chains
|
||||
mem_chain_t *c = &a[i];
|
||||
if (c->kept == 0) free(c->seeds);
|
||||
else a[k++] = a[i];
|
||||
}
|
||||
n_chn = k;
|
||||
return k;
|
||||
}
|
||||
|
||||
|
|
@ -890,7 +903,7 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
|
|||
mem_aln_t *q;
|
||||
if (p->score < opt->T) continue;
|
||||
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
|
||||
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->chain_drop_ratio) continue;
|
||||
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
|
||||
q = kv_pushp(mem_aln_t, aa);
|
||||
*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
|
||||
q->flag |= extra_flag; // flag secondary
|
||||
|
|
@ -950,8 +963,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
|
|||
return regs;
|
||||
}
|
||||
|
||||
|
||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
||||
mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar, const char *name)
|
||||
{
|
||||
mem_aln_t a;
|
||||
int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
|
||||
|
|
@ -971,8 +983,10 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
|||
a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
|
||||
if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
|
||||
if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
|
||||
fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__);
|
||||
exit(1);
|
||||
if (name) fprintf(stderr, "[E::%s] Internal code inconsistency for read '%s'. Please contact the developer. Sorry.\n", __func__, name);
|
||||
else fprintf(stderr, "[E::%s] Internal code inconsistency. Please contact the developer. Sorry.\n", __func__);
|
||||
a.rid = -1; a.pos = -1; a.flag |= 0x4;
|
||||
return a;
|
||||
}
|
||||
tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del);
|
||||
w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins);
|
||||
|
|
@ -1024,6 +1038,11 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
|||
return a;
|
||||
}
|
||||
|
||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
||||
{
|
||||
return mem_reg2aln2(opt, bns, pac, l_query, query_, ar, 0);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
const mem_opt_t *opt;
|
||||
const bwt_t *bwt;
|
||||
|
|
|
|||
3
bwamem.h
3
bwamem.h
|
|
@ -32,6 +32,7 @@ typedef struct {
|
|||
int flag; // see MEM_F_* macros
|
||||
int min_seed_len; // minimum seed length
|
||||
int min_chain_weight;
|
||||
int max_chain_extend;
|
||||
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
|
||||
int split_width; // split into a seed if its occurence is smaller than this value
|
||||
int max_occ; // skip a seed if its occurence is larger than this value
|
||||
|
|
@ -39,7 +40,7 @@ typedef struct {
|
|||
int n_threads; // number of threads
|
||||
int chunk_size; // process chunk_size-bp sequences in a batch
|
||||
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
|
||||
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
||||
float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
||||
float mask_level_redun;
|
||||
float mapQ_coef_len;
|
||||
int mapQ_coef_fac;
|
||||
|
|
|
|||
16
fastmap.c
16
fastmap.c
|
|
@ -53,7 +53,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == 'x') mode = optarg;
|
||||
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
|
||||
|
|
@ -73,10 +73,11 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
|
||||
else if (c == 'v') bwa_verbose = atoi(optarg);
|
||||
else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.;
|
||||
else if (c == 'D') opt->chain_drop_ratio = atof(optarg), opt0.chain_drop_ratio = 1.;
|
||||
else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
|
||||
else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
|
||||
else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
|
||||
else if (c == 'N') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
|
||||
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
|
||||
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
|
||||
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
|
||||
else if (c == 'C') copy_comment = 1;
|
||||
else if (c == 'Q') {
|
||||
|
|
@ -132,7 +133,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio);
|
||||
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
|
||||
fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n");
|
||||
fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
|
||||
fprintf(stderr, " -S skip mate rescue\n");
|
||||
|
|
@ -168,7 +169,7 @@ int main_mem(int argc, char *argv[])
|
|||
}
|
||||
|
||||
if (mode) {
|
||||
if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0) {
|
||||
if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0) {
|
||||
if (!opt0.a) opt->a = 2, opt0.a = 1;
|
||||
update_a(opt, &opt0);
|
||||
if (!opt0.o_del) opt->o_del = 2;
|
||||
|
|
@ -177,13 +178,14 @@ int main_mem(int argc, char *argv[])
|
|||
if (!opt0.e_ins) opt->e_ins = 1;
|
||||
if (!opt0.max_occ) opt->max_occ = 1000;
|
||||
if (opt0.split_factor == 0.) opt->split_factor = 10.;
|
||||
if (strcmp(mode, "pbread") == 0) {
|
||||
if (strcmp(mode, "pbread1") == 0) {
|
||||
opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG;
|
||||
if (!opt0.b) opt->b = 5;
|
||||
if (!opt0.w) opt->w = 100;
|
||||
if (!opt0.min_seed_len) opt->min_seed_len = 13;
|
||||
if (!opt0.min_chain_weight) opt->min_chain_weight = 30;
|
||||
if (opt0.chain_drop_ratio == 0.) opt->chain_drop_ratio = .02;
|
||||
if (!opt0.max_chain_extend) opt->max_chain_extend = 20;
|
||||
if (opt0.drop_ratio == 0.) opt->drop_ratio = .01;
|
||||
} else {
|
||||
if (!opt0.b) opt->b = 7;
|
||||
if (!opt0.w) opt->w = 200;
|
||||
|
|
|
|||
Loading…
Reference in New Issue