r854: improved the calculation of pa
and build pa filtering into BWA-MEM
This commit is contained in:
parent
f5a0e576e6
commit
c982443210
27
bwamem.c
27
bwamem.c
|
|
@ -78,6 +78,7 @@ mem_opt_t *mem_opt_init()
|
|||
o->min_chain_weight = 0;
|
||||
o->max_chain_extend = 1<<30;
|
||||
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
|
||||
o->min_pa_ratio = 0.8;
|
||||
bwa_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
}
|
||||
|
|
@ -514,14 +515,18 @@ int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id
|
|||
int_v z = {0,0,0};
|
||||
if (n == 0) return 0;
|
||||
for (i = n_pri = 0; i < n; ++i) {
|
||||
a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
|
||||
a[i].sub = 0, a[i].alt_sc = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
|
||||
if (!a[i].is_alt) ++n_pri;
|
||||
}
|
||||
ks_introsort(mem_ars_hash, n, a);
|
||||
mem_mark_primary_se_core(opt, n, a, &z);
|
||||
for (i = 0; i < n; ++i) // don't track the "parent" hit of ALT secondary hits
|
||||
if (a[i].is_alt && a[i].secondary >= 0)
|
||||
a[i].secondary = INT_MAX;
|
||||
for (i = 0; i < n; ++i) {
|
||||
mem_alnreg_t *p = &a[i];
|
||||
if (p->is_alt && p->secondary >= 0) // don't track the "parent" hit of ALT secondary hits
|
||||
p->secondary = INT_MAX;
|
||||
if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
|
||||
p->alt_sc = a[p->secondary].score;
|
||||
}
|
||||
if (n_pri > 0 || n_pri != n) {
|
||||
ks_introsort(mem_ars_hash2, n, a);
|
||||
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
|
||||
|
|
@ -808,7 +813,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
|
|||
if (p->rid >= 0) { // with coordinate
|
||||
kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
|
||||
kputl(p->pos + 1, str); kputc('\t', str); // POS
|
||||
kputw(p->mapq, str); kputc('\t', str); // MAPQ
|
||||
kputw(p->score < opt->min_pa_ratio * p->alt_sc? 0 : p->mapq, str); kputc('\t', str); // MAPQ
|
||||
if (p->n_cigar) { // aligned
|
||||
for (i = 0; i < p->n_cigar; ++i) {
|
||||
int c = p->cigar[i]&0xf;
|
||||
|
|
@ -880,13 +885,11 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
|
|||
for (i = 0; i < n; ++i)
|
||||
if (i != which && !(list[i].flag&0x100)) break;
|
||||
if (i < n) { // there are other primary hits; output them
|
||||
int pri_alt_sc = -1;
|
||||
kputsn("\tSA:Z:", 6, str);
|
||||
for (i = 0; i < n; ++i) {
|
||||
const mem_aln_t *r = &list[i];
|
||||
int k;
|
||||
if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
|
||||
if (list[i].is_alt) pri_alt_sc = pri_alt_sc > r->score? pri_alt_sc : r->score;
|
||||
if (i == which || (r->flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
|
||||
kputs(bns->anns[r->rid].name, str); kputc(',', str);
|
||||
kputl(r->pos+1, str); kputc(',', str);
|
||||
kputc("+-"[r->is_rev], str); kputc(',', str);
|
||||
|
|
@ -897,9 +900,11 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
|
|||
kputc(',', str); kputw(r->NM, str);
|
||||
kputc(';', str);
|
||||
}
|
||||
if (pri_alt_sc > 0)
|
||||
ksprintf(str, "\tpa:f:%.3f", (double)p->score / pri_alt_sc);
|
||||
}
|
||||
if (p->alt_sc > 0)
|
||||
ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc);
|
||||
if (p->score < opt->min_pa_ratio * p->alt_sc)
|
||||
ksprintf(str, "\tom:i:%d", p->mapq);
|
||||
}
|
||||
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
|
||||
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
|
||||
|
|
@ -1098,7 +1103,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
|
|||
assert(a.rid == ar->rid);
|
||||
a.pos = pos - bns->anns[a.rid].offset;
|
||||
a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
|
||||
a.is_alt = ar->is_alt;
|
||||
a.is_alt = ar->is_alt; a.alt_sc = ar->alt_sc;
|
||||
free(query);
|
||||
return a;
|
||||
}
|
||||
|
|
|
|||
4
bwamem.h
4
bwamem.h
|
|
@ -48,6 +48,7 @@ typedef struct {
|
|||
float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
|
||||
float mask_level_redun;
|
||||
float mapQ_coef_len;
|
||||
float min_pa_ratio;
|
||||
int mapQ_coef_fac;
|
||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||
|
|
@ -62,6 +63,7 @@ typedef struct {
|
|||
int score; // best local SW score
|
||||
int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
|
||||
int sub; // 2nd best SW score
|
||||
int alt_sc;
|
||||
int csub; // SW score of a tandem hit
|
||||
int sub_n; // approximate number of suboptimal hits
|
||||
int w; // actual band width used in extension
|
||||
|
|
@ -90,7 +92,7 @@ typedef struct { // This struct is only used for the convenience of API.
|
|||
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
|
||||
char *XA; // alternative mappings
|
||||
|
||||
int score, sub;
|
||||
int score, sub, alt_sc;
|
||||
} mem_aln_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:g:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == 'x') mode = optarg;
|
||||
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
|
||||
|
|
@ -85,6 +85,7 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
|
||||
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
|
||||
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
|
||||
else if (c == 'g') opt->min_pa_ratio = atof(optarg), opt0.min_pa_ratio = 1;
|
||||
else if (c == 'C') copy_comment = 1;
|
||||
else if (c == 'K') fixed_chunk_size = atoi(optarg);
|
||||
else if (c == 'Q') {
|
||||
|
|
@ -138,7 +139,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT [%ld]\n", (long)opt->max_mem_intv);
|
||||
fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT (EXPERIMENTAL) [%ld]\n", (long)opt->max_mem_intv);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
|
||||
|
|
@ -164,6 +165,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
|
||||
fprintf(stderr, " -g FLOAT set mapQ to zero if the ratio of the primary-to-alt scores below FLOAT [%.3f]\n", opt->min_pa_ratio);
|
||||
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
|
||||
fprintf(stderr, " -h INT if there are <INT hits with score >80%% of the max score, output all in XA [%d]\n", opt->max_hits);
|
||||
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
|
||||
|
|
|
|||
Loading…
Reference in New Issue