Merge branch 'dev'
This commit is contained in:
commit
e3adc4275b
|
|
@ -1,3 +1,5 @@
|
|||
[](https://travis-ci.org/lh3/bwa)
|
||||
[](https://drone.io/github.com/lh3/bwa/latest)
|
||||
##Getting started
|
||||
|
||||
git clone https://github.com/lh3/bwa.git
|
||||
|
|
|
|||
|
|
@ -205,15 +205,17 @@ function parse_hit(s, opt)
|
|||
|
||||
function bwa_postalt(args)
|
||||
{
|
||||
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false };
|
||||
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 };
|
||||
|
||||
while ((c = getopt(args, 'Pqev:p:')) != null) {
|
||||
while ((c = getopt(args, 'Pqev:p:r:')) != null) {
|
||||
if (c == 'v') opt.verbose = parseInt(getopt.arg);
|
||||
else if (c == 'p') opt.pre = getopt.arg;
|
||||
else if (c == 'P') opt.show_pri = true;
|
||||
else if (c == 'q') opt.recover_maq = false;
|
||||
else if (c == 'e') opt.show_ev = true;
|
||||
else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg);
|
||||
}
|
||||
if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.;
|
||||
|
||||
if (opt.show_ev && opt.pre == null) {
|
||||
warn("ERROR: option '-p' must be specified if '-e' is applied.");
|
||||
|
|
@ -223,13 +225,14 @@ function bwa_postalt(args)
|
|||
if (args.length == getopt.ind) {
|
||||
print("");
|
||||
print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
|
||||
print("Options: -p STR prefix of file(s) for additional information [null]");
|
||||
print(" PREFIX.ctw - weight of each ALT contig");
|
||||
print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
|
||||
print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
|
||||
print(" -e show reads supporting ALT contigs into file PREFIX.evi");
|
||||
print("Options: -p STR prefix of file(s) for additional information [null]");
|
||||
print(" PREFIX.ctw - weight of each ALT contig");
|
||||
print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
|
||||
print(" -e show reads supporting ALT contigs into file PREFIX.evi");
|
||||
print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa<FLOAT ["+opt.min_pa_ratio+"]");
|
||||
print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
|
||||
print("");
|
||||
print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to");
|
||||
print("Note: This script extracts the XA tag, lifts the mapping positions of ALT hits to");
|
||||
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
|
||||
print(" a non-ALT hit overlaps a lifted ALT hit, its mapping quality is set to the");
|
||||
print(" smaller between its original mapQ and the adjusted mapQ of the ALT hit. If");
|
||||
|
|
@ -480,23 +483,31 @@ function bwa_postalt(args)
|
|||
}
|
||||
|
||||
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
|
||||
if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) {
|
||||
var l = rpt_lifted;
|
||||
if (opt.update_mapq && mapQ > 0 && n_rpt_lifted <= 1) {
|
||||
var l = n_rpt_lifted == 1? rpt_lifted : null;
|
||||
for (var i = 0; i < buf2.length; ++i) {
|
||||
var s = buf2[i];
|
||||
if (l[0] != s[2]) continue; // different chr
|
||||
if (((s[1]&16) != 0) != l[1]) continue; // different strand
|
||||
var start = s[3] - 1, end = start;
|
||||
while ((m = re_cigar.exec(t[5])) != null)
|
||||
if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
|
||||
end += parseInt(m[1]);
|
||||
if (start < l[3] && l[2] < end) {
|
||||
var om = -1;
|
||||
for (var j = 11; j < s.length; ++j)
|
||||
if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
|
||||
om = parseInt(m[1]);
|
||||
var s = buf2[i], is_ovlp = true;
|
||||
if (l != null) {
|
||||
if (l[0] != s[2]) is_ovlp = false; // different chr
|
||||
if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
|
||||
var start = s[3] - 1, end = start;
|
||||
while ((m = re_cigar.exec(t[5])) != null)
|
||||
if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
|
||||
end += parseInt(m[1]);
|
||||
if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
|
||||
} else is_ovlp = false;
|
||||
var om = -1, pa = 10.;
|
||||
for (var j = 11; j < s.length; ++j)
|
||||
if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
|
||||
om = parseInt(m[1]);
|
||||
else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null)
|
||||
pa = parseFloat(m[1]);
|
||||
if (is_ovlp) { // overlapping the lifted hit
|
||||
if (om > 0) s[4] = om;
|
||||
s[4] = s[4] < mapQ? s[4] : mapQ;
|
||||
} else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa
|
||||
if (om < 0) s.push("om:i:" + s[4]);
|
||||
s[4] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
28
bwamem.c
28
bwamem.c
|
|
@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init()
|
|||
o->pen_unpaired = 17;
|
||||
o->pen_clip5 = o->pen_clip3 = 5;
|
||||
|
||||
o->max_mem_intv = 0;
|
||||
o->max_mem_intv = 10;
|
||||
|
||||
o->min_seed_len = 19;
|
||||
o->split_width = 10;
|
||||
|
|
@ -79,7 +79,6 @@ mem_opt_t *mem_opt_init()
|
|||
o->min_chain_weight = 0;
|
||||
o->max_chain_extend = 1<<30;
|
||||
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
|
||||
o->min_pa_ratio = 0.8;
|
||||
bwa_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
}
|
||||
|
|
@ -121,7 +120,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
// first pass: find all SMEMs
|
||||
while (x < len) {
|
||||
if (seq[x] < 4) {
|
||||
x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv);
|
||||
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||||
|
|
@ -131,7 +130,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
} else ++x;
|
||||
}
|
||||
// second pass: find MEMs inside a long SMEM
|
||||
if (opt->max_mem_intv > 0) goto sort_intv;
|
||||
old_n = a->mem.n;
|
||||
for (k = 0; k < old_n; ++k) {
|
||||
bwtintv_t *p = &a->mem.a[k];
|
||||
|
|
@ -142,8 +140,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
|
||||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||
}
|
||||
// third pass: LAST-like
|
||||
if (opt->max_mem_intv > 0) {
|
||||
x = 0;
|
||||
while (x < len) {
|
||||
if (seq[x] < 4) {
|
||||
if (1) {
|
||||
bwtintv_t m;
|
||||
x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m);
|
||||
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
|
||||
} else { // for now, we never come to this block which is slower
|
||||
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
|
||||
for (i = 0; i < a->mem1.n; ++i)
|
||||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||
}
|
||||
} else ++x;
|
||||
}
|
||||
}
|
||||
// sort
|
||||
sort_intv:
|
||||
ks_introsort(mem_intv, a->mem.n, a->mem.a);
|
||||
}
|
||||
|
||||
|
|
@ -826,7 +840,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
|
|||
if (p->rid >= 0) { // with coordinate
|
||||
kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
|
||||
kputl(p->pos + 1, str); kputc('\t', str); // POS
|
||||
kputw(p->score < opt->min_pa_ratio * p->alt_sc? 0 : p->mapq, str); kputc('\t', str); // MAPQ
|
||||
kputw(p->mapq, str); kputc('\t', str); // MAPQ
|
||||
if (p->n_cigar) { // aligned
|
||||
for (i = 0; i < p->n_cigar; ++i) {
|
||||
int c = p->cigar[i]&0xf;
|
||||
|
|
@ -916,8 +930,6 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
|
|||
}
|
||||
if (p->alt_sc > 0)
|
||||
ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc);
|
||||
if (p->score < opt->min_pa_ratio * p->alt_sc)
|
||||
ksprintf(str, "\tom:i:%d", p->mapq);
|
||||
}
|
||||
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
|
||||
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -48,7 +48,6 @@ typedef struct {
|
|||
float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
|
||||
float mask_level_redun;
|
||||
float mapQ_coef_len;
|
||||
float min_pa_ratio;
|
||||
int mapQ_coef_fac;
|
||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr)
|
|||
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
||||
if (itr->start == itr->len) return 0;
|
||||
ori_start = itr->start;
|
||||
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
|
||||
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
|
||||
return itr->matches;
|
||||
}
|
||||
|
||||
|
|
|
|||
44
bwt.c
44
bwt.c
|
|
@ -285,8 +285,8 @@ static void bwt_reverse_intvs(bwtintv_v *p)
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
// NOTE: $max_intv is not currently used in BWA-MEM
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
int i, j, c, ret;
|
||||
bwtintv_t ik, ok[4];
|
||||
|
|
@ -302,13 +302,15 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
ik.info = x + 1;
|
||||
|
||||
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
|
||||
if (q[i] < 4) { // an A/C/G/T base
|
||||
if (ik.x[2] < max_intv) { // an interval small enough
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
break;
|
||||
} else if (q[i] < 4) { // an A/C/G/T base
|
||||
c = 3 - q[i]; // complement of q[i]
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
|
||||
if (i - x > max_len && ik.x[2] < max_intv) break;
|
||||
}
|
||||
ik = ok[c]; ik.info = i + 1;
|
||||
} else { // an ambiguous base
|
||||
|
|
@ -325,13 +327,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
|
||||
for (j = 0, curr->n = 0; j < prev->n; ++j) {
|
||||
bwtintv_t *p = &prev->a[j];
|
||||
int max_stop = 0;
|
||||
bwt_extend(bwt, p, ok, 1);
|
||||
if (ok[c].x[2] != p->x[2]) { // change of interval size
|
||||
if (p->info - i - 1 > max_len && p->x[2] < max_intv)
|
||||
max_stop = 1;
|
||||
}
|
||||
if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
||||
if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
|
||||
if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
||||
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
|
||||
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
|
||||
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
|
||||
|
|
@ -355,7 +352,30 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec);
|
||||
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
|
||||
}
|
||||
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem)
|
||||
{
|
||||
int i, c;
|
||||
bwtintv_t ik, ok[4];
|
||||
|
||||
memset(mem, 0, sizeof(bwtintv_t));
|
||||
if (q[x] > 3) return x + 1;
|
||||
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||
for (i = x + 1; i < len; ++i) { // forward search
|
||||
if (q[i] < 4) { // an A/C/G/T base
|
||||
c = 3 - q[i]; // complement of q[i]
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
if (ok[c].x[2] < max_intv) {
|
||||
*mem = ok[c];
|
||||
mem->info = (uint64_t)x<<32 | (i + 1);
|
||||
return i + 1;
|
||||
}
|
||||
ik = ok[c];
|
||||
} else return i + 1;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/*************************
|
||||
|
|
|
|||
4
bwt.h
4
bwt.h
|
|
@ -119,9 +119,9 @@ extern "C" {
|
|||
* Return the end of the longest exact match starting from _x_.
|
||||
*/
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
|
||||
// SMEM iterator interface
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:g:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == 'x') mode = optarg;
|
||||
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
|
||||
|
|
@ -85,7 +85,6 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
|
||||
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
|
||||
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
|
||||
else if (c == 'g') opt->min_pa_ratio = atof(optarg), opt0.min_pa_ratio = 1;
|
||||
else if (c == 'C') copy_comment = 1;
|
||||
else if (c == 'K') fixed_chunk_size = atoi(optarg);
|
||||
else if (c == 'h') {
|
||||
|
|
@ -145,7 +144,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT (EXPERIMENTAL) [%ld]\n", (long)opt->max_mem_intv);
|
||||
fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
|
||||
|
|
@ -172,7 +171,6 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -j ignore ALT contigs\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
|
||||
fprintf(stderr, " -g FLOAT set mapQ to zero if the ratio of the primary-to-alt scores below FLOAT [%.3f]\n", opt->min_pa_ratio);
|
||||
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
|
||||
fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt);
|
||||
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
|
||||
|
|
|
|||
Loading…
Reference in New Issue