Merge branch 'dev'

This commit is contained in:
Heng Li 2014-10-21 00:23:39 -04:00
commit e3adc4275b
9 changed files with 93 additions and 51 deletions

View File

@ -1,3 +1,5 @@
[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa)
[![Build Status](https://drone.io/github.com/lh3/bwa/status.png)](https://drone.io/github.com/lh3/bwa/latest)
##Getting started
git clone https://github.com/lh3/bwa.git

View File

@ -205,15 +205,17 @@ function parse_hit(s, opt)
function bwa_postalt(args)
{
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false };
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 };
while ((c = getopt(args, 'Pqev:p:')) != null) {
while ((c = getopt(args, 'Pqev:p:r:')) != null) {
if (c == 'v') opt.verbose = parseInt(getopt.arg);
else if (c == 'p') opt.pre = getopt.arg;
else if (c == 'P') opt.show_pri = true;
else if (c == 'q') opt.recover_maq = false;
else if (c == 'e') opt.show_ev = true;
else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg);
}
if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.;
if (opt.show_ev && opt.pre == null) {
warn("ERROR: option '-p' must be specified if '-e' is applied.");
@ -223,13 +225,14 @@ function bwa_postalt(args)
if (args.length == getopt.ind) {
print("");
print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
print("Options: -p STR prefix of file(s) for additional information [null]");
print(" PREFIX.ctw - weight of each ALT contig");
print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
print(" -e show reads supporting ALT contigs into file PREFIX.evi");
print("Options: -p STR prefix of file(s) for additional information [null]");
print(" PREFIX.ctw - weight of each ALT contig");
print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
print(" -e show reads supporting ALT contigs into file PREFIX.evi");
print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa<FLOAT ["+opt.min_pa_ratio+"]");
print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
print("");
print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to");
print("Note: This script extracts the XA tag, lifts the mapping positions of ALT hits to");
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
print(" a non-ALT hit overlaps a lifted ALT hit, its mapping quality is set to the");
print(" smaller between its original mapQ and the adjusted mapQ of the ALT hit. If");
@ -480,23 +483,31 @@ function bwa_postalt(args)
}
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) {
var l = rpt_lifted;
if (opt.update_mapq && mapQ > 0 && n_rpt_lifted <= 1) {
var l = n_rpt_lifted == 1? rpt_lifted : null;
for (var i = 0; i < buf2.length; ++i) {
var s = buf2[i];
if (l[0] != s[2]) continue; // different chr
if (((s[1]&16) != 0) != l[1]) continue; // different strand
var start = s[3] - 1, end = start;
while ((m = re_cigar.exec(t[5])) != null)
if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
end += parseInt(m[1]);
if (start < l[3] && l[2] < end) {
var om = -1;
for (var j = 11; j < s.length; ++j)
if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
om = parseInt(m[1]);
var s = buf2[i], is_ovlp = true;
if (l != null) {
if (l[0] != s[2]) is_ovlp = false; // different chr
if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
var start = s[3] - 1, end = start;
while ((m = re_cigar.exec(t[5])) != null)
if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
end += parseInt(m[1]);
if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
} else is_ovlp = false;
var om = -1, pa = 10.;
for (var j = 11; j < s.length; ++j)
if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
om = parseInt(m[1]);
else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null)
pa = parseFloat(m[1]);
if (is_ovlp) { // overlapping the lifted hit
if (om > 0) s[4] = om;
s[4] = s[4] < mapQ? s[4] : mapQ;
} else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa
if (om < 0) s.push("om:i:" + s[4]);
s[4] = 0;
}
}
}

View File

@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init()
o->pen_unpaired = 17;
o->pen_clip5 = o->pen_clip3 = 5;
o->max_mem_intv = 0;
o->max_mem_intv = 10;
o->min_seed_len = 19;
o->split_width = 10;
@ -79,7 +79,6 @@ mem_opt_t *mem_opt_init()
o->min_chain_weight = 0;
o->max_chain_extend = 1<<30;
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
o->min_pa_ratio = 0.8;
bwa_fill_scmat(o->a, o->b, o->mat);
return o;
}
@ -121,7 +120,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
// first pass: find all SMEMs
while (x < len) {
if (seq[x] < 4) {
x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv);
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length
@ -131,7 +130,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
} else ++x;
}
// second pass: find MEMs inside a long SMEM
if (opt->max_mem_intv > 0) goto sort_intv;
old_n = a->mem.n;
for (k = 0; k < old_n; ++k) {
bwtintv_t *p = &a->mem.a[k];
@ -142,8 +140,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
// third pass: LAST-like
if (opt->max_mem_intv > 0) {
x = 0;
while (x < len) {
if (seq[x] < 4) {
if (1) {
bwtintv_t m;
x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m);
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
} else { // for now, we never come to this block which is slower
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
} else ++x;
}
}
// sort
sort_intv:
ks_introsort(mem_intv, a->mem.n, a->mem.a);
}
@ -826,7 +840,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
if (p->rid >= 0) { // with coordinate
kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
kputl(p->pos + 1, str); kputc('\t', str); // POS
kputw(p->score < opt->min_pa_ratio * p->alt_sc? 0 : p->mapq, str); kputc('\t', str); // MAPQ
kputw(p->mapq, str); kputc('\t', str); // MAPQ
if (p->n_cigar) { // aligned
for (i = 0; i < p->n_cigar; ++i) {
int c = p->cigar[i]&0xf;
@ -916,8 +930,6 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
}
if (p->alt_sc > 0)
ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc);
if (p->score < opt->min_pa_ratio * p->alt_sc)
ksprintf(str, "\tom:i:%d", p->mapq);
}
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }

View File

@ -48,7 +48,6 @@ typedef struct {
float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
float mask_level_redun;
float mapQ_coef_len;
float min_pa_ratio;
int mapQ_coef_fac;
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end

View File

@ -65,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr)
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
return itr->matches;
}

44
bwt.c
View File

@ -285,8 +285,8 @@ static void bwt_reverse_intvs(bwtintv_v *p)
}
}
}
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
// NOTE: $max_intv is not currently used in BWA-MEM
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
@ -302,13 +302,15 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
ik.info = x + 1;
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
if (q[i] < 4) { // an A/C/G/T base
if (ik.x[2] < max_intv) { // an interval small enough
kv_push(bwtintv_t, *curr, ik);
break;
} else if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
kv_push(bwtintv_t, *curr, ik);
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
if (i - x > max_len && ik.x[2] < max_intv) break;
}
ik = ok[c]; ik.info = i + 1;
} else { // an ambiguous base
@ -325,13 +327,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
int max_stop = 0;
bwt_extend(bwt, p, ok, 1);
if (ok[c].x[2] != p->x[2]) { // change of interval size
if (p->info - i - 1 > max_len && p->x[2] < max_intv)
max_stop = 1;
}
if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
@ -355,7 +352,30 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec);
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
}
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem)
{
int i, c;
bwtintv_t ik, ok[4];
memset(mem, 0, sizeof(bwtintv_t));
if (q[x] > 3) return x + 1;
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
for (i = x + 1; i < len; ++i) { // forward search
if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] < max_intv) {
*mem = ok[c];
mem->info = (uint64_t)x<<32 | (i + 1);
return i + 1;
}
ik = ok[c];
} else return i + 1;
}
return len;
}
/*************************

4
bwt.h
View File

@ -119,9 +119,9 @@ extern "C" {
* Return the end of the longest exact match starting from _x_.
*/
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
// SMEM iterator interface
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem);
#ifdef __cplusplus
}

View File

@ -55,7 +55,7 @@ int main_mem(int argc, char *argv[])
opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:g:")) >= 0) {
while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == 'x') mode = optarg;
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
@ -85,7 +85,6 @@ int main_mem(int argc, char *argv[])
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
else if (c == 'g') opt->min_pa_ratio = atof(optarg), opt0.min_pa_ratio = 1;
else if (c == 'C') copy_comment = 1;
else if (c == 'K') fixed_chunk_size = atoi(optarg);
else if (c == 'h') {
@ -145,7 +144,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT (EXPERIMENTAL) [%ld]\n", (long)opt->max_mem_intv);
fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
@ -172,7 +171,6 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -j ignore ALT contigs\n");
fprintf(stderr, "\n");
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
fprintf(stderr, " -g FLOAT set mapQ to zero if the ratio of the primary-to-alt scores below FLOAT [%.3f]\n", opt->min_pa_ratio);
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt);
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r915-dirty"
#define PACKAGE_VERSION "0.7.10-r933-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);