diff --git a/README.md b/README.md index cfc84c6..f81715c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa) +[![Build Status](https://drone.io/github.com/lh3/bwa/status.png)](https://drone.io/github.com/lh3/bwa/latest) ##Getting started git clone https://github.com/lh3/bwa.git diff --git a/bwa-postalt.js b/bwa-postalt.js index 411cfdd..a02b3e0 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -205,15 +205,17 @@ function parse_hit(s, opt) function bwa_postalt(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false }; + var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 }; - while ((c = getopt(args, 'Pqev:p:')) != null) { + while ((c = getopt(args, 'Pqev:p:r:')) != null) { if (c == 'v') opt.verbose = parseInt(getopt.arg); else if (c == 'p') opt.pre = getopt.arg; else if (c == 'P') opt.show_pri = true; else if (c == 'q') opt.recover_maq = false; else if (c == 'e') opt.show_ev = true; + else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg); } + if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.; if (opt.show_ev && opt.pre == null) { warn("ERROR: option '-p' must be specified if '-e' is applied."); @@ -223,13 +225,14 @@ function bwa_postalt(args) if (args.length == getopt.ind) { print(""); print("Usage: k8 bwa-postalt.js [options] [aln.sam]\n"); - print("Options: -p STR prefix of file(s) for additional information [null]"); - print(" PREFIX.ctw - weight of each ALT contig"); - print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)"); - print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT"); - print(" -e show reads supporting ALT contigs into file PREFIX.evi"); + print("Options: -p STR prefix of file(s) for additional information [null]"); + print(" PREFIX.ctw - weight of each ALT contig"); + print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)"); + print(" -e show reads supporting ALT contigs into file PREFIX.evi"); + print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa 0) { - var l = rpt_lifted; + if (opt.update_mapq && mapQ > 0 && n_rpt_lifted <= 1) { + var l = n_rpt_lifted == 1? rpt_lifted : null; for (var i = 0; i < buf2.length; ++i) { - var s = buf2[i]; - if (l[0] != s[2]) continue; // different chr - if (((s[1]&16) != 0) != l[1]) continue; // different strand - var start = s[3] - 1, end = start; - while ((m = re_cigar.exec(t[5])) != null) - if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N') - end += parseInt(m[1]); - if (start < l[3] && l[2] < end) { - var om = -1; - for (var j = 11; j < s.length; ++j) - if ((m = /^om:i:(\d+)/.exec(s[j])) != null) - om = parseInt(m[1]); + var s = buf2[i], is_ovlp = true; + if (l != null) { + if (l[0] != s[2]) is_ovlp = false; // different chr + if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand + var start = s[3] - 1, end = start; + while ((m = re_cigar.exec(t[5])) != null) + if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N') + end += parseInt(m[1]); + if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap + } else is_ovlp = false; + var om = -1, pa = 10.; + for (var j = 11; j < s.length; ++j) + if ((m = /^om:i:(\d+)/.exec(s[j])) != null) + om = parseInt(m[1]); + else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null) + pa = parseFloat(m[1]); + if (is_ovlp) { // overlapping the lifted hit if (om > 0) s[4] = om; s[4] = s[4] < mapQ? s[4] : mapQ; + } else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa + if (om < 0) s.push("om:i:" + s[4]); + s[4] = 0; } } } diff --git a/bwamem.c b/bwamem.c index 79785f4..3fd008f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init() o->pen_unpaired = 17; o->pen_clip5 = o->pen_clip3 = 5; - o->max_mem_intv = 0; + o->max_mem_intv = 10; o->min_seed_len = 19; o->split_width = 10; @@ -79,7 +79,6 @@ mem_opt_t *mem_opt_init() o->min_chain_weight = 0; o->max_chain_extend = 1<<30; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); - o->min_pa_ratio = 0.8; bwa_fill_scmat(o->a, o->b, o->mat); return o; } @@ -121,7 +120,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co // first pass: find all SMEMs while (x < len) { if (seq[x] < 4) { - x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv); + x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length @@ -131,7 +130,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co } else ++x; } // second pass: find MEMs inside a long SMEM - if (opt->max_mem_intv > 0) goto sort_intv; old_n = a->mem.n; for (k = 0; k < old_n; ++k) { bwtintv_t *p = &a->mem.a[k]; @@ -142,8 +140,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } + // third pass: LAST-like + if (opt->max_mem_intv > 0) { + x = 0; + while (x < len) { + if (seq[x] < 4) { + if (1) { + bwtintv_t m; + x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m); + if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); + } else { // for now, we never come to this block which is slower + x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) + kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + } + } else ++x; + } + } // sort -sort_intv: ks_introsort(mem_intv, a->mem.n, a->mem.a); } @@ -826,7 +840,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq if (p->rid >= 0) { // with coordinate kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME kputl(p->pos + 1, str); kputc('\t', str); // POS - kputw(p->score < opt->min_pa_ratio * p->alt_sc? 0 : p->mapq, str); kputc('\t', str); // MAPQ + kputw(p->mapq, str); kputc('\t', str); // MAPQ if (p->n_cigar) { // aligned for (i = 0; i < p->n_cigar; ++i) { int c = p->cigar[i]&0xf; @@ -916,8 +930,6 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq } if (p->alt_sc > 0) ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc); - if (p->score < opt->min_pa_ratio * p->alt_sc) - ksprintf(str, "\tom:i:%d", p->mapq); } if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } diff --git a/bwamem.h b/bwamem.h index ae53601..1746ab5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -48,7 +48,6 @@ typedef struct { float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag float mask_level_redun; float mapQ_coef_len; - float min_pa_ratio; int mapQ_coef_fac; int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end diff --git a/bwamem_extra.c b/bwamem_extra.c index 09faaa2..02e817a 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -65,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr) while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; - itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM + itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM return itr->matches; } diff --git a/bwt.c b/bwt.c index a719f53..a97b60c 100644 --- a/bwt.c +++ b/bwt.c @@ -285,8 +285,8 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } } - -int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +// NOTE: $max_intv is not currently used in BWA-MEM +int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; @@ -302,13 +302,15 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] < 4) { // an A/C/G/T base + if (ik.x[2] < max_intv) { // an interval small enough + kv_push(bwtintv_t, *curr, ik); + break; + } else if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); if (ok[c].x[2] != ik.x[2]) { // change of the interval size kv_push(bwtintv_t, *curr, ik); if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further - if (i - x > max_len && ik.x[2] < max_intv) break; } ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base @@ -325,13 +327,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; - int max_stop = 0; - bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] != p->x[2]) { // change of interval size - if (p->info - i - 1 > max_len && p->x[2] < max_intv) - max_stop = 1; - } - if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough + if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1); + if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; @@ -355,7 +352,30 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { - return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec); + return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec); +} + +int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem) +{ + int i, c; + bwtintv_t ik, ok[4]; + + memset(mem, 0, sizeof(bwtintv_t)); + if (q[x] > 3) return x + 1; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base + for (i = x + 1; i < len; ++i) { // forward search + if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] < max_intv) { + *mem = ok[c]; + mem->info = (uint64_t)x<<32 | (i + 1); + return i + 1; + } + ik = ok[c]; + } else return i + 1; + } + return len; } /************************* diff --git a/bwt.h b/bwt.h index 7ab49bc..afb84d2 100644 --- a/bwt.h +++ b/bwt.h @@ -119,9 +119,9 @@ extern "C" { * Return the end of the longest exact match starting from _x_. */ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); - int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); - // SMEM iterator interface + int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index e949da3..18f3b0e 100644 --- a/fastmap.c +++ b/fastmap.c @@ -55,7 +55,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:g:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -85,7 +85,6 @@ int main_mem(int argc, char *argv[]) else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1; - else if (c == 'g') opt->min_pa_ratio = atof(optarg), opt0.min_pa_ratio = 1; else if (c == 'C') copy_comment = 1; else if (c == 'K') fixed_chunk_size = atoi(optarg); else if (c == 'h') { @@ -145,7 +144,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT (EXPERIMENTAL) [%ld]\n", (long)opt->max_mem_intv); + fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); @@ -172,7 +171,6 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -j ignore ALT contigs\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); - fprintf(stderr, " -g FLOAT set mapQ to zero if the ratio of the primary-to-alt scores below FLOAT [%.3f]\n", opt->min_pa_ratio); fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -h INT[,INT] if there are 80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); diff --git a/main.c b/main.c index 1a18559..12b0930 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r915-dirty" +#define PACKAGE_VERSION "0.7.10-r933-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);