From 574098d034e2f596b99efbf3e0b1cb587bb9fa8c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 21 Oct 2014 00:57:06 -0400 Subject: [PATCH 1/6] updated directory name --- README-alt.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README-alt.md b/README-alt.md index f4d3a45..d46b38f 100644 --- a/README-alt.md +++ b/README-alt.md @@ -14,13 +14,13 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/H gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa bwa index hs38a.fa -cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt +cp bwa-hs38-bundle/hs38d4.fa.alt hs38a.fa.alt ``` Perform mapping: ```sh bwa mem hs38a.fa read1.fq read2.fq \ - | bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \ + | bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \ | samtools view -bS - > aln.unsrt.bam ``` For short reads, the postprocessing script `bwa-postalt.js` runs at about the @@ -30,14 +30,14 @@ same speed as BAM compression. Construct the index: ```sh -cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa +cat hs38a.fa bwa-hs38-bundle/hs38d4-extra.fa > hs38d4.fa bwa index hs38d4.fa -cp bwa-hs38-res/hs38d4.fa.alt . +cp bwa-hs38-bundle/hs38d4.fa.alt . ``` Perform mapping: ```sh bwa mem hs38d4.fa read1.fq read2.fq \ - | bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ + | bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ | samtools view -bS - > aln.unsrt.bam ``` This command line generates `postinfo.ctw` which loosely evaluates the presence From 5e00d083462b0eca2be9c4e6eea0fe2d11ffa494 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 21 Oct 2014 09:26:19 -0400 Subject: [PATCH 2/6] r939: fixed a memory leak (issue #35) --- bwamem_pair.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 3882784..5d4c5c2 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -70,6 +70,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (q->n < MIN_DIR_CNT) { fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); r->failed = 1; + free(q->a); continue; } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); ks_introsort_64(q->n, q->a); diff --git a/main.c b/main.c index 12b0930..07a9d00 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r933-dirty" +#define PACKAGE_VERSION "0.7.10-r939-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 282130a64e990de68b246c75442507b42fc995d1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 21 Oct 2014 12:57:49 -0400 Subject: [PATCH 3/6] r940: fixed a bug - missing primary hit --- README-alt.md | 6 ++++-- bwamem.c | 9 +++++---- main.c | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README-alt.md b/README-alt.md index d46b38f..e57313f 100644 --- a/README-alt.md +++ b/README-alt.md @@ -88,7 +88,9 @@ alignments and assigns mapQ following these two rules: In theory, non-ALT alignments from step 1 should be identical to alignments against a reference genome with ALT contigs. In practice, the two types of -alignments may differ in rare cases due to seeding heuristics. +alignments may differ in rare cases due to seeding heuristics. When an ALT hit +is significantly better than non-ALT hits, BWA-MEM may miss seeds on the +non-ALT hits. This happens more often for contig mapping. If we don't care about ALT hits, we may skip postprocessing (step 2). Nonetheless, postprocessing is recommended as it improves mapQ and gives more @@ -134,7 +136,7 @@ not to high resolution for now. ### Evaluating ALT Mapping -(To come later...) +(Coming soon...) ## Problems and Future Development diff --git a/bwamem.c b/bwamem.c index 3fd008f..4f585c6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -980,14 +980,14 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); kstring_t str; kvec_t(mem_aln_t) aa; - int k; + int k, l; char **XA = 0; if (!(opt->flag & MEM_F_ALL)) XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); kv_init(aa); str.l = str.m = 0; str.s = 0; - for (k = 0; k < a->n; ++k) { + for (k = l = 0; k < a->n; ++k) { mem_alnreg_t *p = &a->a[k]; mem_aln_t *q; if (p->score < opt->T) continue; @@ -999,9 +999,10 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, q->XA = XA? XA[k] : 0; q->flag |= extra_flag; // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score - if (k && p->secondary < 0) // if supplementary + if (l && p->secondary < 0) // if supplementary q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; - if (k && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + ++l; } if (aa.n == 0) { // no alignments good enough; then write an unaligned record mem_aln_t t; diff --git a/main.c b/main.c index 07a9d00..65b9817 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r939-dirty" +#define PACKAGE_VERSION "0.7.10-r940-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 60b728487a2a19d8ca728e317495cbbd958b4aeb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 21 Oct 2014 13:15:42 -0400 Subject: [PATCH 4/6] r941: set a min length for 3rd-round seeding --- bwamem.c | 4 ++-- bwt.c | 4 ++-- bwt.h | 2 +- main.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4f585c6..6f36128 100644 --- a/bwamem.c +++ b/bwamem.c @@ -147,7 +147,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co if (seq[x] < 4) { if (1) { bwtintv_t m; - x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m); + x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); } else { // for now, we never come to this block which is slower x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); @@ -274,7 +274,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn bwtintv_t *p = &aux->mem.a[i]; int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; - if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive + // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { mem_chain_t tmp, *lower, *upper; diff --git a/bwt.c b/bwt.c index a97b60c..9083654 100644 --- a/bwt.c +++ b/bwt.c @@ -355,7 +355,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec); } -int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem) +int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem) { int i, c; bwtintv_t ik, ok[4]; @@ -367,7 +367,7 @@ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int m if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); - if (ok[c].x[2] < max_intv) { + if (ok[c].x[2] < max_intv && i - x >= min_len) { *mem = ok[c]; mem->info = (uint64_t)x<<32 | (i + 1); return i + 1; diff --git a/bwt.h b/bwt.h index afb84d2..c71d6b5 100644 --- a/bwt.h +++ b/bwt.h @@ -121,7 +121,7 @@ extern "C" { int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); - int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem); + int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem); #ifdef __cplusplus } diff --git a/main.c b/main.c index 65b9817..59c659e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r940-dirty" +#define PACKAGE_VERSION "0.7.10-r941-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 4177d6c2c744bb6a688b1d301833612c53e3f06b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 22 Oct 2014 10:24:16 -0400 Subject: [PATCH 5/6] r942: ignore ALT hits when counting n_sub for ... non-ALT hits. Counting leads to underestimated mapQ. --- bwamem.c | 3 ++- main.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6f36128..a7e7150 100644 --- a/bwamem.c +++ b/bwamem.c @@ -514,7 +514,8 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t * int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].sub == 0) a[j].sub = a[i].score; - if (a[j].score - a[i].score <= tmp) ++a[j].sub_n; + if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt)) + ++a[j].sub_n; break; } } diff --git a/main.c b/main.c index 59c659e..a7ad841 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r941-dirty" +#define PACKAGE_VERSION "0.7.10-r942-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 3d129be642d5d99b03fecc115e506f9c4bab15c4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 22 Oct 2014 12:42:58 -0400 Subject: [PATCH 6/6] r943: change the default -y to 20, but ... for GRCh38 ALT, this is not enough. We need -y at least 40 to get high accuracy because a locus at chr19 has 35 copies. --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index a7e7150..b32c166 100644 --- a/bwamem.c +++ b/bwamem.c @@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init() o->pen_unpaired = 17; o->pen_clip5 = o->pen_clip3 = 5; - o->max_mem_intv = 10; + o->max_mem_intv = 20; o->min_seed_len = 19; o->split_width = 10; diff --git a/main.c b/main.c index a7ad841..7591875 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r942-dirty" +#define PACKAGE_VERSION "0.7.10-r943-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);