diff --git a/README-alt.md b/README-alt.md index f4d3a45..e57313f 100644 --- a/README-alt.md +++ b/README-alt.md @@ -14,13 +14,13 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/H gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa bwa index hs38a.fa -cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt +cp bwa-hs38-bundle/hs38d4.fa.alt hs38a.fa.alt ``` Perform mapping: ```sh bwa mem hs38a.fa read1.fq read2.fq \ - | bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \ + | bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \ | samtools view -bS - > aln.unsrt.bam ``` For short reads, the postprocessing script `bwa-postalt.js` runs at about the @@ -30,14 +30,14 @@ same speed as BAM compression. Construct the index: ```sh -cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa +cat hs38a.fa bwa-hs38-bundle/hs38d4-extra.fa > hs38d4.fa bwa index hs38d4.fa -cp bwa-hs38-res/hs38d4.fa.alt . +cp bwa-hs38-bundle/hs38d4.fa.alt . ``` Perform mapping: ```sh bwa mem hs38d4.fa read1.fq read2.fq \ - | bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ + | bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ | samtools view -bS - > aln.unsrt.bam ``` This command line generates `postinfo.ctw` which loosely evaluates the presence @@ -88,7 +88,9 @@ alignments and assigns mapQ following these two rules: In theory, non-ALT alignments from step 1 should be identical to alignments against a reference genome with ALT contigs. In practice, the two types of -alignments may differ in rare cases due to seeding heuristics. +alignments may differ in rare cases due to seeding heuristics. When an ALT hit +is significantly better than non-ALT hits, BWA-MEM may miss seeds on the +non-ALT hits. This happens more often for contig mapping. If we don't care about ALT hits, we may skip postprocessing (step 2). Nonetheless, postprocessing is recommended as it improves mapQ and gives more @@ -134,7 +136,7 @@ not to high resolution for now. ### Evaluating ALT Mapping -(To come later...) +(Coming soon...) ## Problems and Future Development diff --git a/bwamem.c b/bwamem.c index 3fd008f..b32c166 100644 --- a/bwamem.c +++ b/bwamem.c @@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init() o->pen_unpaired = 17; o->pen_clip5 = o->pen_clip3 = 5; - o->max_mem_intv = 10; + o->max_mem_intv = 20; o->min_seed_len = 19; o->split_width = 10; @@ -147,7 +147,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co if (seq[x] < 4) { if (1) { bwtintv_t m; - x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m); + x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); } else { // for now, we never come to this block which is slower x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); @@ -274,7 +274,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn bwtintv_t *p = &aux->mem.a[i]; int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; - if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive + // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { mem_chain_t tmp, *lower, *upper; @@ -514,7 +514,8 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t * int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].sub == 0) a[j].sub = a[i].score; - if (a[j].score - a[i].score <= tmp) ++a[j].sub_n; + if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt)) + ++a[j].sub_n; break; } } @@ -980,14 +981,14 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); kstring_t str; kvec_t(mem_aln_t) aa; - int k; + int k, l; char **XA = 0; if (!(opt->flag & MEM_F_ALL)) XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); kv_init(aa); str.l = str.m = 0; str.s = 0; - for (k = 0; k < a->n; ++k) { + for (k = l = 0; k < a->n; ++k) { mem_alnreg_t *p = &a->a[k]; mem_aln_t *q; if (p->score < opt->T) continue; @@ -999,9 +1000,10 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, q->XA = XA? XA[k] : 0; q->flag |= extra_flag; // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score - if (k && p->secondary < 0) // if supplementary + if (l && p->secondary < 0) // if supplementary q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; - if (k && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + ++l; } if (aa.n == 0) { // no alignments good enough; then write an unaligned record mem_aln_t t; diff --git a/bwamem_pair.c b/bwamem_pair.c index 3882784..5d4c5c2 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -70,6 +70,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (q->n < MIN_DIR_CNT) { fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); r->failed = 1; + free(q->a); continue; } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); ks_introsort_64(q->n, q->a); diff --git a/bwt.c b/bwt.c index a97b60c..9083654 100644 --- a/bwt.c +++ b/bwt.c @@ -355,7 +355,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec); } -int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem) +int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem) { int i, c; bwtintv_t ik, ok[4]; @@ -367,7 +367,7 @@ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int m if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); - if (ok[c].x[2] < max_intv) { + if (ok[c].x[2] < max_intv && i - x >= min_len) { *mem = ok[c]; mem->info = (uint64_t)x<<32 | (i + 1); return i + 1; diff --git a/bwt.h b/bwt.h index afb84d2..c71d6b5 100644 --- a/bwt.h +++ b/bwt.h @@ -121,7 +121,7 @@ extern "C" { int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); - int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem); + int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem); #ifdef __cplusplus } diff --git a/main.c b/main.c index 12b0930..7591875 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r933-dirty" +#define PACKAGE_VERSION "0.7.10-r943-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);