Merge branch 'dev'

This commit is contained in:
Heng Li 2014-10-22 16:17:36 -04:00
commit af606fc31a
6 changed files with 24 additions and 19 deletions

View File

@ -14,13 +14,13 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/H
gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
bwa index hs38a.fa bwa index hs38a.fa
cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt cp bwa-hs38-bundle/hs38d4.fa.alt hs38a.fa.alt
``` ```
Perform mapping: Perform mapping:
```sh ```sh
bwa mem hs38a.fa read1.fq read2.fq \ bwa mem hs38a.fa read1.fq read2.fq \
| bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \ | bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \
| samtools view -bS - > aln.unsrt.bam | samtools view -bS - > aln.unsrt.bam
``` ```
For short reads, the postprocessing script `bwa-postalt.js` runs at about the For short reads, the postprocessing script `bwa-postalt.js` runs at about the
@ -30,14 +30,14 @@ same speed as BAM compression.
Construct the index: Construct the index:
```sh ```sh
cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa cat hs38a.fa bwa-hs38-bundle/hs38d4-extra.fa > hs38d4.fa
bwa index hs38d4.fa bwa index hs38d4.fa
cp bwa-hs38-res/hs38d4.fa.alt . cp bwa-hs38-bundle/hs38d4.fa.alt .
``` ```
Perform mapping: Perform mapping:
```sh ```sh
bwa mem hs38d4.fa read1.fq read2.fq \ bwa mem hs38d4.fa read1.fq read2.fq \
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ | bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
| samtools view -bS - > aln.unsrt.bam | samtools view -bS - > aln.unsrt.bam
``` ```
This command line generates `postinfo.ctw` which loosely evaluates the presence This command line generates `postinfo.ctw` which loosely evaluates the presence
@ -88,7 +88,9 @@ alignments and assigns mapQ following these two rules:
In theory, non-ALT alignments from step 1 should be identical to alignments In theory, non-ALT alignments from step 1 should be identical to alignments
against a reference genome with ALT contigs. In practice, the two types of against a reference genome with ALT contigs. In practice, the two types of
alignments may differ in rare cases due to seeding heuristics. alignments may differ in rare cases due to seeding heuristics. When an ALT hit
is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
non-ALT hits. This happens more often for contig mapping.
If we don't care about ALT hits, we may skip postprocessing (step 2). If we don't care about ALT hits, we may skip postprocessing (step 2).
Nonetheless, postprocessing is recommended as it improves mapQ and gives more Nonetheless, postprocessing is recommended as it improves mapQ and gives more
@ -134,7 +136,7 @@ not to high resolution for now.
### Evaluating ALT Mapping ### Evaluating ALT Mapping
(To come later...) (Coming soon...)
## Problems and Future Development ## Problems and Future Development

View File

@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init()
o->pen_unpaired = 17; o->pen_unpaired = 17;
o->pen_clip5 = o->pen_clip3 = 5; o->pen_clip5 = o->pen_clip3 = 5;
o->max_mem_intv = 10; o->max_mem_intv = 20;
o->min_seed_len = 19; o->min_seed_len = 19;
o->split_width = 10; o->split_width = 10;
@ -147,7 +147,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
if (seq[x] < 4) { if (seq[x] < 4) {
if (1) { if (1) {
bwtintv_t m; bwtintv_t m;
x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m); x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
} else { // for now, we never come to this block which is slower } else { // for now, we never come to this block which is slower
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
@ -274,7 +274,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
bwtintv_t *p = &aux->mem.a[i]; bwtintv_t *p = &aux->mem.a[i];
int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k; int64_t k;
if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
mem_chain_t tmp, *lower, *upper; mem_chain_t tmp, *lower, *upper;
@ -514,7 +514,8 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *
int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
if (a[j].sub == 0) a[j].sub = a[i].score; if (a[j].sub == 0) a[j].sub = a[i].score;
if (a[j].score - a[i].score <= tmp) ++a[j].sub_n; if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt))
++a[j].sub_n;
break; break;
} }
} }
@ -980,14 +981,14 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
kstring_t str; kstring_t str;
kvec_t(mem_aln_t) aa; kvec_t(mem_aln_t) aa;
int k; int k, l;
char **XA = 0; char **XA = 0;
if (!(opt->flag & MEM_F_ALL)) if (!(opt->flag & MEM_F_ALL))
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
kv_init(aa); kv_init(aa);
str.l = str.m = 0; str.s = 0; str.l = str.m = 0; str.s = 0;
for (k = 0; k < a->n; ++k) { for (k = l = 0; k < a->n; ++k) {
mem_alnreg_t *p = &a->a[k]; mem_alnreg_t *p = &a->a[k];
mem_aln_t *q; mem_aln_t *q;
if (p->score < opt->T) continue; if (p->score < opt->T) continue;
@ -999,9 +1000,10 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
q->XA = XA? XA[k] : 0; q->XA = XA? XA[k] : 0;
q->flag |= extra_flag; // flag secondary q->flag |= extra_flag; // flag secondary
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
if (k && p->secondary < 0) // if supplementary if (l && p->secondary < 0) // if supplementary
q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
if (k && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
++l;
} }
if (aa.n == 0) { // no alignments good enough; then write an unaligned record if (aa.n == 0) { // no alignments good enough; then write an unaligned record
mem_aln_t t; mem_aln_t t;

View File

@ -70,6 +70,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
if (q->n < MIN_DIR_CNT) { if (q->n < MIN_DIR_CNT) {
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
r->failed = 1; r->failed = 1;
free(q->a);
continue; continue;
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
ks_introsort_64(q->n, q->a); ks_introsort_64(q->n, q->a);

4
bwt.c
View File

@ -355,7 +355,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec); return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
} }
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem) int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
{ {
int i, c; int i, c;
bwtintv_t ik, ok[4]; bwtintv_t ik, ok[4];
@ -367,7 +367,7 @@ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int m
if (q[i] < 4) { // an A/C/G/T base if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i] c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0); bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] < max_intv) { if (ok[c].x[2] < max_intv && i - x >= min_len) {
*mem = ok[c]; *mem = ok[c];
mem->info = (uint64_t)x<<32 | (i + 1); mem->info = (uint64_t)x<<32 | (i + 1);
return i + 1; return i + 1;

2
bwt.h
View File

@ -121,7 +121,7 @@ extern "C" {
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem); int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
#ifdef __cplusplus #ifdef __cplusplus
} }

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h" #include "utils.h"
#ifndef PACKAGE_VERSION #ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r933-dirty" #define PACKAGE_VERSION "0.7.10-r943-dirty"
#endif #endif
int bwa_fa2pac(int argc, char *argv[]); int bwa_fa2pac(int argc, char *argv[]);