Merge branch 'dev'

This commit is contained in:
Heng Li 2014-10-22 16:17:36 -04:00
commit af606fc31a
6 changed files with 24 additions and 19 deletions

View File

@ -14,13 +14,13 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/H
gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
bwa index hs38a.fa
cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt
cp bwa-hs38-bundle/hs38d4.fa.alt hs38a.fa.alt
```
Perform mapping:
```sh
bwa mem hs38a.fa read1.fq read2.fq \
| bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \
| bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \
| samtools view -bS - > aln.unsrt.bam
```
For short reads, the postprocessing script `bwa-postalt.js` runs at about the
@ -30,14 +30,14 @@ same speed as BAM compression.
Construct the index:
```sh
cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa
cat hs38a.fa bwa-hs38-bundle/hs38d4-extra.fa > hs38d4.fa
bwa index hs38d4.fa
cp bwa-hs38-res/hs38d4.fa.alt .
cp bwa-hs38-bundle/hs38d4.fa.alt .
```
Perform mapping:
```sh
bwa mem hs38d4.fa read1.fq read2.fq \
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
| bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
| samtools view -bS - > aln.unsrt.bam
```
This command line generates `postinfo.ctw` which loosely evaluates the presence
@ -88,7 +88,9 @@ alignments and assigns mapQ following these two rules:
In theory, non-ALT alignments from step 1 should be identical to alignments
against a reference genome with ALT contigs. In practice, the two types of
alignments may differ in rare cases due to seeding heuristics.
alignments may differ in rare cases due to seeding heuristics. When an ALT hit
is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
non-ALT hits. This happens more often for contig mapping.
If we don't care about ALT hits, we may skip postprocessing (step 2).
Nonetheless, postprocessing is recommended as it improves mapQ and gives more
@ -134,7 +136,7 @@ not to high resolution for now.
### Evaluating ALT Mapping
(To come later...)
(Coming soon...)
## Problems and Future Development

View File

@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init()
o->pen_unpaired = 17;
o->pen_clip5 = o->pen_clip3 = 5;
o->max_mem_intv = 10;
o->max_mem_intv = 20;
o->min_seed_len = 19;
o->split_width = 10;
@ -147,7 +147,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
if (seq[x] < 4) {
if (1) {
bwtintv_t m;
x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m);
x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
} else { // for now, we never come to this block which is slower
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
@ -274,7 +274,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
bwtintv_t *p = &aux->mem.a[i];
int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k;
if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
// if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
mem_chain_t tmp, *lower, *upper;
@ -514,7 +514,8 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *
int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
if (a[j].sub == 0) a[j].sub = a[i].score;
if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt))
++a[j].sub_n;
break;
}
}
@ -980,14 +981,14 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
kstring_t str;
kvec_t(mem_aln_t) aa;
int k;
int k, l;
char **XA = 0;
if (!(opt->flag & MEM_F_ALL))
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
kv_init(aa);
str.l = str.m = 0; str.s = 0;
for (k = 0; k < a->n; ++k) {
for (k = l = 0; k < a->n; ++k) {
mem_alnreg_t *p = &a->a[k];
mem_aln_t *q;
if (p->score < opt->T) continue;
@ -999,9 +1000,10 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
q->XA = XA? XA[k] : 0;
q->flag |= extra_flag; // flag secondary
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
if (k && p->secondary < 0) // if supplementary
if (l && p->secondary < 0) // if supplementary
q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
if (k && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
++l;
}
if (aa.n == 0) { // no alignments good enough; then write an unaligned record
mem_aln_t t;

View File

@ -70,6 +70,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
if (q->n < MIN_DIR_CNT) {
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
r->failed = 1;
free(q->a);
continue;
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
ks_introsort_64(q->n, q->a);

4
bwt.c
View File

@ -355,7 +355,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
}
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem)
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
{
int i, c;
bwtintv_t ik, ok[4];
@ -367,7 +367,7 @@ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int m
if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] < max_intv) {
if (ok[c].x[2] < max_intv && i - x >= min_len) {
*mem = ok[c];
mem->info = (uint64_t)x<<32 | (i + 1);
return i + 1;

2
bwt.h
View File

@ -121,7 +121,7 @@ extern "C" {
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem);
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
#ifdef __cplusplus
}

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r933-dirty"
#define PACKAGE_VERSION "0.7.10-r943-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);