Merge branch 'dev'
This commit is contained in:
commit
af606fc31a
|
|
@ -14,13 +14,13 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/H
|
|||
gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
|
||||
mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
|
||||
bwa index hs38a.fa
|
||||
cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt
|
||||
cp bwa-hs38-bundle/hs38d4.fa.alt hs38a.fa.alt
|
||||
```
|
||||
|
||||
Perform mapping:
|
||||
```sh
|
||||
bwa mem hs38a.fa read1.fq read2.fq \
|
||||
| bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \
|
||||
| bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \
|
||||
| samtools view -bS - > aln.unsrt.bam
|
||||
```
|
||||
For short reads, the postprocessing script `bwa-postalt.js` runs at about the
|
||||
|
|
@ -30,14 +30,14 @@ same speed as BAM compression.
|
|||
|
||||
Construct the index:
|
||||
```sh
|
||||
cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa
|
||||
cat hs38a.fa bwa-hs38-bundle/hs38d4-extra.fa > hs38d4.fa
|
||||
bwa index hs38d4.fa
|
||||
cp bwa-hs38-res/hs38d4.fa.alt .
|
||||
cp bwa-hs38-bundle/hs38d4.fa.alt .
|
||||
```
|
||||
Perform mapping:
|
||||
```sh
|
||||
bwa mem hs38d4.fa read1.fq read2.fq \
|
||||
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
|
||||
| bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
|
||||
| samtools view -bS - > aln.unsrt.bam
|
||||
```
|
||||
This command line generates `postinfo.ctw` which loosely evaluates the presence
|
||||
|
|
@ -88,7 +88,9 @@ alignments and assigns mapQ following these two rules:
|
|||
|
||||
In theory, non-ALT alignments from step 1 should be identical to alignments
|
||||
against a reference genome with ALT contigs. In practice, the two types of
|
||||
alignments may differ in rare cases due to seeding heuristics.
|
||||
alignments may differ in rare cases due to seeding heuristics. When an ALT hit
|
||||
is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
|
||||
non-ALT hits. This happens more often for contig mapping.
|
||||
|
||||
If we don't care about ALT hits, we may skip postprocessing (step 2).
|
||||
Nonetheless, postprocessing is recommended as it improves mapQ and gives more
|
||||
|
|
@ -134,7 +136,7 @@ not to high resolution for now.
|
|||
|
||||
### Evaluating ALT Mapping
|
||||
|
||||
(To come later...)
|
||||
(Coming soon...)
|
||||
|
||||
## Problems and Future Development
|
||||
|
||||
|
|
|
|||
18
bwamem.c
18
bwamem.c
|
|
@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init()
|
|||
o->pen_unpaired = 17;
|
||||
o->pen_clip5 = o->pen_clip3 = 5;
|
||||
|
||||
o->max_mem_intv = 10;
|
||||
o->max_mem_intv = 20;
|
||||
|
||||
o->min_seed_len = 19;
|
||||
o->split_width = 10;
|
||||
|
|
@ -147,7 +147,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
if (seq[x] < 4) {
|
||||
if (1) {
|
||||
bwtintv_t m;
|
||||
x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m);
|
||||
x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||||
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
|
||||
} else { // for now, we never come to this block which is slower
|
||||
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
|
||||
|
|
@ -274,7 +274,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
|
|||
bwtintv_t *p = &aux->mem.a[i];
|
||||
int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||||
int64_t k;
|
||||
if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
|
||||
// if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
|
||||
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
|
||||
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
|
||||
mem_chain_t tmp, *lower, *upper;
|
||||
|
|
@ -514,7 +514,8 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *
|
|||
int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
|
||||
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
|
||||
if (a[j].sub == 0) a[j].sub = a[i].score;
|
||||
if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
|
||||
if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt))
|
||||
++a[j].sub_n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -980,14 +981,14 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
|
|||
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
|
||||
kstring_t str;
|
||||
kvec_t(mem_aln_t) aa;
|
||||
int k;
|
||||
int k, l;
|
||||
char **XA = 0;
|
||||
|
||||
if (!(opt->flag & MEM_F_ALL))
|
||||
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
|
||||
kv_init(aa);
|
||||
str.l = str.m = 0; str.s = 0;
|
||||
for (k = 0; k < a->n; ++k) {
|
||||
for (k = l = 0; k < a->n; ++k) {
|
||||
mem_alnreg_t *p = &a->a[k];
|
||||
mem_aln_t *q;
|
||||
if (p->score < opt->T) continue;
|
||||
|
|
@ -999,9 +1000,10 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
|
|||
q->XA = XA? XA[k] : 0;
|
||||
q->flag |= extra_flag; // flag secondary
|
||||
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
|
||||
if (k && p->secondary < 0) // if supplementary
|
||||
if (l && p->secondary < 0) // if supplementary
|
||||
q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
|
||||
if (k && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
|
||||
if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
|
||||
++l;
|
||||
}
|
||||
if (aa.n == 0) { // no alignments good enough; then write an unaligned record
|
||||
mem_aln_t t;
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
|
|||
if (q->n < MIN_DIR_CNT) {
|
||||
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
||||
r->failed = 1;
|
||||
free(q->a);
|
||||
continue;
|
||||
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
||||
ks_introsort_64(q->n, q->a);
|
||||
|
|
|
|||
4
bwt.c
4
bwt.c
|
|
@ -355,7 +355,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
|
||||
}
|
||||
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem)
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
|
||||
{
|
||||
int i, c;
|
||||
bwtintv_t ik, ok[4];
|
||||
|
|
@ -367,7 +367,7 @@ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int m
|
|||
if (q[i] < 4) { // an A/C/G/T base
|
||||
c = 3 - q[i]; // complement of q[i]
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
if (ok[c].x[2] < max_intv) {
|
||||
if (ok[c].x[2] < max_intv && i - x >= min_len) {
|
||||
*mem = ok[c];
|
||||
mem->info = (uint64_t)x<<32 | (i + 1);
|
||||
return i + 1;
|
||||
|
|
|
|||
2
bwt.h
2
bwt.h
|
|
@ -121,7 +121,7 @@ extern "C" {
|
|||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem);
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue