Merge branch 'dev'
This commit is contained in:
commit
fa50c1b77f
|
|
@ -0,0 +1,5 @@
|
||||||
|
language: c
|
||||||
|
compiler:
|
||||||
|
- gcc
|
||||||
|
- clang
|
||||||
|
script: make
|
||||||
4
Makefile
4
Makefile
|
|
@ -4,8 +4,8 @@ CFLAGS= -g -Wall -Wno-unused-function -O2
|
||||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||||
AR= ar
|
AR= ar
|
||||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
||||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
|
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
|
||||||
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
AOBJS= QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||||
is.o bwtindex.o bwape.o kopen.o pemerge.o \
|
is.o bwtindex.o bwape.o kopen.o pemerge.o \
|
||||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,8 @@
|
||||||
Since version 0.7.11, BWA-MEM supports read mapping against a reference genome
|
Since version 0.7.11, BWA-MEM supports read mapping against a reference genome
|
||||||
with long alternative haplotypes present in separate ALT contigs. To use the
|
with long alternative haplotypes present in separate ALT contigs. To use the
|
||||||
ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the
|
ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the
|
||||||
SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is
|
SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment
|
||||||
available from the [BWA resource bundle for GRCh38][res].
|
is available from the [BWA resource bundle for GRCh38][res].
|
||||||
|
|
||||||
#### Option 1: Mapping to the official GRCh38 with ALT contigs
|
#### Option 1: Mapping to the official GRCh38 with ALT contigs
|
||||||
|
|
||||||
|
|
@ -37,7 +37,7 @@ cp bwa-hs38-res/hs38d4.fa.alt .
|
||||||
```
|
```
|
||||||
Perform mapping:
|
Perform mapping:
|
||||||
```sh
|
```sh
|
||||||
bwa mem -g.8 hs38d4.fa read1.fq read2.fq \
|
bwa mem hs38d4.fa read1.fq read2.fq \
|
||||||
| samblaster \
|
| samblaster \
|
||||||
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
|
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
|
||||||
| samtools view -bS - > aln.unsrt.bam
|
| samtools view -bS - > aln.unsrt.bam
|
||||||
|
|
@ -45,6 +45,9 @@ bwa mem -g.8 hs38d4.fa read1.fq read2.fq \
|
||||||
This command line generates `postinfo.ctw` which loosely evaluates the presence
|
This command line generates `postinfo.ctw` which loosely evaluates the presence
|
||||||
of an ALT contig with an empirical score at the last column.
|
of an ALT contig with an empirical score at the last column.
|
||||||
|
|
||||||
|
**If you are not interested in the way BWA-MEM performs ALT mapping, you can
|
||||||
|
skip the rest of this documentation.**
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
|
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
|
||||||
|
|
@ -65,6 +68,8 @@ implementation encourages researchers to use ALT contigs soon and often.
|
||||||
|
|
||||||
## Methods
|
## Methods
|
||||||
|
|
||||||
|
### Sequence alignment
|
||||||
|
|
||||||
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
|
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
|
||||||
postprocessing.
|
postprocessing.
|
||||||
|
|
||||||
|
|
@ -108,6 +113,27 @@ pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a
|
||||||
Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
|
Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
|
||||||
option 2 above.
|
option 2 above.
|
||||||
|
|
||||||
|
### On the Completeness of GRCh38+ALT
|
||||||
|
|
||||||
|
While GRCh38 is much more complete than GRCh37, it is still missing some true
|
||||||
|
human sequences. To make sure every piece of sequence in the reference assembly
|
||||||
|
is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig
|
||||||
|
to have enough support from multiple sources before considering to add it to the
|
||||||
|
reference assembly. This careful procedure has left out some sequences, one of
|
||||||
|
which is [this example][novel], a 10kb contig assembled from CHM1 short
|
||||||
|
reads and present also in NA12878. You can try [BLAT][blat] or [BLAST][blast] to
|
||||||
|
see where it maps.
|
||||||
|
|
||||||
|
For a more complete reference genome, we compiled a new set of decoy sequences
|
||||||
|
from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
|
||||||
|
The sequences are included in `hs38d4-extra.fa` from the [BWA resource bundle
|
||||||
|
for GRCh38][res].
|
||||||
|
|
||||||
|
In addition to decoy, we also put multiple alleles of HLA genes in
|
||||||
|
`hs38d4-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
|
||||||
|
version 3.18.0. Script `bwa-postalt.js` also helps to genotype HLA genes, though
|
||||||
|
not to high resolution for now.
|
||||||
|
|
||||||
## Problems and Future Development
|
## Problems and Future Development
|
||||||
|
|
||||||
There are some uncertainties about ALT mappings - we are not sure whether they
|
There are some uncertainties about ALT mappings - we are not sure whether they
|
||||||
|
|
@ -119,5 +145,12 @@ for performance; if not, we will try new designs. It is also possible that we
|
||||||
may make breakthrough on the representation of multiple genomes, in which case,
|
may make breakthrough on the representation of multiple genomes, in which case,
|
||||||
we can even get rid of ALT contigs once for all.
|
we can even get rid of ALT contigs once for all.
|
||||||
|
|
||||||
|
|
||||||
[res]: https://sourceforge.net/projects/bio-bwa/files/
|
[res]: https://sourceforge.net/projects/bio-bwa/files/
|
||||||
[sb]: https://github.com/GregoryFaust/samblaster
|
[sb]: https://github.com/GregoryFaust/samblaster
|
||||||
|
[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/
|
||||||
|
[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc
|
||||||
|
[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat
|
||||||
|
[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
|
||||||
|
[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/
|
||||||
|
[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/
|
||||||
|
|
|
||||||
3
bwa.c
3
bwa.c
|
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
int bwa_verbose = 3;
|
int bwa_verbose = 3;
|
||||||
char bwa_rg_id[256];
|
char bwa_rg_id[256];
|
||||||
|
char *bwa_pg;
|
||||||
|
|
||||||
/************************
|
/************************
|
||||||
* Batch FASTA/Q reader *
|
* Batch FASTA/Q reader *
|
||||||
|
|
@ -352,7 +353,7 @@ void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
|
||||||
for (i = 0; i < bns->n_seqs; ++i)
|
for (i = 0; i < bns->n_seqs; ++i)
|
||||||
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
||||||
if (rg_line) err_printf("%s\n", rg_line);
|
if (rg_line) err_printf("%s\n", rg_line);
|
||||||
err_printf("%s\n", bwa_pg);
|
if (bwa_pg) err_printf("%s\n", bwa_pg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static char *bwa_escape(char *s)
|
static char *bwa_escape(char *s)
|
||||||
|
|
|
||||||
12
example.c
12
example.c
|
|
@ -7,10 +7,6 @@
|
||||||
#include "kseq.h" // for the FASTA/Q parser
|
#include "kseq.h" // for the FASTA/Q parser
|
||||||
KSEQ_DECLARE(gzFile)
|
KSEQ_DECLARE(gzFile)
|
||||||
|
|
||||||
#ifdef USE_MALLOC_WRAPPERS
|
|
||||||
# include "malloc_wrap.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
bwaidx_t *idx;
|
bwaidx_t *idx;
|
||||||
|
|
@ -47,10 +43,10 @@ int main(int argc, char *argv[])
|
||||||
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
|
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
|
||||||
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
|
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
|
||||||
// print alignment
|
// print alignment
|
||||||
err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
|
printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
|
||||||
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
|
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
|
||||||
err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
|
printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
|
||||||
err_printf("\t%d\n", a.NM); // print edit distance
|
printf("\t%d\n", a.NM); // print edit distance
|
||||||
free(a.cigar); // don't forget to deallocate CIGAR
|
free(a.cigar); // don't forget to deallocate CIGAR
|
||||||
}
|
}
|
||||||
free(ar.a); // and deallocate the hit list
|
free(ar.a); // and deallocate the hit list
|
||||||
|
|
@ -58,7 +54,7 @@ int main(int argc, char *argv[])
|
||||||
|
|
||||||
free(opt);
|
free(opt);
|
||||||
kseq_destroy(ks);
|
kseq_destroy(ks);
|
||||||
err_gzclose(fp);
|
gzclose(fp);
|
||||||
bwa_idx_destroy(idx);
|
bwa_idx_destroy(idx);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
5
main.c
5
main.c
|
|
@ -4,7 +4,7 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#ifndef PACKAGE_VERSION
|
#ifndef PACKAGE_VERSION
|
||||||
#define PACKAGE_VERSION "0.7.10-r912-dirty"
|
#define PACKAGE_VERSION "0.7.10-r915-dirty"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int bwa_fa2pac(int argc, char *argv[]);
|
int bwa_fa2pac(int argc, char *argv[]);
|
||||||
|
|
@ -26,8 +26,6 @@ int main_shm(int argc, char *argv[]);
|
||||||
|
|
||||||
int main_pemerge(int argc, char *argv[]);
|
int main_pemerge(int argc, char *argv[]);
|
||||||
|
|
||||||
char *bwa_pg;
|
|
||||||
|
|
||||||
static int usage()
|
static int usage()
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
@ -61,6 +59,7 @@ static int usage()
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
|
extern char *bwa_pg;
|
||||||
int i, ret;
|
int i, ret;
|
||||||
double t_real;
|
double t_real;
|
||||||
kstring_t pg = {0,0,0};
|
kstring_t pg = {0,0,0};
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue