Merge branch 'dev'

This commit is contained in:
Heng Li 2014-10-17 16:39:24 -04:00
commit fa50c1b77f
6 changed files with 51 additions and 17 deletions

5
.travis.yml 100644
View File

@ -0,0 +1,5 @@
language: c
compiler:
- gcc
- clang
script: make

View File

@ -4,8 +4,8 @@ CFLAGS= -g -Wall -Wno-unused-function -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o

View File

@ -3,8 +3,8 @@
Since version 0.7.11, BWA-MEM supports read mapping against a reference genome
with long alternative haplotypes present in separate ALT contigs. To use the
ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the
SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is
available from the [BWA resource bundle for GRCh38][res].
SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment
is available from the [BWA resource bundle for GRCh38][res].
#### Option 1: Mapping to the official GRCh38 with ALT contigs
@ -37,7 +37,7 @@ cp bwa-hs38-res/hs38d4.fa.alt .
```
Perform mapping:
```sh
bwa mem -g.8 hs38d4.fa read1.fq read2.fq \
bwa mem hs38d4.fa read1.fq read2.fq \
| samblaster \
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
| samtools view -bS - > aln.unsrt.bam
@ -45,6 +45,9 @@ bwa mem -g.8 hs38d4.fa read1.fq read2.fq \
This command line generates `postinfo.ctw` which loosely evaluates the presence
of an ALT contig with an empirical score at the last column.
**If you are not interested in the way BWA-MEM performs ALT mapping, you can
skip the rest of this documentation.**
## Background
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
@ -65,6 +68,8 @@ implementation encourages researchers to use ALT contigs soon and often.
## Methods
### Sequence alignment
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
postprocessing.
@ -108,6 +113,27 @@ pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a
Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
option 2 above.
### On the Completeness of GRCh38+ALT
While GRCh38 is much more complete than GRCh37, it is still missing some true
human sequences. To make sure every piece of sequence in the reference assembly
is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig
to have enough support from multiple sources before considering to add it to the
reference assembly. This careful procedure has left out some sequences, one of
which is [this example][novel], a 10kb contig assembled from CHM1 short
reads and present also in NA12878. You can try [BLAT][blat] or [BLAST][blast] to
see where it maps.
For a more complete reference genome, we compiled a new set of decoy sequences
from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
The sequences are included in `hs38d4-extra.fa` from the [BWA resource bundle
for GRCh38][res].
In addition to decoy, we also put multiple alleles of HLA genes in
`hs38d4-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
version 3.18.0. Script `bwa-postalt.js` also helps to genotype HLA genes, though
not to high resolution for now.
## Problems and Future Development
There are some uncertainties about ALT mappings - we are not sure whether they
@ -119,5 +145,12 @@ for performance; if not, we will try new designs. It is also possible that we
may make breakthrough on the representation of multiple genomes, in which case,
we can even get rid of ALT contigs once for all.
[res]: https://sourceforge.net/projects/bio-bwa/files/
[sb]: https://github.com/GregoryFaust/samblaster
[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/
[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc
[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat
[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/
[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/

3
bwa.c
View File

@ -14,6 +14,7 @@
int bwa_verbose = 3;
char bwa_rg_id[256];
char *bwa_pg;
/************************
* Batch FASTA/Q reader *
@ -352,7 +353,7 @@ void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
for (i = 0; i < bns->n_seqs; ++i)
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
if (rg_line) err_printf("%s\n", rg_line);
err_printf("%s\n", bwa_pg);
if (bwa_pg) err_printf("%s\n", bwa_pg);
}
static char *bwa_escape(char *s)

View File

@ -7,10 +7,6 @@
#include "kseq.h" // for the FASTA/Q parser
KSEQ_DECLARE(gzFile)
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
int main(int argc, char *argv[])
{
bwaidx_t *idx;
@ -47,10 +43,10 @@ int main(int argc, char *argv[])
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
// print alignment
err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
err_printf("\t%d\n", a.NM); // print edit distance
printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
printf("\t%d\n", a.NM); // print edit distance
free(a.cigar); // don't forget to deallocate CIGAR
}
free(ar.a); // and deallocate the hit list
@ -58,7 +54,7 @@ int main(int argc, char *argv[])
free(opt);
kseq_destroy(ks);
err_gzclose(fp);
gzclose(fp);
bwa_idx_destroy(idx);
return 0;
}

5
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r912-dirty"
#define PACKAGE_VERSION "0.7.10-r915-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);
@ -26,8 +26,6 @@ int main_shm(int argc, char *argv[]);
int main_pemerge(int argc, char *argv[]);
char *bwa_pg;
static int usage()
{
fprintf(stderr, "\n");
@ -61,6 +59,7 @@ static int usage()
int main(int argc, char *argv[])
{
extern char *bwa_pg;
int i, ret;
double t_real;
kstring_t pg = {0,0,0};