diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..e386b27 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +language: c +compiler: + - gcc + - clang +script: make diff --git a/Makefile b/Makefile index d0d50cf..1d6ae44 100644 --- a/Makefile +++ b/Makefile @@ -4,8 +4,8 @@ CFLAGS= -g -Wall -Wno-unused-function -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o -AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o +AOBJS= QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o diff --git a/README-alt.md b/README-alt.md index 8dac185..024890b 100644 --- a/README-alt.md +++ b/README-alt.md @@ -3,8 +3,8 @@ Since version 0.7.11, BWA-MEM supports read mapping against a reference genome with long alternative haplotypes present in separate ALT contigs. To use the ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the -SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is -available from the [BWA resource bundle for GRCh38][res]. +SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment +is available from the [BWA resource bundle for GRCh38][res]. #### Option 1: Mapping to the official GRCh38 with ALT contigs @@ -37,7 +37,7 @@ cp bwa-hs38-res/hs38d4.fa.alt . ``` Perform mapping: ```sh -bwa mem -g.8 hs38d4.fa read1.fq read2.fq \ +bwa mem hs38d4.fa read1.fq read2.fq \ | samblaster \ | bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ | samtools view -bS - > aln.unsrt.bam @@ -45,6 +45,9 @@ bwa mem -g.8 hs38d4.fa read1.fq read2.fq \ This command line generates `postinfo.ctw` which loosely evaluates the presence of an ALT contig with an empirical score at the last column. +**If you are not interested in the way BWA-MEM performs ALT mapping, you can +skip the rest of this documentation.** + ## Background GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions. @@ -65,6 +68,8 @@ implementation encourages researchers to use ALT contigs soon and often. ## Methods +### Sequence alignment + As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and postprocessing. @@ -108,6 +113,27 @@ pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the option 2 above. +### On the Completeness of GRCh38+ALT + +While GRCh38 is much more complete than GRCh37, it is still missing some true +human sequences. To make sure every piece of sequence in the reference assembly +is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig +to have enough support from multiple sources before considering to add it to the +reference assembly. This careful procedure has left out some sequences, one of +which is [this example][novel], a 10kb contig assembled from CHM1 short +reads and present also in NA12878. You can try [BLAT][blat] or [BLAST][blast] to +see where it maps. + +For a more complete reference genome, we compiled a new set of decoy sequences +from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples. +The sequences are included in `hs38d4-extra.fa` from the [BWA resource bundle +for GRCh38][res]. + +In addition to decoy, we also put multiple alleles of HLA genes in +`hs38d4-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb], +version 3.18.0. Script `bwa-postalt.js` also helps to genotype HLA genes, though +not to high resolution for now. + ## Problems and Future Development There are some uncertainties about ALT mappings - we are not sure whether they @@ -119,5 +145,12 @@ for performance; if not, we will try new designs. It is also possible that we may make breakthrough on the representation of multiple genomes, in which case, we can even get rid of ALT contigs once for all. + [res]: https://sourceforge.net/projects/bio-bwa/files/ [sb]: https://github.com/GregoryFaust/samblaster +[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/ +[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc +[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat +[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome +[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/ +[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/ diff --git a/bwa.c b/bwa.c index 30e5284..80a4b8f 100644 --- a/bwa.c +++ b/bwa.c @@ -14,6 +14,7 @@ int bwa_verbose = 3; char bwa_rg_id[256]; +char *bwa_pg; /************************ * Batch FASTA/Q reader * @@ -352,7 +353,7 @@ void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) for (i = 0; i < bns->n_seqs; ++i) err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); if (rg_line) err_printf("%s\n", rg_line); - err_printf("%s\n", bwa_pg); + if (bwa_pg) err_printf("%s\n", bwa_pg); } static char *bwa_escape(char *s) diff --git a/example.c b/example.c index a6c9bdd..4e8494d 100644 --- a/example.c +++ b/example.c @@ -7,10 +7,6 @@ #include "kseq.h" // for the FASTA/Q parser KSEQ_DECLARE(gzFile) -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - int main(int argc, char *argv[]) { bwaidx_t *idx; @@ -47,10 +43,10 @@ int main(int argc, char *argv[]) if (ar.a[i].secondary >= 0) continue; // skip secondary alignments a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR // print alignment - err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); + printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) // print CIGAR - err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); - err_printf("\t%d\n", a.NM); // print edit distance + printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); + printf("\t%d\n", a.NM); // print edit distance free(a.cigar); // don't forget to deallocate CIGAR } free(ar.a); // and deallocate the hit list @@ -58,7 +54,7 @@ int main(int argc, char *argv[]) free(opt); kseq_destroy(ks); - err_gzclose(fp); + gzclose(fp); bwa_idx_destroy(idx); return 0; } diff --git a/main.c b/main.c index e0f0edb..1a18559 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r912-dirty" +#define PACKAGE_VERSION "0.7.10-r915-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -26,8 +26,6 @@ int main_shm(int argc, char *argv[]); int main_pemerge(int argc, char *argv[]); -char *bwa_pg; - static int usage() { fprintf(stderr, "\n"); @@ -61,6 +59,7 @@ static int usage() int main(int argc, char *argv[]) { + extern char *bwa_pg; int i, ret; double t_real; kstring_t pg = {0,0,0};