diff --git a/README-alt.md b/README-alt.md index 7d3a3c7..3460cdd 100644 --- a/README-alt.md +++ b/README-alt.md @@ -1,7 +1,7 @@ -## Getting Started +## For the Impatient ```sh -# Download the bwa-0.7.11 binary package +# Download the bwa-0.7.11 binary package (download link may change) wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit-0.7.11_x64-linux.tar.bz2/download \ | gzip -dc | tar xf - # Generate the GRCh38+ALT+decoy+HLA and create the BWA index @@ -11,22 +11,9 @@ bwa.kit/bwa index hs38d6.fa # create BWA index bwa.kit/run-bwamem -o out hs38d6.fa read1.fq read2.fq | sh # skip "|sh" to show command lines ``` -This will generate the following files: - -* `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this - file, one read may be placed on multiple overlapping ALT contigs at the same - time even if the read is mapped better to some contigs than others. This makes - it possible to analyze each contig independent of others. - -* `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes. - -* `out.hla.all`: other possible genotypes on the six HLA genes. - -* `out.log.*`: bwa-mem, samblaster and HLA typing log files. - -Note that `run-bwamem` only prints command lines but doesn't execute them. It -is advised to have a look at the command lines before passing them to `sh` for -actual execution. +This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA +genotypes on each gene and `out.hla.all` for other possible HLA genotypes. +Please check out [bwa/bwakit/README.md][kithelp] for details. ## Background @@ -57,7 +44,7 @@ postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT contigs are present. The following picture shows an example about how BWA-MEM infers mapping quality and reports alignment after step 2: -![](https://raw.githubusercontent.com/lh3/bwa/dev/extras/alt-demo.png) +![](http://lh3lh3.users.sourceforge.net/images/alt-demo.png) #### Step 1: BWA-MEM mapping @@ -189,3 +176,4 @@ can even get rid of ALT contigs for good. [hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html [hla3]: http://www.biomedcentral.com/1471-2164/15/325 [hla4]: http://genomemedicine.com/content/4/12/95 +[kithelp]: https://github.com/lh3/bwa/tree/master/bwakit diff --git a/bwa.c b/bwa.c index 053025e..2411b4a 100644 --- a/bwa.c +++ b/bwa.c @@ -367,13 +367,13 @@ int bwa_idx2mem(bwaidx_t *idx) * SAM header routines * ***********************/ -void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) +void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line) { int i; extern char *bwa_pg; for (i = 0; i < bns->n_seqs; ++i) err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); - if (rg_line) err_printf("%s\n", rg_line); + if (hdr_line) err_printf("%s\n", hdr_line); if (bwa_pg) err_printf("%s\n", bwa_pg); } @@ -422,3 +422,16 @@ err_set_rg: return 0; } +char *bwa_insert_header(const char *s, char *hdr) +{ + int len = 0; + if (s == 0 || s[0] != '@') return hdr; + if (hdr) { + len = strlen(hdr); + hdr = realloc(hdr, len + strlen(s) + 2); + hdr[len++] = '\n'; + strcpy(hdr + len, s); + } else hdr = strdup(s); + bwa_escape(hdr + len); + return hdr; +} diff --git a/bwa.h b/bwa.h index 830bc3c..1541c1c 100644 --- a/bwa.h +++ b/bwa.h @@ -51,8 +51,9 @@ extern "C" { int bwa_idx2mem(bwaidx_t *idx); int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx); - void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); + void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line); char *bwa_set_rg(const char *s); + char *bwa_insert_header(const char *s, char *hdr); #ifdef __cplusplus } diff --git a/bwakit/README.md b/bwakit/README.md new file mode 100644 index 0000000..4430ffb --- /dev/null +++ b/bwakit/README.md @@ -0,0 +1,50 @@ +Bwakit is a self-consistent installation-free package of scripts and precompiled +binaries which provide an end-to-end solution to read mapping. In addition to +the basic mapping functionality implemented in bwa, bwakit is able to generate +proper human reference genome and to take advantage of ALT contigs, if present, +to improve read mapping and to perform HLA typing for high-coverage human data. +It can remap name- or coordinate-sorted BAM with read group and barcode +information retained. Bwakit also *optionally* trims adapters (via +[trimadap][ta]), marks duplicates (via [samblaster][sb]) and sorts the final +alignment (via [samtools][smtl]). + +Bwakit has two entry scripts: `run-gen-ref` which downloads and generates human +reference genomes, and `run-bwamem` which prints mapping command lines on the +standard output that can be piped to `sh` to execute. The two scripts will call +other programs or use data in `bwa.kit`. The following shows an example about +how to use bwakit: + +```sh +# Download the bwa-0.7.11 binary package (download link may change) +wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \ + | gzip -dc | tar xf - +# Generate the GRCh38+ALT+decoy+HLA and create the BWA index +bwa.kit/run-gen-ref hs38d6 # download GRCh38 and write hs38d6.fa +bwa.kit/bwa index hs38d6.fa # create BWA index +# mapping +bwa.kit/run-bwamem -o out hs38d6.fa read1.fq read2.fq | sh +``` + +The last mapping command line will generate the following files: + +* `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this + file, one read may be placed on multiple overlapping ALT contigs at the same + time even if the read is mapped better to some contigs than others. This makes + it possible to analyze each contig independent of others. + +* `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes. + +* `out.hla.all`: other possible genotypes on the six HLA genes. + +* `out.log.*`: bwa-mem, samblaster and HLA typing log files. + +Bwakit can be [downloaded here][res]. It is only available to x86_64-linux. The +scripts in the package are available in the [bwa/bwakit][kit] directory. +Packaging is done manually for now. + + +[res]: https://sourceforge.net/projects/bio-bwa/files/ +[sb]: https://github.com/GregoryFaust/samblaster +[ta]: https://github.com/lh3/seqtk/blob/master/trimadap.c +[smtl]: http://www.htslib.org +[kit]: https://github.com/lh3/bwa/tree/master/bwakit diff --git a/extras/run-HLA b/bwakit/run-HLA similarity index 100% rename from extras/run-HLA rename to bwakit/run-HLA diff --git a/extras/run-bwamem b/bwakit/run-bwamem similarity index 82% rename from extras/run-bwamem rename to bwakit/run-bwamem index 9d5fa19..4c75de9 100755 --- a/extras/run-bwamem +++ b/bwakit/run-bwamem @@ -31,16 +31,16 @@ Examples: run-bwamem -o prefix -t8 -R"@RG\tID:foo\tSM:bar" hs38d6.fa read1.fq.gz read2.fq.gz - * Remap coordinate-sorted BAM, trim Illumina PE adapters and sort the output. The BAM - may contain single-end or paired-end reads, or a mixture of the two types. Read groups - are not transferred to the output BAM, though. + * Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and + sort the output. The BAM may contain single-end or paired-end reads, or a mixture of + the two types. Specifying -R stops read group transfer. run-bwamem -sao prefix hs38d6.fa old-srt.bam * Remap name-grouped BAM and mark duplicates. Note that in this case, all reads from a single library should be aligned at the same time. Paired-end only. - run-bwamem -Sdo prefix hs38d6.fa old-unsrt.bam + run-bwamem -Sdo prefix hs38d6.fa old-unsrt.bam Output files: @@ -51,8 +51,6 @@ Output files: ') if @ARGV < 2; -warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") unless defined($opts{R}); - my $idx = $ARGV[0]; my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0); @@ -82,7 +80,7 @@ if (defined $opts{o}) { last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1) } $prefix = substr($ARGV[1], 0, $i) if $i > 0; -} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|sam|sam\.gz|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) { +} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) { $prefix = $1; } die("ERROR: failed to identify the prefix for output. Please specify -p.\n") unless defined($prefix); @@ -98,19 +96,36 @@ for my $f (@ARGV[1..$#ARGV]) { } my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0; -my $is_sam = $ARGV[1] =~ /\.(sam|sam\.gz)$/? 1 : 0; my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0; if (defined($opts{x})) { delete($opts{d}); delete($opts{a}); delete $opts{p}; } +# for BAM input, find @RG header lines +my @RG_lines = (); +if ($is_bam && !defined($opts{R})) { + my $fh; + open($fh, "$root/samtools view -H $ARGV[1] |") || die; + while (<$fh>) { + chomp; + if (/^\@RG\t/) { + s/\t/\\t/g; + push(@RG_lines, "-H'$_'"); + } + } + close($fh); +} + +warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0; + my $cmd = ''; -if ($is_sam || $is_bam) { - my $cmd_sam2bam = $is_sam? "$root/htsbox samview -uS $ARGV[1] \\\n" : "cat $ARGV[1] \\\n"; +if ($is_bam) { + my $cmd_sam2bam = "cat $ARGV[1] \\\n"; my $ntmps = int($size / 4e9) + 1; - my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : ""; - my $cmd_bam2fq = " | $root/htsbox bam2fq -O - \\\n"; + my $cmd_shuf = !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : ""; + my $bam2fq_opt = @RG_lines > 0? " -t" : ""; + my $cmd_bam2fq = " | $root/htsbox bam2fq -O$bam2fq_opt - \\\n"; $cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq; } elsif (@ARGV >= 3) { $cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n"; @@ -119,6 +134,7 @@ if ($is_sam || $is_bam) { } my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : ""); +$bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0; $cmd .= " | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a}); $cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n"; diff --git a/extras/run-gen-ref b/bwakit/run-gen-ref similarity index 100% rename from extras/run-gen-ref rename to bwakit/run-gen-ref diff --git a/extras/typeHLA-selctg.js b/bwakit/typeHLA-selctg.js similarity index 100% rename from extras/typeHLA-selctg.js rename to bwakit/typeHLA-selctg.js diff --git a/extras/typeHLA.js b/bwakit/typeHLA.js similarity index 100% rename from extras/typeHLA.js rename to bwakit/typeHLA.js diff --git a/extras/typeHLA.sh b/bwakit/typeHLA.sh similarity index 100% rename from extras/typeHLA.sh rename to bwakit/typeHLA.sh diff --git a/extras/alt-demo.graffle b/extras/alt-demo.graffle deleted file mode 100644 index 32a8f5f..0000000 --- a/extras/alt-demo.graffle +++ /dev/null @@ -1,330 +0,0 @@ - - - - - ActiveLayerIndex - 0 - ApplicationVersion - - com.omnigroup.OmniGraffle - 139.18.0.187838 - - AutoAdjust - - BackgroundGraphic - - Bounds - {{0, 0}, {576, 733}} - Class - SolidGraphic - ID - 2 - Style - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - BaseZoom - 0 - CanvasOrigin - {0, 0} - ColumnAlign - 1 - ColumnSpacing - 36 - CreationDate - 2014-11-17 16:51:42 +0000 - Creator - Heng Li - DisplayScale - 1 0/72 in = 1 0/72 in - GraphDocumentVersion - 8 - GraphicsList - - - Bounds - {{35.699992179870605, 151.89999580383301}, {476, 224}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - AndaleMono - Size - 12 - - ID - 28 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Align - 0 - Pad - 0 - Text - {\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 Consolas;\f1\fnil\fcharset0 Consolas-Bold;} -{\colortbl;\red255\green255\blue255;\red0\green0\blue0;\red127\green127\blue127;\red255\green0\blue0; -\red204\green204\blue204;\red0\green0\blue255;\red0\green128\blue0;\red255\green128\blue0;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural - -\f0\fs24 \cf2 Read: A\cf0 TCAGCATC\ -\cf2 \ - ALT ctg 1: \cf3 TGA\cf3 AA---CGAATGCAAATGGTCA -\f1\b \cf4 ATCAGCATC -\f0\b0 \cf3 GAACTAGTCACAT\cf2 \ - \cf3 |||||\cf5 (high div) \cf3 ||||||\cf5 (novel ins)\cf3 ||||||||||\cf2 \ -Chromosome:\cf3 GCGTACATGATACGA -\f1\b \cf6 ATCgGCATC -\f0\b0 \cf3 ATGGTC-------------CTAGTCACATCGTAATC\ -\cf2 \cf3 |||||||||||| ||||||||||\cf5 (novel ins) \cf3 ||||||||||\ -\cf2 ALT ctg 2:\cf3 TGATACGA -\f1\b \cf7 ATCgcCATC -\f0\b0 \cf3 ATGGTCA -\f1\b \cf8 ATCgcCAgC -\f0\b0 \cf3 GAACTAGTCACAT\ -\ -\cf2 4 potential hits: -\f1\b \cf4 ATCAGCATC -\f0\b0 \cf0 > -\f1\b \cf6 ATCgGCATC -\f0\b0 \cf0 > -\f1\b \cf7 ATCgcCATC -\f0\b0 \cf2 > -\f1\b \cf8 ATCgcCAgC\ -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural - -\f0\b0 \cf0 2 hit groups: \{ -\f1\b \cf4 ATCAGCATC -\f0\b0 \cf0 , -\f1\b \cf8 ATCgcCAgC -\f0\b0 \cf2 \} and\cf0 \{ -\f1\b \cf6 ATCgGCATC -\f0\b0 \cf2 , -\f1\b \cf7 ATCgcCATC -\f0\b0 \cf2 \}\ -\cf0 Hits considered in mapQ: -\f1\b \cf4 ATCAGCATC -\f0\b0 \cf0 and -\f1\b \cf6 ATCgGCATC -\f0\b0 \cf2 (best from each group) -\f1\b \cf6 \ -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural - -\f0\b0 \cf3 \ -\cf2 In the output SAM: -\f1\b \cf6 ATCgGCATC -\f0\b0 \cf2 as the primary SAM line with mapQ=0\ - -\f1\b \cf4 ATCAGCATC -\f0\b0 \cf2 as a supplementary line with mapQ>0\ - -\f1\b \cf8 ATCgcCAgC -\f0\b0 \cf2 as a supplementary line with mapQ>0\ - -\f1\b \cf7 ATCgcCATC -\f0\b0 \cf2 in an XA tag, not as a separate line} - VerticalPad - 0 - - Wrap - NO - - - GridInfo - - GridSpacing - 7.1999998092651367 - MajorGridSpacing - 10 - SnapsToGrid - YES - - GuidesLocked - NO - GuidesVisible - YES - HPages - 1 - ImageCounter - 1 - KeepToScale - - Layers - - - Lock - NO - Name - Layer 1 - Print - YES - View - YES - - - LayoutInfo - - Animate - NO - circoMinDist - 18 - circoSeparation - 0.0 - layoutEngine - dot - neatoSeparation - 0.0 - twopiSeparation - 0.0 - - LinksVisible - NO - MagnetsVisible - NO - MasterSheets - - ModificationDate - 2014-11-17 18:28:10 +0000 - Modifier - Heng Li - NotesVisible - NO - Orientation - 2 - OriginVisible - NO - PageBreaks - YES - PrintInfo - - NSBottomMargin - - float - 41 - - NSHorizonalPagination - - coded - BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG - - NSLeftMargin - - float - 18 - - NSPaperSize - - size - {612, 792} - - NSPrintReverseOrientation - - int - 0 - - NSRightMargin - - float - 18 - - NSTopMargin - - float - 18 - - - PrintOnePage - - ReadOnly - NO - RowAlign - 1 - RowSpacing - 36 - SheetTitle - Canvas 1 - SmartAlignmentGuidesActive - YES - SmartDistanceGuidesActive - YES - UniqueID - 1 - UseEntirePage - - VPages - 1 - WindowInfo - - CurrentSheet - 0 - ExpandedCanvases - - - name - Canvas 1 - - - Frame - {{367, 6}, {710, 872}} - ListView - - OutlineWidth - 142 - RightSidebar - - ShowRuler - - Sidebar - - SidebarWidth - 120 - VisibleRegion - {{0, 0}, {575, 733}} - Zoom - 1 - ZoomValues - - - Canvas 1 - 1 - 1 - - - - - diff --git a/extras/alt-demo.png b/extras/alt-demo.png deleted file mode 100644 index 71f4976..0000000 Binary files a/extras/alt-demo.png and /dev/null differ diff --git a/fastmap.c b/fastmap.c index 988b6ed..312b1bc 100644 --- a/fastmap.c +++ b/fastmap.c @@ -44,7 +44,7 @@ int main_mem(int argc, char *argv[]) kseq_t *ks, *ks2 = 0; bseq1_t *seqs; bwaidx_t *idx; - char *p, *rg_line = 0; + char *p, *rg_line = 0, *hdr_line = 0; const char *mode = 0; void *ko = 0, *ko2 = 0; int64_t n_processed = 0; @@ -55,7 +55,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -115,6 +115,8 @@ int main_mem(int argc, char *argv[]) opt->pen_clip3 = strtol(p+1, &p, 10); } else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak + } else if (c == 'H') { + hdr_line = bwa_insert_header(optarg, hdr_line); } else if (c == 'I') { // specify the insert size distribution pes0 = pes; pes[1].failed = 0; @@ -135,6 +137,12 @@ int main_mem(int argc, char *argv[]) } else return 1; } + + if (rg_line) { + hdr_line = bwa_insert_header(rg_line, hdr_line); + free(rg_line); + } + if (opt->n_threads < 1) opt->n_threads = 1; if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); @@ -169,6 +177,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, " -H STR insert an arbitrary header line [null]\n"); fprintf(stderr, " -j ignore ALT contigs\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); @@ -261,7 +270,7 @@ int main_mem(int argc, char *argv[]) } } if (!(opt->flag & MEM_F_ALN_REG)) - bwa_print_sam_hdr(idx->bns, rg_line); + bwa_print_sam_hdr(idx->bns, hdr_line); actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) { int64_t size = 0; @@ -301,6 +310,7 @@ int main_mem(int argc, char *argv[]) free(seqs); } + free(hdr_line); free(opt); bwa_idx_destroy(idx); kseq_destroy(ks); diff --git a/main.c b/main.c index 5a2de61..d7ffb13 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r998-dirty" +#define PACKAGE_VERSION "0.7.10-r1005-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);