From 32c4e3fe5a9530978070045932233d83b6d16b5d Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Sat, 20 Dec 2014 22:24:13 -0500
Subject: [PATCH 1/6] updated decoy sequences and names

---
 bwakit/README.md   | 10 ++++------
 bwakit/run-gen-ref | 10 +++++-----
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/bwakit/README.md b/bwakit/README.md
index d56a9e4..8bf9b03 100644
--- a/bwakit/README.md
+++ b/bwakit/README.md
@@ -17,7 +17,7 @@ other programs or use data in `bwa.kit`. The following shows an example about
 how to use bwakit:
 
 ```sh
-# Download bwakit (or from <http://sourceforge.net/projects/bio-bwa/files/bwakit/> manually)
+# Download the bwa-0.7.11 binary package (download link may change)
 wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
   | gzip -dc | tar xf -
 # Generate the GRCh38+ALT+decoy+HLA and create the BWA index
@@ -50,7 +50,7 @@ bwa.kit
 |-- README.md                  This README file.
 |-- run-bwamem                 *Entry script* for the entire mapping pipeline.
 |-- bwa                        *BWA binary*
-|-- k8                         Interpreter for *.js scripts.
+|-- k8                         Interpretor for *.js scripts.
 |-- bwa-postalt.js             Post-process alignments to ALT contigs/decoys/HLA genes.
 |-- htsbox                     Used by run-bwamem for shuffling BAMs and BAM=>FASTQ.
 |-- samblaster                 MarkDuplicates for reads from the same library. v0.1.20
@@ -60,10 +60,8 @@ bwa.kit
 |
 |-- run-gen-ref                *Entry script* for generating human reference genomes.
 |-- resource-GRCh38            Resources for generating GRCh38
-|   |-- hs38d6-decoy.nt.anno   Top decoy-to-nt hits. Not used by any scripts.
-|   |-- hs38d6-decoy.rm.out    RepeatMasker report. Not used.
-|   |-- hs38d6-extra.fa        Decoy and HLA gene sequences. Used by run-gen-ref.
-|   `-- hs38d6.fa.alt          ALT-to-GRCh38 alignment. Used by run-gen-ref.
+|   |-- hs38D1-extra.fa        Decoy and HLA gene sequences. Used by run-gen-ref.
+|   `-- hs38D1.fa.alt          ALT-to-GRCh38 alignment. Used by run-gen-ref.
 |
 |-- run-HLA                    HLA typing for sequences extracted by bwa-postalt.js.
 |-- typeHLA.sh                 Type one HLA-gene. Called by run-HLA.
diff --git a/bwakit/run-gen-ref b/bwakit/run-gen-ref
index 3be3325..8a6baa0 100755
--- a/bwakit/run-gen-ref
+++ b/bwakit/run-gen-ref
@@ -6,16 +6,16 @@ url38="ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals
 url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
 
 if [ $# -eq 0 ]; then
-	echo "Usage: $0 <hs38|hs38a|hs38d6|hs37|hs37d5>"
+	echo "Usage: $0 <hs38|hs38a|hs38D1|hs37|hs37d5>"
 	exit 1;
 fi
 
-if [ $1 == "hs38d6" ]; then
-	(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38d6-extra.fa) > $1.fa
-	[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38d6.fa.alt $1.fa.alt
+if [ $1 == "hs38D1" ]; then
+	(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38D1-extra.fa) > $1.fa
+	[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38D1.fa.alt $1.fa.alt
 elif [ $1 == "hs38a" ]; then
 	wget -O- $url38 | gzip -dc > $1.fa
-	[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38d6.fa.alt > $1.fa.alt
+	[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38D1.fa.alt > $1.fa.alt
 elif [ $1 == "hs38" ]; then
 	wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
 elif [ $1 == "hs37d5" ]; then

From 5aba18896929d6d4fcd06a8a9babf66cf33a0092 Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Sat, 20 Dec 2014 22:27:27 -0500
Subject: [PATCH 2/6] write empty .gt file as a placeholder

to suppress error messages from run-bwamem
---
 bwakit/typeHLA.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bwakit/typeHLA.sh b/bwakit/typeHLA.sh
index b5a2b4e..b73100d 100755
--- a/bwakit/typeHLA.sh
+++ b/bwakit/typeHLA.sh
@@ -15,6 +15,7 @@ fi
 preres="resource-human-HLA"
 root=`dirname $0`
 pre=$1.$2
+touch $pre.gt
 
 if [ ! -s $pre.fq ]; then
 	echo '** Empty input file. Abort!' >&2

From c05a721f2872d145dda3919edcbe760f9a40653a Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Sun, 21 Dec 2014 00:12:11 -0500
Subject: [PATCH 3/6] improved doc and don't type HLA by default

---
 NEWS.md           | 11 ++++++-----
 README-alt.md     | 20 ++++++++++----------
 bwakit/README.md  | 32 ++++++++++++++++++++++++++++----
 bwakit/run-bwamem | 27 ++++++++++++++++++---------
 4 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 4219e94..4a3fbd3 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,17 +1,18 @@
-Release 0.7.11 (XX November, 2014)
------------------------------------
+Release 0.7.11 (XX December, 2014)
+----------------------------------
 
 A major change to BWA-MEM is the support of mapping to ALT contigs in addition
 to the primary assembly. Part of the ALT mapping strategy is implemented in
 BWA-MEM and the rest in a postprocessing script for now. Due to the extra
 layer of complexity on generating the reference genome and on the two-step
 mapping, we start to provide a wrapper script and precompiled binaries since
-this release. Please check README-alt.md for details.
+this release. The package may be more convenient to some specific use cases.
+For general uses, the single BWA binary still works like the old way.
 
 Another major addition to BWA-MEM is HLA typing, which made possible with the
 new ALT mapping strategy. Necessary data and programs are included in the
 binary release. The wrapper script also performs HLA typing when HLA genes are
-also included in the reference genome as additional ALT contigs.
+included in the reference genome as additional ALT contigs.
 
 Other notable changes to BWA-MEM:
 
@@ -44,7 +45,7 @@ Other notable changes to BWA-MEM:
    writing SAM. This saves significant wall-clock time when reading from
    or writing to a slow Unix pipe.
 
-(0.7.11: XX November 2014, rXXX)
+(0.7.11: XX December 2014, r10XX)
 
 
 
diff --git a/README-alt.md b/README-alt.md
index 0d97ffd..08da754 100644
--- a/README-alt.md
+++ b/README-alt.md
@@ -5,10 +5,10 @@
 wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
   | gzip -dc | tar xf -
 # Generate the GRCh38+ALT+decoy+HLA and create the BWA index
-bwa.kit/run-gen-ref hs38d6   # download GRCh38 and write hs38d6.fa
-bwa.kit/bwa index hs38d6.fa  # create BWA index
+bwa.kit/run-gen-ref hs38D1   # download GRCh38 and write hs38D1.fa
+bwa.kit/bwa index hs38D1.fa  # create BWA index
 # mapping
-bwa.kit/run-bwamem -o out hs38d6.fa read1.fq read2.fq | sh  # skip "|sh" to show command lines
+bwa.kit/run-bwamem -o out -H hs38D1.fa read1.fq read2.fq | sh  # skip "|sh" to show command lines
 ```
 
 This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA
@@ -94,11 +94,11 @@ CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or
 
 For a more complete reference genome, we compiled a new set of decoy sequences
 from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
-The sequences are included in `hs38d6-extra.fa` from the [BWA binary
+The sequences are included in `hs38D1-extra.fa` from the [BWA binary
 package][res].
 
 In addition to decoy, we also put multiple alleles of HLA genes in
-`hs38d6-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
+`hs38D1-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
 version 3.18.0 and are used to collect reads sequenced from these genes.
 
 ### HLA typing
@@ -125,26 +125,26 @@ most of them are distributed under restrictive licenses.
 
 To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878
 unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy
-(hs38d6), and called small variants from the alignment. CHM1 is haploid.
+(hs38D1), and called small variants from the alignment. CHM1 is haploid.
 Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The
 true positive (TP) heterozygous calls from NA12878 are approximately equal
 to the difference between NA12878 and CHM1 heterozygous calls. A better assembly
 should yield higher TP and lower FP. The following table shows the numbers for
 these assemblies:
 
-|Assembly|hs37   |hs38   |hs38d6|CHM1_1.1|  huref|
+|Assembly|hs37   |hs38   |hs38D1|CHM1_1.1|  huref|
 |:------:|------:|------:|------:|------:|------:|
 |FP      | 255706| 168068| 142516|307172 | 575634|
 |TP      |2142260|2163113|2150844|2167235|2137053|
 
-With this measurement, hs38 is clearly better than hs37. Genome hs38d6 reduces
+With this measurement, hs38 is clearly better than hs37. Genome hs38D1 reduces
 FP by ~25k but also reduces TP by ~12k. We manually inspected variants called
 from hs38 only and found the majority of them are associated with excessive read
 depth, clustered variants or weak alignment. We believe most hs38-only calls are
 problematic. In addition, if we compare two NA12878 replicates from HiSeq X10
 with nearly identical library construction, the difference is ~140k, an order
-of magnitude higher than the difference between hs38 and hs38d6. ALT contigs,
-decoy and HLA genes in hs38d6 improve variant calling and enable the analyses of
+of magnitude higher than the difference between hs38 and hs38D1. ALT contigs,
+decoy and HLA genes in hs38D1 improve variant calling and enable the analyses of
 ALT contigs and HLA typing at little cost.
 
 ## Problems and Future Development
diff --git a/bwakit/README.md b/bwakit/README.md
index 8bf9b03..5c1490a 100644
--- a/bwakit/README.md
+++ b/bwakit/README.md
@@ -21,10 +21,10 @@ how to use bwakit:
 wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
   | gzip -dc | tar xf -
 # Generate the GRCh38+ALT+decoy+HLA and create the BWA index
-bwa.kit/run-gen-ref hs38d6   # download GRCh38 and write hs38d6.fa
-bwa.kit/bwa index hs38d6.fa  # create BWA index
+bwa.kit/run-gen-ref hs38D1   # download GRCh38 and write hs38D1.fa
+bwa.kit/bwa index hs38D1.fa  # create BWA index
 # mapping
-bwa.kit/run-bwamem -o out hs38d6.fa read1.fq read2.fq | sh
+bwa.kit/run-bwamem -o out -H hs38D1.fa read1.fq read2.fq | sh
 ```
 
 The last mapping command line will generate the following files:
@@ -44,7 +44,31 @@ Bwakit can be [downloaded here][res]. It is only available to x86_64-linux. The
 scripts in the package are available in the [bwa/bwakit][kit] directory.
 Packaging is done manually for now.
 
-## Contents
+## Limitations
+
+* HLA typing only works for high-coverage human data. The typing accuracy can
+  still be improved. We encourage researchers to develop better HLA typing tools
+  based on the intermediate output of bwakit (for each HLA gene included in the
+  index, bwakit writes all reads matching it in a separate file).
+
+* Duplicate marking only works when all reads from a single paired-end library
+  are provided as the input. This limitation is the necessary tradeoff of fast
+  MarkDuplicate provided by samblaster.
+
+* The adapter trimmer is chosen as it is fast, pipe friendly and does not
+  discard reads. However, it is conservative and suboptimal. If this is a
+  concern, it is recommended to preprocess input reads with a more sophisticated
+  adapter trimmer. We also hope existing trimmers can be modified to operate on
+  an interleaved FASTQ stream. We will replace trimadap once a better trimmer
+  meets our needs.
+
+* Bwakit can be memory demanding depends on the functionality invoked. For 30X
+  human data, bwa-mem takes about 6GB RAM, samblaster uses close to 10GB and BAM
+  shuffling (if the input is sorted BAM) uses several GB. In the current
+  setting, sorting uses about 10GB.
+
+
+## Package Contents  
 ```
 bwa.kit
 |-- README.md                  This README file.
diff --git a/bwakit/run-bwamem b/bwakit/run-bwamem
index 1b40be2..d931067 100755
--- a/bwakit/run-bwamem
+++ b/bwakit/run-bwamem
@@ -18,29 +18,38 @@ Options: -o STR    prefix for output files                       [inferred from
                    ont2d:    Oxford Nanopore reads (~10kb query, higher error rate)
          -t INT    number of threads                                               [1]
 
+         -H        apply HLA typing
          -a        trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
          -d        mark duplicate (via samblaster)
-         -S        for SAM/BAM input, don\'t shuffle
-         -s        sort the output alignment (requring more RAM)
-         -H        skip HLA typing
+         -S        for BAM input, don\'t shuffle
+         -s        sort the output alignment (via samtools; requring more RAM)
          -k        keep temporary files generated by typeHLA
 
 Examples:
 
  * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
 
-     run-bwamem -o prefix -t8 -R"@RG\tID:foo\tSM:bar" hs38d6.fa read1.fq.gz read2.fq.gz
+     run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38D1.fa read1.fq.gz read2.fq.gz
+
+   Note: HLA typing is only effective for high-coverage data. The typing accuracy varies
+   with the quality of input. It is only intended for research purpose, not for diagnostic.
 
  * Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and
    sort the output. The BAM may contain single-end or paired-end reads, or a mixture of
    the two types. Specifying -R stops read group transfer.
 
-     run-bwamem -sao prefix hs38d6.fa old-srt.bam
+     run-bwamem -sao prefix hs38D1.fa old-srt.bam
 
- * Remap name-grouped BAM and mark duplicates. Note that in this case, all reads from
-   a single library should be aligned at the same time. Paired-end only.
+   Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current
+   mapping pipeline better. It is conservative and suboptimal. A more sophisticated
+   trimmer is recommended if this becomes a concern.
 
-     run-bwamem -Sdo prefix hs38d6.fa old-unsrt.bam
+ * Remap name-grouped BAM and mark duplicates:
+ 
+     run-bwamem -Sdo prefix hs38D1.fa old-unsrt.bam
+
+   Note: streamed duplicate marking requires all reads from a single paired-end library
+   to be aligned at the same time.
 
 Output files:
 
@@ -156,7 +165,7 @@ if (-f "$ARGV[0].alt") {
 my $t_sort = $opts{t} < 4? $opts{t} : 4;
 $cmd .= defined($opts{s})? "  | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : "  | $root/samtools view -1 - > $prefix.aln.bam;\n";
 
-if ($has_hla && !defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
+if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
 	$cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
 	$cmd .= "cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
 	$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});

From ed95769a3393cc63fa9adaa4ffe3f8642fed76a6 Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Sun, 21 Dec 2014 16:30:29 -0500
Subject: [PATCH 4/6] updated documentations

(prepare for release)
---
 NEWS.md   | 20 +++++++++++---------
 bwa.1     | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 fastmap.c |  2 +-
 3 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 4a3fbd3..4be1dfc 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -11,25 +11,27 @@ For general uses, the single BWA binary still works like the old way.
 
 Another major addition to BWA-MEM is HLA typing, which made possible with the
 new ALT mapping strategy. Necessary data and programs are included in the
-binary release. The wrapper script also performs HLA typing when HLA genes are
-included in the reference genome as additional ALT contigs.
+binary release. The wrapper script also optionally performs HLA typing when HLA
+genes are included in the reference genome as additional ALT contigs.
 
 Other notable changes to BWA-MEM:
 
  * Added option `-b` to `bwa index`. This option tunes the batch size used in
    the construction of BWT. It is advised to use large `-b` for huge reference
-   sequences such as the *nt* database.
+   sequences such as the BLAST *nt* database.
 
- * Optimized for PacBio data. This includes a change to the scoring based on a
-   mini-study done by Aaron Quinlan and a heuristic speedup. Further speedup is
+ * Optimized for PacBio data. This includes a change to scoring based on a
+   study done by Aaron Quinlan and a heuristic speedup. Further speedup is
    possible, but needs more careful investigation.
 
- * Dropped PacBio read-to-read alignment for now. BWA-MEM is only good at
-   finding the best hit, not all hits. Option `-x pbread` is still available,
-   but hidden on the command line.
+ * Dropped PacBio read-to-read alignment for now. BWA-MEM is good for finding
+   the best hit, but is not very sensitive to suboptimal hits. Option `-x pbread`
+   is still available, but hidden on the command line. This may be removed in
+   future releases.
 
  * Added a new pre-setting for Oxford Nanopore 2D reads. LAST is still a little
-   more sensitive on bacterial data, but bwa-mem is times faster on human data.
+   more sensitive on older bacterial data, but bwa-mem is as good on more
+   recent data and is times faster for mapping against mammalian genomes.
 
  * Added LAST-like seeding. This improves the accuracy for longer reads.
 
diff --git a/bwa.1 b/bwa.1
index 0d556be..6e95c0b 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,4 +1,4 @@
-.TH bwa 1 "18 November 2014" "bwa-0.7.11-r999" "Bioinformatics tools"
+.TH bwa 1 "21 December 2014" "bwa-0.7.11-r1032" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
@@ -75,7 +75,7 @@ appropriate algorithm will be chosen automatically.
 .TP
 .B mem
 .B bwa mem
-.RB [ -aCHMpP ]
+.RB [ -aCHjMpP ]
 .RB [ -t
 .IR nThreads ]
 .RB [ -k
@@ -88,6 +88,12 @@ appropriate algorithm will be chosen automatically.
 .IR seedSplitRatio ]
 .RB [ -c
 .IR maxOcc ]
+.RB [ -D
+.IR chainShadow ]
+.RB [ -m
+.IR maxMateSW ]
+.RB [ -W
+.IR minSeedMatch ]
 .RB [ -A
 .IR matchScore ]
 .RB [ -B
@@ -102,6 +108,8 @@ appropriate algorithm will be chosen automatically.
 .IR unpairPen ]
 .RB [ -R
 .IR RGline ]
+.RB [ -H
+.IR HDlines ]
 .RB [ -v
 .IR verboseLevel ]
 .I db.prefix
@@ -193,9 +201,28 @@ Discard a MEM if it has more than
 .I INT
 occurence in the genome. This is an insensitive parameter. [500]
 .TP
+.BI -D \ INT
+Drop chains shorter than
+.I FLOAT
+fraction of the longest overlapping chain [0.5]
+.TP
+.BI -m \ INT
+Perform at most
+.I INT
+rounds of mate-SW [50]
+.TP
+.BI -W \ INT
+Drop a chain if the number of bases in seeds is smaller than
+.IR INT .
+This option is primarily used for longer contigs/reads. When positive, it also
+affects seed filtering. [0]
+.TP
 .B -P
 In the paired-end mode, perform SW to rescue missing hits only but do not try to find
 hits that fit a proper pair.
+
+.TP
+.B SCORING OPTIONS:
 .TP
 .BI -A \ INT
 Matching score. [1]
@@ -244,15 +271,30 @@ and will be converted to a TAB in the output SAM. The read group ID will be
 attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
 [null]
 .TP
+.BI -H \ ARG
+If ARG starts with @, it is interpreted as a string and gets inserted into the
+output SAM header; otherwise, ARG is interpreted as a file with all lines
+starting with @ in the file inserted into the SAM header. [null]
+.TP
 .BI -T \ INT
 Don't output alignment with score lower than
 .IR INT .
 This option affects output and occasionally SAM flag 2. [30]
 .TP
-.BI -h \ INT
+.BI -j
+Treat ALT contigs as part of the primary assembly (i.e. ignore the
+.I db.prefix.alt
+file).
+.TP
+.BI -h \ INT[,INT2]
 If a query has not more than
 .I INT
-hits with score higher than 80% of the best hit, output them all in the XA tag [5]
+hits with score higher than 80% of the best hit, output them all in the XA tag.
+If
+.I INT2
+is specified, BWA-MEM outputs up to
+.I INT2
+hits if the list contains a hit to an ALT contig. [5,200]
 .TP
 .B -a
 Output all found alignments for single-end or unpaired paired-end reads. These
diff --git a/fastmap.c b/fastmap.c
index 79ebd39..3cca4de 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -268,7 +268,7 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -p            smart pairing (ignoring in2.fq)\n");
 		fprintf(stderr, "       -R STR        read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
 		fprintf(stderr, "       -H STR/FILE   insert STR to header if it starts with @; or insert lines in FILE [null]\n");
-		fprintf(stderr, "       -j            ignore ALT contigs\n");
+		fprintf(stderr, "       -j            treat ALT contigs as part of the primary assembly (i.e. ignore <idxbase>.alt file)\n");
 		fprintf(stderr, "\n");
 		fprintf(stderr, "       -v INT        verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
 		fprintf(stderr, "       -T INT        minimum score to output [%d]\n", opt->T);

From d7ce7e139575f35ec45d897331701de4f3698691 Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Tue, 23 Dec 2014 11:48:59 -0500
Subject: [PATCH 5/6] hs38D1=>hs38DH; cmd help for run-gen-ref

---
 README-alt.md      | 20 ++++++++++----------
 bwakit/README.md   | 16 ++++++++--------
 bwakit/run-bwamem  | 10 +++++-----
 bwakit/run-gen-ref | 19 ++++++++++++++-----
 4 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/README-alt.md b/README-alt.md
index 08da754..4f72042 100644
--- a/README-alt.md
+++ b/README-alt.md
@@ -5,10 +5,10 @@
 wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
   | gzip -dc | tar xf -
 # Generate the GRCh38+ALT+decoy+HLA and create the BWA index
-bwa.kit/run-gen-ref hs38D1   # download GRCh38 and write hs38D1.fa
-bwa.kit/bwa index hs38D1.fa  # create BWA index
+bwa.kit/run-gen-ref hs38DH   # download GRCh38 and write hs38DH.fa
+bwa.kit/bwa index hs38DH.fa  # create BWA index
 # mapping
-bwa.kit/run-bwamem -o out -H hs38D1.fa read1.fq read2.fq | sh  # skip "|sh" to show command lines
+bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh  # skip "|sh" to show command lines
 ```
 
 This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA
@@ -94,11 +94,11 @@ CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or
 
 For a more complete reference genome, we compiled a new set of decoy sequences
 from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
-The sequences are included in `hs38D1-extra.fa` from the [BWA binary
+The sequences are included in `hs38DH-extra.fa` from the [BWA binary
 package][res].
 
 In addition to decoy, we also put multiple alleles of HLA genes in
-`hs38D1-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
+`hs38DH-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
 version 3.18.0 and are used to collect reads sequenced from these genes.
 
 ### HLA typing
@@ -125,26 +125,26 @@ most of them are distributed under restrictive licenses.
 
 To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878
 unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy
-(hs38D1), and called small variants from the alignment. CHM1 is haploid.
+(hs38DH), and called small variants from the alignment. CHM1 is haploid.
 Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The
 true positive (TP) heterozygous calls from NA12878 are approximately equal
 to the difference between NA12878 and CHM1 heterozygous calls. A better assembly
 should yield higher TP and lower FP. The following table shows the numbers for
 these assemblies:
 
-|Assembly|hs37   |hs38   |hs38D1|CHM1_1.1|  huref|
+|Assembly|hs37   |hs38   |hs38DH|CHM1_1.1|  huref|
 |:------:|------:|------:|------:|------:|------:|
 |FP      | 255706| 168068| 142516|307172 | 575634|
 |TP      |2142260|2163113|2150844|2167235|2137053|
 
-With this measurement, hs38 is clearly better than hs37. Genome hs38D1 reduces
+With this measurement, hs38 is clearly better than hs37. Genome hs38DH reduces
 FP by ~25k but also reduces TP by ~12k. We manually inspected variants called
 from hs38 only and found the majority of them are associated with excessive read
 depth, clustered variants or weak alignment. We believe most hs38-only calls are
 problematic. In addition, if we compare two NA12878 replicates from HiSeq X10
 with nearly identical library construction, the difference is ~140k, an order
-of magnitude higher than the difference between hs38 and hs38D1. ALT contigs,
-decoy and HLA genes in hs38D1 improve variant calling and enable the analyses of
+of magnitude higher than the difference between hs38 and hs38DH. ALT contigs,
+decoy and HLA genes in hs38DH improve variant calling and enable the analyses of
 ALT contigs and HLA typing at little cost.
 
 ## Problems and Future Development
diff --git a/bwakit/README.md b/bwakit/README.md
index 5c1490a..b3a4033 100644
--- a/bwakit/README.md
+++ b/bwakit/README.md
@@ -21,10 +21,10 @@ how to use bwakit:
 wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
   | gzip -dc | tar xf -
 # Generate the GRCh38+ALT+decoy+HLA and create the BWA index
-bwa.kit/run-gen-ref hs38D1   # download GRCh38 and write hs38D1.fa
-bwa.kit/bwa index hs38D1.fa  # create BWA index
+bwa.kit/run-gen-ref hs38DH   # download GRCh38 and write hs38DH.fa
+bwa.kit/bwa index hs38DH.fa  # create BWA index
 # mapping
-bwa.kit/run-bwamem -o out -H hs38D1.fa read1.fq read2.fq | sh
+bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh
 ```
 
 The last mapping command line will generate the following files:
@@ -63,9 +63,9 @@ Packaging is done manually for now.
   meets our needs.
 
 * Bwakit can be memory demanding depends on the functionality invoked. For 30X
-  human data, bwa-mem takes about 6GB RAM, samblaster uses close to 10GB and BAM
-  shuffling (if the input is sorted BAM) uses several GB. In the current
-  setting, sorting uses about 10GB.
+  human data, bwa-mem takes about 11GB RAM with 32 threads, samblaster uses
+  close to 10GB and BAM shuffling (if the input is sorted BAM) uses several GB.
+  In the current setting, sorting uses about 10GB.
 
 
 ## Package Contents  
@@ -84,8 +84,8 @@ bwa.kit
 |
 |-- run-gen-ref                *Entry script* for generating human reference genomes.
 |-- resource-GRCh38            Resources for generating GRCh38
-|   |-- hs38D1-extra.fa        Decoy and HLA gene sequences. Used by run-gen-ref.
-|   `-- hs38D1.fa.alt          ALT-to-GRCh38 alignment. Used by run-gen-ref.
+|   |-- hs38DH-extra.fa        Decoy and HLA gene sequences. Used by run-gen-ref.
+|   `-- hs38DH.fa.alt          ALT-to-GRCh38 alignment. Used by run-gen-ref.
 |
 |-- run-HLA                    HLA typing for sequences extracted by bwa-postalt.js.
 |-- typeHLA.sh                 Type one HLA-gene. Called by run-HLA.
diff --git a/bwakit/run-bwamem b/bwakit/run-bwamem
index d931067..165f93e 100755
--- a/bwakit/run-bwamem
+++ b/bwakit/run-bwamem
@@ -29,7 +29,7 @@ Examples:
 
  * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
 
-     run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38D1.fa read1.fq.gz read2.fq.gz
+     run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz
 
    Note: HLA typing is only effective for high-coverage data. The typing accuracy varies
    with the quality of input. It is only intended for research purpose, not for diagnostic.
@@ -38,7 +38,7 @@ Examples:
    sort the output. The BAM may contain single-end or paired-end reads, or a mixture of
    the two types. Specifying -R stops read group transfer.
 
-     run-bwamem -sao prefix hs38D1.fa old-srt.bam
+     run-bwamem -sao prefix hs38DH.fa old-srt.bam
 
    Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current
    mapping pipeline better. It is conservative and suboptimal. A more sophisticated
@@ -46,7 +46,7 @@ Examples:
 
  * Remap name-grouped BAM and mark duplicates:
  
-     run-bwamem -Sdo prefix hs38D1.fa old-unsrt.bam
+     run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam
 
    Note: streamed duplicate marking requires all reads from a single paired-end library
    to be aligned at the same time.
@@ -93,7 +93,7 @@ if (defined $opts{o}) {
 } elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
 	$prefix = $1;
 }
-die("ERROR: failed to identify the prefix for output. Please specify -p.\n") unless defined($prefix);
+die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix);
 
 my $size = 0;
 my $comp_ratio = 3.;
@@ -167,7 +167,7 @@ $cmd .= defined($opts{s})? "  | $root/samtools sort -@ $t_sort -m1G - $prefix.al
 
 if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
 	$cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
-	$cmd .= "cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
+	$cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
 	$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
 }
 
diff --git a/bwakit/run-gen-ref b/bwakit/run-gen-ref
index 8a6baa0..e129b62 100755
--- a/bwakit/run-gen-ref
+++ b/bwakit/run-gen-ref
@@ -6,16 +6,25 @@ url38="ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals
 url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
 
 if [ $# -eq 0 ]; then
-	echo "Usage: $0 <hs38|hs38a|hs38D1|hs37|hs37d5>"
+	echo "Usage: $0 <hs38|hs38a|hs38DH|hs37|hs37d5>"
+	echo "Analysis sets:"
+	echo "  hs38     primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV"
+	echo "  hs38a    hs38 plus ALT contigs"
+	echo "  hs38DH   hs38a plus decoy contigs and HLA genes (recommended for GRCh38 mapping)"
+	echo "  hs37     primary assembly of GRCh37 (used by 1000g phase 1) plus the EBV genome"
+	echo "  hs37d5   hs37 plus decoy contigs (used by 1000g phase 3)"
+	echo ""
+	echo "Note: This script downloads human reference genomes. For hs38a and hs38DH, it needs additional"
+	echo "      sequences and ALT-to-REF mapping included in the bwa.kit package."
 	exit 1;
 fi
 
-if [ $1 == "hs38D1" ]; then
-	(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38D1-extra.fa) > $1.fa
-	[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38D1.fa.alt $1.fa.alt
+if [ $1 == "hs38DH" ]; then
+	(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38DH-extra.fa) > $1.fa
+	[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38DH.fa.alt $1.fa.alt
 elif [ $1 == "hs38a" ]; then
 	wget -O- $url38 | gzip -dc > $1.fa
-	[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38D1.fa.alt > $1.fa.alt
+	[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38DH.fa.alt > $1.fa.alt
 elif [ $1 == "hs38" ]; then
 	wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
 elif [ $1 == "hs37d5" ]; then

From 408fd5e0727b0970ce362bb072ad44aa746456cb Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Tue, 23 Dec 2014 15:29:18 -0500
Subject: [PATCH 6/6] Released bwa-0.7.11-r1034

---
 NEWS.md | 13 +++++++++++--
 bwa.1   |  5 ++---
 main.c  |  2 +-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 4be1dfc..fd42fa3 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-Release 0.7.11 (XX December, 2014)
+Release 0.7.11 (23 December, 2014)
 ----------------------------------
 
 A major change to BWA-MEM is the support of mapping to ALT contigs in addition
@@ -47,7 +47,16 @@ Other notable changes to BWA-MEM:
    writing SAM. This saves significant wall-clock time when reading from
    or writing to a slow Unix pipe.
 
-(0.7.11: XX December 2014, r10XX)
+With the new release, the recommended way to map Illumina reads to GRCh38 is to
+use the bwakit binary package:
+
+    bwa.kit/run-gen-ref hs38DH
+    bwa.kit/bwa index hs38DH.fa
+    bwa.kit/run-bwamem -t8 -H -o out-prefix hs38DH.fa read1.fq.gz read2.fq.gz | sh
+
+Please check bwa.kit/README.md for details and command line options.
+
+(0.7.11: 23 December 2014, r1034)
 
 
 
diff --git a/bwa.1 b/bwa.1
index 6e95c0b..9b681b7 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,4 +1,4 @@
-.TH bwa 1 "21 December 2014" "bwa-0.7.11-r1032" "Bioinformatics tools"
+.TH bwa 1 "23 December 2014" "bwa-0.7.11-r1034" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
@@ -614,6 +614,7 @@ R	0x0020	strand of the mate
 s	0x0100	the alignment is not primary
 f	0x0200	QC failure
 d	0x0400	optical or PCR duplicate
+S	0x0800	supplementary alignment
 .TE
 
 .PP
@@ -647,8 +648,6 @@ _
 XS	Suboptimal alignment score
 XF	Support from forward/reverse alignment
 XE	Number of supporting seeds
-_
-XP	Alt primary hits; format: /(chr,pos,CIGAR,mapQ,NM;)+/
 .TE
 
 .PP
diff --git a/main.c b/main.c
index 887ea28..e157df6 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.10-r1027-dirty"
+#define PACKAGE_VERSION "0.7.11-r1034"
 #endif
 
 int bwa_fa2pac(int argc, char *argv[]);