From 39a6cd5bb09add7cc09c5bd1ddf064ab749bbd35 Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Sun, 11 May 2014 15:15:44 -0400
Subject: [PATCH] r762: cleanup for the new release; unfinished

It will take to make the documentation ready.
---
 Makefile       | 40 ++++++++++++++++++++++++++++++++++++++++
 NEWS           | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 README.md      | 36 +++++++++++++++++++++++++++++++++++-
 bwa.1          | 12 +++++++++++-
 bwamem.c       |  1 +
 bwamem.h       |  1 +
 bwamem_extra.c |  4 ++--
 fastmap.c      |  1 +
 main.c         |  2 +-
 9 files changed, 141 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 339ac0a..60c2104 100644
--- a/Makefile
+++ b/Makefile
@@ -37,3 +37,43 @@ depend:
 	( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c )
 
 # DO NOT DELETE THIS LINE -- make depend depends on it.
+
+QSufSort.o: QSufSort.h
+bamlite.o: bamlite.h malloc_wrap.h
+bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h
+bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h
+bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
+bwamem.o: ksort.h utils.h kbtree.h
+bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h
+bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h
+bwamem_pair.o: utils.h ksw.h
+bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h
+bwape.o: ksw.h khash.h
+bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
+bwase.o: bwa.h ksw.h
+bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
+bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
+bwt_gen.o: QSufSort.h malloc_wrap.h
+bwt_lite.o: bwt_lite.h malloc_wrap.h
+bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
+bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
+bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h
+bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
+bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
+bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
+bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h
+bwtsw2_core.o: khash.h ksort.h
+bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
+bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
+bwtsw2_pair.o: malloc_wrap.h ksw.h
+example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
+fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
+is.o: malloc_wrap.h
+kopen.o: malloc_wrap.h
+kstring.o: kstring.h malloc_wrap.h
+ksw.o: ksw.h malloc_wrap.h
+main.o: kstring.h malloc_wrap.h utils.h
+malloc_wrap.o: malloc_wrap.h
+pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
+test-extend.o: ksw.h
+utils.o: utils.h ksort.h malloc_wrap.h kseq.h
diff --git a/NEWS b/NEWS
index 40f4433..d09c322 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,52 @@
+Release 0.7.9 (11 May, 2014)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release brings several major changes to BWA-MEM. Notably, BWA-MEM
+now formally supports PacBio reads. It generally runs faster at a minor cost of
+accuracy. The speedup is more significant when GRCh38 is in use. More
+specifically:
+
+ * Support PacBio subreads to reference alignment. Although older BWA-MEM works
+   with PacBio data in principle, the resultant alignments are frequently
+   fragmented. In this release, we fine tuned existing methods and introduced
+   new heuristics to improve PacBio alignment. These changes are not used by
+   default. Users need to add option "-x pacbio" to enable the feature.
+
+ * Support PacBio subread-to-subread alignment. The feature is enabled with
+   option "-x pbread". In this mode, the output only gives the overlapping
+   region between a pair of reads without detailed alignment.
+
+ * Output alternative hits in the XA tag if there are not so many such hits.
+   This feature is originally implemented in BWA-backtrack.
+
+ * EXPERIMENTAL feature to support mapping to ALT contigs in GRCh38. We prodive
+   a script to postprocess hits in the XA tag to adjust the mapping quality and
+   generate new primary alignments to all overlapping ALT contigs. We would NOT
+   recommended this feature for production uses.
+
+ * Improved alignments to many short reference sequences. Older BWA-MEM may
+   generate an alignment bridging two or more adjacent reference sequences.
+   Such alignments are split at a later step as postprocessing. This approach
+   is complex and does not always work. This release forbids these alignments
+   from the very beginning. BWA-MEM should not produce an alignment bridging
+   two or more reference sequences any more.
+
+ * Reduced the maximum seed occurrence from 10000 to 500. Reduced the maximum
+   number of Smith-Waterman mate rescue from 100 to 50. Added a heuristic to
+   lower the mapping quality if a read contains seeds with excessive
+   occurrences. These changes make BWA-MEM XXX faster for mapping short reads
+   to GRCh37 and XXX faster to GRCh38.
+
+ * Added an option "-Y" to use soft clipping for supplementary alignments.
+
+ * Bugfix: incomplete alignment extension in corner cases.
+
+ * Bugfix: integer overflow when aligning low query sequences.
+
+(0.7.9: 11 May 2014, r777)
+
+
+
 Release 0.7.8 (31 March, 2014)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/README.md b/README.md
index 3db893d..40dd079 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
 sequence reads up to 100bp, while the rest two for longer sequences ranged from
 70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as the support of
 long reads and chimeric alignment, but BWA-MEM, which is the latest, is
-generally recommended for high-quality queries as it is faster and more
+generally recommended as it is faster and more
 accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp
 Illumina reads.
 
@@ -59,6 +59,38 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
 
 ###Frequently asked questions (FAQs)
 
+####What types of data does BWA work with?
+
+BWA works with a variety types of DNA sequence data, though the optimal
+algorithm and setting may vary. The following list gives the recommended
+settings:
+
+* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
+  contigs up to a few megabases:
+
+		bwa mem ref.fa reads.fq > aln.sam
+
+* Illumina/454/IonTorrent paired-end reads longer than ~70bp:
+
+		bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
+
+* PacBio subreads to a reference genome:
+
+		bwa mem -x pacbio ref.fa reads.fq > aln.sam
+
+* PacBio subreads to themselves (the output is not SAM):
+
+		bwa mem -x pbread reads.fq reads.fq > overlap.pas
+
+* Illumina single-end reads no longer than ~70bp:
+
+		bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
+
+* Illumina paired-end reads no longer than ~70bp:
+
+		bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
+		bwa samse ref.fa reads.sai reads.fq > aln-pe.sam
+
 ####How to map sequences to GRCh38 with ALT contigs?
 
 BWA-backtrack and BWA-MEM partially support mapping to a reference containing
@@ -91,6 +123,8 @@ highly repetitive regions as these hits will not be reported with option
 `-h50`. `bwa-helper.js` is a prototype implementation not recommended for
 production uses.
 
+
+
 [1]: http://en.wikipedia.org/wiki/GNU_General_Public_License
 [2]: https://github.com/lh3/bwa
 [3]: http://sourceforge.net/projects/bio-bwa/files/
diff --git a/bwa.1 b/bwa.1
index b6354e5..1a70a52 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,4 +1,4 @@
-.TH bwa 1 "31 March 2014" "bwa-0.7.8" "Bioinformatics tools"
+.TH bwa 1 "11 May 2014" "bwa-0.7.9" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
@@ -248,6 +248,11 @@ Don't output alignment with score lower than
 .IR INT .
 This option affects output and occasionally SAM flag 2. [30]
 .TP
+.BI -h \ INT
+If a query has not more than
+.I INT
+hits with score higher than 80% of the best hit, output them all in the XA tag [5]
+.TP
 .B -a
 Output all found alignments for single-end or unpaired paired-end reads. These
 alignments will be flagged as secondary alignments.
@@ -258,6 +263,11 @@ transfer read meta information (e.g. barcode) to the SAM output. Note that the
 FASTA/Q comment (the string after a space in the header line) must conform the SAM
 spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
 .TP
+.B -Y
+Use soft clipping CIGAR operation for supplementary alignments. By default, BWA-MEM
+uses soft clipping for the primary alignment and hard clipping for
+supplementary alignments.
+.TP
 .B -M
 Mark shorter split hits as secondary (for Picard compatibility).
 .TP
diff --git a/bwamem.c b/bwamem.c
index 6fd9acf..582b809 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -64,6 +64,7 @@ mem_opt_t *mem_opt_init()
 	o->max_ins = 10000;
 	o->mask_level = 0.50;
 	o->drop_ratio = 0.50;
+	o->XA_drop_ratio = 0.80;
 	o->split_factor = 1.5;
 	o->chunk_size = 10000000;
 	o->n_threads = 1;
diff --git a/bwamem.h b/bwamem.h
index 98e135d..7b14ca8 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -42,6 +42,7 @@ typedef struct {
 	int chunk_size;         // process chunk_size-bp sequences in a batch
 	float mask_level;       // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
 	float drop_ratio;       // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
+	float XA_drop_ratio;    // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
 	float mask_level_redun;
 	float mapQ_coef_len;
 	int mapQ_coef_fac;
diff --git a/bwamem_extra.c b/bwamem_extra.c
index 6b600ae..7403910 100644
--- a/bwamem_extra.c
+++ b/bwamem_extra.c
@@ -115,7 +115,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 	cnt = calloc(a->n, sizeof(int));
 	for (i = 0, tot = 0; i < a->n; ++i) {
 		int j = a->a[i].secondary;
-		if (j >= 0 && a->a[i].score >= a->a[j].score * opt->drop_ratio)
+		if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
 			++cnt[j], ++tot;
 	}
 	if (tot == 0) goto end_gen_alt;
@@ -123,7 +123,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 	for (i = 0; i < a->n; ++i) {
 		mem_aln_t t;
 		int j = a->a[i].secondary;
-		if (j < 0 || a->a[i].score < a->a[j].score * opt->drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
+		if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
 		if (cnt[j] > opt->max_hits) continue;
 		t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
 		kputs(bns->anns[t.rid].name, &aln[j]);
diff --git a/fastmap.c b/fastmap.c
index 19800de..3b782bf 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -159,6 +159,7 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -h INT        if #hits < INT, output all in the XA tag [%d]\n", opt->max_hits);
 		fprintf(stderr, "       -a            output all alignments for SE or unpaired PE\n");
 		fprintf(stderr, "       -C            append FASTA/FASTQ comment to SAM output\n");
+		fprintf(stderr, "       -Y            use soft clipping for supplementary alignments\n");
 		fprintf(stderr, "       -M            mark shorter split hits as secondary\n\n");
 		fprintf(stderr, "       -I FLOAT[,FLOAT[,INT[,INT]]]\n");
 		fprintf(stderr, "                     specify the mean, standard deviation (10%% of the mean if absent), max\n");
diff --git a/main.c b/main.c
index f6eab65..e006542 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.8-r760-dirty"
+#define PACKAGE_VERSION "0.7.8-r762-dirty"
 #endif
 
 int bwa_fa2pac(int argc, char *argv[]);