From e620f0ff4ed29b93758128ad1141ec5248b5e591 Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@me.com>
Date: Wed, 27 Feb 2013 13:16:22 -0500
Subject: [PATCH] r302: updated the manpage

---
 bwa.1     | 264 ++++++++++++++++++++++++++++++++++++++++++------------
 fastmap.c |   7 +-
 main.c    |   2 +-
 3 files changed, 215 insertions(+), 58 deletions(-)

diff --git a/bwa.1 b/bwa.1
index 66bc9a2..442d069 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,47 +1,45 @@
-.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools"
+.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
 .SH SYNOPSIS
 .PP
-bwa index -a bwtsw database.fasta
+bwa index ref.fa
 .PP
-bwa aln database.fasta short_read.fastq > aln_sa.sai
+bwa mem ref.fa reads.fq > aln-se.sam
 .PP
-bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
+bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
 .PP
-bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
+bwa aln ref.fa short_read.fq > aln_sa.sai
 .PP
-bwa bwasw database.fasta long_read.fastq > aln.sam
+bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
+.PP
+bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
+.PP
+bwa bwasw ref.fa long_read.fq > aln.sam
 
 .SH DESCRIPTION
 .PP
-BWA is a fast light-weighted tool that aligns relatively short sequences
-(queries) to a sequence database (targe), such as the human reference
-genome. It implements two different algorithms, both based on
-Burrows-Wheeler Transform (BWT). The first algorithm is designed for
-short queries up to ~150bp with low error rate (<3%). It does gapped
-global alignment w.r.t. queries, supports paired-end reads, and is one
-of the fastest short read alignment algorithms to date while also
-visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
-reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like
-alignment to find high-scoring local hits and split hits. On
-low-error short queries, BWA-SW is a little slower and less accurate than the
-first algorithm, but on long queries, it is better.
-.PP
-For both algorithms, the database file in the FASTA format must be
-first indexed with the
-.B `index'
-command, which typically takes a few hours for a 3GB genome. The first algorithm is
-implemented via the
-.B `aln'
-command, which finds the suffix array (SA) coordinates of good hits of
-each individual read, and the
-.B `samse/sampe'
-command, which converts SA coordinates to chromosomal coordinate and
-pairs reads (for `sampe'). The second algorithm is invoked by the
-.B `bwasw'
-command. It works for single-end reads only.
+BWA is a software package for mapping low-divergent sequences against a large
+reference genome, such as the human genome. It consists of three algorithms:
+BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
+sequence reads up to 100bp, while the rest two for longer sequences ranged from
+70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
+support and split alignment, but BWA-MEM, which is the latest, is generally
+recommended for high-quality queries as it is faster and more accurate.
+BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
+reads.
+
+For all the algorithms, BWA first needs to construct the FM-index for
+the reference genome (the
+.B index
+command). Alignment algorithms are invoked with different sub-commands:
+.BR aln / samse / sampe
+for BWA-backtrack,
+.B bwasw
+for BWA-SW and
+.B mem
+for the BWA-MEM algorithm.
 
 .SH COMMANDS AND OPTIONS
 .TP
@@ -53,9 +51,6 @@ Index database sequences in the FASTA format.
 .B OPTIONS:
 .RS
 .TP 10
-.B -c
-Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x)
-.TP
 .BI -p \ STR
 Prefix of the output database [same as db filename]
 .TP
@@ -76,6 +71,168 @@ genome.
 .RE
 .RE
 
+.TP
+.B mem
+.B bwa mem
+.RB [ -aCHMpP ]
+.RB [ -t
+.IR nThreads ]
+.RB [ -k
+.IR minSeedLen ]
+.RB [ -w
+.IR bandWidth ]
+.RB [ -r
+.IR seedSplitRatio ]
+.RB [ -c
+.IR maxOcc ]
+.RB [ -A
+.IR matchScore ]
+.RB [ -B
+.IR mmPenalty ]
+.RB [ -O
+.IR gapOpenPen ]
+.RB [ -E
+.IR gapExtPen ]
+.RB [ -U
+.IR unpairPen ]
+.RB [ -R
+.IR RGline ]
+.RB [ -v
+.IR verboseLevel ]
+.I db.prefix
+.I reads.fq
+.RI [ mates.fq ]
+
+Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
+algorithm works by seeding alignments with maximal exact matches (MEMs) and
+then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
+
+If
+.I mates.fq
+file is absent and option
+.B -p
+is not set, this command regards input reads are single-end. If
+.I mates.fq
+is present, this command assumes the
+.IR i -th
+read in
+.I reads.fq
+and the
+.IR i -th
+read in
+.I mates.fq
+constitute a read pair. If
+.B -p
+is used, the command assumes the
+.RI 2 i -th
+and the
+.RI (2 i +1)-th
+read in
+.I reads.fq
+constitute a read pair (such input file is said to be interleaved). In this case,
+.I mates.fq
+is ignored. In the paired-end mode, the
+.B mem
+command will infer the read orientation and the insert size distribution from a
+batch of reads.
+
+The BWA-MEM algorithm performs local alignment. It may produce multiple primary
+alignments for different part of a query sequence. This is a crucial feature
+for long sequences. However, some tools such as Picard's markDuplicates does
+not work with split alignments. One may consider to use option
+.B -M
+to flag shorter split hits as secondary.
+
+.B OPTIONS:
+.RS
+.TP 10
+.BI -t \ INT
+Number of threads [1]
+.TP
+.BI -k \ INT
+Minimum seed length. Matches shorter than
+.I INT
+will be missed. The alignment speed is usually insensitive to this value unless
+it significantly deviates 20. [19]
+.TP
+.BI -w \ INT
+Band width. Essentially, gaps longer than
+.I INT
+will not be found. Note that the maximum gap length is also affected by the
+scoring matrix and the hit length, not solely determined by this option. [100]
+.TP
+.BI -r \ FLOAT
+Trigger re-seeding for a MEM longer than
+.IR minSeedLen * FLOAT .
+This is a key heuristic parameter for tuning the performance. Larger value
+yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
+.TP
+.BI -c \ INT
+Discard a MEM if it has more than
+.I INT
+occurence in the genome. This is an insensitive parameter. [10000]
+.TP
+.B -P
+In the paired-end mode, perform SW to rescue missing hits only but do not try to find
+hits that fit a proper pair.
+.TP
+.BI -A \ INT
+Matching score. [1]
+.TP
+.BI -B \ INT
+Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
+.TP
+.BI -O \ INT
+Gap open penalty. [6]
+.TP
+.BI -E \ INT
+Gap extension penalty. A gap of length k costs O + k*E (i.e.
+.B -O
+is for opening a zero-length gap). [1]
+.TP
+.BI -U \ INT
+Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
+.RI scoreRead1+scoreRead2- INT
+and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
+two scores to determine whether we should force pairing. [9]
+.TP
+.B -p
+Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.
+.TP
+.BI -R \ STR
+Complete read group header line. '\\t' can be used in
+.I STR
+and will be converted to a TAB in the output SAM. The read group ID will be
+attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
+[null]
+.TP
+.B -a
+Output all found alignments for single-end or unpaired paired-end reads. These
+alignments will be flagged as secondary alignments.
+.TP
+.B -C
+Append append FASTA/Q comment to SAM output. This option can be used to
+transfer read meta information (e.g. barcode) to the SAM output. Note that the
+FASTA/Q comment (the string after a space in the header line) must conform the SAM
+spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
+.TP
+.B -H
+Use hard clipping 'H' in the SAM output. This option may dramatically reduce
+the redundancy of output when mapping long contig or BAC sequences.
+.TP
+.B -M
+Mark shorter split hits as secondary (for Picard compatibility).
+.TP
+.BI -v \ INT
+Control the verbose level of the output. This option has not been fully
+supported throughout BWA. Ideally, a value 0 for disabling all the output to
+stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
+all normal messages; 4 or higher for debugging. When this option takes value
+4, the output is not SAM. [3]
+
+.RE
+.RE
+
 .TP
 .B aln
 bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
@@ -482,24 +639,6 @@ Pairing is slower for shorter reads. This is mainly because shorter
 reads have more spurious hits and converting SA coordinates to
 chromosomal coordinates are very costly.
 
-.SH NOTES ON LONG-READ ALIGNMENT
-.PP
-Command
-.B bwasw
-is designed for long-read alignment. BWA-SW essentially aligns the trie
-of the reference genome against the directed acyclic word graph (DAWG) of a
-read to find seeds not highly repetitive in the genome, and then performs a
-standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called
-the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the
-top Z reference suffix intervals that match the vertex. BWA-SW is more accurate
-if the resultant alignment is supported by more seeds, and therefore BWA-SW
-usually performs better on long queries or queries with low divergence to the
-reference genome.
-
-BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads
-mainly because it gives better gapped alignment. For paired-end reads, it is yet
-to know whether BWA-short or BWA-SW yield overall better results.
-
 .SH CHANGES IN BWA-0.6
 .PP
 Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
@@ -534,16 +673,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes
 from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
 libraries are distributed under the MIT license.
 .PP
-If you use the short-read alignment component, please cite the following
+If you use the BWA-backtrack algorithm, please cite the following
 paper:
 .PP
 Li H. and Durbin R. (2009) Fast and accurate short read alignment with
 Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
 .PP
-If you use the long-read component (BWA-SW), please cite:
+If you use the BWA-SW algorithm, please cite:
 .PP
 Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
 Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
+.PP
+If you use the fastmap component of BWA, please cite:
+.PP
+Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de
+novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178]
+.PP
+The BWA-MEM algorithm has not been published yet.
 
 .SH HISTORY
 BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
@@ -569,3 +715,11 @@ short-read aligners are being implemented.
 
 The BWA-SW algorithm is a new component of BWA. It was conceived in
 November 2008 and implemented ten months later.
+
+The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
+matches (SMEMs), which was first published with the fermi assembler paper
+in 2012. I first implemented the basic SMEM algorithm in the
+.B fastmap
+command for an experiment and then extended the basic algorithm and added the
+extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.
+
diff --git a/fastmap.c b/fastmap.c
index 81ce665..b2d5f39 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[])
 	void *ko = 0, *ko2 = 0;
 
 	opt = mem_opt_init();
-	while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) {
+	while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:")) >= 0) {
 		if (c == 'k') opt->min_seed_len = atoi(optarg);
 		else if (c == 'w') opt->w = atoi(optarg);
 		else if (c == 'A') opt->a = atoi(optarg);
 		else if (c == 'B') opt->b = atoi(optarg);
 		else if (c == 'O') opt->q = atoi(optarg);
 		else if (c == 'E') opt->r = atoi(optarg);
+		else if (c == 'U') opt->pen_unpaired = atoi(optarg);
 		else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
 		else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
 		else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
@@ -56,13 +57,14 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -k INT     minimum seed length [%d]\n", opt->min_seed_len);
 		fprintf(stderr, "       -w INT     band width for banded alignment [%d]\n", opt->w);
 		fprintf(stderr, "       -r FLOAT   look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
-		fprintf(stderr, "       -s INT     look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
+//		fprintf(stderr, "       -s INT     look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
 		fprintf(stderr, "       -c INT     skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
 		fprintf(stderr, "       -P         skip pairing; perform mate SW only\n");
 		fprintf(stderr, "       -A INT     score for a sequence match [%d]\n", opt->a);
 		fprintf(stderr, "       -B INT     penalty for a mismatch [%d]\n", opt->b);
 		fprintf(stderr, "       -O INT     gap open penalty [%d]\n", opt->q);
 		fprintf(stderr, "       -E INT     gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
+		fprintf(stderr, "       -U INT     penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
 		fprintf(stderr, "\nInput/output options:\n\n");
 		fprintf(stderr, "       -p         first query file consists of interleaved paired-end sequences\n");
 		fprintf(stderr, "       -R STR     read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
@@ -72,6 +74,7 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -C         append FASTA/FASTQ comment to SAM output\n");
 		fprintf(stderr, "       -H         hard clipping\n");
 		fprintf(stderr, "       -M         mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
+		fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
 		fprintf(stderr, "\n");
 		free(opt);
 		return 1;
diff --git a/main.c b/main.c
index 7648310..041a83e 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.6.2-r301-beta"
+#define PACKAGE_VERSION "0.6.2-r302-beta"
 #endif
 
 static int usage()