From e620f0ff4ed29b93758128ad1141ec5248b5e591 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 13:16:22 -0500 Subject: [PATCH] r302: updated the manpage --- bwa.1 | 264 ++++++++++++++++++++++++++++++++++++++++++------------ fastmap.c | 7 +- main.c | 2 +- 3 files changed, 215 insertions(+), 58 deletions(-) diff --git a/bwa.1 b/bwa.1 index 66bc9a2..442d069 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,47 +1,45 @@ -.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools" +.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool .SH SYNOPSIS .PP -bwa index -a bwtsw database.fasta +bwa index ref.fa .PP -bwa aln database.fasta short_read.fastq > aln_sa.sai +bwa mem ref.fa reads.fq > aln-se.sam .PP -bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam +bwa mem ref.fa read1.fq read2.fq > aln-pe.sam .PP -bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam +bwa aln ref.fa short_read.fq > aln_sa.sai .PP -bwa bwasw database.fasta long_read.fastq > aln.sam +bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam +.PP +bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam +.PP +bwa bwasw ref.fa long_read.fq > aln.sam .SH DESCRIPTION .PP -BWA is a fast light-weighted tool that aligns relatively short sequences -(queries) to a sequence database (targe), such as the human reference -genome. It implements two different algorithms, both based on -Burrows-Wheeler Transform (BWT). The first algorithm is designed for -short queries up to ~150bp with low error rate (<3%). It does gapped -global alignment w.r.t. queries, supports paired-end reads, and is one -of the fastest short read alignment algorithms to date while also -visiting suboptimal hits. The second algorithm, BWA-SW, is designed for -reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like -alignment to find high-scoring local hits and split hits. On -low-error short queries, BWA-SW is a little slower and less accurate than the -first algorithm, but on long queries, it is better. -.PP -For both algorithms, the database file in the FASTA format must be -first indexed with the -.B `index' -command, which typically takes a few hours for a 3GB genome. The first algorithm is -implemented via the -.B `aln' -command, which finds the suffix array (SA) coordinates of good hits of -each individual read, and the -.B `samse/sampe' -command, which converts SA coordinates to chromosomal coordinate and -pairs reads (for `sampe'). The second algorithm is invoked by the -.B `bwasw' -command. It works for single-end reads only. +BWA is a software package for mapping low-divergent sequences against a large +reference genome, such as the human genome. It consists of three algorithms: +BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina +sequence reads up to 100bp, while the rest two for longer sequences ranged from +70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read +support and split alignment, but BWA-MEM, which is the latest, is generally +recommended for high-quality queries as it is faster and more accurate. +BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina +reads. + +For all the algorithms, BWA first needs to construct the FM-index for +the reference genome (the +.B index +command). Alignment algorithms are invoked with different sub-commands: +.BR aln / samse / sampe +for BWA-backtrack, +.B bwasw +for BWA-SW and +.B mem +for the BWA-MEM algorithm. .SH COMMANDS AND OPTIONS .TP @@ -53,9 +51,6 @@ Index database sequences in the FASTA format. .B OPTIONS: .RS .TP 10 -.B -c -Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x) -.TP .BI -p \ STR Prefix of the output database [same as db filename] .TP @@ -76,6 +71,168 @@ genome. .RE .RE +.TP +.B mem +.B bwa mem +.RB [ -aCHMpP ] +.RB [ -t +.IR nThreads ] +.RB [ -k +.IR minSeedLen ] +.RB [ -w +.IR bandWidth ] +.RB [ -r +.IR seedSplitRatio ] +.RB [ -c +.IR maxOcc ] +.RB [ -A +.IR matchScore ] +.RB [ -B +.IR mmPenalty ] +.RB [ -O +.IR gapOpenPen ] +.RB [ -E +.IR gapExtPen ] +.RB [ -U +.IR unpairPen ] +.RB [ -R +.IR RGline ] +.RB [ -v +.IR verboseLevel ] +.I db.prefix +.I reads.fq +.RI [ mates.fq ] + +Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the +algorithm works by seeding alignments with maximal exact matches (MEMs) and +then extending seeds with the affine-gap Smith-Waterman algorithm (SW). + +If +.I mates.fq +file is absent and option +.B -p +is not set, this command regards input reads are single-end. If +.I mates.fq +is present, this command assumes the +.IR i -th +read in +.I reads.fq +and the +.IR i -th +read in +.I mates.fq +constitute a read pair. If +.B -p +is used, the command assumes the +.RI 2 i -th +and the +.RI (2 i +1)-th +read in +.I reads.fq +constitute a read pair (such input file is said to be interleaved). In this case, +.I mates.fq +is ignored. In the paired-end mode, the +.B mem +command will infer the read orientation and the insert size distribution from a +batch of reads. + +The BWA-MEM algorithm performs local alignment. It may produce multiple primary +alignments for different part of a query sequence. This is a crucial feature +for long sequences. However, some tools such as Picard's markDuplicates does +not work with split alignments. One may consider to use option +.B -M +to flag shorter split hits as secondary. + +.B OPTIONS: +.RS +.TP 10 +.BI -t \ INT +Number of threads [1] +.TP +.BI -k \ INT +Minimum seed length. Matches shorter than +.I INT +will be missed. The alignment speed is usually insensitive to this value unless +it significantly deviates 20. [19] +.TP +.BI -w \ INT +Band width. Essentially, gaps longer than +.I INT +will not be found. Note that the maximum gap length is also affected by the +scoring matrix and the hit length, not solely determined by this option. [100] +.TP +.BI -r \ FLOAT +Trigger re-seeding for a MEM longer than +.IR minSeedLen * FLOAT . +This is a key heuristic parameter for tuning the performance. Larger value +yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5] +.TP +.BI -c \ INT +Discard a MEM if it has more than +.I INT +occurence in the genome. This is an insensitive parameter. [10000] +.TP +.B -P +In the paired-end mode, perform SW to rescue missing hits only but do not try to find +hits that fit a proper pair. +.TP +.BI -A \ INT +Matching score. [1] +.TP +.BI -B \ INT +Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4] +.TP +.BI -O \ INT +Gap open penalty. [6] +.TP +.BI -E \ INT +Gap extension penalty. A gap of length k costs O + k*E (i.e. +.B -O +is for opening a zero-length gap). [1] +.TP +.BI -U \ INT +Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as +.RI scoreRead1+scoreRead2- INT +and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these +two scores to determine whether we should force pairing. [9] +.TP +.B -p +Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details. +.TP +.BI -R \ STR +Complete read group header line. '\\t' can be used in +.I STR +and will be converted to a TAB in the output SAM. The read group ID will be +attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. +[null] +.TP +.B -a +Output all found alignments for single-end or unpaired paired-end reads. These +alignments will be flagged as secondary alignments. +.TP +.B -C +Append append FASTA/Q comment to SAM output. This option can be used to +transfer read meta information (e.g. barcode) to the SAM output. Note that the +FASTA/Q comment (the string after a space in the header line) must conform the SAM +spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. +.TP +.B -H +Use hard clipping 'H' in the SAM output. This option may dramatically reduce +the redundancy of output when mapping long contig or BAC sequences. +.TP +.B -M +Mark shorter split hits as secondary (for Picard compatibility). +.TP +.BI -v \ INT +Control the verbose level of the output. This option has not been fully +supported throughout BWA. Ideally, a value 0 for disabling all the output to +stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for +all normal messages; 4 or higher for debugging. When this option takes value +4, the output is not SAM. [3] + +.RE +.RE + .TP .B aln bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i @@ -482,24 +639,6 @@ Pairing is slower for shorter reads. This is mainly because shorter reads have more spurious hits and converting SA coordinates to chromosomal coordinates are very costly. -.SH NOTES ON LONG-READ ALIGNMENT -.PP -Command -.B bwasw -is designed for long-read alignment. BWA-SW essentially aligns the trie -of the reference genome against the directed acyclic word graph (DAWG) of a -read to find seeds not highly repetitive in the genome, and then performs a -standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called -the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the -top Z reference suffix intervals that match the vertex. BWA-SW is more accurate -if the resultant alignment is supported by more seeds, and therefore BWA-SW -usually performs better on long queries or queries with low divergence to the -reference genome. - -BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads -mainly because it gives better gapped alignment. For paired-end reads, it is yet -to know whether BWA-short or BWA-SW yield overall better results. - .SH CHANGES IN BWA-0.6 .PP Since version 0.6, BWA has been able to work with a reference genome longer than 4GB. @@ -534,16 +673,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS libraries are distributed under the MIT license. .PP -If you use the short-read alignment component, please cite the following +If you use the BWA-backtrack algorithm, please cite the following paper: .PP Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168] .PP -If you use the long-read component (BWA-SW), please cite: +If you use the BWA-SW algorithm, please cite: .PP Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] +.PP +If you use the fastmap component of BWA, please cite: +.PP +Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de +novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178] +.PP +The BWA-MEM algorithm has not been published yet. .SH HISTORY BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW @@ -569,3 +715,11 @@ short-read aligners are being implemented. The BWA-SW algorithm is a new component of BWA. It was conceived in November 2008 and implemented ten months later. + +The BWA-MEM algorithm is based on an algorithm finding super-maximal exact +matches (SMEMs), which was first published with the fermi assembler paper +in 2012. I first implemented the basic SMEM algorithm in the +.B fastmap +command for an experiment and then extended the basic algorithm and added the +extension part in Feburary 2013 to make BWA-MEM a fully featured mapper. + diff --git a/fastmap.c b/fastmap.c index 81ce665..b2d5f39 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); + else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; @@ -56,13 +57,14 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); +// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -P skip pairing; perform mate SW only\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); + fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -72,6 +74,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); + fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); return 1; diff --git a/main.c b/main.c index 7648310..041a83e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r301-beta" +#define PACKAGE_VERSION "0.6.2-r302-beta" #endif static int usage()