Merge branch 'master' into master_fixes
Merge changes to commit c5434ac (0.7.0 release)
Conflicts:
Makefile
bwamem.c
This commit is contained in:
commit
6beab5f765
5
Makefile
5
Makefile
|
|
@ -8,7 +8,7 @@ AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamli
|
||||||
is.o bwtindex.o bwape.o \
|
is.o bwtindex.o bwape.o \
|
||||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||||
PROG= bwa
|
PROG= bwa bwamem-lite
|
||||||
INCLUDES=
|
INCLUDES=
|
||||||
LIBS= -lm -lz -lpthread
|
LIBS= -lm -lz -lpthread
|
||||||
SUBDIRS= .
|
SUBDIRS= .
|
||||||
|
|
@ -25,6 +25,9 @@ all:$(PROG)
|
||||||
bwa:libbwa.a $(AOBJS) main.o
|
bwa:libbwa.a $(AOBJS) main.o
|
||||||
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
|
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
|
||||||
|
|
||||||
|
bwamem-lite:libbwa.a example.o
|
||||||
|
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
|
||||||
|
|
||||||
libbwa.a:$(LOBJS)
|
libbwa.a:$(LOBJS)
|
||||||
$(AR) -csru $@ $(LOBJS)
|
$(AR) -csru $@ $(LOBJS)
|
||||||
|
|
||||||
|
|
|
||||||
53
NEWS
53
NEWS
|
|
@ -1,3 +1,56 @@
|
||||||
|
Beta Release 0.7.0 (28 Feburary, 2013)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query
|
||||||
|
sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap
|
||||||
|
algorithm and extends seeds with banded affine-gap-penalty dynamic programming
|
||||||
|
(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or
|
||||||
|
longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA
|
||||||
|
and BWA-SW and is more accurate. It also supports split alignments like BWA-SW
|
||||||
|
and may optionally output multiple hits like BWA. BWA-MEM does not guarantee
|
||||||
|
to find hits within a certain edit distance, but BWA is not efficient for such
|
||||||
|
task given longer reads anyway, and the edit-distance criterion is arguably
|
||||||
|
not as important in long-read alignment.
|
||||||
|
|
||||||
|
In addition to the algorithmic improvements, BWA-MEM also implements a few
|
||||||
|
handy features in practical aspects:
|
||||||
|
|
||||||
|
1. BWA-MEM automatically switches between local and glocal (global wrt reads;
|
||||||
|
local wrt reference) alignment. It reports the end-to-end glocal alignment
|
||||||
|
if the glocal alignment is not much worse than the optimal local alignment.
|
||||||
|
Glocal alignment reduces reference bias.
|
||||||
|
|
||||||
|
2. BWA-MEM automatically infers pair orientation from a batch of single-end
|
||||||
|
alignments. It allows more than one orientations if there are sufficient
|
||||||
|
supporting reads. This feature has not been tested on reads from Illumina
|
||||||
|
jumping library yet. (EXPERIMENTAL)
|
||||||
|
|
||||||
|
3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It
|
||||||
|
is possible to convert a name-sorted BAM to an interleaved fastq on the fly
|
||||||
|
and feed the data stream to BWA-MEM for mapping.
|
||||||
|
|
||||||
|
4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which
|
||||||
|
helps to transfer individual read annotations to the output.
|
||||||
|
|
||||||
|
5. BWA-MEM supports more advanced piping. Users can now run:
|
||||||
|
(bwa mem ref.fa '<bzcat r1.fq.bz2' '<bzcat r2.fq.bz2') to map bzip'd read
|
||||||
|
files without replying on bash features.
|
||||||
|
|
||||||
|
6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c'
|
||||||
|
program in the source code directory implements a full single-end mapper in
|
||||||
|
50 lines of code.
|
||||||
|
|
||||||
|
The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM
|
||||||
|
for production use yet. However, when the implementation becomes stable after a
|
||||||
|
few release cycles, existing BWA users are recommended to migrate to BWA-MEM
|
||||||
|
for 76bp or longer Illumina reads and long query sequences. The original BWA
|
||||||
|
short-read algorithm will not deliver satisfactory results for 150bp+ Illumina
|
||||||
|
reads. Change of mappers will be necessary sooner or later.
|
||||||
|
|
||||||
|
(0.7.0 beta: 28 Feburary 2013, r313)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Release 0.6.2 (19 June, 2012)
|
Release 0.6.2 (19 June, 2012)
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
|
|
||||||
271
bwa.1
271
bwa.1
|
|
@ -1,47 +1,45 @@
|
||||||
.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools"
|
.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
.PP
|
.PP
|
||||||
bwa - Burrows-Wheeler Alignment Tool
|
bwa - Burrows-Wheeler Alignment Tool
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
.PP
|
.PP
|
||||||
bwa index -a bwtsw database.fasta
|
bwa index ref.fa
|
||||||
.PP
|
.PP
|
||||||
bwa aln database.fasta short_read.fastq > aln_sa.sai
|
bwa mem ref.fa reads.fq > aln-se.sam
|
||||||
.PP
|
.PP
|
||||||
bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
|
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
|
||||||
.PP
|
.PP
|
||||||
bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
|
bwa aln ref.fa short_read.fq > aln_sa.sai
|
||||||
.PP
|
.PP
|
||||||
bwa bwasw database.fasta long_read.fastq > aln.sam
|
bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
|
||||||
|
.PP
|
||||||
|
bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
|
||||||
|
.PP
|
||||||
|
bwa bwasw ref.fa long_read.fq > aln.sam
|
||||||
|
|
||||||
.SH DESCRIPTION
|
.SH DESCRIPTION
|
||||||
.PP
|
.PP
|
||||||
BWA is a fast light-weighted tool that aligns relatively short sequences
|
BWA is a software package for mapping low-divergent sequences against a large
|
||||||
(queries) to a sequence database (targe), such as the human reference
|
reference genome, such as the human genome. It consists of three algorithms:
|
||||||
genome. It implements two different algorithms, both based on
|
BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
|
||||||
Burrows-Wheeler Transform (BWT). The first algorithm is designed for
|
sequence reads up to 100bp, while the rest two for longer sequences ranged from
|
||||||
short queries up to ~150bp with low error rate (<3%). It does gapped
|
70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
|
||||||
global alignment w.r.t. queries, supports paired-end reads, and is one
|
support and split alignment, but BWA-MEM, which is the latest, is generally
|
||||||
of the fastest short read alignment algorithms to date while also
|
recommended for high-quality queries as it is faster and more accurate.
|
||||||
visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
|
BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
|
||||||
reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like
|
reads.
|
||||||
alignment to find high-scoring local hits and split hits. On
|
|
||||||
low-error short queries, BWA-SW is a little slower and less accurate than the
|
For all the algorithms, BWA first needs to construct the FM-index for
|
||||||
first algorithm, but on long queries, it is better.
|
the reference genome (the
|
||||||
.PP
|
.B index
|
||||||
For both algorithms, the database file in the FASTA format must be
|
command). Alignment algorithms are invoked with different sub-commands:
|
||||||
first indexed with the
|
.BR aln / samse / sampe
|
||||||
.B `index'
|
for BWA-backtrack,
|
||||||
command, which typically takes a few hours for a 3GB genome. The first algorithm is
|
.B bwasw
|
||||||
implemented via the
|
for BWA-SW and
|
||||||
.B `aln'
|
.B mem
|
||||||
command, which finds the suffix array (SA) coordinates of good hits of
|
for the BWA-MEM algorithm.
|
||||||
each individual read, and the
|
|
||||||
.B `samse/sampe'
|
|
||||||
command, which converts SA coordinates to chromosomal coordinate and
|
|
||||||
pairs reads (for `sampe'). The second algorithm is invoked by the
|
|
||||||
.B `bwasw'
|
|
||||||
command. It works for single-end reads only.
|
|
||||||
|
|
||||||
.SH COMMANDS AND OPTIONS
|
.SH COMMANDS AND OPTIONS
|
||||||
.TP
|
.TP
|
||||||
|
|
@ -53,9 +51,6 @@ Index database sequences in the FASTA format.
|
||||||
.B OPTIONS:
|
.B OPTIONS:
|
||||||
.RS
|
.RS
|
||||||
.TP 10
|
.TP 10
|
||||||
.B -c
|
|
||||||
Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x)
|
|
||||||
.TP
|
|
||||||
.BI -p \ STR
|
.BI -p \ STR
|
||||||
Prefix of the output database [same as db filename]
|
Prefix of the output database [same as db filename]
|
||||||
.TP
|
.TP
|
||||||
|
|
@ -76,6 +71,175 @@ genome.
|
||||||
.RE
|
.RE
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
|
.TP
|
||||||
|
.B mem
|
||||||
|
.B bwa mem
|
||||||
|
.RB [ -aCHMpP ]
|
||||||
|
.RB [ -t
|
||||||
|
.IR nThreads ]
|
||||||
|
.RB [ -k
|
||||||
|
.IR minSeedLen ]
|
||||||
|
.RB [ -w
|
||||||
|
.IR bandWidth ]
|
||||||
|
.RB [ -r
|
||||||
|
.IR seedSplitRatio ]
|
||||||
|
.RB [ -c
|
||||||
|
.IR maxOcc ]
|
||||||
|
.RB [ -A
|
||||||
|
.IR matchScore ]
|
||||||
|
.RB [ -B
|
||||||
|
.IR mmPenalty ]
|
||||||
|
.RB [ -O
|
||||||
|
.IR gapOpenPen ]
|
||||||
|
.RB [ -E
|
||||||
|
.IR gapExtPen ]
|
||||||
|
.RB [ -L
|
||||||
|
.IR clipPen ]
|
||||||
|
.RB [ -U
|
||||||
|
.IR unpairPen ]
|
||||||
|
.RB [ -R
|
||||||
|
.IR RGline ]
|
||||||
|
.RB [ -v
|
||||||
|
.IR verboseLevel ]
|
||||||
|
.I db.prefix
|
||||||
|
.I reads.fq
|
||||||
|
.RI [ mates.fq ]
|
||||||
|
|
||||||
|
Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
|
||||||
|
algorithm works by seeding alignments with maximal exact matches (MEMs) and
|
||||||
|
then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
|
||||||
|
|
||||||
|
If
|
||||||
|
.I mates.fq
|
||||||
|
file is absent and option
|
||||||
|
.B -p
|
||||||
|
is not set, this command regards input reads are single-end. If
|
||||||
|
.I mates.fq
|
||||||
|
is present, this command assumes the
|
||||||
|
.IR i -th
|
||||||
|
read in
|
||||||
|
.I reads.fq
|
||||||
|
and the
|
||||||
|
.IR i -th
|
||||||
|
read in
|
||||||
|
.I mates.fq
|
||||||
|
constitute a read pair. If
|
||||||
|
.B -p
|
||||||
|
is used, the command assumes the
|
||||||
|
.RI 2 i -th
|
||||||
|
and the
|
||||||
|
.RI (2 i +1)-th
|
||||||
|
read in
|
||||||
|
.I reads.fq
|
||||||
|
constitute a read pair (such input file is said to be interleaved). In this case,
|
||||||
|
.I mates.fq
|
||||||
|
is ignored. In the paired-end mode, the
|
||||||
|
.B mem
|
||||||
|
command will infer the read orientation and the insert size distribution from a
|
||||||
|
batch of reads.
|
||||||
|
|
||||||
|
The BWA-MEM algorithm performs local alignment. It may produce multiple primary
|
||||||
|
alignments for different part of a query sequence. This is a crucial feature
|
||||||
|
for long sequences. However, some tools such as Picard's markDuplicates does
|
||||||
|
not work with split alignments. One may consider to use option
|
||||||
|
.B -M
|
||||||
|
to flag shorter split hits as secondary.
|
||||||
|
|
||||||
|
.B OPTIONS:
|
||||||
|
.RS
|
||||||
|
.TP 10
|
||||||
|
.BI -t \ INT
|
||||||
|
Number of threads [1]
|
||||||
|
.TP
|
||||||
|
.BI -k \ INT
|
||||||
|
Minimum seed length. Matches shorter than
|
||||||
|
.I INT
|
||||||
|
will be missed. The alignment speed is usually insensitive to this value unless
|
||||||
|
it significantly deviates 20. [19]
|
||||||
|
.TP
|
||||||
|
.BI -w \ INT
|
||||||
|
Band width. Essentially, gaps longer than
|
||||||
|
.I INT
|
||||||
|
will not be found. Note that the maximum gap length is also affected by the
|
||||||
|
scoring matrix and the hit length, not solely determined by this option. [100]
|
||||||
|
.TP
|
||||||
|
.BI -r \ FLOAT
|
||||||
|
Trigger re-seeding for a MEM longer than
|
||||||
|
.IR minSeedLen * FLOAT .
|
||||||
|
This is a key heuristic parameter for tuning the performance. Larger value
|
||||||
|
yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
|
||||||
|
.TP
|
||||||
|
.BI -c \ INT
|
||||||
|
Discard a MEM if it has more than
|
||||||
|
.I INT
|
||||||
|
occurence in the genome. This is an insensitive parameter. [10000]
|
||||||
|
.TP
|
||||||
|
.B -P
|
||||||
|
In the paired-end mode, perform SW to rescue missing hits only but do not try to find
|
||||||
|
hits that fit a proper pair.
|
||||||
|
.TP
|
||||||
|
.BI -A \ INT
|
||||||
|
Matching score. [1]
|
||||||
|
.TP
|
||||||
|
.BI -B \ INT
|
||||||
|
Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
|
||||||
|
.TP
|
||||||
|
.BI -O \ INT
|
||||||
|
Gap open penalty. [6]
|
||||||
|
.TP
|
||||||
|
.BI -E \ INT
|
||||||
|
Gap extension penalty. A gap of length k costs O + k*E (i.e.
|
||||||
|
.B -O
|
||||||
|
is for opening a zero-length gap). [1]
|
||||||
|
.TP
|
||||||
|
.BI -L \ INT
|
||||||
|
Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
|
||||||
|
score reaching the end of query. If this score is larger than the best SW score
|
||||||
|
minus the clipping penalty, clipping will not be applied. Note that in this
|
||||||
|
case, the SAM AS tag reports the best SW score; clipping penalty is not
|
||||||
|
deducted. [5]
|
||||||
|
.TP
|
||||||
|
.BI -U \ INT
|
||||||
|
Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
|
||||||
|
.RI scoreRead1+scoreRead2- INT
|
||||||
|
and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
|
||||||
|
two scores to determine whether we should force pairing. [9]
|
||||||
|
.TP
|
||||||
|
.B -p
|
||||||
|
Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.
|
||||||
|
.TP
|
||||||
|
.BI -R \ STR
|
||||||
|
Complete read group header line. '\\t' can be used in
|
||||||
|
.I STR
|
||||||
|
and will be converted to a TAB in the output SAM. The read group ID will be
|
||||||
|
attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
|
||||||
|
[null]
|
||||||
|
.TP
|
||||||
|
.B -a
|
||||||
|
Output all found alignments for single-end or unpaired paired-end reads. These
|
||||||
|
alignments will be flagged as secondary alignments.
|
||||||
|
.TP
|
||||||
|
.B -C
|
||||||
|
Append append FASTA/Q comment to SAM output. This option can be used to
|
||||||
|
transfer read meta information (e.g. barcode) to the SAM output. Note that the
|
||||||
|
FASTA/Q comment (the string after a space in the header line) must conform the SAM
|
||||||
|
spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
|
||||||
|
.TP
|
||||||
|
.B -H
|
||||||
|
Use hard clipping 'H' in the SAM output. This option may dramatically reduce
|
||||||
|
the redundancy of output when mapping long contig or BAC sequences.
|
||||||
|
.TP
|
||||||
|
.B -M
|
||||||
|
Mark shorter split hits as secondary (for Picard compatibility).
|
||||||
|
.TP
|
||||||
|
.BI -v \ INT
|
||||||
|
Control the verbose level of the output. This option has not been fully
|
||||||
|
supported throughout BWA. Ideally, a value 0 for disabling all the output to
|
||||||
|
stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
|
||||||
|
all normal messages; 4 or higher for debugging. When this option takes value
|
||||||
|
4, the output is not SAM. [3]
|
||||||
|
.RE
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.B aln
|
.B aln
|
||||||
bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
|
bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
|
||||||
|
|
@ -482,24 +646,6 @@ Pairing is slower for shorter reads. This is mainly because shorter
|
||||||
reads have more spurious hits and converting SA coordinates to
|
reads have more spurious hits and converting SA coordinates to
|
||||||
chromosomal coordinates are very costly.
|
chromosomal coordinates are very costly.
|
||||||
|
|
||||||
.SH NOTES ON LONG-READ ALIGNMENT
|
|
||||||
.PP
|
|
||||||
Command
|
|
||||||
.B bwasw
|
|
||||||
is designed for long-read alignment. BWA-SW essentially aligns the trie
|
|
||||||
of the reference genome against the directed acyclic word graph (DAWG) of a
|
|
||||||
read to find seeds not highly repetitive in the genome, and then performs a
|
|
||||||
standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called
|
|
||||||
the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the
|
|
||||||
top Z reference suffix intervals that match the vertex. BWA-SW is more accurate
|
|
||||||
if the resultant alignment is supported by more seeds, and therefore BWA-SW
|
|
||||||
usually performs better on long queries or queries with low divergence to the
|
|
||||||
reference genome.
|
|
||||||
|
|
||||||
BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads
|
|
||||||
mainly because it gives better gapped alignment. For paired-end reads, it is yet
|
|
||||||
to know whether BWA-short or BWA-SW yield overall better results.
|
|
||||||
|
|
||||||
.SH CHANGES IN BWA-0.6
|
.SH CHANGES IN BWA-0.6
|
||||||
.PP
|
.PP
|
||||||
Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
|
Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
|
||||||
|
|
@ -534,16 +680,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes
|
||||||
from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
|
from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
|
||||||
libraries are distributed under the MIT license.
|
libraries are distributed under the MIT license.
|
||||||
.PP
|
.PP
|
||||||
If you use the short-read alignment component, please cite the following
|
If you use the BWA-backtrack algorithm, please cite the following
|
||||||
paper:
|
paper:
|
||||||
.PP
|
.PP
|
||||||
Li H. and Durbin R. (2009) Fast and accurate short read alignment with
|
Li H. and Durbin R. (2009) Fast and accurate short read alignment with
|
||||||
Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
|
Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
|
||||||
.PP
|
.PP
|
||||||
If you use the long-read component (BWA-SW), please cite:
|
If you use the BWA-SW algorithm, please cite:
|
||||||
.PP
|
.PP
|
||||||
Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
|
Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
|
||||||
Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
|
Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
|
||||||
|
.PP
|
||||||
|
If you use the fastmap component of BWA, please cite:
|
||||||
|
.PP
|
||||||
|
Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de
|
||||||
|
novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178]
|
||||||
|
.PP
|
||||||
|
The BWA-MEM algorithm has not been published yet.
|
||||||
|
|
||||||
.SH HISTORY
|
.SH HISTORY
|
||||||
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
|
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
|
||||||
|
|
@ -569,3 +722,11 @@ short-read aligners are being implemented.
|
||||||
|
|
||||||
The BWA-SW algorithm is a new component of BWA. It was conceived in
|
The BWA-SW algorithm is a new component of BWA. It was conceived in
|
||||||
November 2008 and implemented ten months later.
|
November 2008 and implemented ten months later.
|
||||||
|
|
||||||
|
The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
|
||||||
|
matches (SMEMs), which was first published with the fermi assembler paper
|
||||||
|
in 2012. I first implemented the basic SMEM algorithm in the
|
||||||
|
.B fastmap
|
||||||
|
command for an experiment and then extended the basic algorithm and added the
|
||||||
|
extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.
|
||||||
|
|
||||||
|
|
|
||||||
31
bwa.c
31
bwa.c
|
|
@ -74,7 +74,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
|
||||||
{
|
{
|
||||||
uint32_t *cigar = 0;
|
uint32_t *cigar = 0;
|
||||||
uint8_t tmp, *rseq;
|
uint8_t tmp, *rseq;
|
||||||
int i, w;
|
int i;
|
||||||
int64_t rlen;
|
int64_t rlen;
|
||||||
*n_cigar = 0; *NM = -1;
|
*n_cigar = 0; *NM = -1;
|
||||||
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
||||||
|
|
@ -86,15 +86,26 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
|
||||||
for (i = 0; i < rlen>>1; ++i)
|
for (i = 0; i < rlen>>1; ++i)
|
||||||
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
||||||
}
|
}
|
||||||
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
|
||||||
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
cigar = xmalloc(4);
|
||||||
// set the band-width
|
cigar[0] = l_query<<4 | 0;
|
||||||
w = (int)((double)(l_query * mat[0] - q) / r + 1.);
|
*n_cigar = 1;
|
||||||
w = w < 1? w : 1;
|
for (i = 0, *score = 0; i < l_query; ++i)
|
||||||
w = w < w_? w : w_;
|
*score += mat[rseq[i]*5 + query[i]];
|
||||||
w += abs(rlen - l_query);
|
} else {
|
||||||
// NW alignment
|
int w, max_gap, min_w;
|
||||||
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
||||||
|
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
||||||
|
// set the band-width
|
||||||
|
max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
|
||||||
|
max_gap = max_gap > 1? max_gap : 1;
|
||||||
|
w = (max_gap + abs(rlen - l_query) + 1) >> 1;
|
||||||
|
w = w < w_? w : w_;
|
||||||
|
min_w = abs(rlen - l_query) + 3;
|
||||||
|
w = w > min_w? w : min_w;
|
||||||
|
// NW alignment
|
||||||
|
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
||||||
|
}
|
||||||
{// compute NM
|
{// compute NM
|
||||||
int k, x, y, n_mm = 0, n_gap = 0;
|
int k, x, y, n_mm = 0, n_gap = 0;
|
||||||
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
||||||
|
|
|
||||||
83
bwamem.c
83
bwamem.c
|
|
@ -42,8 +42,10 @@ mem_opt_t *mem_opt_init()
|
||||||
{
|
{
|
||||||
mem_opt_t *o;
|
mem_opt_t *o;
|
||||||
o = xcalloc(1, sizeof(mem_opt_t));
|
o = xcalloc(1, sizeof(mem_opt_t));
|
||||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
|
||||||
o->flag = 0;
|
o->flag = 0;
|
||||||
|
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||||
|
o->pen_unpaired = 9;
|
||||||
|
o->pen_clip = 5;
|
||||||
o->min_seed_len = 19;
|
o->min_seed_len = 19;
|
||||||
o->split_width = 10;
|
o->split_width = 10;
|
||||||
o->max_occ = 10000;
|
o->max_occ = 10000;
|
||||||
|
|
@ -54,7 +56,6 @@ mem_opt_t *mem_opt_init()
|
||||||
o->split_factor = 1.5;
|
o->split_factor = 1.5;
|
||||||
o->chunk_size = 10000000;
|
o->chunk_size = 10000000;
|
||||||
o->n_threads = 1;
|
o->n_threads = 1;
|
||||||
o->pen_unpaired = 9;
|
|
||||||
o->max_matesw = 100;
|
o->max_matesw = 100;
|
||||||
mem_fill_scmat(o->a, o->b, o->mat);
|
mem_fill_scmat(o->a, o->b, o->mat);
|
||||||
return o;
|
return o;
|
||||||
|
|
@ -487,23 +488,27 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
||||||
|
|
||||||
if (s->qbeg) { // left extension
|
if (s->qbeg) { // left extension
|
||||||
uint8_t *rs, *qs;
|
uint8_t *rs, *qs;
|
||||||
int qle, tle;
|
int qle, tle, gtle, gscore;
|
||||||
qs = xmalloc(s->qbeg);
|
qs = xmalloc(s->qbeg);
|
||||||
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
|
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
|
||||||
tmp = s->rbeg - rmax[0];
|
tmp = s->rbeg - rmax[0];
|
||||||
rs = xmalloc(tmp);
|
rs = xmalloc(tmp);
|
||||||
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle);
|
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore);
|
||||||
a->qb = s->qbeg - qle; a->rb = s->rbeg - tle;
|
// check whether we prefer to reach the end of the query
|
||||||
|
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
|
||||||
|
else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
|
||||||
free(qs); free(rs);
|
free(qs); free(rs);
|
||||||
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
||||||
|
|
||||||
if (s->qbeg + s->len != l_query) { // right extension
|
if (s->qbeg + s->len != l_query) { // right extension
|
||||||
int qle, tle, qe, re;
|
int qle, tle, qe, re, gtle, gscore;
|
||||||
qe = s->qbeg + s->len;
|
qe = s->qbeg + s->len;
|
||||||
re = s->rbeg + s->len - rmax[0];
|
re = s->rbeg + s->len - rmax[0];
|
||||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle);
|
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore);
|
||||||
a->qe = qe + qle; a->re = rmax[0] + re + tle;
|
// similar to the above
|
||||||
|
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
|
||||||
|
else a->qe = l_query, a->re = rmax[0] + re + gtle;
|
||||||
} else a->qe = l_query, a->re = s->rbeg + s->len;
|
} else a->qe = l_query, a->re = s->rbeg + s->len;
|
||||||
if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
|
if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
|
||||||
|
|
||||||
|
|
@ -521,6 +526,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
||||||
* Basic hit->SAM conversion *
|
* Basic hit->SAM conversion *
|
||||||
*****************************/
|
*****************************/
|
||||||
|
|
||||||
|
static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
|
||||||
|
{
|
||||||
|
int w;
|
||||||
|
if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps
|
||||||
|
w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
|
||||||
|
if (w < abs(l1 - l2)) w = abs(l1 - l2);
|
||||||
|
return w;
|
||||||
|
}
|
||||||
|
|
||||||
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
|
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
|
||||||
{
|
{
|
||||||
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
|
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
|
||||||
|
|
@ -547,7 +561,10 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
|
||||||
int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
|
int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
|
||||||
if (p->flag&0x10000) sam_flag |= 0x100;
|
if (p->flag&0x10000) sam_flag |= 0x100;
|
||||||
if (!copy_mate) {
|
if (!copy_mate) {
|
||||||
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
|
int w2;
|
||||||
|
w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r);
|
||||||
|
w2 = w2 < w? w2 : w;
|
||||||
|
cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
|
||||||
p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
|
p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
|
||||||
} else n_cigar = 0, cigar = 0;
|
} else n_cigar = 0, cigar = 0;
|
||||||
pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev);
|
pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev);
|
||||||
|
|
@ -674,7 +691,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
|
||||||
s->sam = str.s;
|
s->sam = str.s;
|
||||||
}
|
}
|
||||||
|
|
||||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
mem_chain_v chn;
|
mem_chain_v chn;
|
||||||
|
|
@ -698,6 +715,46 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
|
||||||
return regs;
|
return regs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||||
|
{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se()
|
||||||
|
mem_alnreg_v ar;
|
||||||
|
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
|
||||||
|
mem_mark_primary_se(opt, ar.n, ar.a);
|
||||||
|
return ar;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This routine is only used for the API purpose
|
||||||
|
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar)
|
||||||
|
{
|
||||||
|
mem_aln_t a;
|
||||||
|
int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
|
||||||
|
int64_t pos, rb = ar->rb, re = ar->re;
|
||||||
|
memset(&a, 0, sizeof(mem_aln_t));
|
||||||
|
a.mapq = mem_approx_mapq_se(opt, ar);
|
||||||
|
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
|
||||||
|
w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
|
||||||
|
w2 = w2 < opt->w? w2 : opt->w;
|
||||||
|
a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
|
||||||
|
a.NM = NM;
|
||||||
|
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||||||
|
a.is_rev = is_rev;
|
||||||
|
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
|
||||||
|
int clip5, clip3;
|
||||||
|
clip5 = is_rev? l_query - qe : qb;
|
||||||
|
clip3 = is_rev? qb : l_query - qe;
|
||||||
|
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
|
||||||
|
if (clip5) {
|
||||||
|
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
|
||||||
|
a.cigar[0] = clip5<<4|3;
|
||||||
|
++a.n_cigar;
|
||||||
|
}
|
||||||
|
if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3;
|
||||||
|
}
|
||||||
|
a.rid = bns_pos2rid(bns, pos);
|
||||||
|
a.pos = pos - bns->anns[a.rid].offset;
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int start, step, n;
|
int start, step, n;
|
||||||
const mem_opt_t *opt;
|
const mem_opt_t *opt;
|
||||||
|
|
@ -715,11 +772,11 @@ static void *worker1(void *data)
|
||||||
int i;
|
int i;
|
||||||
if (!(w->opt->flag&MEM_F_PE)) {
|
if (!(w->opt->flag&MEM_F_PE)) {
|
||||||
for (i = w->start; i < w->n; i += w->step)
|
for (i = w->start; i < w->n; i += w->step)
|
||||||
w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
|
w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
|
||||||
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
|
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
|
||||||
for (i = w->start; i < w->n>>1; i += w->step) {
|
for (i = w->start; i < w->n>>1; i += w->step) {
|
||||||
w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
|
w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
|
||||||
w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
|
w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
18
bwamem.h
18
bwamem.h
|
|
@ -19,7 +19,10 @@ typedef struct __smem_i smem_i;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
|
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
|
||||||
|
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||||
|
int pen_clip; // clipping penalty. This score is not deducted from the DP score.
|
||||||
int w; // band width
|
int w; // band width
|
||||||
|
|
||||||
int flag; // see MEM_F_* macros
|
int flag; // see MEM_F_* macros
|
||||||
int min_seed_len; // minimum seed length
|
int min_seed_len; // minimum seed length
|
||||||
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
|
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
|
||||||
|
|
@ -30,7 +33,6 @@ typedef struct {
|
||||||
int chunk_size; // process chunk_size-bp sequences in a batch
|
int chunk_size; // process chunk_size-bp sequences in a batch
|
||||||
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
|
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
|
||||||
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
||||||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
|
||||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||||
|
|
@ -47,19 +49,27 @@ typedef struct {
|
||||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||||
} mem_alnreg_t;
|
} mem_alnreg_t;
|
||||||
|
|
||||||
|
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int low, high, failed;
|
int low, high, failed;
|
||||||
double avg, std;
|
double avg, std;
|
||||||
} mem_pestat_t;
|
} mem_pestat_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
|
||||||
int64_t rb, re;
|
int64_t rb, re;
|
||||||
int qb, qe, flag, qual;
|
int qb, qe, flag, qual;
|
||||||
// optional info
|
// optional info
|
||||||
int score, sub;
|
int score, sub;
|
||||||
} bwahit_t;
|
} bwahit_t;
|
||||||
|
|
||||||
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
|
typedef struct { // This struct is only used for the convenience of API.
|
||||||
|
int rid;
|
||||||
|
int pos;
|
||||||
|
uint32_t is_rev:1, mapq:8, NM:23;
|
||||||
|
int n_cigar;
|
||||||
|
uint32_t *cigar;
|
||||||
|
} mem_aln_t;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
@ -112,6 +122,8 @@ extern "C" {
|
||||||
*/
|
*/
|
||||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
|
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
|
||||||
|
|
||||||
|
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Infer the insert size distribution from interleaved alignment regions
|
* Infer the insert size distribution from interleaved alignment regions
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <zlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include "bwamem.h"
|
||||||
|
#include "kseq.h" // for the FASTA/Q parser
|
||||||
|
KSEQ_DECLARE(gzFile)
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
bwaidx_t *idx;
|
||||||
|
gzFile fp;
|
||||||
|
kseq_t *ks;
|
||||||
|
mem_opt_t *opt;
|
||||||
|
|
||||||
|
if (argc < 3) {
|
||||||
|
fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
|
||||||
|
if (NULL == idx) {
|
||||||
|
fprintf(stderr, "Index load failed.\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
|
||||||
|
if (NULL == fp) {
|
||||||
|
fprintf(stderr, "Couldn't open %s : %s\n",
|
||||||
|
strcmp(argv[2], "-") ? argv[2] : "stdin",
|
||||||
|
errno ? strerror(errno) : "Out of memory");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
ks = kseq_init(fp); // initialize the FASTA/Q parser
|
||||||
|
opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
|
||||||
|
|
||||||
|
while (kseq_read(ks) >= 0) { // read one sequence
|
||||||
|
mem_alnreg_v ar;
|
||||||
|
int i, k;
|
||||||
|
ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
|
||||||
|
for (i = 0; i < ar.n; ++i) { // traverse each hit
|
||||||
|
mem_aln_t a;
|
||||||
|
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
|
||||||
|
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
|
||||||
|
// print alignment
|
||||||
|
err_printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
|
||||||
|
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
|
||||||
|
err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
|
||||||
|
err_printf("\t%d\n", a.NM); // print edit distance
|
||||||
|
free(a.cigar); // don't forget to deallocate CIGAR
|
||||||
|
}
|
||||||
|
free(ar.a); // and deallocate the hit list
|
||||||
|
}
|
||||||
|
|
||||||
|
free(opt);
|
||||||
|
kseq_destroy(ks);
|
||||||
|
gzclose(fp);
|
||||||
|
bwa_idx_destroy(idx);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -27,13 +27,15 @@ int main_mem(int argc, char *argv[])
|
||||||
void *ko = 0, *ko2 = 0;
|
void *ko = 0, *ko2 = 0;
|
||||||
|
|
||||||
opt = mem_opt_init();
|
opt = mem_opt_init();
|
||||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) {
|
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) {
|
||||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||||
else if (c == 'w') opt->w = atoi(optarg);
|
else if (c == 'w') opt->w = atoi(optarg);
|
||||||
else if (c == 'A') opt->a = atoi(optarg);
|
else if (c == 'A') opt->a = atoi(optarg);
|
||||||
else if (c == 'B') opt->b = atoi(optarg);
|
else if (c == 'B') opt->b = atoi(optarg);
|
||||||
else if (c == 'O') opt->q = atoi(optarg);
|
else if (c == 'O') opt->q = atoi(optarg);
|
||||||
else if (c == 'E') opt->r = atoi(optarg);
|
else if (c == 'E') opt->r = atoi(optarg);
|
||||||
|
else if (c == 'L') opt->pen_clip = atoi(optarg);
|
||||||
|
else if (c == 'U') opt->pen_unpaired = atoi(optarg);
|
||||||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
||||||
else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
|
else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
|
||||||
|
|
@ -57,13 +59,15 @@ int main_mem(int argc, char *argv[])
|
||||||
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
||||||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||||
fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||||
fprintf(stderr, " -P skip pairing; perform mate SW only\n");
|
fprintf(stderr, " -P skip pairing; perform mate SW only\n");
|
||||||
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
|
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
|
||||||
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
||||||
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
|
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
|
||||||
fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
|
fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
|
||||||
|
fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip);
|
||||||
|
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
|
||||||
fprintf(stderr, "\nInput/output options:\n\n");
|
fprintf(stderr, "\nInput/output options:\n\n");
|
||||||
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
|
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
|
||||||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||||
|
|
@ -73,6 +77,7 @@ int main_mem(int argc, char *argv[])
|
||||||
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
|
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
|
||||||
fprintf(stderr, " -H hard clipping\n");
|
fprintf(stderr, " -H hard clipping\n");
|
||||||
fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
|
fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
|
||||||
|
fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
free(opt);
|
free(opt);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
||||||
12
ksw.c
12
ksw.c
|
|
@ -360,11 +360,11 @@ typedef struct {
|
||||||
int32_t h, e;
|
int32_t h, e;
|
||||||
} eh_t;
|
} eh_t;
|
||||||
|
|
||||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
|
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore)
|
||||||
{
|
{
|
||||||
eh_t *eh; // score array
|
eh_t *eh; // score array
|
||||||
int8_t *qp; // query profile
|
int8_t *qp; // query profile
|
||||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
|
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore;
|
||||||
if (h0 < 0) h0 = 0;
|
if (h0 < 0) h0 = 0;
|
||||||
// allocate memory
|
// allocate memory
|
||||||
qp = xmalloc(qlen * m);
|
qp = xmalloc(qlen * m);
|
||||||
|
|
@ -386,7 +386,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
max_gap = max_gap > 1? max_gap : 1;
|
max_gap = max_gap > 1? max_gap : 1;
|
||||||
w = w < max_gap? w : max_gap;
|
w = w < max_gap? w : max_gap;
|
||||||
// DP loop
|
// DP loop
|
||||||
max = h0, max_i = max_j = -1;
|
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
|
||||||
beg = 0, end = qlen;
|
beg = 0, end = qlen;
|
||||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||||
int f = 0, h1, m = 0, mj = -1;
|
int f = 0, h1, m = 0, mj = -1;
|
||||||
|
|
@ -422,6 +422,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
f = f > h? f : h; // computed F(i,j+1)
|
f = f > h? f : h; // computed F(i,j+1)
|
||||||
}
|
}
|
||||||
eh[end].h = h1; eh[end].e = 0;
|
eh[end].h = h1; eh[end].e = 0;
|
||||||
|
if (j == qlen) {
|
||||||
|
max_ie = gscore > h1? max_ie : i;
|
||||||
|
gscore = gscore > h1? gscore : h1;
|
||||||
|
}
|
||||||
if (m == 0) break;
|
if (m == 0) break;
|
||||||
if (m > max) max = m, max_i = i, max_j = mj;
|
if (m > max) max = m, max_i = i, max_j = mj;
|
||||||
// update beg and end for the next round
|
// update beg and end for the next round
|
||||||
|
|
@ -434,6 +438,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
||||||
free(eh); free(qp);
|
free(eh); free(qp);
|
||||||
if (_qle) *_qle = max_j + 1;
|
if (_qle) *_qle = max_j + 1;
|
||||||
if (_tle) *_tle = max_i + 1;
|
if (_tle) *_tle = max_i + 1;
|
||||||
|
if (_gtle) *_gtle = max_ie + 1;
|
||||||
|
if (_gscore) *_gscore = gscore;
|
||||||
return max;
|
return max;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
45
ksw.h
45
ksw.h
|
|
@ -30,7 +30,7 @@ extern "C" {
|
||||||
* @param tlen length of the target sequence
|
* @param tlen length of the target sequence
|
||||||
* @param target target sequence
|
* @param target target sequence
|
||||||
* @param m number of residue types
|
* @param m number of residue types
|
||||||
* @param mat m*m scoring matrix in one-dimention array
|
* @param mat m*m scoring matrix in one-dimension array
|
||||||
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
||||||
* @param gape gap extension penalty
|
* @param gape gap extension penalty
|
||||||
* @param xtra extra information (see below)
|
* @param xtra extra information (see below)
|
||||||
|
|
@ -62,8 +62,47 @@ extern "C" {
|
||||||
*/
|
*/
|
||||||
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
|
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
|
||||||
|
|
||||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
|
/**
|
||||||
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
|
* Banded global alignment
|
||||||
|
*
|
||||||
|
* @param qlen query length
|
||||||
|
* @param query query sequence with 0 <= query[i] < m
|
||||||
|
* @param tlen target length
|
||||||
|
* @param target target sequence with 0 <= target[i] < m
|
||||||
|
* @param m number of residue types
|
||||||
|
* @param mat m*m scoring mattrix in one-dimension array
|
||||||
|
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
||||||
|
* @param gape gap extension penalty
|
||||||
|
* @param w band width
|
||||||
|
* @param n_cigar (out) number of CIGAR elements
|
||||||
|
* @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with free()
|
||||||
|
*
|
||||||
|
* @return score of the alignment
|
||||||
|
*/
|
||||||
|
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extend alignment
|
||||||
|
*
|
||||||
|
* The routine aligns $query and $target, assuming their upstream sequences,
|
||||||
|
* which are not provided, have been aligned with score $h0. In return,
|
||||||
|
* region [0,*qle) on the query and [0,*tle) on the target sequences are
|
||||||
|
* aligned together. If *gscore>=0, *gscore keeps the best score such that
|
||||||
|
* the entire query sequence is aligned; *gtle keeps the position on the
|
||||||
|
* target where *gscore is achieved. Returning *gscore and *gtle helps the
|
||||||
|
* caller to decide whether an end-to-end hit or a partial hit is preferred.
|
||||||
|
*
|
||||||
|
* The first 9 parameters are identical to those in ksw_global()
|
||||||
|
*
|
||||||
|
* @param h0 alignment score of upstream sequences
|
||||||
|
* @param _qle (out) length of the query in the alignment
|
||||||
|
* @param _tle (out) length of the target in the alignment
|
||||||
|
* @param _gtle (out) length of the target if query is fully aligned
|
||||||
|
* @param _gscore (out) score of the best end-to-end alignment; negative if not found
|
||||||
|
*
|
||||||
|
* @return best semi-local alignment score
|
||||||
|
*/
|
||||||
|
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue