Merge branch 'master' into master_fixes
Merge changes to commit c5434ac (0.7.0 release)
Conflicts:
Makefile
bwamem.c
This commit is contained in:
commit
6beab5f765
5
Makefile
5
Makefile
|
|
@ -8,7 +8,7 @@ AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamli
|
|||
is.o bwtindex.o bwape.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||
PROG= bwa
|
||||
PROG= bwa bwamem-lite
|
||||
INCLUDES=
|
||||
LIBS= -lm -lz -lpthread
|
||||
SUBDIRS= .
|
||||
|
|
@ -25,6 +25,9 @@ all:$(PROG)
|
|||
bwa:libbwa.a $(AOBJS) main.o
|
||||
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
|
||||
|
||||
bwamem-lite:libbwa.a example.o
|
||||
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
|
||||
|
||||
libbwa.a:$(LOBJS)
|
||||
$(AR) -csru $@ $(LOBJS)
|
||||
|
||||
|
|
|
|||
53
NEWS
53
NEWS
|
|
@ -1,3 +1,56 @@
|
|||
Beta Release 0.7.0 (28 Feburary, 2013)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query
|
||||
sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap
|
||||
algorithm and extends seeds with banded affine-gap-penalty dynamic programming
|
||||
(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or
|
||||
longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA
|
||||
and BWA-SW and is more accurate. It also supports split alignments like BWA-SW
|
||||
and may optionally output multiple hits like BWA. BWA-MEM does not guarantee
|
||||
to find hits within a certain edit distance, but BWA is not efficient for such
|
||||
task given longer reads anyway, and the edit-distance criterion is arguably
|
||||
not as important in long-read alignment.
|
||||
|
||||
In addition to the algorithmic improvements, BWA-MEM also implements a few
|
||||
handy features in practical aspects:
|
||||
|
||||
1. BWA-MEM automatically switches between local and glocal (global wrt reads;
|
||||
local wrt reference) alignment. It reports the end-to-end glocal alignment
|
||||
if the glocal alignment is not much worse than the optimal local alignment.
|
||||
Glocal alignment reduces reference bias.
|
||||
|
||||
2. BWA-MEM automatically infers pair orientation from a batch of single-end
|
||||
alignments. It allows more than one orientations if there are sufficient
|
||||
supporting reads. This feature has not been tested on reads from Illumina
|
||||
jumping library yet. (EXPERIMENTAL)
|
||||
|
||||
3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It
|
||||
is possible to convert a name-sorted BAM to an interleaved fastq on the fly
|
||||
and feed the data stream to BWA-MEM for mapping.
|
||||
|
||||
4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which
|
||||
helps to transfer individual read annotations to the output.
|
||||
|
||||
5. BWA-MEM supports more advanced piping. Users can now run:
|
||||
(bwa mem ref.fa '<bzcat r1.fq.bz2' '<bzcat r2.fq.bz2') to map bzip'd read
|
||||
files without replying on bash features.
|
||||
|
||||
6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c'
|
||||
program in the source code directory implements a full single-end mapper in
|
||||
50 lines of code.
|
||||
|
||||
The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM
|
||||
for production use yet. However, when the implementation becomes stable after a
|
||||
few release cycles, existing BWA users are recommended to migrate to BWA-MEM
|
||||
for 76bp or longer Illumina reads and long query sequences. The original BWA
|
||||
short-read algorithm will not deliver satisfactory results for 150bp+ Illumina
|
||||
reads. Change of mappers will be necessary sooner or later.
|
||||
|
||||
(0.7.0 beta: 28 Feburary 2013, r313)
|
||||
|
||||
|
||||
|
||||
Release 0.6.2 (19 June, 2012)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
|
|
|||
271
bwa.1
271
bwa.1
|
|
@ -1,47 +1,45 @@
|
|||
.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools"
|
||||
.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
bwa - Burrows-Wheeler Alignment Tool
|
||||
.SH SYNOPSIS
|
||||
.PP
|
||||
bwa index -a bwtsw database.fasta
|
||||
bwa index ref.fa
|
||||
.PP
|
||||
bwa aln database.fasta short_read.fastq > aln_sa.sai
|
||||
bwa mem ref.fa reads.fq > aln-se.sam
|
||||
.PP
|
||||
bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
|
||||
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
|
||||
.PP
|
||||
bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
|
||||
bwa aln ref.fa short_read.fq > aln_sa.sai
|
||||
.PP
|
||||
bwa bwasw database.fasta long_read.fastq > aln.sam
|
||||
bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
|
||||
.PP
|
||||
bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
|
||||
.PP
|
||||
bwa bwasw ref.fa long_read.fq > aln.sam
|
||||
|
||||
.SH DESCRIPTION
|
||||
.PP
|
||||
BWA is a fast light-weighted tool that aligns relatively short sequences
|
||||
(queries) to a sequence database (targe), such as the human reference
|
||||
genome. It implements two different algorithms, both based on
|
||||
Burrows-Wheeler Transform (BWT). The first algorithm is designed for
|
||||
short queries up to ~150bp with low error rate (<3%). It does gapped
|
||||
global alignment w.r.t. queries, supports paired-end reads, and is one
|
||||
of the fastest short read alignment algorithms to date while also
|
||||
visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
|
||||
reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like
|
||||
alignment to find high-scoring local hits and split hits. On
|
||||
low-error short queries, BWA-SW is a little slower and less accurate than the
|
||||
first algorithm, but on long queries, it is better.
|
||||
.PP
|
||||
For both algorithms, the database file in the FASTA format must be
|
||||
first indexed with the
|
||||
.B `index'
|
||||
command, which typically takes a few hours for a 3GB genome. The first algorithm is
|
||||
implemented via the
|
||||
.B `aln'
|
||||
command, which finds the suffix array (SA) coordinates of good hits of
|
||||
each individual read, and the
|
||||
.B `samse/sampe'
|
||||
command, which converts SA coordinates to chromosomal coordinate and
|
||||
pairs reads (for `sampe'). The second algorithm is invoked by the
|
||||
.B `bwasw'
|
||||
command. It works for single-end reads only.
|
||||
BWA is a software package for mapping low-divergent sequences against a large
|
||||
reference genome, such as the human genome. It consists of three algorithms:
|
||||
BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
|
||||
sequence reads up to 100bp, while the rest two for longer sequences ranged from
|
||||
70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
|
||||
support and split alignment, but BWA-MEM, which is the latest, is generally
|
||||
recommended for high-quality queries as it is faster and more accurate.
|
||||
BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
|
||||
reads.
|
||||
|
||||
For all the algorithms, BWA first needs to construct the FM-index for
|
||||
the reference genome (the
|
||||
.B index
|
||||
command). Alignment algorithms are invoked with different sub-commands:
|
||||
.BR aln / samse / sampe
|
||||
for BWA-backtrack,
|
||||
.B bwasw
|
||||
for BWA-SW and
|
||||
.B mem
|
||||
for the BWA-MEM algorithm.
|
||||
|
||||
.SH COMMANDS AND OPTIONS
|
||||
.TP
|
||||
|
|
@ -53,9 +51,6 @@ Index database sequences in the FASTA format.
|
|||
.B OPTIONS:
|
||||
.RS
|
||||
.TP 10
|
||||
.B -c
|
||||
Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x)
|
||||
.TP
|
||||
.BI -p \ STR
|
||||
Prefix of the output database [same as db filename]
|
||||
.TP
|
||||
|
|
@ -76,6 +71,175 @@ genome.
|
|||
.RE
|
||||
.RE
|
||||
|
||||
.TP
|
||||
.B mem
|
||||
.B bwa mem
|
||||
.RB [ -aCHMpP ]
|
||||
.RB [ -t
|
||||
.IR nThreads ]
|
||||
.RB [ -k
|
||||
.IR minSeedLen ]
|
||||
.RB [ -w
|
||||
.IR bandWidth ]
|
||||
.RB [ -r
|
||||
.IR seedSplitRatio ]
|
||||
.RB [ -c
|
||||
.IR maxOcc ]
|
||||
.RB [ -A
|
||||
.IR matchScore ]
|
||||
.RB [ -B
|
||||
.IR mmPenalty ]
|
||||
.RB [ -O
|
||||
.IR gapOpenPen ]
|
||||
.RB [ -E
|
||||
.IR gapExtPen ]
|
||||
.RB [ -L
|
||||
.IR clipPen ]
|
||||
.RB [ -U
|
||||
.IR unpairPen ]
|
||||
.RB [ -R
|
||||
.IR RGline ]
|
||||
.RB [ -v
|
||||
.IR verboseLevel ]
|
||||
.I db.prefix
|
||||
.I reads.fq
|
||||
.RI [ mates.fq ]
|
||||
|
||||
Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
|
||||
algorithm works by seeding alignments with maximal exact matches (MEMs) and
|
||||
then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
|
||||
|
||||
If
|
||||
.I mates.fq
|
||||
file is absent and option
|
||||
.B -p
|
||||
is not set, this command regards input reads are single-end. If
|
||||
.I mates.fq
|
||||
is present, this command assumes the
|
||||
.IR i -th
|
||||
read in
|
||||
.I reads.fq
|
||||
and the
|
||||
.IR i -th
|
||||
read in
|
||||
.I mates.fq
|
||||
constitute a read pair. If
|
||||
.B -p
|
||||
is used, the command assumes the
|
||||
.RI 2 i -th
|
||||
and the
|
||||
.RI (2 i +1)-th
|
||||
read in
|
||||
.I reads.fq
|
||||
constitute a read pair (such input file is said to be interleaved). In this case,
|
||||
.I mates.fq
|
||||
is ignored. In the paired-end mode, the
|
||||
.B mem
|
||||
command will infer the read orientation and the insert size distribution from a
|
||||
batch of reads.
|
||||
|
||||
The BWA-MEM algorithm performs local alignment. It may produce multiple primary
|
||||
alignments for different part of a query sequence. This is a crucial feature
|
||||
for long sequences. However, some tools such as Picard's markDuplicates does
|
||||
not work with split alignments. One may consider to use option
|
||||
.B -M
|
||||
to flag shorter split hits as secondary.
|
||||
|
||||
.B OPTIONS:
|
||||
.RS
|
||||
.TP 10
|
||||
.BI -t \ INT
|
||||
Number of threads [1]
|
||||
.TP
|
||||
.BI -k \ INT
|
||||
Minimum seed length. Matches shorter than
|
||||
.I INT
|
||||
will be missed. The alignment speed is usually insensitive to this value unless
|
||||
it significantly deviates 20. [19]
|
||||
.TP
|
||||
.BI -w \ INT
|
||||
Band width. Essentially, gaps longer than
|
||||
.I INT
|
||||
will not be found. Note that the maximum gap length is also affected by the
|
||||
scoring matrix and the hit length, not solely determined by this option. [100]
|
||||
.TP
|
||||
.BI -r \ FLOAT
|
||||
Trigger re-seeding for a MEM longer than
|
||||
.IR minSeedLen * FLOAT .
|
||||
This is a key heuristic parameter for tuning the performance. Larger value
|
||||
yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
|
||||
.TP
|
||||
.BI -c \ INT
|
||||
Discard a MEM if it has more than
|
||||
.I INT
|
||||
occurence in the genome. This is an insensitive parameter. [10000]
|
||||
.TP
|
||||
.B -P
|
||||
In the paired-end mode, perform SW to rescue missing hits only but do not try to find
|
||||
hits that fit a proper pair.
|
||||
.TP
|
||||
.BI -A \ INT
|
||||
Matching score. [1]
|
||||
.TP
|
||||
.BI -B \ INT
|
||||
Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
|
||||
.TP
|
||||
.BI -O \ INT
|
||||
Gap open penalty. [6]
|
||||
.TP
|
||||
.BI -E \ INT
|
||||
Gap extension penalty. A gap of length k costs O + k*E (i.e.
|
||||
.B -O
|
||||
is for opening a zero-length gap). [1]
|
||||
.TP
|
||||
.BI -L \ INT
|
||||
Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
|
||||
score reaching the end of query. If this score is larger than the best SW score
|
||||
minus the clipping penalty, clipping will not be applied. Note that in this
|
||||
case, the SAM AS tag reports the best SW score; clipping penalty is not
|
||||
deducted. [5]
|
||||
.TP
|
||||
.BI -U \ INT
|
||||
Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
|
||||
.RI scoreRead1+scoreRead2- INT
|
||||
and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
|
||||
two scores to determine whether we should force pairing. [9]
|
||||
.TP
|
||||
.B -p
|
||||
Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.
|
||||
.TP
|
||||
.BI -R \ STR
|
||||
Complete read group header line. '\\t' can be used in
|
||||
.I STR
|
||||
and will be converted to a TAB in the output SAM. The read group ID will be
|
||||
attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
|
||||
[null]
|
||||
.TP
|
||||
.B -a
|
||||
Output all found alignments for single-end or unpaired paired-end reads. These
|
||||
alignments will be flagged as secondary alignments.
|
||||
.TP
|
||||
.B -C
|
||||
Append append FASTA/Q comment to SAM output. This option can be used to
|
||||
transfer read meta information (e.g. barcode) to the SAM output. Note that the
|
||||
FASTA/Q comment (the string after a space in the header line) must conform the SAM
|
||||
spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
|
||||
.TP
|
||||
.B -H
|
||||
Use hard clipping 'H' in the SAM output. This option may dramatically reduce
|
||||
the redundancy of output when mapping long contig or BAC sequences.
|
||||
.TP
|
||||
.B -M
|
||||
Mark shorter split hits as secondary (for Picard compatibility).
|
||||
.TP
|
||||
.BI -v \ INT
|
||||
Control the verbose level of the output. This option has not been fully
|
||||
supported throughout BWA. Ideally, a value 0 for disabling all the output to
|
||||
stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
|
||||
all normal messages; 4 or higher for debugging. When this option takes value
|
||||
4, the output is not SAM. [3]
|
||||
.RE
|
||||
|
||||
.TP
|
||||
.B aln
|
||||
bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
|
||||
|
|
@ -482,24 +646,6 @@ Pairing is slower for shorter reads. This is mainly because shorter
|
|||
reads have more spurious hits and converting SA coordinates to
|
||||
chromosomal coordinates are very costly.
|
||||
|
||||
.SH NOTES ON LONG-READ ALIGNMENT
|
||||
.PP
|
||||
Command
|
||||
.B bwasw
|
||||
is designed for long-read alignment. BWA-SW essentially aligns the trie
|
||||
of the reference genome against the directed acyclic word graph (DAWG) of a
|
||||
read to find seeds not highly repetitive in the genome, and then performs a
|
||||
standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called
|
||||
the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the
|
||||
top Z reference suffix intervals that match the vertex. BWA-SW is more accurate
|
||||
if the resultant alignment is supported by more seeds, and therefore BWA-SW
|
||||
usually performs better on long queries or queries with low divergence to the
|
||||
reference genome.
|
||||
|
||||
BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads
|
||||
mainly because it gives better gapped alignment. For paired-end reads, it is yet
|
||||
to know whether BWA-short or BWA-SW yield overall better results.
|
||||
|
||||
.SH CHANGES IN BWA-0.6
|
||||
.PP
|
||||
Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
|
||||
|
|
@ -534,16 +680,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes
|
|||
from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
|
||||
libraries are distributed under the MIT license.
|
||||
.PP
|
||||
If you use the short-read alignment component, please cite the following
|
||||
If you use the BWA-backtrack algorithm, please cite the following
|
||||
paper:
|
||||
.PP
|
||||
Li H. and Durbin R. (2009) Fast and accurate short read alignment with
|
||||
Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
|
||||
.PP
|
||||
If you use the long-read component (BWA-SW), please cite:
|
||||
If you use the BWA-SW algorithm, please cite:
|
||||
.PP
|
||||
Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
|
||||
Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
|
||||
.PP
|
||||
If you use the fastmap component of BWA, please cite:
|
||||
.PP
|
||||
Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de
|
||||
novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178]
|
||||
.PP
|
||||
The BWA-MEM algorithm has not been published yet.
|
||||
|
||||
.SH HISTORY
|
||||
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
|
||||
|
|
@ -569,3 +722,11 @@ short-read aligners are being implemented.
|
|||
|
||||
The BWA-SW algorithm is a new component of BWA. It was conceived in
|
||||
November 2008 and implemented ten months later.
|
||||
|
||||
The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
|
||||
matches (SMEMs), which was first published with the fermi assembler paper
|
||||
in 2012. I first implemented the basic SMEM algorithm in the
|
||||
.B fastmap
|
||||
command for an experiment and then extended the basic algorithm and added the
|
||||
extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.
|
||||
|
||||
|
|
|
|||
31
bwa.c
31
bwa.c
|
|
@ -74,7 +74,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
|
|||
{
|
||||
uint32_t *cigar = 0;
|
||||
uint8_t tmp, *rseq;
|
||||
int i, w;
|
||||
int i;
|
||||
int64_t rlen;
|
||||
*n_cigar = 0; *NM = -1;
|
||||
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
||||
|
|
@ -86,15 +86,26 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
|
|||
for (i = 0; i < rlen>>1; ++i)
|
||||
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
||||
}
|
||||
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
||||
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
||||
// set the band-width
|
||||
w = (int)((double)(l_query * mat[0] - q) / r + 1.);
|
||||
w = w < 1? w : 1;
|
||||
w = w < w_? w : w_;
|
||||
w += abs(rlen - l_query);
|
||||
// NW alignment
|
||||
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
||||
if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
|
||||
cigar = xmalloc(4);
|
||||
cigar[0] = l_query<<4 | 0;
|
||||
*n_cigar = 1;
|
||||
for (i = 0, *score = 0; i < l_query; ++i)
|
||||
*score += mat[rseq[i]*5 + query[i]];
|
||||
} else {
|
||||
int w, max_gap, min_w;
|
||||
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
||||
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
||||
// set the band-width
|
||||
max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
|
||||
max_gap = max_gap > 1? max_gap : 1;
|
||||
w = (max_gap + abs(rlen - l_query) + 1) >> 1;
|
||||
w = w < w_? w : w_;
|
||||
min_w = abs(rlen - l_query) + 3;
|
||||
w = w > min_w? w : min_w;
|
||||
// NW alignment
|
||||
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
||||
}
|
||||
{// compute NM
|
||||
int k, x, y, n_mm = 0, n_gap = 0;
|
||||
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
||||
|
|
|
|||
83
bwamem.c
83
bwamem.c
|
|
@ -42,8 +42,10 @@ mem_opt_t *mem_opt_init()
|
|||
{
|
||||
mem_opt_t *o;
|
||||
o = xcalloc(1, sizeof(mem_opt_t));
|
||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||
o->flag = 0;
|
||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||
o->pen_unpaired = 9;
|
||||
o->pen_clip = 5;
|
||||
o->min_seed_len = 19;
|
||||
o->split_width = 10;
|
||||
o->max_occ = 10000;
|
||||
|
|
@ -54,7 +56,6 @@ mem_opt_t *mem_opt_init()
|
|||
o->split_factor = 1.5;
|
||||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
o->pen_unpaired = 9;
|
||||
o->max_matesw = 100;
|
||||
mem_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
|
|
@ -487,23 +488,27 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
|
||||
if (s->qbeg) { // left extension
|
||||
uint8_t *rs, *qs;
|
||||
int qle, tle;
|
||||
int qle, tle, gtle, gscore;
|
||||
qs = xmalloc(s->qbeg);
|
||||
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
|
||||
tmp = s->rbeg - rmax[0];
|
||||
rs = xmalloc(tmp);
|
||||
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle);
|
||||
a->qb = s->qbeg - qle; a->rb = s->rbeg - tle;
|
||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore);
|
||||
// check whether we prefer to reach the end of the query
|
||||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
|
||||
else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
|
||||
free(qs); free(rs);
|
||||
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
||||
|
||||
if (s->qbeg + s->len != l_query) { // right extension
|
||||
int qle, tle, qe, re;
|
||||
int qle, tle, qe, re, gtle, gscore;
|
||||
qe = s->qbeg + s->len;
|
||||
re = s->rbeg + s->len - rmax[0];
|
||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle);
|
||||
a->qe = qe + qle; a->re = rmax[0] + re + tle;
|
||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore);
|
||||
// similar to the above
|
||||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
|
||||
else a->qe = l_query, a->re = rmax[0] + re + gtle;
|
||||
} else a->qe = l_query, a->re = s->rbeg + s->len;
|
||||
if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
|
||||
|
||||
|
|
@ -521,6 +526,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
|||
* Basic hit->SAM conversion *
|
||||
*****************************/
|
||||
|
||||
static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
|
||||
{
|
||||
int w;
|
||||
if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps
|
||||
w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
|
||||
if (w < abs(l1 - l2)) w = abs(l1 - l2);
|
||||
return w;
|
||||
}
|
||||
|
||||
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
|
||||
{
|
||||
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
|
||||
|
|
@ -547,7 +561,10 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
|
|||
int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
|
||||
if (p->flag&0x10000) sam_flag |= 0x100;
|
||||
if (!copy_mate) {
|
||||
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
|
||||
int w2;
|
||||
w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r);
|
||||
w2 = w2 < w? w2 : w;
|
||||
cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
|
||||
p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
|
||||
} else n_cigar = 0, cigar = 0;
|
||||
pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev);
|
||||
|
|
@ -674,7 +691,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
|
|||
s->sam = str.s;
|
||||
}
|
||||
|
||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||
{
|
||||
int i;
|
||||
mem_chain_v chn;
|
||||
|
|
@ -698,6 +715,46 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
|
|||
return regs;
|
||||
}
|
||||
|
||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||
{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se()
|
||||
mem_alnreg_v ar;
|
||||
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
|
||||
mem_mark_primary_se(opt, ar.n, ar.a);
|
||||
return ar;
|
||||
}
|
||||
|
||||
// This routine is only used for the API purpose
|
||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar)
|
||||
{
|
||||
mem_aln_t a;
|
||||
int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
|
||||
int64_t pos, rb = ar->rb, re = ar->re;
|
||||
memset(&a, 0, sizeof(mem_aln_t));
|
||||
a.mapq = mem_approx_mapq_se(opt, ar);
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
|
||||
w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
|
||||
w2 = w2 < opt->w? w2 : opt->w;
|
||||
a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
|
||||
a.NM = NM;
|
||||
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||||
a.is_rev = is_rev;
|
||||
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
|
||||
int clip5, clip3;
|
||||
clip5 = is_rev? l_query - qe : qb;
|
||||
clip3 = is_rev? qb : l_query - qe;
|
||||
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
|
||||
if (clip5) {
|
||||
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
|
||||
a.cigar[0] = clip5<<4|3;
|
||||
++a.n_cigar;
|
||||
}
|
||||
if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3;
|
||||
}
|
||||
a.rid = bns_pos2rid(bns, pos);
|
||||
a.pos = pos - bns->anns[a.rid].offset;
|
||||
return a;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int start, step, n;
|
||||
const mem_opt_t *opt;
|
||||
|
|
@ -715,11 +772,11 @@ static void *worker1(void *data)
|
|||
int i;
|
||||
if (!(w->opt->flag&MEM_F_PE)) {
|
||||
for (i = w->start; i < w->n; i += w->step)
|
||||
w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
|
||||
w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
|
||||
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
|
||||
for (i = w->start; i < w->n>>1; i += w->step) {
|
||||
w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
|
||||
w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
|
||||
w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
|
||||
w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
|
|
|||
18
bwamem.h
18
bwamem.h
|
|
@ -19,7 +19,10 @@ typedef struct __smem_i smem_i;
|
|||
|
||||
typedef struct {
|
||||
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
|
||||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||
int pen_clip; // clipping penalty. This score is not deducted from the DP score.
|
||||
int w; // band width
|
||||
|
||||
int flag; // see MEM_F_* macros
|
||||
int min_seed_len; // minimum seed length
|
||||
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
|
||||
|
|
@ -30,7 +33,6 @@ typedef struct {
|
|||
int chunk_size; // process chunk_size-bp sequences in a batch
|
||||
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
|
||||
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
||||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
|
|
@ -47,19 +49,27 @@ typedef struct {
|
|||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||
} mem_alnreg_t;
|
||||
|
||||
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
|
||||
|
||||
typedef struct {
|
||||
int low, high, failed;
|
||||
double avg, std;
|
||||
} mem_pestat_t;
|
||||
|
||||
typedef struct {
|
||||
typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
|
||||
int64_t rb, re;
|
||||
int qb, qe, flag, qual;
|
||||
// optional info
|
||||
int score, sub;
|
||||
} bwahit_t;
|
||||
|
||||
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
|
||||
typedef struct { // This struct is only used for the convenience of API.
|
||||
int rid;
|
||||
int pos;
|
||||
uint32_t is_rev:1, mapq:8, NM:23;
|
||||
int n_cigar;
|
||||
uint32_t *cigar;
|
||||
} mem_aln_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -112,6 +122,8 @@ extern "C" {
|
|||
*/
|
||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
|
||||
|
||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar);
|
||||
|
||||
/**
|
||||
* Infer the insert size distribution from interleaved alignment regions
|
||||
*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
#include "bwamem.h"
|
||||
#include "kseq.h" // for the FASTA/Q parser
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
bwaidx_t *idx;
|
||||
gzFile fp;
|
||||
kseq_t *ks;
|
||||
mem_opt_t *opt;
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
|
||||
if (NULL == idx) {
|
||||
fprintf(stderr, "Index load failed.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Couldn't open %s : %s\n",
|
||||
strcmp(argv[2], "-") ? argv[2] : "stdin",
|
||||
errno ? strerror(errno) : "Out of memory");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ks = kseq_init(fp); // initialize the FASTA/Q parser
|
||||
opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
|
||||
|
||||
while (kseq_read(ks) >= 0) { // read one sequence
|
||||
mem_alnreg_v ar;
|
||||
int i, k;
|
||||
ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
|
||||
for (i = 0; i < ar.n; ++i) { // traverse each hit
|
||||
mem_aln_t a;
|
||||
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
|
||||
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
|
||||
// print alignment
|
||||
err_printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
|
||||
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
|
||||
err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
|
||||
err_printf("\t%d\n", a.NM); // print edit distance
|
||||
free(a.cigar); // don't forget to deallocate CIGAR
|
||||
}
|
||||
free(ar.a); // and deallocate the hit list
|
||||
}
|
||||
|
||||
free(opt);
|
||||
kseq_destroy(ks);
|
||||
gzclose(fp);
|
||||
bwa_idx_destroy(idx);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -27,13 +27,15 @@ int main_mem(int argc, char *argv[])
|
|||
void *ko = 0, *ko2 = 0;
|
||||
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||
else if (c == 'w') opt->w = atoi(optarg);
|
||||
else if (c == 'A') opt->a = atoi(optarg);
|
||||
else if (c == 'B') opt->b = atoi(optarg);
|
||||
else if (c == 'O') opt->q = atoi(optarg);
|
||||
else if (c == 'E') opt->r = atoi(optarg);
|
||||
else if (c == 'L') opt->pen_clip = atoi(optarg);
|
||||
else if (c == 'U') opt->pen_unpaired = atoi(optarg);
|
||||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
||||
else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
|
||||
|
|
@ -57,13 +59,15 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
||||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -P skip pairing; perform mate SW only\n");
|
||||
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
|
||||
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
||||
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
|
||||
fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
|
||||
fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip);
|
||||
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
|
||||
fprintf(stderr, "\nInput/output options:\n\n");
|
||||
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
|
||||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
|
|
@ -73,6 +77,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
|
||||
fprintf(stderr, " -H hard clipping\n");
|
||||
fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
|
||||
fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
|
||||
fprintf(stderr, "\n");
|
||||
free(opt);
|
||||
return 1;
|
||||
|
|
|
|||
12
ksw.c
12
ksw.c
|
|
@ -360,11 +360,11 @@ typedef struct {
|
|||
int32_t h, e;
|
||||
} eh_t;
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore)
|
||||
{
|
||||
eh_t *eh; // score array
|
||||
int8_t *qp; // query profile
|
||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
|
||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore;
|
||||
if (h0 < 0) h0 = 0;
|
||||
// allocate memory
|
||||
qp = xmalloc(qlen * m);
|
||||
|
|
@ -386,7 +386,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
max_gap = max_gap > 1? max_gap : 1;
|
||||
w = w < max_gap? w : max_gap;
|
||||
// DP loop
|
||||
max = h0, max_i = max_j = -1;
|
||||
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
|
||||
beg = 0, end = qlen;
|
||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||
int f = 0, h1, m = 0, mj = -1;
|
||||
|
|
@ -422,6 +422,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
f = f > h? f : h; // computed F(i,j+1)
|
||||
}
|
||||
eh[end].h = h1; eh[end].e = 0;
|
||||
if (j == qlen) {
|
||||
max_ie = gscore > h1? max_ie : i;
|
||||
gscore = gscore > h1? gscore : h1;
|
||||
}
|
||||
if (m == 0) break;
|
||||
if (m > max) max = m, max_i = i, max_j = mj;
|
||||
// update beg and end for the next round
|
||||
|
|
@ -434,6 +438,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
free(eh); free(qp);
|
||||
if (_qle) *_qle = max_j + 1;
|
||||
if (_tle) *_tle = max_i + 1;
|
||||
if (_gtle) *_gtle = max_ie + 1;
|
||||
if (_gscore) *_gscore = gscore;
|
||||
return max;
|
||||
}
|
||||
|
||||
|
|
|
|||
45
ksw.h
45
ksw.h
|
|
@ -30,7 +30,7 @@ extern "C" {
|
|||
* @param tlen length of the target sequence
|
||||
* @param target target sequence
|
||||
* @param m number of residue types
|
||||
* @param mat m*m scoring matrix in one-dimention array
|
||||
* @param mat m*m scoring matrix in one-dimension array
|
||||
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
||||
* @param gape gap extension penalty
|
||||
* @param xtra extra information (see below)
|
||||
|
|
@ -62,8 +62,47 @@ extern "C" {
|
|||
*/
|
||||
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
|
||||
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
|
||||
/**
|
||||
* Banded global alignment
|
||||
*
|
||||
* @param qlen query length
|
||||
* @param query query sequence with 0 <= query[i] < m
|
||||
* @param tlen target length
|
||||
* @param target target sequence with 0 <= target[i] < m
|
||||
* @param m number of residue types
|
||||
* @param mat m*m scoring mattrix in one-dimension array
|
||||
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
||||
* @param gape gap extension penalty
|
||||
* @param w band width
|
||||
* @param n_cigar (out) number of CIGAR elements
|
||||
* @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with free()
|
||||
*
|
||||
* @return score of the alignment
|
||||
*/
|
||||
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
|
||||
|
||||
/**
|
||||
* Extend alignment
|
||||
*
|
||||
* The routine aligns $query and $target, assuming their upstream sequences,
|
||||
* which are not provided, have been aligned with score $h0. In return,
|
||||
* region [0,*qle) on the query and [0,*tle) on the target sequences are
|
||||
* aligned together. If *gscore>=0, *gscore keeps the best score such that
|
||||
* the entire query sequence is aligned; *gtle keeps the position on the
|
||||
* target where *gscore is achieved. Returning *gscore and *gtle helps the
|
||||
* caller to decide whether an end-to-end hit or a partial hit is preferred.
|
||||
*
|
||||
* The first 9 parameters are identical to those in ksw_global()
|
||||
*
|
||||
* @param h0 alignment score of upstream sequences
|
||||
* @param _qle (out) length of the query in the alignment
|
||||
* @param _tle (out) length of the target in the alignment
|
||||
* @param _gtle (out) length of the target if query is fully aligned
|
||||
* @param _gscore (out) score of the best end-to-end alignment; negative if not found
|
||||
*
|
||||
* @return best semi-local alignment score
|
||||
*/
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue