diff --git a/Makefile b/Makefile index 70b89b0..390819f 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamli is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o -PROG= bwa +PROG= bwa bwamem-lite INCLUDES= LIBS= -lm -lz -lpthread SUBDIRS= . @@ -25,6 +25,9 @@ all:$(PROG) bwa:libbwa.a $(AOBJS) main.o $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa +bwamem-lite:libbwa.a example.o + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa + libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) diff --git a/NEWS b/NEWS index d68c693..35202f1 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,56 @@ +Beta Release 0.7.0 (28 Feburary, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query +sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap +algorithm and extends seeds with banded affine-gap-penalty dynamic programming +(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or +longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA +and BWA-SW and is more accurate. It also supports split alignments like BWA-SW +and may optionally output multiple hits like BWA. BWA-MEM does not guarantee +to find hits within a certain edit distance, but BWA is not efficient for such +task given longer reads anyway, and the edit-distance criterion is arguably +not as important in long-read alignment. + +In addition to the algorithmic improvements, BWA-MEM also implements a few +handy features in practical aspects: + + 1. BWA-MEM automatically switches between local and glocal (global wrt reads; + local wrt reference) alignment. It reports the end-to-end glocal alignment + if the glocal alignment is not much worse than the optimal local alignment. + Glocal alignment reduces reference bias. + + 2. BWA-MEM automatically infers pair orientation from a batch of single-end + alignments. It allows more than one orientations if there are sufficient + supporting reads. This feature has not been tested on reads from Illumina + jumping library yet. (EXPERIMENTAL) + + 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It + is possible to convert a name-sorted BAM to an interleaved fastq on the fly + and feed the data stream to BWA-MEM for mapping. + + 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which + helps to transfer individual read annotations to the output. + + 5. BWA-MEM supports more advanced piping. Users can now run: + (bwa mem ref.fa ' aln_sa.sai +bwa mem ref.fa reads.fq > aln-se.sam .PP -bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam +bwa mem ref.fa read1.fq read2.fq > aln-pe.sam .PP -bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam +bwa aln ref.fa short_read.fq > aln_sa.sai .PP -bwa bwasw database.fasta long_read.fastq > aln.sam +bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam +.PP +bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam +.PP +bwa bwasw ref.fa long_read.fq > aln.sam .SH DESCRIPTION .PP -BWA is a fast light-weighted tool that aligns relatively short sequences -(queries) to a sequence database (targe), such as the human reference -genome. It implements two different algorithms, both based on -Burrows-Wheeler Transform (BWT). The first algorithm is designed for -short queries up to ~150bp with low error rate (<3%). It does gapped -global alignment w.r.t. queries, supports paired-end reads, and is one -of the fastest short read alignment algorithms to date while also -visiting suboptimal hits. The second algorithm, BWA-SW, is designed for -reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like -alignment to find high-scoring local hits and split hits. On -low-error short queries, BWA-SW is a little slower and less accurate than the -first algorithm, but on long queries, it is better. -.PP -For both algorithms, the database file in the FASTA format must be -first indexed with the -.B `index' -command, which typically takes a few hours for a 3GB genome. The first algorithm is -implemented via the -.B `aln' -command, which finds the suffix array (SA) coordinates of good hits of -each individual read, and the -.B `samse/sampe' -command, which converts SA coordinates to chromosomal coordinate and -pairs reads (for `sampe'). The second algorithm is invoked by the -.B `bwasw' -command. It works for single-end reads only. +BWA is a software package for mapping low-divergent sequences against a large +reference genome, such as the human genome. It consists of three algorithms: +BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina +sequence reads up to 100bp, while the rest two for longer sequences ranged from +70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read +support and split alignment, but BWA-MEM, which is the latest, is generally +recommended for high-quality queries as it is faster and more accurate. +BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina +reads. + +For all the algorithms, BWA first needs to construct the FM-index for +the reference genome (the +.B index +command). Alignment algorithms are invoked with different sub-commands: +.BR aln / samse / sampe +for BWA-backtrack, +.B bwasw +for BWA-SW and +.B mem +for the BWA-MEM algorithm. .SH COMMANDS AND OPTIONS .TP @@ -53,9 +51,6 @@ Index database sequences in the FASTA format. .B OPTIONS: .RS .TP 10 -.B -c -Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x) -.TP .BI -p \ STR Prefix of the output database [same as db filename] .TP @@ -76,6 +71,175 @@ genome. .RE .RE +.TP +.B mem +.B bwa mem +.RB [ -aCHMpP ] +.RB [ -t +.IR nThreads ] +.RB [ -k +.IR minSeedLen ] +.RB [ -w +.IR bandWidth ] +.RB [ -r +.IR seedSplitRatio ] +.RB [ -c +.IR maxOcc ] +.RB [ -A +.IR matchScore ] +.RB [ -B +.IR mmPenalty ] +.RB [ -O +.IR gapOpenPen ] +.RB [ -E +.IR gapExtPen ] +.RB [ -L +.IR clipPen ] +.RB [ -U +.IR unpairPen ] +.RB [ -R +.IR RGline ] +.RB [ -v +.IR verboseLevel ] +.I db.prefix +.I reads.fq +.RI [ mates.fq ] + +Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the +algorithm works by seeding alignments with maximal exact matches (MEMs) and +then extending seeds with the affine-gap Smith-Waterman algorithm (SW). + +If +.I mates.fq +file is absent and option +.B -p +is not set, this command regards input reads are single-end. If +.I mates.fq +is present, this command assumes the +.IR i -th +read in +.I reads.fq +and the +.IR i -th +read in +.I mates.fq +constitute a read pair. If +.B -p +is used, the command assumes the +.RI 2 i -th +and the +.RI (2 i +1)-th +read in +.I reads.fq +constitute a read pair (such input file is said to be interleaved). In this case, +.I mates.fq +is ignored. In the paired-end mode, the +.B mem +command will infer the read orientation and the insert size distribution from a +batch of reads. + +The BWA-MEM algorithm performs local alignment. It may produce multiple primary +alignments for different part of a query sequence. This is a crucial feature +for long sequences. However, some tools such as Picard's markDuplicates does +not work with split alignments. One may consider to use option +.B -M +to flag shorter split hits as secondary. + +.B OPTIONS: +.RS +.TP 10 +.BI -t \ INT +Number of threads [1] +.TP +.BI -k \ INT +Minimum seed length. Matches shorter than +.I INT +will be missed. The alignment speed is usually insensitive to this value unless +it significantly deviates 20. [19] +.TP +.BI -w \ INT +Band width. Essentially, gaps longer than +.I INT +will not be found. Note that the maximum gap length is also affected by the +scoring matrix and the hit length, not solely determined by this option. [100] +.TP +.BI -r \ FLOAT +Trigger re-seeding for a MEM longer than +.IR minSeedLen * FLOAT . +This is a key heuristic parameter for tuning the performance. Larger value +yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5] +.TP +.BI -c \ INT +Discard a MEM if it has more than +.I INT +occurence in the genome. This is an insensitive parameter. [10000] +.TP +.B -P +In the paired-end mode, perform SW to rescue missing hits only but do not try to find +hits that fit a proper pair. +.TP +.BI -A \ INT +Matching score. [1] +.TP +.BI -B \ INT +Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4] +.TP +.BI -O \ INT +Gap open penalty. [6] +.TP +.BI -E \ INT +Gap extension penalty. A gap of length k costs O + k*E (i.e. +.B -O +is for opening a zero-length gap). [1] +.TP +.BI -L \ INT +Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best +score reaching the end of query. If this score is larger than the best SW score +minus the clipping penalty, clipping will not be applied. Note that in this +case, the SAM AS tag reports the best SW score; clipping penalty is not +deducted. [5] +.TP +.BI -U \ INT +Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as +.RI scoreRead1+scoreRead2- INT +and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these +two scores to determine whether we should force pairing. [9] +.TP +.B -p +Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details. +.TP +.BI -R \ STR +Complete read group header line. '\\t' can be used in +.I STR +and will be converted to a TAB in the output SAM. The read group ID will be +attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. +[null] +.TP +.B -a +Output all found alignments for single-end or unpaired paired-end reads. These +alignments will be flagged as secondary alignments. +.TP +.B -C +Append append FASTA/Q comment to SAM output. This option can be used to +transfer read meta information (e.g. barcode) to the SAM output. Note that the +FASTA/Q comment (the string after a space in the header line) must conform the SAM +spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. +.TP +.B -H +Use hard clipping 'H' in the SAM output. This option may dramatically reduce +the redundancy of output when mapping long contig or BAC sequences. +.TP +.B -M +Mark shorter split hits as secondary (for Picard compatibility). +.TP +.BI -v \ INT +Control the verbose level of the output. This option has not been fully +supported throughout BWA. Ideally, a value 0 for disabling all the output to +stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for +all normal messages; 4 or higher for debugging. When this option takes value +4, the output is not SAM. [3] +.RE + .TP .B aln bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i @@ -482,24 +646,6 @@ Pairing is slower for shorter reads. This is mainly because shorter reads have more spurious hits and converting SA coordinates to chromosomal coordinates are very costly. -.SH NOTES ON LONG-READ ALIGNMENT -.PP -Command -.B bwasw -is designed for long-read alignment. BWA-SW essentially aligns the trie -of the reference genome against the directed acyclic word graph (DAWG) of a -read to find seeds not highly repetitive in the genome, and then performs a -standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called -the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the -top Z reference suffix intervals that match the vertex. BWA-SW is more accurate -if the resultant alignment is supported by more seeds, and therefore BWA-SW -usually performs better on long queries or queries with low divergence to the -reference genome. - -BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads -mainly because it gives better gapped alignment. For paired-end reads, it is yet -to know whether BWA-short or BWA-SW yield overall better results. - .SH CHANGES IN BWA-0.6 .PP Since version 0.6, BWA has been able to work with a reference genome longer than 4GB. @@ -534,16 +680,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS libraries are distributed under the MIT license. .PP -If you use the short-read alignment component, please cite the following +If you use the BWA-backtrack algorithm, please cite the following paper: .PP Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168] .PP -If you use the long-read component (BWA-SW), please cite: +If you use the BWA-SW algorithm, please cite: .PP Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] +.PP +If you use the fastmap component of BWA, please cite: +.PP +Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de +novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178] +.PP +The BWA-MEM algorithm has not been published yet. .SH HISTORY BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW @@ -569,3 +722,11 @@ short-read aligners are being implemented. The BWA-SW algorithm is a new component of BWA. It was conceived in November 2008 and implemented ten months later. + +The BWA-MEM algorithm is based on an algorithm finding super-maximal exact +matches (SMEMs), which was first published with the fermi assembler paper +in 2012. I first implemented the basic SMEM algorithm in the +.B fastmap +command for an experiment and then extended the basic algorithm and added the +extension part in Feburary 2013 to make BWA-MEM a fully featured mapper. + diff --git a/bwa.c b/bwa.c index 55a0a9b..b3c9191 100644 --- a/bwa.c +++ b/bwa.c @@ -74,7 +74,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa { uint32_t *cigar = 0; uint8_t tmp, *rseq; - int i, w; + int i; int64_t rlen; *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand @@ -86,15 +86,26 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa for (i = 0; i < rlen>>1; ++i) tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; } - //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); - // set the band-width - w = (int)((double)(l_query * mat[0] - q) / r + 1.); - w = w < 1? w : 1; - w = w < w_? w : w_; - w += abs(rlen - l_query); - // NW alignment - *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP + cigar = xmalloc(4); + cigar[0] = l_query<<4 | 0; + *n_cigar = 1; + for (i = 0, *score = 0; i < l_query; ++i) + *score += mat[rseq[i]*5 + query[i]]; + } else { + int w, max_gap, min_w; + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + // set the band-width + max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); + max_gap = max_gap > 1? max_gap : 1; + w = (max_gap + abs(rlen - l_query) + 1) >> 1; + w = w < w_? w : w_; + min_w = abs(rlen - l_query) + 3; + w = w > min_w? w : min_w; + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + } {// compute NM int k, x, y, n_mm = 0, n_gap = 0; for (k = 0, x = y = 0; k < *n_cigar; ++k) { diff --git a/bwamem.c b/bwamem.c index 5a8a4dc..b11a790 100644 --- a/bwamem.c +++ b/bwamem.c @@ -42,8 +42,10 @@ mem_opt_t *mem_opt_init() { mem_opt_t *o; o = xcalloc(1, sizeof(mem_opt_t)); - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->flag = 0; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->pen_unpaired = 9; + o->pen_clip = 5; o->min_seed_len = 19; o->split_width = 10; o->max_occ = 10000; @@ -54,7 +56,6 @@ mem_opt_t *mem_opt_init() o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; - o->pen_unpaired = 9; o->max_matesw = 100; mem_fill_scmat(o->a, o->b, o->mat); return o; @@ -487,23 +488,27 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle; + int qle, tle, gtle, gscore; qs = xmalloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = xmalloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle); - a->qb = s->qbeg - qle; a->rb = s->rbeg - tle; + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore); + // check whether we prefer to reach the end of the query + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits + else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end free(qs); free(rs); } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re; + int qle, tle, qe, re, gtle, gscore; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); - a->qe = qe + qle; a->re = rmax[0] + re + tle; + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore); + // similar to the above + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; + else a->qe = l_query, a->re = rmax[0] + re + gtle; } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); @@ -521,6 +526,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int * Basic hit->SAM conversion * *****************************/ +static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) +{ + int w; + if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps + w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.); + if (w < abs(l1 - l2)) w = abs(l1 - l2); + return w; +} + void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) @@ -547,7 +561,10 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); + int w2; + w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r); + w2 = w2 < w? w2 : w; + cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); @@ -674,7 +691,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) +mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { int i; mem_chain_v chn; @@ -698,6 +715,46 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * return regs; } +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) +{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se() + mem_alnreg_v ar; + ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); + mem_mark_primary_se(opt, ar.n, ar.a); + return ar; +} + +// This routine is only used for the API purpose +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar) +{ + mem_aln_t a; + int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; + int64_t pos, rb = ar->rb, re = ar->re; + memset(&a, 0, sizeof(mem_aln_t)); + a.mapq = mem_approx_mapq_se(opt, ar); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); + w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); + w2 = w2 < opt->w? w2 : opt->w; + a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + a.NM = NM; + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + a.is_rev = is_rev; + if (qb != 0 || qe != l_query) { // add clipping to CIGAR + int clip5, clip3; + clip5 = is_rev? l_query - qe : qb; + clip3 = is_rev? qb : l_query - qe; + a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); + if (clip5) { + memmove(a.cigar+1, a.cigar, a.n_cigar * 4); + a.cigar[0] = clip5<<4|3; + ++a.n_cigar; + } + if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3; + } + a.rid = bns_pos2rid(bns, pos); + a.pos = pos - bns->anns[a.rid].offset; + return a; +} + typedef struct { int start, step, n; const mem_opt_t *opt; @@ -715,11 +772,11 @@ static void *worker1(void *data) int i; if (!(w->opt->flag&MEM_F_PE)) { for (i = w->start; i < w->n; i += w->step) - w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); } else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower for (i = w->start; i < w->n>>1; i += w->step) { - w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); - w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); + w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); } } return 0; diff --git a/bwamem.h b/bwamem.h index 8a7c7b8..c2f124c 100644 --- a/bwamem.h +++ b/bwamem.h @@ -19,7 +19,10 @@ typedef struct __smem_i smem_i; typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r + int pen_unpaired; // phred-scaled penalty for unpaired reads + int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width + int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor @@ -30,7 +33,6 @@ typedef struct { int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain - int pen_unpaired; // phred-scaled penalty for unpaired reads int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset @@ -47,19 +49,27 @@ typedef struct { int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; +typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; + typedef struct { int low, high, failed; double avg, std; } mem_pestat_t; -typedef struct { +typedef struct { // TODO: This is an intermediate struct only. Better get rid of it. int64_t rb, re; int qb, qe, flag, qual; // optional info int score, sub; } bwahit_t; -typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; +typedef struct { // This struct is only used for the convenience of API. + int rid; + int pos; + uint32_t is_rev:1, mapq:8, NM:23; + int n_cigar; + uint32_t *cigar; +} mem_aln_t; #ifdef __cplusplus extern "C" { @@ -112,6 +122,8 @@ extern "C" { */ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar); + /** * Infer the insert size distribution from interleaved alignment regions * diff --git a/example.c b/example.c new file mode 100644 index 0000000..31bb231 --- /dev/null +++ b/example.c @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include "bwamem.h" +#include "kseq.h" // for the FASTA/Q parser +KSEQ_DECLARE(gzFile) + +int main(int argc, char *argv[]) +{ + bwaidx_t *idx; + gzFile fp; + kseq_t *ks; + mem_opt_t *opt; + + if (argc < 3) { + fprintf(stderr, "Usage: bwamem-lite \n"); + return 1; + } + + idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index + if (NULL == idx) { + fprintf(stderr, "Index load failed.\n"); + exit(EXIT_FAILURE); + } + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[2], "-") ? argv[2] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks = kseq_init(fp); // initialize the FASTA/Q parser + opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values + + while (kseq_read(ks) >= 0) { // read one sequence + mem_alnreg_v ar; + int i, k; + ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits + for (i = 0; i < ar.n; ++i) { // traverse each hit + mem_aln_t a; + if (ar.a[i].secondary >= 0) continue; // skip secondary alignments + a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + // print alignment + err_printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); + for (k = 0; k < a.n_cigar; ++k) // print CIGAR + err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); + err_printf("\t%d\n", a.NM); // print edit distance + free(a.cigar); // don't forget to deallocate CIGAR + } + free(ar.a); // and deallocate the hit list + } + + free(opt); + kseq_destroy(ks); + gzclose(fp); + bwa_idx_destroy(idx); + return 0; +} diff --git a/fastmap.c b/fastmap.c index ec517e2..0d87007 100644 --- a/fastmap.c +++ b/fastmap.c @@ -27,13 +27,15 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); + else if (c == 'L') opt->pen_clip = atoi(optarg); + else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; @@ -57,13 +59,15 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); +// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -P skip pairing; perform mate SW only\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); + fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip); + fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -73,6 +77,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); + fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); return 1; diff --git a/ksw.c b/ksw.c index 32b8a3a..3c279d7 100644 --- a/ksw.c +++ b/ksw.c @@ -360,11 +360,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore) { eh_t *eh; // score array int8_t *qp; // query profile - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore; if (h0 < 0) h0 = 0; // allocate memory qp = xmalloc(qlen * m); @@ -386,7 +386,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_gap = max_gap > 1? max_gap : 1; w = w < max_gap? w : max_gap; // DP loop - max = h0, max_i = max_j = -1; + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1; @@ -422,6 +422,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, f = f > h? f : h; // computed F(i,j+1) } eh[end].h = h1; eh[end].e = 0; + if (j == qlen) { + max_ie = gscore > h1? max_ie : i; + gscore = gscore > h1? gscore : h1; + } if (m == 0) break; if (m > max) max = m, max_i = i, max_j = mj; // update beg and end for the next round @@ -434,6 +438,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, free(eh); free(qp); if (_qle) *_qle = max_j + 1; if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; return max; } diff --git a/ksw.h b/ksw.h index 5162dc0..d2975de 100644 --- a/ksw.h +++ b/ksw.h @@ -30,7 +30,7 @@ extern "C" { * @param tlen length of the target sequence * @param target target sequence * @param m number of residue types - * @param mat m*m scoring matrix in one-dimention array + * @param mat m*m scoring matrix in one-dimension array * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)" * @param gape gap extension penalty * @param xtra extra information (see below) @@ -62,8 +62,47 @@ extern "C" { */ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle); - int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar); + /** + * Banded global alignment + * + * @param qlen query length + * @param query query sequence with 0 <= query[i] < m + * @param tlen target length + * @param target target sequence with 0 <= target[i] < m + * @param m number of residue types + * @param mat m*m scoring mattrix in one-dimension array + * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)" + * @param gape gap extension penalty + * @param w band width + * @param n_cigar (out) number of CIGAR elements + * @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with free() + * + * @return score of the alignment + */ + int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar); + + /** + * Extend alignment + * + * The routine aligns $query and $target, assuming their upstream sequences, + * which are not provided, have been aligned with score $h0. In return, + * region [0,*qle) on the query and [0,*tle) on the target sequences are + * aligned together. If *gscore>=0, *gscore keeps the best score such that + * the entire query sequence is aligned; *gtle keeps the position on the + * target where *gscore is achieved. Returning *gscore and *gtle helps the + * caller to decide whether an end-to-end hit or a partial hit is preferred. + * + * The first 9 parameters are identical to those in ksw_global() + * + * @param h0 alignment score of upstream sequences + * @param _qle (out) length of the query in the alignment + * @param _tle (out) length of the target in the alignment + * @param _gtle (out) length of the target if query is fully aligned + * @param _gscore (out) score of the best end-to-end alignment; negative if not found + * + * @return best semi-local alignment score + */ + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore); #ifdef __cplusplus } diff --git a/main.c b/main.c index 764c0d2..db25f82 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r301-beta" +#define PACKAGE_VERSION "0.7.0-r313" #endif static int usage()