diff --git a/README b/README deleted file mode 100644 index dd1d335..0000000 --- a/README +++ /dev/null @@ -1,36 +0,0 @@ -Released packages can be downloaded from SourceForge.net: - - http://sourceforge.net/projects/bio-bwa/files/ - -Introduction and FAQ are available at: - - http://bio-bwa.sourceforge.net - -Manual page at: - - http://bio-bwa.sourceforge.net/bwa.shtml - -Mailing list: - - bio-bwa-help@lists.sourceforge.net - -To sign up: - - http://sourceforge.net/mail/?group_id=276243 - -Publications (Open Access): - - http://www.ncbi.nlm.nih.gov/pubmed/20080505 - http://www.ncbi.nlm.nih.gov/pubmed/19451168 - -Incomplete list of citations (via HubMed.org): - - http://www.hubmed.org/references.cgi?uids=20080505 - http://www.hubmed.org/references.cgi?uids=19451168 - -Related projects: - - http://pbwa.sourceforge.net/ - http://www.many-core.group.cam.ac.uk/projects/lam.shtml - http://biodoop-seal.sourceforge.net/ - http://gitorious.org/bwa-cuda diff --git a/README.md b/README.md new file mode 100644 index 0000000..a46a691 --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +###Getting started + + git clone https://github.com/lh3/bwa.git + cd bwa; make + ./bwa index ref.fa + ./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz + ./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz + +###Introduction + +BWA is a software package for mapping low-divergent sequences against a large +reference genome, such as the human genome. It consists of three algorithms: +BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina +sequence reads up to 100bp, while the rest two for longer sequences ranged from +70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as the support of +long reads and chimeric alignment, but BWA-MEM, which is the latest, is +generally recommended for high-quality queries as it is faster and more +accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp +Illumina reads. + +For all the algorithms, BWA first needs to construct the FM-index for the +reference genome (the **index** command). Alignment algorithms are invoked with +different sub-commands: **aln**/**samse**/**sampe** for BWA-backtrack, +**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm. + +###Availability + +BWA is released under [GPLv3][1]. The latest souce code is [freely +available][2] at github. Released packages can [be downloaded ][3] at +SourceForge. After you acquire the source code, simply use `make` to compile +and copy the single executable `bwa` to the destination you want. + +###Seeking helps + +The detailed usage is described in the man page available together with the +source code. You can use `man ./bwa.1` to view the man page in a terminal. The +[HTML version][4] of the man page can be found at the [BWA website][5]. If you +have questions about BWA, you may [sign up the mailing list][6] and then send +the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions +in forums such as [BioStar][8] and [SEQanswers][9]. + +###Citing BWA + +* Li H. and Durbin R. (2009) Fast and accurate short read alignment with + Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID: + [19451168][10]]. (if you use the BWA-backtrack algorithm) + +* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with + Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID: + [20080505][11]]. (if you use the BWA-SW algorithm) + +* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs + with BWA-MEM. [arXiv:1303.3997v1][12] [q-bio.GN]. (if you use the BWA-MEM + algorithm or the **fastmap** command) + +Please note that the last reference is a preprint hosted at [arXiv.org][13]. I +do not have plan to submit it to a peer-reviewed journal in the near future. + + + +[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License +[2]: https://github.com/lh3/bwa +[3]: http://sourceforge.net/projects/bio-bwa/files/ +[4]: http://bio-bwa.sourceforge.net/bwa.shtml +[5]: http://bio-bwa.sourceforge.net/ +[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help +[7]: mailto:bio-bwa-help@sourceforge.net +[8]: http://biostars.org +[9]: http://seqanswers.com/ +[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168 +[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505 +[12]: http://arxiv.org/abs/1303.3997 +[13]: http://arxiv.org/ diff --git a/bwa.1 b/bwa.1 index d25ba4a..e63fe8d 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "23 April 2013" "bwa-0.7.4" "Bioinformatics tools" +.TH bwa 1 "24 May 2013" "bwa-0.7.5" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -718,12 +718,13 @@ If you use the BWA-SW algorithm, please cite: Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] .PP -If you use the fastmap component of BWA, please cite: +If you use BWA-MEM or the fastmap component of BWA, please cite: .PP -Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de -novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178] +Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with +BWA-MEM. arXiv:1303.3997v1 [q-bio.GN]. .PP -The BWA-MEM algorithm has not been published yet. +It is likely that the BWA-MEM manuscript will not appear in a peer-reviewed +journal. .SH HISTORY BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW diff --git a/bwase.c b/bwase.c index dcf29bf..5bb8116 100644 --- a/bwase.c +++ b/bwase.c @@ -167,20 +167,29 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se #define SW_BW 50 -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t rb, int *n_cigar) +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar) { bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; ubyte_t *rseq; - int64_t k, re, rlen; + int64_t k, rb, re, rlen; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); - re = rb + len + ref_shift; + rb = *_rb; re = rb + len + ref_shift; assert(re <= l_pac); rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); assert(re - rb == rlen); - ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension + ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); + assert(*n_cigar > 0); + if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 4; // change endding ins to soft clipping + if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 4; // change beginning ins to soft clipping + if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del + if ((cigar32[0]&0xf) == 2) { // delete beginning del + *_rb += cigar32[0]>>4; + --*n_cigar; + memmove(cigar32, cigar32+1, (*n_cigar) * 4); + } cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); @@ -292,14 +301,14 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap) { // gapped alignment - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, q->pos, &n_cigar); + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar); q->n_cigar = n_cigar; if (q->cigar) s->multi[k++] = *q; } else s->multi[k++] = *q; } s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, s->pos, &s->n_cigar); + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar); if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } // generate MD tag