From 39a6cd5bb09add7cc09c5bd1ddf064ab749bbd35 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 11 May 2014 15:15:44 -0400 Subject: [PATCH] r762: cleanup for the new release; unfinished It will take to make the documentation ready. --- Makefile | 40 ++++++++++++++++++++++++++++++++++++++++ NEWS | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 36 +++++++++++++++++++++++++++++++++++- bwa.1 | 12 +++++++++++- bwamem.c | 1 + bwamem.h | 1 + bwamem_extra.c | 4 ++-- fastmap.c | 1 + main.c | 2 +- 9 files changed, 141 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 339ac0a..60c2104 100644 --- a/Makefile +++ b/Makefile @@ -37,3 +37,43 @@ depend: ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c ) # DO NOT DELETE THIS LINE -- make depend depends on it. + +QSufSort.o: QSufSort.h +bamlite.o: bamlite.h malloc_wrap.h +bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h +bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h +bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h +bwamem.o: ksort.h utils.h kbtree.h +bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h +bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h +bwamem_pair.o: utils.h ksw.h +bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h +bwape.o: ksw.h khash.h +bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h +bwase.o: bwa.h ksw.h +bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h +bwt.o: utils.h bwt.h kvec.h malloc_wrap.h +bwt_gen.o: QSufSort.h malloc_wrap.h +bwt_lite.o: bwt_lite.h malloc_wrap.h +bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h +bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h +bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h +bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h +bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h +bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h +bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h +bwtsw2_core.o: khash.h ksort.h +bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h +bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h +bwtsw2_pair.o: malloc_wrap.h ksw.h +example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h +fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h +is.o: malloc_wrap.h +kopen.o: malloc_wrap.h +kstring.o: kstring.h malloc_wrap.h +ksw.o: ksw.h malloc_wrap.h +main.o: kstring.h malloc_wrap.h utils.h +malloc_wrap.o: malloc_wrap.h +pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h +test-extend.o: ksw.h +utils.o: utils.h ksort.h malloc_wrap.h kseq.h diff --git a/NEWS b/NEWS index 40f4433..d09c322 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,52 @@ +Release 0.7.9 (11 May, 2014) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release brings several major changes to BWA-MEM. Notably, BWA-MEM +now formally supports PacBio reads. It generally runs faster at a minor cost of +accuracy. The speedup is more significant when GRCh38 is in use. More +specifically: + + * Support PacBio subreads to reference alignment. Although older BWA-MEM works + with PacBio data in principle, the resultant alignments are frequently + fragmented. In this release, we fine tuned existing methods and introduced + new heuristics to improve PacBio alignment. These changes are not used by + default. Users need to add option "-x pacbio" to enable the feature. + + * Support PacBio subread-to-subread alignment. The feature is enabled with + option "-x pbread". In this mode, the output only gives the overlapping + region between a pair of reads without detailed alignment. + + * Output alternative hits in the XA tag if there are not so many such hits. + This feature is originally implemented in BWA-backtrack. + + * EXPERIMENTAL feature to support mapping to ALT contigs in GRCh38. We prodive + a script to postprocess hits in the XA tag to adjust the mapping quality and + generate new primary alignments to all overlapping ALT contigs. We would NOT + recommended this feature for production uses. + + * Improved alignments to many short reference sequences. Older BWA-MEM may + generate an alignment bridging two or more adjacent reference sequences. + Such alignments are split at a later step as postprocessing. This approach + is complex and does not always work. This release forbids these alignments + from the very beginning. BWA-MEM should not produce an alignment bridging + two or more reference sequences any more. + + * Reduced the maximum seed occurrence from 10000 to 500. Reduced the maximum + number of Smith-Waterman mate rescue from 100 to 50. Added a heuristic to + lower the mapping quality if a read contains seeds with excessive + occurrences. These changes make BWA-MEM XXX faster for mapping short reads + to GRCh37 and XXX faster to GRCh38. + + * Added an option "-Y" to use soft clipping for supplementary alignments. + + * Bugfix: incomplete alignment extension in corner cases. + + * Bugfix: integer overflow when aligning low query sequences. + +(0.7.9: 11 May 2014, r777) + + + Release 0.7.8 (31 March, 2014) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.md b/README.md index 3db893d..40dd079 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina sequence reads up to 100bp, while the rest two for longer sequences ranged from 70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as the support of long reads and chimeric alignment, but BWA-MEM, which is the latest, is -generally recommended for high-quality queries as it is faster and more +generally recommended as it is faster and more accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina reads. @@ -59,6 +59,38 @@ do not have plan to submit it to a peer-reviewed journal in the near future. ###Frequently asked questions (FAQs) +####What types of data does BWA work with? + +BWA works with a variety types of DNA sequence data, though the optimal +algorithm and setting may vary. The following list gives the recommended +settings: + +* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly + contigs up to a few megabases: + + bwa mem ref.fa reads.fq > aln.sam + +* Illumina/454/IonTorrent paired-end reads longer than ~70bp: + + bwa mem ref.fa read1.fq read2.fq > aln-pe.sam + +* PacBio subreads to a reference genome: + + bwa mem -x pacbio ref.fa reads.fq > aln.sam + +* PacBio subreads to themselves (the output is not SAM): + + bwa mem -x pbread reads.fq reads.fq > overlap.pas + +* Illumina single-end reads no longer than ~70bp: + + bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam + +* Illumina paired-end reads no longer than ~70bp: + + bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai + bwa samse ref.fa reads.sai reads.fq > aln-pe.sam + ####How to map sequences to GRCh38 with ALT contigs? BWA-backtrack and BWA-MEM partially support mapping to a reference containing @@ -91,6 +123,8 @@ highly repetitive regions as these hits will not be reported with option `-h50`. `bwa-helper.js` is a prototype implementation not recommended for production uses. + + [1]: http://en.wikipedia.org/wiki/GNU_General_Public_License [2]: https://github.com/lh3/bwa [3]: http://sourceforge.net/projects/bio-bwa/files/ diff --git a/bwa.1 b/bwa.1 index b6354e5..1a70a52 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "31 March 2014" "bwa-0.7.8" "Bioinformatics tools" +.TH bwa 1 "11 May 2014" "bwa-0.7.9" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -248,6 +248,11 @@ Don't output alignment with score lower than .IR INT . This option affects output and occasionally SAM flag 2. [30] .TP +.BI -h \ INT +If a query has not more than +.I INT +hits with score higher than 80% of the best hit, output them all in the XA tag [5] +.TP .B -a Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments. @@ -258,6 +263,11 @@ transfer read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment (the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. .TP +.B -Y +Use soft clipping CIGAR operation for supplementary alignments. By default, BWA-MEM +uses soft clipping for the primary alignment and hard clipping for +supplementary alignments. +.TP .B -M Mark shorter split hits as secondary (for Picard compatibility). .TP diff --git a/bwamem.c b/bwamem.c index 6fd9acf..582b809 100644 --- a/bwamem.c +++ b/bwamem.c @@ -64,6 +64,7 @@ mem_opt_t *mem_opt_init() o->max_ins = 10000; o->mask_level = 0.50; o->drop_ratio = 0.50; + o->XA_drop_ratio = 0.80; o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; diff --git a/bwamem.h b/bwamem.h index 98e135d..7b14ca8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -42,6 +42,7 @@ typedef struct { int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain + float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag float mask_level_redun; float mapQ_coef_len; int mapQ_coef_fac; diff --git a/bwamem_extra.c b/bwamem_extra.c index 6b600ae..7403910 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -115,7 +115,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac cnt = calloc(a->n, sizeof(int)); for (i = 0, tot = 0; i < a->n; ++i) { int j = a->a[i].secondary; - if (j >= 0 && a->a[i].score >= a->a[j].score * opt->drop_ratio) + if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio) ++cnt[j], ++tot; } if (tot == 0) goto end_gen_alt; @@ -123,7 +123,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac for (i = 0; i < a->n; ++i) { mem_aln_t t; int j = a->a[i].secondary; - if (j < 0 || a->a[i].score < a->a[j].score * opt->drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later + if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later if (cnt[j] > opt->max_hits) continue; t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); kputs(bns->anns[t.rid].name, &aln[j]); diff --git a/fastmap.c b/fastmap.c index 19800de..3b782bf 100644 --- a/fastmap.c +++ b/fastmap.c @@ -159,6 +159,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -h INT if #hits < INT, output all in the XA tag [%d]\n", opt->max_hits); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, " -Y use soft clipping for supplementary alignments\n"); fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); diff --git a/main.c b/main.c index f6eab65..e006542 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8-r760-dirty" +#define PACKAGE_VERSION "0.7.8-r762-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);