r762: cleanup for the new release; unfinished
It will take to make the documentation ready.
This commit is contained in:
parent
cfe6996173
commit
39a6cd5bb0
40
Makefile
40
Makefile
|
|
@ -37,3 +37,43 @@ depend:
|
|||
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c )
|
||||
|
||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||||
|
||||
QSufSort.o: QSufSort.h
|
||||
bamlite.o: bamlite.h malloc_wrap.h
|
||||
bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h
|
||||
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h
|
||||
bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
|
||||
bwamem.o: ksort.h utils.h kbtree.h
|
||||
bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h
|
||||
bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h
|
||||
bwamem_pair.o: utils.h ksw.h
|
||||
bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h
|
||||
bwape.o: ksw.h khash.h
|
||||
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
|
||||
bwase.o: bwa.h ksw.h
|
||||
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
|
||||
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
|
||||
bwt_gen.o: QSufSort.h malloc_wrap.h
|
||||
bwt_lite.o: bwt_lite.h malloc_wrap.h
|
||||
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
|
||||
bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
|
||||
bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h
|
||||
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
|
||||
bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
|
||||
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
|
||||
bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h
|
||||
bwtsw2_core.o: khash.h ksort.h
|
||||
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
|
||||
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
|
||||
bwtsw2_pair.o: malloc_wrap.h ksw.h
|
||||
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
|
||||
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
|
||||
is.o: malloc_wrap.h
|
||||
kopen.o: malloc_wrap.h
|
||||
kstring.o: kstring.h malloc_wrap.h
|
||||
ksw.o: ksw.h malloc_wrap.h
|
||||
main.o: kstring.h malloc_wrap.h utils.h
|
||||
malloc_wrap.o: malloc_wrap.h
|
||||
pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
|
||||
test-extend.o: ksw.h
|
||||
utils.o: utils.h ksort.h malloc_wrap.h kseq.h
|
||||
|
|
|
|||
49
NEWS
49
NEWS
|
|
@ -1,3 +1,52 @@
|
|||
Release 0.7.9 (11 May, 2014)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This release brings several major changes to BWA-MEM. Notably, BWA-MEM
|
||||
now formally supports PacBio reads. It generally runs faster at a minor cost of
|
||||
accuracy. The speedup is more significant when GRCh38 is in use. More
|
||||
specifically:
|
||||
|
||||
* Support PacBio subreads to reference alignment. Although older BWA-MEM works
|
||||
with PacBio data in principle, the resultant alignments are frequently
|
||||
fragmented. In this release, we fine tuned existing methods and introduced
|
||||
new heuristics to improve PacBio alignment. These changes are not used by
|
||||
default. Users need to add option "-x pacbio" to enable the feature.
|
||||
|
||||
* Support PacBio subread-to-subread alignment. The feature is enabled with
|
||||
option "-x pbread". In this mode, the output only gives the overlapping
|
||||
region between a pair of reads without detailed alignment.
|
||||
|
||||
* Output alternative hits in the XA tag if there are not so many such hits.
|
||||
This feature is originally implemented in BWA-backtrack.
|
||||
|
||||
* EXPERIMENTAL feature to support mapping to ALT contigs in GRCh38. We prodive
|
||||
a script to postprocess hits in the XA tag to adjust the mapping quality and
|
||||
generate new primary alignments to all overlapping ALT contigs. We would NOT
|
||||
recommended this feature for production uses.
|
||||
|
||||
* Improved alignments to many short reference sequences. Older BWA-MEM may
|
||||
generate an alignment bridging two or more adjacent reference sequences.
|
||||
Such alignments are split at a later step as postprocessing. This approach
|
||||
is complex and does not always work. This release forbids these alignments
|
||||
from the very beginning. BWA-MEM should not produce an alignment bridging
|
||||
two or more reference sequences any more.
|
||||
|
||||
* Reduced the maximum seed occurrence from 10000 to 500. Reduced the maximum
|
||||
number of Smith-Waterman mate rescue from 100 to 50. Added a heuristic to
|
||||
lower the mapping quality if a read contains seeds with excessive
|
||||
occurrences. These changes make BWA-MEM XXX faster for mapping short reads
|
||||
to GRCh37 and XXX faster to GRCh38.
|
||||
|
||||
* Added an option "-Y" to use soft clipping for supplementary alignments.
|
||||
|
||||
* Bugfix: incomplete alignment extension in corner cases.
|
||||
|
||||
* Bugfix: integer overflow when aligning low query sequences.
|
||||
|
||||
(0.7.9: 11 May 2014, r777)
|
||||
|
||||
|
||||
|
||||
Release 0.7.8 (31 March, 2014)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
|
|
|||
36
README.md
36
README.md
|
|
@ -14,7 +14,7 @@ BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
|
|||
sequence reads up to 100bp, while the rest two for longer sequences ranged from
|
||||
70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as the support of
|
||||
long reads and chimeric alignment, but BWA-MEM, which is the latest, is
|
||||
generally recommended for high-quality queries as it is faster and more
|
||||
generally recommended as it is faster and more
|
||||
accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp
|
||||
Illumina reads.
|
||||
|
||||
|
|
@ -59,6 +59,38 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
|
|||
|
||||
###Frequently asked questions (FAQs)
|
||||
|
||||
####What types of data does BWA work with?
|
||||
|
||||
BWA works with a variety types of DNA sequence data, though the optimal
|
||||
algorithm and setting may vary. The following list gives the recommended
|
||||
settings:
|
||||
|
||||
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
|
||||
contigs up to a few megabases:
|
||||
|
||||
bwa mem ref.fa reads.fq > aln.sam
|
||||
|
||||
* Illumina/454/IonTorrent paired-end reads longer than ~70bp:
|
||||
|
||||
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
|
||||
|
||||
* PacBio subreads to a reference genome:
|
||||
|
||||
bwa mem -x pacbio ref.fa reads.fq > aln.sam
|
||||
|
||||
* PacBio subreads to themselves (the output is not SAM):
|
||||
|
||||
bwa mem -x pbread reads.fq reads.fq > overlap.pas
|
||||
|
||||
* Illumina single-end reads no longer than ~70bp:
|
||||
|
||||
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
|
||||
|
||||
* Illumina paired-end reads no longer than ~70bp:
|
||||
|
||||
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
|
||||
bwa samse ref.fa reads.sai reads.fq > aln-pe.sam
|
||||
|
||||
####How to map sequences to GRCh38 with ALT contigs?
|
||||
|
||||
BWA-backtrack and BWA-MEM partially support mapping to a reference containing
|
||||
|
|
@ -91,6 +123,8 @@ highly repetitive regions as these hits will not be reported with option
|
|||
`-h50`. `bwa-helper.js` is a prototype implementation not recommended for
|
||||
production uses.
|
||||
|
||||
|
||||
|
||||
[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License
|
||||
[2]: https://github.com/lh3/bwa
|
||||
[3]: http://sourceforge.net/projects/bio-bwa/files/
|
||||
|
|
|
|||
12
bwa.1
12
bwa.1
|
|
@ -1,4 +1,4 @@
|
|||
.TH bwa 1 "31 March 2014" "bwa-0.7.8" "Bioinformatics tools"
|
||||
.TH bwa 1 "11 May 2014" "bwa-0.7.9" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
bwa - Burrows-Wheeler Alignment Tool
|
||||
|
|
@ -248,6 +248,11 @@ Don't output alignment with score lower than
|
|||
.IR INT .
|
||||
This option affects output and occasionally SAM flag 2. [30]
|
||||
.TP
|
||||
.BI -h \ INT
|
||||
If a query has not more than
|
||||
.I INT
|
||||
hits with score higher than 80% of the best hit, output them all in the XA tag [5]
|
||||
.TP
|
||||
.B -a
|
||||
Output all found alignments for single-end or unpaired paired-end reads. These
|
||||
alignments will be flagged as secondary alignments.
|
||||
|
|
@ -258,6 +263,11 @@ transfer read meta information (e.g. barcode) to the SAM output. Note that the
|
|||
FASTA/Q comment (the string after a space in the header line) must conform the SAM
|
||||
spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
|
||||
.TP
|
||||
.B -Y
|
||||
Use soft clipping CIGAR operation for supplementary alignments. By default, BWA-MEM
|
||||
uses soft clipping for the primary alignment and hard clipping for
|
||||
supplementary alignments.
|
||||
.TP
|
||||
.B -M
|
||||
Mark shorter split hits as secondary (for Picard compatibility).
|
||||
.TP
|
||||
|
|
|
|||
1
bwamem.c
1
bwamem.c
|
|
@ -64,6 +64,7 @@ mem_opt_t *mem_opt_init()
|
|||
o->max_ins = 10000;
|
||||
o->mask_level = 0.50;
|
||||
o->drop_ratio = 0.50;
|
||||
o->XA_drop_ratio = 0.80;
|
||||
o->split_factor = 1.5;
|
||||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -42,6 +42,7 @@ typedef struct {
|
|||
int chunk_size; // process chunk_size-bp sequences in a batch
|
||||
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
|
||||
float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
||||
float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
|
||||
float mask_level_redun;
|
||||
float mapQ_coef_len;
|
||||
int mapQ_coef_fac;
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
|||
cnt = calloc(a->n, sizeof(int));
|
||||
for (i = 0, tot = 0; i < a->n; ++i) {
|
||||
int j = a->a[i].secondary;
|
||||
if (j >= 0 && a->a[i].score >= a->a[j].score * opt->drop_ratio)
|
||||
if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
|
||||
++cnt[j], ++tot;
|
||||
}
|
||||
if (tot == 0) goto end_gen_alt;
|
||||
|
|
@ -123,7 +123,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
|||
for (i = 0; i < a->n; ++i) {
|
||||
mem_aln_t t;
|
||||
int j = a->a[i].secondary;
|
||||
if (j < 0 || a->a[i].score < a->a[j].score * opt->drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
|
||||
if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
|
||||
if (cnt[j] > opt->max_hits) continue;
|
||||
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
|
||||
kputs(bns->anns[t.rid].name, &aln[j]);
|
||||
|
|
|
|||
|
|
@ -159,6 +159,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -h INT if #hits < INT, output all in the XA tag [%d]\n", opt->max_hits);
|
||||
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
|
||||
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
|
||||
fprintf(stderr, " -Y use soft clipping for supplementary alignments\n");
|
||||
fprintf(stderr, " -M mark shorter split hits as secondary\n\n");
|
||||
fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n");
|
||||
fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n");
|
||||
|
|
|
|||
Loading…
Reference in New Issue