diff --git a/NEWS.md b/NEWS.md index 3d8dec9..c624481 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,29 +1,17 @@ -Release 0.7.11 (XX September, 2014) +Release 0.7.11 (XX November, 2014) ----------------------------------- -A major change to BWA-MEM is the support of mapping to ALT contigs. To use this -feature, users need to manually create a file `indexbase.alt` with each line -giving the name of an ALT contig. During alignment, BWA-MEM will be able to -classify potential hits to ALT and non-ALT hits. It reports alignments and -assigns mapping quality (mapQ) loosely following these rules: +A major change to BWA-MEM is the support of mapping to ALT contigs in addition +to the primary assembly. Part of the ALT mapping strategy is implemented in +BWA-MEM and the rest in a postprocessing script for now. Due to the extra +layer of complexity on generating the reference genome and on the two-step +mapping, we start to provide a wrapper script and precompiled binaries since +this release. Please check README-alt.md for details. - 1. The original mapQ of a non-ALT hit is computed across non-ALT hits only. - The reported mapQ of an ALT hit is always computed across all hits. - - 2. An ALT hit is only reported if its score is strictly better than all - overlapping non-ALT hits. A reported ALT hit is flagged with 0x800 - (supplementary) unless there are no non-ALT hits. - - 3. The mapQ of a non-ALT hit is reduced to zero if its score is less than 80% - (controlled by option `-g`) of the score of an overlapping ALT hit. In this - case, the original mapQ is moved to the `om` tag. - -This way, non-ALT alignments are only affected by ALT contigs if there are -significantly better ALT alignments. BWA-MEM is carefully engineered such that -ALT contigs do not interfere with the alignments to the primary assembly. - -Users may consider to use ALT contigs from GRCh38. I am also constructing a -non-redundant and more complete set of sequences missing from GRCh38. +Another major addition to BWA-MEM is HLA typing, which made possible with the +new ALT mapping strategy. Necessary data and programs are also included in +the binary release. The wrapper script also performs HLA typing when HLA genes +are also included in the reference genome as additional ALT contigs. Other notable changes to BWA-MEM: @@ -42,7 +30,9 @@ Other notable changes to BWA-MEM: * Added new pre-setting for Oxford Nanopore 2D reads. For small genomes, though, LAST is still more sensitive. -(0.7.11: XX September 2014, rXXX) + * Added LAST-like seeding. This improves the accuracy for longer reads. + +(0.7.11: XX November 2014, rXXX) diff --git a/bwa.c b/bwa.c index 80a4b8f..053025e 100644 --- a/bwa.c +++ b/bwa.c @@ -7,6 +7,7 @@ #include "ksw.h" #include "utils.h" #include "kstring.h" +#include "kvec.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -55,10 +56,12 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) } trim_readno(&ks->name); kseq2bseq1(ks, &seqs[n]); + seqs[n].id = n; size += seqs[n++].l_seq; if (ks2) { trim_readno(&ks2->name); kseq2bseq1(ks2, &seqs[n]); + seqs[n].id = n; size += seqs[n++].l_seq; } if (size >= chunk_size && (n&1) == 0) break; @@ -71,6 +74,24 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) return seqs; } +void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]) +{ + int i, has_last; + kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}}; + for (i = 1, has_last = 1; i < n; ++i) { + if (has_last) { + if (strcmp(seqs[i].name, seqs[i-1].name) == 0) { + kv_push(bseq1_t, a[1], seqs[i-1]); + kv_push(bseq1_t, a[1], seqs[i]); + has_last = 0; + } else kv_push(bseq1_t, a[0], seqs[i-1]); + } else has_last = 1; + } + if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]); + sep[0] = a[0].a, m[0] = a[0].n; + sep[1] = a[1].a, m[1] = a[1].n; +} + /***************** * CIGAR related * *****************/ diff --git a/bwa.h b/bwa.h index 6fcc82e..830bc3c 100644 --- a/bwa.h +++ b/bwa.h @@ -23,7 +23,7 @@ typedef struct { } bwaidx_t; typedef struct { - int l_seq; + int l_seq, id; char *name, *comment, *seq, *qual, *sam; } bseq1_t; @@ -35,6 +35,7 @@ extern "C" { #endif bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]); void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); diff --git a/bwamem.h b/bwamem.h index 1746ab5..8ffe729 100644 --- a/bwamem.h +++ b/bwamem.h @@ -20,6 +20,7 @@ typedef struct __smem_i smem_i; #define MEM_F_ALN_REG 0x80 #define MEM_F_REF_HDR 0x100 #define MEM_F_SOFTCLIP 0x200 +#define MEM_F_SMARTPE 0x400 typedef struct { int a, b; // match score and mismatch penalty diff --git a/extras/run-bwamem b/extras/run-bwamem index 40724bb..dba8f72 100755 --- a/extras/run-bwamem +++ b/extras/run-bwamem @@ -5,7 +5,7 @@ use warnings; use Getopt::Std; my %opts = (t=>1, n=>64); -getopts("Spadsko:R:x:t:", \%opts); +getopts("Sadsko:R:x:t:", \%opts); die(' Usage: run-bwamem [options] [file2] @@ -17,30 +17,36 @@ Options: -o STR prefix for output files [inferred from pacbio: pacbio subreads (~10kb query, high error rate) ont2d: Oxford Nanopore reads (~10kb query, higher error rate) -t INT number of threads [1] - -p input are paired-end reads if file2 absent -a trim HiSeq2000/2500 PE resequencing adapters (via trimadap) -d mark duplicate (via samblaster) -S for SAM/BAM input, don\'t shuffle - -s sort the output alignment (higher RAM footprint) + -s sort the output alignment (requring more RAM) -k keep temporary files generated by typeHLA -Note: File type is determined by the file extension of the first input sequence file. - Recognized file extensions: fasta, fa, fastq, fq, fasta.gz, fa.gz, fastq.gz, - fq.gz, sam, sam.gz and bam. SAM/BAM input will be converted to FASTQ. +Examples: - When option -d is in use, all the input reads are required to come from the same - library and all the reads from the library shall be included in the input. In - addition, samblaster does not distinguish optical and PCR duplicates. + * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing: - The output may consist of the following files: + run-bwamem -o prefix -t8 -R"@RG\tID:foo\tSM:bar" hs38d6.fa read1.fq.gz read2.fq.gz - {-o}.aln.sam.gz - unsorted alignment (unless -s is specified) - {-o}.aln.bam - sorted alignment (if -s is specified) - {-o}.oph.sam.gz - orphan single-end reads mapping (if input is paired SAM/BAM) - {-o}.hla.top - best HLA typing for the six classical HLA genes - {-o}.hla.all - additional HLA typing - {-o}.log.* - log files + * Remap coordinate-sorted BAM, trim Illumina PE adapters and sort the output. The BAM + may contain single-end or paired-end reads, or a mixture of the two types. The read + groups are not transferred to the output BAM. + + run-bwamem -sao prefix hs38d6.fa old-srt.bam + + * Remap name-grouped BAM and mark duplicates. Note that in this case, all reads from + a single library should be aligned at the same time. Paired-end only. + + run-bwamem -Sdo prefix hs38d6.fa old-unsrt.bam + +Output files: + + {-o}.aln.bam - final alignment + {-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs) + {-o}.hla.all - additional HLA genotypes consistent with data + {-o}.log.* - log files ') if @ARGV < 2; @@ -103,8 +109,7 @@ if ($is_sam || $is_bam) { my $cmd_sam2bam = $is_sam? "$root/htsbox samview -uS $ARGV[1] \\\n" : "cat $ARGV[1] \\\n"; my $ntmps = int($size / 4e9) + 1; my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : ""; - my $cmd_bam2fq = ""; - $cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.oph.fq.gz " : "") . "- \\\n"; + my $cmd_bam2fq = " | $root/htsbox bam2fq -O - \\\n"; $cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq; } elsif (@ARGV >= 3) { $cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n"; @@ -112,10 +117,10 @@ if ($is_sam || $is_bam) { $cmd = "cat $ARGV[1] \\\n"; } -my $bwa_opts = ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : ""); +my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : ""); $cmd .= " | $root/trimadap \\\n" if defined($opts{a}); -$cmd .= " | $root/bwa mem $bwa_opts" . ($is_pe? "-p " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n"; +$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n"; $cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d}); my $has_hla = 0; @@ -131,15 +136,7 @@ if (-f "$ARGV[0].alt") { } my $t_sort = $opts{t} < 4? $opts{t} : 4; -$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | gzip -1 > $prefix.aln.sam.gz;\n"; - -# map single end reads -if ($cmd =~ /$prefix.oph.fq.gz/) { - $cmd .= "[ -s $prefix.oph.fq.gz ] && zcat $prefix.oph.fq.gz \\\n"; - $cmd .= " | $root/trimadap \\\n" if defined($opts{a}); - $cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2>> $prefix.log.bwamem | gzip -1 > $prefix.oph.sam.gz;\n"; - $cmd .= "rm -f $prefix.oph.fq.gz;\n"; -} +$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n"; if ($has_hla && (!defined($opts{x}) || $opts{x} eq 'intractg')) { $cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n"; diff --git a/fastmap.c b/fastmap.c index 25bf822..988b6ed 100644 --- a/fastmap.c +++ b/fastmap.c @@ -66,7 +66,7 @@ int main_mem(int argc, char *argv[]) else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'a') opt->flag |= MEM_F_ALL; - else if (c == 'p') opt->flag |= MEM_F_PE; + else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; @@ -167,7 +167,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n"); // fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n"); fprintf(stderr, "\nInput/output options:\n\n"); - fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); + fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, " -j ignore ALT contigs\n"); fprintf(stderr, "\n"); @@ -248,7 +248,7 @@ int main_mem(int argc, char *argv[]) if (optind + 2 < argc) { if (opt->flag&MEM_F_PE) { if (bwa_verbose >= 2) - fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__); + fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__); } else { ko2 = kopen(argv[optind + 2], &fd2); if (ko2 == 0) { @@ -265,11 +265,6 @@ int main_mem(int argc, char *argv[]) actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) { int64_t size = 0; - if ((opt->flag & MEM_F_PE) && (n&1) == 1) { - if (bwa_verbose >= 2) - fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__); - n = n>>1<<1; - } if (!copy_comment) for (i = 0; i < n; ++i) { free(seqs[i].comment); seqs[i].comment = 0; @@ -277,7 +272,27 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < n; ++i) size += seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); - mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0); + if (opt->flag & MEM_F_SMARTPE) { + bseq1_t *sep[2]; + int i, n_sep[2]; + mem_opt_t tmp_opt = *opt; + bseq_classify(n, seqs, n_sep, sep); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]); + if (n_sep[0]) { + tmp_opt.flag &= ~MEM_F_PE; + mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed, n_sep[0], sep[0], 0); + for (i = 0; i < n_sep[0]; ++i) + seqs[sep[0][i].id].sam = sep[0][i].sam; + } + if (n_sep[1]) { + tmp_opt.flag |= MEM_F_PE; + mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed + n_sep[0], n_sep[1], sep[1], pes0); + for (i = 0; i < n_sep[1]; ++i) + seqs[sep[1][i].id].sam = sep[1][i].sam; + } + free(sep[0]); free(sep[1]); + } else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0); n_processed += n; for (i = 0; i < n; ++i) { if (seqs[i].sam) err_fputs(seqs[i].sam, stdout); diff --git a/main.c b/main.c index 33e27c2..5a2de61 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r984-dirty" +#define PACKAGE_VERSION "0.7.10-r998-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);