r998: smart pairing; allow mixture of SE/PE reads
This commit is contained in:
parent
6f9bf9a4ae
commit
80e4ecfa79
38
NEWS.md
38
NEWS.md
|
|
@ -1,29 +1,17 @@
|
|||
Release 0.7.11 (XX September, 2014)
|
||||
Release 0.7.11 (XX November, 2014)
|
||||
-----------------------------------
|
||||
|
||||
A major change to BWA-MEM is the support of mapping to ALT contigs. To use this
|
||||
feature, users need to manually create a file `indexbase.alt` with each line
|
||||
giving the name of an ALT contig. During alignment, BWA-MEM will be able to
|
||||
classify potential hits to ALT and non-ALT hits. It reports alignments and
|
||||
assigns mapping quality (mapQ) loosely following these rules:
|
||||
A major change to BWA-MEM is the support of mapping to ALT contigs in addition
|
||||
to the primary assembly. Part of the ALT mapping strategy is implemented in
|
||||
BWA-MEM and the rest in a postprocessing script for now. Due to the extra
|
||||
layer of complexity on generating the reference genome and on the two-step
|
||||
mapping, we start to provide a wrapper script and precompiled binaries since
|
||||
this release. Please check README-alt.md for details.
|
||||
|
||||
1. The original mapQ of a non-ALT hit is computed across non-ALT hits only.
|
||||
The reported mapQ of an ALT hit is always computed across all hits.
|
||||
|
||||
2. An ALT hit is only reported if its score is strictly better than all
|
||||
overlapping non-ALT hits. A reported ALT hit is flagged with 0x800
|
||||
(supplementary) unless there are no non-ALT hits.
|
||||
|
||||
3. The mapQ of a non-ALT hit is reduced to zero if its score is less than 80%
|
||||
(controlled by option `-g`) of the score of an overlapping ALT hit. In this
|
||||
case, the original mapQ is moved to the `om` tag.
|
||||
|
||||
This way, non-ALT alignments are only affected by ALT contigs if there are
|
||||
significantly better ALT alignments. BWA-MEM is carefully engineered such that
|
||||
ALT contigs do not interfere with the alignments to the primary assembly.
|
||||
|
||||
Users may consider to use ALT contigs from GRCh38. I am also constructing a
|
||||
non-redundant and more complete set of sequences missing from GRCh38.
|
||||
Another major addition to BWA-MEM is HLA typing, which made possible with the
|
||||
new ALT mapping strategy. Necessary data and programs are also included in
|
||||
the binary release. The wrapper script also performs HLA typing when HLA genes
|
||||
are also included in the reference genome as additional ALT contigs.
|
||||
|
||||
Other notable changes to BWA-MEM:
|
||||
|
||||
|
|
@ -42,7 +30,9 @@ Other notable changes to BWA-MEM:
|
|||
* Added new pre-setting for Oxford Nanopore 2D reads. For small genomes,
|
||||
though, LAST is still more sensitive.
|
||||
|
||||
(0.7.11: XX September 2014, rXXX)
|
||||
* Added LAST-like seeding. This improves the accuracy for longer reads.
|
||||
|
||||
(0.7.11: XX November 2014, rXXX)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
21
bwa.c
21
bwa.c
|
|
@ -7,6 +7,7 @@
|
|||
#include "ksw.h"
|
||||
#include "utils.h"
|
||||
#include "kstring.h"
|
||||
#include "kvec.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
|
|
@ -55,10 +56,12 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
|||
}
|
||||
trim_readno(&ks->name);
|
||||
kseq2bseq1(ks, &seqs[n]);
|
||||
seqs[n].id = n;
|
||||
size += seqs[n++].l_seq;
|
||||
if (ks2) {
|
||||
trim_readno(&ks2->name);
|
||||
kseq2bseq1(ks2, &seqs[n]);
|
||||
seqs[n].id = n;
|
||||
size += seqs[n++].l_seq;
|
||||
}
|
||||
if (size >= chunk_size && (n&1) == 0) break;
|
||||
|
|
@ -71,6 +74,24 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
|||
return seqs;
|
||||
}
|
||||
|
||||
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
|
||||
{
|
||||
int i, has_last;
|
||||
kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
|
||||
for (i = 1, has_last = 1; i < n; ++i) {
|
||||
if (has_last) {
|
||||
if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
|
||||
kv_push(bseq1_t, a[1], seqs[i-1]);
|
||||
kv_push(bseq1_t, a[1], seqs[i]);
|
||||
has_last = 0;
|
||||
} else kv_push(bseq1_t, a[0], seqs[i-1]);
|
||||
} else has_last = 1;
|
||||
}
|
||||
if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
|
||||
sep[0] = a[0].a, m[0] = a[0].n;
|
||||
sep[1] = a[1].a, m[1] = a[1].n;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* CIGAR related *
|
||||
*****************/
|
||||
|
|
|
|||
3
bwa.h
3
bwa.h
|
|
@ -23,7 +23,7 @@ typedef struct {
|
|||
} bwaidx_t;
|
||||
|
||||
typedef struct {
|
||||
int l_seq;
|
||||
int l_seq, id;
|
||||
char *name, *comment, *seq, *qual, *sam;
|
||||
} bseq1_t;
|
||||
|
||||
|
|
@ -35,6 +35,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
|
||||
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
|
||||
|
||||
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -20,6 +20,7 @@ typedef struct __smem_i smem_i;
|
|||
#define MEM_F_ALN_REG 0x80
|
||||
#define MEM_F_REF_HDR 0x100
|
||||
#define MEM_F_SOFTCLIP 0x200
|
||||
#define MEM_F_SMARTPE 0x400
|
||||
|
||||
typedef struct {
|
||||
int a, b; // match score and mismatch penalty
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ use warnings;
|
|||
use Getopt::Std;
|
||||
|
||||
my %opts = (t=>1, n=>64);
|
||||
getopts("Spadsko:R:x:t:", \%opts);
|
||||
getopts("Sadsko:R:x:t:", \%opts);
|
||||
|
||||
die('
|
||||
Usage: run-bwamem [options] <idxbase> <file1> [file2]
|
||||
|
|
@ -17,30 +17,36 @@ Options: -o STR prefix for output files [inferred from
|
|||
pacbio: pacbio subreads (~10kb query, high error rate)
|
||||
ont2d: Oxford Nanopore reads (~10kb query, higher error rate)
|
||||
-t INT number of threads [1]
|
||||
-p input are paired-end reads if file2 absent
|
||||
|
||||
-a trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
|
||||
-d mark duplicate (via samblaster)
|
||||
-S for SAM/BAM input, don\'t shuffle
|
||||
-s sort the output alignment (higher RAM footprint)
|
||||
-s sort the output alignment (requring more RAM)
|
||||
-k keep temporary files generated by typeHLA
|
||||
|
||||
Note: File type is determined by the file extension of the first input sequence file.
|
||||
Recognized file extensions: fasta, fa, fastq, fq, fasta.gz, fa.gz, fastq.gz,
|
||||
fq.gz, sam, sam.gz and bam. SAM/BAM input will be converted to FASTQ.
|
||||
Examples:
|
||||
|
||||
When option -d is in use, all the input reads are required to come from the same
|
||||
library and all the reads from the library shall be included in the input. In
|
||||
addition, samblaster does not distinguish optical and PCR duplicates.
|
||||
* Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
|
||||
|
||||
The output may consist of the following files:
|
||||
run-bwamem -o prefix -t8 -R"@RG\tID:foo\tSM:bar" hs38d6.fa read1.fq.gz read2.fq.gz
|
||||
|
||||
{-o}.aln.sam.gz - unsorted alignment (unless -s is specified)
|
||||
{-o}.aln.bam - sorted alignment (if -s is specified)
|
||||
{-o}.oph.sam.gz - orphan single-end reads mapping (if input is paired SAM/BAM)
|
||||
{-o}.hla.top - best HLA typing for the six classical HLA genes
|
||||
{-o}.hla.all - additional HLA typing
|
||||
{-o}.log.* - log files
|
||||
* Remap coordinate-sorted BAM, trim Illumina PE adapters and sort the output. The BAM
|
||||
may contain single-end or paired-end reads, or a mixture of the two types. The read
|
||||
groups are not transferred to the output BAM.
|
||||
|
||||
run-bwamem -sao prefix hs38d6.fa old-srt.bam
|
||||
|
||||
* Remap name-grouped BAM and mark duplicates. Note that in this case, all reads from
|
||||
a single library should be aligned at the same time. Paired-end only.
|
||||
|
||||
run-bwamem -Sdo prefix hs38d6.fa old-unsrt.bam
|
||||
|
||||
Output files:
|
||||
|
||||
{-o}.aln.bam - final alignment
|
||||
{-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
|
||||
{-o}.hla.all - additional HLA genotypes consistent with data
|
||||
{-o}.log.* - log files
|
||||
|
||||
') if @ARGV < 2;
|
||||
|
||||
|
|
@ -103,8 +109,7 @@ if ($is_sam || $is_bam) {
|
|||
my $cmd_sam2bam = $is_sam? "$root/htsbox samview -uS $ARGV[1] \\\n" : "cat $ARGV[1] \\\n";
|
||||
my $ntmps = int($size / 4e9) + 1;
|
||||
my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
|
||||
my $cmd_bam2fq = "";
|
||||
$cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.oph.fq.gz " : "") . "- \\\n";
|
||||
my $cmd_bam2fq = " | $root/htsbox bam2fq -O - \\\n";
|
||||
$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
|
||||
} elsif (@ARGV >= 3) {
|
||||
$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
|
||||
|
|
@ -112,10 +117,10 @@ if ($is_sam || $is_bam) {
|
|||
$cmd = "cat $ARGV[1] \\\n";
|
||||
}
|
||||
|
||||
my $bwa_opts = ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
|
||||
my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
|
||||
|
||||
$cmd .= " | $root/trimadap \\\n" if defined($opts{a});
|
||||
$cmd .= " | $root/bwa mem $bwa_opts" . ($is_pe? "-p " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n";
|
||||
$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
|
||||
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
|
||||
|
||||
my $has_hla = 0;
|
||||
|
|
@ -131,15 +136,7 @@ if (-f "$ARGV[0].alt") {
|
|||
}
|
||||
|
||||
my $t_sort = $opts{t} < 4? $opts{t} : 4;
|
||||
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | gzip -1 > $prefix.aln.sam.gz;\n";
|
||||
|
||||
# map single end reads
|
||||
if ($cmd =~ /$prefix.oph.fq.gz/) {
|
||||
$cmd .= "[ -s $prefix.oph.fq.gz ] && zcat $prefix.oph.fq.gz \\\n";
|
||||
$cmd .= " | $root/trimadap \\\n" if defined($opts{a});
|
||||
$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2>> $prefix.log.bwamem | gzip -1 > $prefix.oph.sam.gz;\n";
|
||||
$cmd .= "rm -f $prefix.oph.fq.gz;\n";
|
||||
}
|
||||
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n";
|
||||
|
||||
if ($has_hla && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
|
||||
$cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
|
||||
|
|
|
|||
33
fastmap.c
33
fastmap.c
|
|
@ -66,7 +66,7 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
||||
else if (c == 'a') opt->flag |= MEM_F_ALL;
|
||||
else if (c == 'p') opt->flag |= MEM_F_PE;
|
||||
else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
|
||||
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
||||
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
|
||||
else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
|
||||
|
|
@ -167,7 +167,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
|
||||
// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
|
||||
fprintf(stderr, "\nInput/output options:\n\n");
|
||||
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
|
||||
fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
|
||||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
fprintf(stderr, " -j ignore ALT contigs\n");
|
||||
fprintf(stderr, "\n");
|
||||
|
|
@ -248,7 +248,7 @@ int main_mem(int argc, char *argv[])
|
|||
if (optind + 2 < argc) {
|
||||
if (opt->flag&MEM_F_PE) {
|
||||
if (bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
|
||||
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
|
||||
} else {
|
||||
ko2 = kopen(argv[optind + 2], &fd2);
|
||||
if (ko2 == 0) {
|
||||
|
|
@ -265,11 +265,6 @@ int main_mem(int argc, char *argv[])
|
|||
actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
|
||||
while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) {
|
||||
int64_t size = 0;
|
||||
if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
|
||||
if (bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
|
||||
n = n>>1<<1;
|
||||
}
|
||||
if (!copy_comment)
|
||||
for (i = 0; i < n; ++i) {
|
||||
free(seqs[i].comment); seqs[i].comment = 0;
|
||||
|
|
@ -277,7 +272,27 @@ int main_mem(int argc, char *argv[])
|
|||
for (i = 0; i < n; ++i) size += seqs[i].l_seq;
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
|
||||
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
|
||||
if (opt->flag & MEM_F_SMARTPE) {
|
||||
bseq1_t *sep[2];
|
||||
int i, n_sep[2];
|
||||
mem_opt_t tmp_opt = *opt;
|
||||
bseq_classify(n, seqs, n_sep, sep);
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
|
||||
if (n_sep[0]) {
|
||||
tmp_opt.flag &= ~MEM_F_PE;
|
||||
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed, n_sep[0], sep[0], 0);
|
||||
for (i = 0; i < n_sep[0]; ++i)
|
||||
seqs[sep[0][i].id].sam = sep[0][i].sam;
|
||||
}
|
||||
if (n_sep[1]) {
|
||||
tmp_opt.flag |= MEM_F_PE;
|
||||
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed + n_sep[0], n_sep[1], sep[1], pes0);
|
||||
for (i = 0; i < n_sep[1]; ++i)
|
||||
seqs[sep[1][i].id].sam = sep[1][i].sam;
|
||||
}
|
||||
free(sep[0]); free(sep[1]);
|
||||
} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
|
||||
n_processed += n;
|
||||
for (i = 0; i < n; ++i) {
|
||||
if (seqs[i].sam) err_fputs(seqs[i].sam, stdout);
|
||||
|
|
|
|||
Loading…
Reference in New Issue