From 0071c4ab22f3a95c04386df7cf03079dd06f1478 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 17 Nov 2014 10:44:54 -0500 Subject: [PATCH] polish the run script --- extras/run-bwamem | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/extras/run-bwamem b/extras/run-bwamem index 30e682d..3231a6e 100755 --- a/extras/run-bwamem +++ b/extras/run-bwamem @@ -5,7 +5,7 @@ use warnings; use Getopt::Std; my %opts = (t=>1, n=>64); -getopts("SpADsko:R:x:t:", \%opts); +getopts("SpAdsko:R:x:t:", \%opts); die(' Usage: run-bwamem [options] [file2] @@ -20,7 +20,7 @@ Options: -o STR prefix for output files [inferred from -p input are paired-end reads if file2 absent -A skip HiSeq2000/2500 PE resequencing adapter trimming (via trimadap) - -D skip duplicate marking (via samblaster) + -d mark duplicate (via samblaster) -S for SAM/BAM input, don\'t shuffle -s sort the output alignment (more RAM) -k keep temporary files generated by typeHLA @@ -29,6 +29,19 @@ Note: File type is determined by the file extension of the first input sequence Recognized file extensions: fasta, fa, fastq, fq, fasta.gz, fa.gz, fastq.gz, fq.gz, sam, sam.gz and bam. SAM/BAM input will be converted to FASTQ. + When option -d is in use, all the input reads are required to come from the same + library and all the reads from the library shall be included in the input. In + addition, samblaster does not distinguish optical and PCR duplicates. + + The output may consist of the following files: + + {-o}.aln.sam.gz - unsorted alignment (unless -s is specified) + {-o}.aln.bam - sorted alignment (if -s is specified) + {-o}.oph.sam.gz - orphan single-end reads mapping (if input is paired SAM/BAM) + {-o}.hla.top - best HLA typing for the six classical HLA genes + {-o}.hla.all - additional HLA typing + {-o}.log.* - log files + ') if @ARGV < 2; warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") unless defined($opts{R}); @@ -82,7 +95,7 @@ my $is_sam = $ARGV[1] =~ /\.(sam|sam\.gz)$/? 1 : 0; my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0; if (defined($opts{x})) { - $opts{A} = $opts{D} = 1; + $opts{A} = 1; delete($opts{d}); delete $opts{p}; } @@ -92,7 +105,7 @@ if ($is_sam || $is_bam) { my $ntmps = int($size / 4e9) + 1; my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : ""; my $cmd_bam2fq = ""; - $cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.out.se.fq.gz " : "") . "- \\\n"; + $cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.oph.fq.gz " : "") . "- \\\n"; $cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq; } elsif (@ARGV >= 3) { $cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n"; @@ -100,9 +113,11 @@ if ($is_sam || $is_bam) { $cmd = "cat $ARGV[1] \\\n"; } +my $bwa_opts = ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : ""); + $cmd .= " | $root/trimadap \\\n" unless defined($opts{A}); -$cmd .= " | $root/bwa mem -t$opts{t} " . (defined($opts{x})? "-x $opts{x} " : "") . ($is_pe? "-p " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n"; -$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" unless defined($opts{D}); +$cmd .= " | $root/bwa mem $bwa_opts" . ($is_pe? "-p " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n"; +$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d}); my $has_hla = 0; if (-f "$ARGV[0].alt") { @@ -112,17 +127,23 @@ if (-f "$ARGV[0].alt") { $has_hla = 1 if /^HLA-[^\s\*]+\*\d+/; } close($fh); - my $hla_pre = $has_hla? "-p $prefix.out " : ""; + my $hla_pre = $has_hla? "-p $prefix.hla " : ""; $cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n"; } my $t_sort = $opts{t} < 4? $opts{t} : 4; -$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.out;\n" : " | gzip -1 > $prefix.out.sam.gz;\n"; +$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | gzip -1 > $prefix.aln.sam.gz;\n"; + +# map single end reads +if ($cmd =~ /$prefix.oph.fq.gz/) { + $cmd .= "[ -s $prefix.oph.fq.gz ] && "; + $cmd .= "$root/bwa mem $bwa_opts$ARGV[0] $prefix.oph.fq.gz 2>> $prefix.log.bwamem | gzip -1 > $prefix.oph.sam.gz; rm -f $prefix.oph.fq.gz;\n"; +} if ($has_hla && (!defined($opts{x}) || $opts{x} eq 'intractg')) { - $cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.out > $prefix.hla.top 2> $prefix.log.hla;\n"; - $cmd .= "cat $prefix.out.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n"; - $cmd .= "rm -f $prefix.out.HLA*;\n" unless defined($opts{k}); + $cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n"; + $cmd .= "cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n"; + $cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k}); } print $cmd;