polish the run script

This commit is contained in:
Heng Li 2014-11-17 10:44:54 -05:00
parent 792be78a80
commit 0071c4ab22
1 changed files with 32 additions and 11 deletions

View File

@ -5,7 +5,7 @@ use warnings;
use Getopt::Std;
my %opts = (t=>1, n=>64);
getopts("SpADsko:R:x:t:", \%opts);
getopts("SpAdsko:R:x:t:", \%opts);
die('
Usage: run-bwamem [options] <idxbase> <file1> [file2]
@ -20,7 +20,7 @@ Options: -o STR prefix for output files [inferred from
-p input are paired-end reads if file2 absent
-A skip HiSeq2000/2500 PE resequencing adapter trimming (via trimadap)
-D skip duplicate marking (via samblaster)
-d mark duplicate (via samblaster)
-S for SAM/BAM input, don\'t shuffle
-s sort the output alignment (more RAM)
-k keep temporary files generated by typeHLA
@ -29,6 +29,19 @@ Note: File type is determined by the file extension of the first input sequence
Recognized file extensions: fasta, fa, fastq, fq, fasta.gz, fa.gz, fastq.gz,
fq.gz, sam, sam.gz and bam. SAM/BAM input will be converted to FASTQ.
When option -d is in use, all the input reads are required to come from the same
library and all the reads from the library shall be included in the input. In
addition, samblaster does not distinguish optical and PCR duplicates.
The output may consist of the following files:
{-o}.aln.sam.gz - unsorted alignment (unless -s is specified)
{-o}.aln.bam - sorted alignment (if -s is specified)
{-o}.oph.sam.gz - orphan single-end reads mapping (if input is paired SAM/BAM)
{-o}.hla.top - best HLA typing for the six classical HLA genes
{-o}.hla.all - additional HLA typing
{-o}.log.* - log files
') if @ARGV < 2;
warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") unless defined($opts{R});
@ -82,7 +95,7 @@ my $is_sam = $ARGV[1] =~ /\.(sam|sam\.gz)$/? 1 : 0;
my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
if (defined($opts{x})) {
$opts{A} = $opts{D} = 1;
$opts{A} = 1; delete($opts{d});
delete $opts{p};
}
@ -92,7 +105,7 @@ if ($is_sam || $is_bam) {
my $ntmps = int($size / 4e9) + 1;
my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
my $cmd_bam2fq = "";
$cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.out.se.fq.gz " : "") . "- \\\n";
$cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.oph.fq.gz " : "") . "- \\\n";
$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
} elsif (@ARGV >= 3) {
$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
@ -100,9 +113,11 @@ if ($is_sam || $is_bam) {
$cmd = "cat $ARGV[1] \\\n";
}
my $bwa_opts = ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
$cmd .= " | $root/trimadap \\\n" unless defined($opts{A});
$cmd .= " | $root/bwa mem -t$opts{t} " . (defined($opts{x})? "-x $opts{x} " : "") . ($is_pe? "-p " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n";
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" unless defined($opts{D});
$cmd .= " | $root/bwa mem $bwa_opts" . ($is_pe? "-p " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n";
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
my $has_hla = 0;
if (-f "$ARGV[0].alt") {
@ -112,17 +127,23 @@ if (-f "$ARGV[0].alt") {
$has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
}
close($fh);
my $hla_pre = $has_hla? "-p $prefix.out " : "";
my $hla_pre = $has_hla? "-p $prefix.hla " : "";
$cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
}
my $t_sort = $opts{t} < 4? $opts{t} : 4;
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.out;\n" : " | gzip -1 > $prefix.out.sam.gz;\n";
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | gzip -1 > $prefix.aln.sam.gz;\n";
# map single end reads
if ($cmd =~ /$prefix.oph.fq.gz/) {
$cmd .= "[ -s $prefix.oph.fq.gz ] && ";
$cmd .= "$root/bwa mem $bwa_opts$ARGV[0] $prefix.oph.fq.gz 2>> $prefix.log.bwamem | gzip -1 > $prefix.oph.sam.gz; rm -f $prefix.oph.fq.gz;\n";
}
if ($has_hla && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
$cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.out > $prefix.hla.top 2> $prefix.log.hla;\n";
$cmd .= "cat $prefix.out.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
$cmd .= "rm -f $prefix.out.HLA*;\n" unless defined($opts{k});
$cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
$cmd .= "cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
}
print $cmd;