gatk-3.8/perl/splitAndEnqueueGATKjobs.pl

152 lines
3.5 KiB
Perl
Executable File

#!/usr/bin/perl -w
# Runs a given GATK walker genome-wide, but splits up the jobs
# and then merges the results together. One should really use
# scatter-gather here, but I wanted to test against that framework
# (to ensure consistency) so I wrote this temporary script.
# Also, it allowed me to add in farm commands for wait ids
# (which is currently unavailable in scatter-gather).
# If intervals file is left blank, it splits by chromosome.
use strict;
use Getopt::Long;
sub usage {
print "Usage: perl enqueueGATKjobsByChromosome.pl\n\t-cmd <GATK command without output or -L args>\n\t-o <output file>\n\t[-oarg GATK output argument; default:o]\n\t[-i intervals file]\n\t[-n number of splits to make]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job farm job name]\n\t[-dry]\n\t[-bam is output format bam?; default:no]\n";
exit(1);
}
my $cmd = undef;
my $output = undef;
my $outputArg = "o";
my $jobName = undef;
my $wait = undef;
my $queue = "gsa";
my $breaks = 24;
my $intervalsFile = undef;
my $dry;
my $isBam;
GetOptions( "cmd=s" => \$cmd,
"o=s" => \$output,
"oarg:s" => \$outputArg,
"job:s" => \$jobName,
"wait:s" => \$wait,
"n:i" => \$breaks,
"i:s" => \$intervalsFile,
"dry!" => \$dry,
"bam!" => \$isBam,
"q:s" => \$queue);
usage() if ( !$cmd || !$output );
my @intervals;
if (!$intervalsFile) {
my $num = 1;
while ($num < 23) {
push(@intervals, $num);
$num++;
}
push(@intervals, "X");
push(@intervals, "Y");
} else {
open(FILE, "< $intervalsFile") or die "can't open $intervalsFile: $!";
my @lines = <FILE>;
chomp(@lines);
my $linecount = scalar(@lines);
if ($linecount < $breaks) {
$breaks = $linecount;
}
my $linesPerJob = $linecount / $breaks;
my $index = 0;
for (my $i = 1; $i < $breaks; $i++) {
my $interval = "";
for (my $j = 0; $j < $linesPerJob; $j++) {
$interval .= "$lines[$index];";
$index++;
}
push(@intervals, $interval);
}
my $interval = "";
while ($index < $linecount) {
$interval .= "$lines[$index];";
$index++;
}
push(@intervals, $interval);
}
my $intervalcount = scalar(@intervals);
for (my $i = 0; $i < $intervalcount; $i++) {
enqueue($intervals[$i], $cmd, $output, $outputArg, $wait, $queue, $dry, $isBam, ($i+1));
}
mergeResults($output, $queue, $dry, $isBam, $jobName, $intervalcount);
sub enqueue {
my $interval = $_[0];
my $cmd = $_[1];
my $outArg = $_[3];
my $waitid = $_[4];
my $queue = $_[5];
my $dry = $_[6];
my $bam = $_[7];
my $index = $_[8];
my $output = "$_[2].$index";
if ($bam) {
$output .= ".bam";
}
my $bsub = "bsub -q $queue -o $output.sdout -J $_[2]";
if ($waitid) {
$bsub .= " -w \"ended($waitid)\"";
}
my $command = "$bsub $cmd -$outputArg $output -L $interval";
execute($command, $dry);
}
sub mergeResults {
my $output = $_[0];
my $queue = $_[1];
my $dry = $_[2];
my $bam = $_[3];
my $jobName = $_[4];
my $intervalcount = $_[5];
my $cmd = "bsub -q $queue -o $output.sdout -w \"ended($output)\"";
if ($jobName) {
$cmd .= " -J $jobName";
}
if ($bam) {
$cmd .= " samtools merge $output ";
for (my $i = 1; $i <= $intervalcount; $i++) {
$cmd .= "$output.$i.bam ";
}
} else {
$cmd .= " cat ";
for (my $i = 1; $i <= $intervalcount; $i++) {
$cmd .= "$output.$i ";
}
$cmd .= "> $output";
}
execute($cmd, $dry);
}
sub execute {
my $cmd = $_[0];
my $dry = $_[1];
if ($dry) {
print "$cmd\n";
} else {
system($cmd);
}
}