2nd stab at creating a pipeline [not quite finished yet].

Modularized the system to emulate what it will look like in firehose.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1605 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-09-13 05:53:18 +00:00
parent 794bd26b20
commit 84c8374e68
6 changed files with 391 additions and 111 deletions

View File

@ -0,0 +1,91 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
sub usage {
print "Usage: perl runCleaningPipeline.pl\n\t-i <input bam>\n\t-odir <output directory>\n\t-obam <output bam name>\n\t[-sting Sting dir]\n\t[-inject]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job final farm job name]\n\t[-dry]\n";
exit(1);
}
my $inputBam = undef;
my $outputDir = undef;
my $outputBam = undef;
my $jobName = undef;
my $wait = undef;
my $dry;
my $inject;
my $queue = "gsa";
my $sting = "/humgen/gsa-scr1/ebanks/Sting";
GetOptions( "i=s" => \$inputBam,
"odir=s" => \$outputDir,
"obam=s" => \$outputBam,
"q:s" => \$queue,
"dry!" => \$dry,
"inject!" => \$inject,
"job:s" => \$jobName,
"wait:s" => \$wait,
"sting:s" => \$sting );
usage() if ( !$inputBam || !$outputDir || !$outputBam );
my $indelIntervals = "$outputDir/$outputBam.indels.intervals";
my $command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $indelIntervals -oarg o -j $outputDir/$outputBam.intervals -q $queue";
if ($dry) {
$command .= " -dry";
}
if ($wait) {
$command .= " -wait $wait";
}
system($command);
my $mismatchIntervals = "$outputDir/$outputBam.mismatches.intervals";
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $mismatchIntervals -oarg o -j $outputDir/$outputBam.intervals -q $queue";
if ($dry) {
$command .= " -dry";
}
if ($wait) {
$command .= " -wait $wait";
}
system($command);
my $mergedIntervals = "$outputDir/$outputBam.merged.intervals";
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -intervals $indelIntervals -intervals $mismatchIntervals\" -o $mergedIntervals -oarg o -wait $outputDir/$outputBam.intervals -j $outputDir/$outputBam.merged -q $queue";
if ($dry) {
$command .= " -dry";
}
system($command);
my $cleanedBam;
if ($inject) {
$cleanedBam = "$outputDir/$outputBam.cleaned.bam";
} else {
$cleanedBam = "$outputDir/$outputBam";
}
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -compress 1";
if ($inject) {
$command .= " -cleanedOnly\" -j $outputDir/$outputBam.cleaned";
} else {
$command .= "\" -j $jobName";
}
$command .= "\" -o $cleanedBam -oarg O -wait $outputDir/$outputBam.merged -q $queue -bam -i $mergedIntervals -n 50";
if ($dry) {
$command .= " -dry";
}
system($command);
if ($inject) {
my $bam = "$outputDir/$outputBam";
$command = "bsub -q $queue -o $bam.sdout";
if ($jobName) {
$command .= " -J $jobName";
}
$command .= " -w \"ended($outputDir/$outputBam.cleaned)\" java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam --output_bam $bam --cleaned_reads $cleanedBam -compress 1";
if ($dry) {
print "$command\n";
} else {
system($command);
}
}

View File

@ -0,0 +1,64 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
sub usage {
print "Usage: perl runPilot1Pipeline.pl\n\t-i <input dir>\n\t-o <output directory>\n\t[-sting Sting dir]\n\t[-q farm queue; default:gsa]\n\t[-dry]\n";
exit(1);
}
my $inputDir = undef;
my $outputDir = undef;
my $dry;
my $queue = "gsa";
my $sting = "/humgen/gsa-scr1/ebanks/Sting";
GetOptions( "i=s" => \$inputDir,
"odir=s" => \$outputDir,
"q:s" => \$queue,
"dry!" => \$dry,
"sting:s" => \$sting );
usage() if ( !$inputDir || !$outputDir );
my @samples = ("ceu","yri","chb_jpt");
foreach my $sample (@samples) {
$inputBam = "$inputDir/low_coverage_$sample.bam";
$outputHead = "$outputDir/$sample";
clean($inputBam, $outputDir, "$outputHead.bam", $queue, $sting, $dry);
# call($outputBam, $outputHead, $queue, $sting, $dry);
}
sub clean {
my $inputBam = $_[0];
my $outputDir = $_[1];
my $outputBam = $_[2];
my $queue = $_[3];
my $sting = $_[4];
my $dry = $_[5];
my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -odir $outputDir -obam $outputBam -q $queue -j $outputBam.cleaningpipeline -sting $sting";
if ($dry) {
$cmd .= " -dry";
}
system($cmd);
}
sub call {
my $inputBam = $_[0];
my $outputHead = $_[1];
my $queue = $_[2];
my $sting = $_[3];
my $dry = $_[4];
$cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBam -o $outputHead -q $queue -wait $inputBam.cleaningpipeline -sting $sting";
if ($dry) {
$cmd .= " -dry";
}
system($cmd);
}

View File

@ -0,0 +1,83 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
sub usage {
print "Usage: perl runPilot2Pipeline.pl\n\t-i <input dir>\n\t-o <output directory>\n\t[-sting Sting dir]\n\t[-q farm queue; default:gsa]\n\t[-dry]\n";
exit(1);
}
my $inputDir = undef;
my $outputDir = undef;
my $dry;
my $queue = "gsa";
my $sting = "/humgen/gsa-scr1/ebanks/Sting";
GetOptions( "i=s" => \$inputDir,
"odir=s" => \$outputDir,
"q:s" => \$queue,
"dry!" => \$dry,
"sting:s" => \$sting );
usage() if ( !$inputDir || !$outputDir );
my @samples = ("NA19238","NA19239","NA19240","NA12878","NA12891","NA12892");
foreach my $sample (@samples) {
my $inputBamSLX = "$inputDir/$sample.pilot2.SLX.bam";
my $outputHead = "$outputDir/$sample.SLX";
clean($inputBamSLX, $outputDir, "$outputHead.bam", $queue, $sting, $dry);
# call($outputBam, $outputHead, $queue, $sting, $dry);
if ($sample eq "NA12878" || $sample eq "NA19240") {
my $inputBamSOLID = "$inputDir/$sample.pilot2.SOLID.bam";
$outputHead = "$outputDir/$sample.SOLID";
clean($inputBamSOLID, $outputDir, "$outputHead.bam", $queue, $sting, $dry);
# call($outputBam, $outputHead, $queue, $sting, $dry);
my $inputBam454 = "$inputDir/$sample.pilot2.454.bam";
$outputHead = "$outputDir/$sample.454";
# call($inputBam454, $outputHead, $queue, $sting, $dry);
my @inputBams = ($inputBamSOLID, $inputBam454);
$outputHead = "$outputDir/$sample.SOLID_454";
# multicall($inputBams, $outputHead, $queue, $sting, $dry);
@inputBams = ($inputBamSLX, $inputBamSOLID, $inputBam454);
$outputHead = "$outputDir/$sample.allTechs";
# multicall($inputBams, $outputHead, $queue, $sting, $dry);
}
}
sub clean {
my $inputBam = $_[0];
my $outputDir = $_[1];
my $outputBam = $_[2];
my $queue = $_[3];
my $sting = $_[4];
my $dry = $_[5];
my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -odir $outputDir -obam $outputBam -q $queue -inject -j $outputBam.cleaningpipeline -sting $sting";
if ($dry) {
$cmd .= " -dry";
}
system($cmd);
}
sub call {
my $inputBam = $_[0];
my $outputHead = $_[1];
my $queue = $_[2];
my $sting = $_[3];
my $dry = $_[4];
my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBam -o $outputHead -q $queue -snps -wait $inputBam.cleaningpipeline -sting $sting";
if ($dry) {
$cmd .= " -dry";
}
system($cmd);
}

View File

@ -0,0 +1,2 @@
#perl /humgen/gsa-scr1/ebanks/Sting/perl/1kgScripts/runPilot1Pipeline.pl -i /humgen/gsa-hphome1/projects/1kg_pilot1/mergedBamsByPopulation -o /broad/hptmp/ebanks/1kg_pilot1/cleaned -dry
perl /humgen/gsa-scr1/ebanks/Sting/perl/1kgScripts/runPilot2Pipeline.pl -i /broad/1KG/DCC_merged/freeze5.2 -o /broad/hptmp/ebanks/1kg_pilot2/cleaned -dry

View File

@ -1,111 +0,0 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
my $walker = undef;
my $pilot = "pilot2";
my $queue = "gsa";
my $tech = "SLX";
my $jar = "/humgen/gsa-scr1/ebanks/Sting/dist/GenomeAnalysisTK.jar";
GetOptions( "T=s" => \$walker,
"p:s" => \$pilot,
"q:s" => \$queue,
"tech:s" => \$tech,
"j:s" => \$jar );
exit(1) if ( !$walker );
my @samples;
if ($pilot eq "pilot1") {
@samples = ("CEU","YRI","CHB-JPT");
} elsif ($pilot eq "pilot2") {
@samples = ("NA19238","NA19239","NA19240","NA12878","NA12891","NA12892");
}
foreach my $sample (@samples) {
my $num = 1;
while ($num < 23) {
enqueue($sample, $num, $pilot, $queue, $jar, $walker, $tech);
$num++;
}
enqueue($sample, "X", $pilot, $queue, $jar, $walker, $tech);
enqueue($sample, "Y", $pilot, $queue, $jar, $walker, $tech);
}
sub enqueue {
my $sample = $_[0];
my $chr = $_[1];
my $pilot = $_[2];
my $queue = $_[3];
my $jar = $_[4];
my $walker = $_[5];
my $tech = $_[6];
my $inputBam;
if ($pilot eq "pilot2") {
$inputBam = "/broad/1KG/DCC/ftp/pilot_data/$sample/alignment/$sample.chrom$chr.$tech.SRP000032.2009_07.bam";
} else {
$inputBam = "/broad/hptmp/ebanks/1kg_pilot1/".$sample."_BAMS.list";
}
my $outputDir;
if ($pilot eq "pilot1") {
$outputDir = "/broad/hptmp/ebanks/1kg_pilot1/cleaned";
} else {
$outputDir = "/broad/hptmp/ebanks/1kg_pilot2/cleaned";
}
my $cmd;
my $outputFile;
SWITCH: {
$walker eq "IndelIntervals" && do {
$outputFile = "$outputDir/intervals/$sample.chr$chr.$tech.indels.intervals";
$cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr";
last SWITCH;
};
$walker eq "MismatchIntervals" && do {
$outputFile = "$outputDir/intervals/$sample.chr$chr.$tech.mismatches.intervals";
$cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr";
last SWITCH;
};
$walker eq "SNPClusters" && do {
$outputFile = "$outputDir/intervals/$sample.chr$chr.$tech.clusters.intervals";
$cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T SNPClusters -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr";
last SWITCH;
};
$walker eq "IntervalMerger" && do {
$outputFile = "$outputDir/intervals/$sample.chr$chr.$tech.merged.intervals";
$cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr -intervals $outputDir/intervals/$sample.chr$chr.$tech.indels.intervals -intervals $outputDir/intervals/$sample.chr$chr.$tech.mismatches.intervals";
last SWITCH;
};
$walker eq "IntervalCleaner" && do {
if ($pilot eq "pilot2") {
$outputFile = "$outputDir/cleaner/$sample.chr$chr.$tech.bam";
} else {
$outputFile = "$outputDir/bams/$sample.chr$chr.$tech.bam";
}
$cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -O $outputFile -L $chr -intervals $outputDir/intervals/$sample.chr$chr.$tech.merged.intervals -compress 1";
if ($pilot eq "pilot2") {
$cmd .= " -cleanedOnly";
}
last SWITCH;
};
$walker eq "CleanedReadInjector" && do {
$outputFile = "$outputDir/bams/$sample.chr$chr.$tech.bam";
$cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr";
last SWITCH;
};
print "$walker is not a supported class\n";
exit(1);
}
# print "$cmd\n";
system($cmd);
}

View File

@ -0,0 +1,151 @@
#!/usr/bin/perl -w
# Runs a given GATK walker genome-wide, but splits up the jobs
# and then merges the results together. One should really use
# scatter-gather here, but I wanted to test against that framework
# (to ensure consistency) so I wrote this temporary script.
# Also, it allowed me to add in farm commands for wait ids
# (which is currently unavailable in scatter-gather).
# If intervals file is left blank, it splits by chromosome.
use strict;
use Getopt::Long;
sub usage {
print "Usage: perl enqueueGATKjobsByChromosome.pl\n\t-cmd <GATK command without output or -L args>\n\t-o <output file>\n\t[-oarg GATK output argument; default:o]\n\t[-i intervals file]\n\t[-n number of splits to make]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job farm job name]\n\t[-dry]\n\t[-bam is output format bam?; default:no]\n";
exit(1);
}
my $cmd = undef;
my $output = undef;
my $outputArg = "o";
my $jobName = undef;
my $wait = undef;
my $queue = "gsa";
my $breaks = 24;
my $intervalsFile = undef;
my $dry;
my $isBam;
GetOptions( "cmd=s" => \$cmd,
"o=s" => \$output,
"oarg:s" => \$outputArg,
"job:s" => \$jobName,
"wait:s" => \$wait,
"n:i" => \$breaks,
"i:s" => \$intervalsFile,
"dry!" => \$dry,
"bam!" => \$isBam,
"q:s" => \$queue);
usage() if ( !$cmd || !$output );
my @intervals;
if (!$intervalsFile) {
my $num = 1;
while ($num < 23) {
push(@intervals, $num);
$num++;
}
push(@intervals, "X");
push(@intervals, "Y");
} else {
open(FILE, "< $intervalsFile") or die "can't open $intervalsFile: $!";
my @lines = <FILE>;
chomp(@lines);
my $linecount = scalar(@lines);
if ($linecount < $breaks) {
$breaks = $linecount;
}
my $linesPerJob = $linecount / $breaks;
my $index = 0;
for (my $i = 1; $i < $breaks; $i++) {
my $interval = "";
for (my $j = 0; $j < $linesPerJob; $j++) {
$interval .= "$lines[$index];";
$index++;
}
push(@intervals, $interval);
}
my $interval = "";
while ($index < $linecount) {
$interval .= "$lines[$index];";
$index++;
}
push(@intervals, $interval);
}
my $intervalcount = scalar(@intervals);
for (my $i = 0; $i < $intervalcount; $i++) {
enqueue($intervals[$i], $cmd, $output, $outputArg, $wait, $queue, $dry, $isBam, ($i+1));
}
mergeResults($output, $queue, $dry, $isBam, $jobName, $intervalcount);
sub enqueue {
my $interval = $_[0];
my $cmd = $_[1];
my $outArg = $_[3];
my $waitid = $_[4];
my $queue = $_[5];
my $dry = $_[6];
my $bam = $_[7];
my $index = $_[8];
my $output = "$_[2].$index";
if ($bam) {
$output .= ".bam";
}
my $bsub = "bsub -q $queue -o $output.sdout -J $_[2]";
if ($waitid) {
$bsub .= " -w \"ended($waitid)\"";
}
my $command = "$bsub $cmd -$outputArg $output -L $interval";
execute($command, $dry);
}
sub mergeResults {
my $output = $_[0];
my $queue = $_[1];
my $dry = $_[2];
my $bam = $_[3];
my $jobName = $_[4];
my $intervalcount = $_[5];
my $cmd = "bsub -q $queue -o $output.sdout -w \"ended($output)\"";
if ($jobName) {
$cmd .= " -J $jobName";
}
if ($bam) {
$cmd .= " samtools merge $output ";
for (my $i = 1; $i <= $intervalcount; $i++) {
$cmd .= "$output.$i.bam ";
}
} else {
$cmd .= " cat ";
for (my $i = 1; $i <= $intervalcount; $i++) {
$cmd .= "$output.$i ";
}
$cmd .= "> $output";
}
execute($cmd, $dry);
}
sub execute {
my $cmd = $_[0];
my $dry = $_[1];
if ($dry) {
print "$cmd\n";
} else {
system($cmd);
}
}