From 84c8374e68296356cca1d3175eaaffc1323f9ba0 Mon Sep 17 00:00:00 2001 From: ebanks Date: Sun, 13 Sep 2009 05:53:18 +0000 Subject: [PATCH] 2nd stab at creating a pipeline [not quite finished yet]. Modularized the system to emulate what it will look like in firehose. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1605 348d0f76-0448-11de-a6fe-93d51630548a --- perl/1kgScripts/runCleaningPipeline.pl | 91 +++++++++++++++ perl/1kgScripts/runPilot1Pipeline.pl | 64 +++++++++++ perl/1kgScripts/runPilot2Pipeline.pl | 83 ++++++++++++++ perl/1kgScripts/runPilotPipelines.sh | 2 + perl/enqueueGATKjobsByChromosome.pl | 111 ------------------ perl/splitAndEnqueueGATKjobs.pl | 151 +++++++++++++++++++++++++ 6 files changed, 391 insertions(+), 111 deletions(-) create mode 100755 perl/1kgScripts/runCleaningPipeline.pl create mode 100755 perl/1kgScripts/runPilot1Pipeline.pl create mode 100755 perl/1kgScripts/runPilot2Pipeline.pl create mode 100755 perl/1kgScripts/runPilotPipelines.sh delete mode 100755 perl/enqueueGATKjobsByChromosome.pl create mode 100755 perl/splitAndEnqueueGATKjobs.pl diff --git a/perl/1kgScripts/runCleaningPipeline.pl b/perl/1kgScripts/runCleaningPipeline.pl new file mode 100755 index 000000000..f38faf6a2 --- /dev/null +++ b/perl/1kgScripts/runCleaningPipeline.pl @@ -0,0 +1,91 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +sub usage { + print "Usage: perl runCleaningPipeline.pl\n\t-i \n\t-odir \n\t-obam \n\t[-sting Sting dir]\n\t[-inject]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job final farm job name]\n\t[-dry]\n"; + exit(1); +} + +my $inputBam = undef; +my $outputDir = undef; +my $outputBam = undef; +my $jobName = undef; +my $wait = undef; +my $dry; +my $inject; +my $queue = "gsa"; +my $sting = "/humgen/gsa-scr1/ebanks/Sting"; + +GetOptions( "i=s" => \$inputBam, + "odir=s" => \$outputDir, + "obam=s" => \$outputBam, + "q:s" => \$queue, + "dry!" => \$dry, + "inject!" => \$inject, + "job:s" => \$jobName, + "wait:s" => \$wait, + "sting:s" => \$sting ); + +usage() if ( !$inputBam || !$outputDir || !$outputBam ); + +my $indelIntervals = "$outputDir/$outputBam.indels.intervals"; +my $command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $indelIntervals -oarg o -j $outputDir/$outputBam.intervals -q $queue"; +if ($dry) { + $command .= " -dry"; +} +if ($wait) { + $command .= " -wait $wait"; +} +system($command); + +my $mismatchIntervals = "$outputDir/$outputBam.mismatches.intervals"; +$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $mismatchIntervals -oarg o -j $outputDir/$outputBam.intervals -q $queue"; +if ($dry) { + $command .= " -dry"; +} +if ($wait) { + $command .= " -wait $wait"; +} +system($command); + +my $mergedIntervals = "$outputDir/$outputBam.merged.intervals"; +$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -intervals $indelIntervals -intervals $mismatchIntervals\" -o $mergedIntervals -oarg o -wait $outputDir/$outputBam.intervals -j $outputDir/$outputBam.merged -q $queue"; +if ($dry) { + $command .= " -dry"; +} +system($command); + +my $cleanedBam; +if ($inject) { + $cleanedBam = "$outputDir/$outputBam.cleaned.bam"; +} else { + $cleanedBam = "$outputDir/$outputBam"; +} +$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -compress 1"; +if ($inject) { + $command .= " -cleanedOnly\" -j $outputDir/$outputBam.cleaned"; +} else { + $command .= "\" -j $jobName"; +} +$command .= "\" -o $cleanedBam -oarg O -wait $outputDir/$outputBam.merged -q $queue -bam -i $mergedIntervals -n 50"; +if ($dry) { + $command .= " -dry"; +} +system($command); + + +if ($inject) { + my $bam = "$outputDir/$outputBam"; + $command = "bsub -q $queue -o $bam.sdout"; + if ($jobName) { + $command .= " -J $jobName"; + } + $command .= " -w \"ended($outputDir/$outputBam.cleaned)\" java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam --output_bam $bam --cleaned_reads $cleanedBam -compress 1"; + if ($dry) { + print "$command\n"; + } else { + system($command); + } +} diff --git a/perl/1kgScripts/runPilot1Pipeline.pl b/perl/1kgScripts/runPilot1Pipeline.pl new file mode 100755 index 000000000..39ece6fb0 --- /dev/null +++ b/perl/1kgScripts/runPilot1Pipeline.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +sub usage { + print "Usage: perl runPilot1Pipeline.pl\n\t-i \n\t-o \n\t[-sting Sting dir]\n\t[-q farm queue; default:gsa]\n\t[-dry]\n"; + exit(1); +} + +my $inputDir = undef; +my $outputDir = undef; +my $dry; +my $queue = "gsa"; +my $sting = "/humgen/gsa-scr1/ebanks/Sting"; + +GetOptions( "i=s" => \$inputDir, + "odir=s" => \$outputDir, + "q:s" => \$queue, + "dry!" => \$dry, + "sting:s" => \$sting ); + +usage() if ( !$inputDir || !$outputDir ); + +my @samples = ("ceu","yri","chb_jpt"); + +foreach my $sample (@samples) { + + $inputBam = "$inputDir/low_coverage_$sample.bam"; + $outputHead = "$outputDir/$sample"; + clean($inputBam, $outputDir, "$outputHead.bam", $queue, $sting, $dry); +# call($outputBam, $outputHead, $queue, $sting, $dry); +} + +sub clean { + + my $inputBam = $_[0]; + my $outputDir = $_[1]; + my $outputBam = $_[2]; + my $queue = $_[3]; + my $sting = $_[4]; + my $dry = $_[5]; + + my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -odir $outputDir -obam $outputBam -q $queue -j $outputBam.cleaningpipeline -sting $sting"; + if ($dry) { + $cmd .= " -dry"; + } + system($cmd); +} + +sub call { + + my $inputBam = $_[0]; + my $outputHead = $_[1]; + my $queue = $_[2]; + my $sting = $_[3]; + my $dry = $_[4]; + + $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBam -o $outputHead -q $queue -wait $inputBam.cleaningpipeline -sting $sting"; + if ($dry) { + $cmd .= " -dry"; + } + system($cmd); +} diff --git a/perl/1kgScripts/runPilot2Pipeline.pl b/perl/1kgScripts/runPilot2Pipeline.pl new file mode 100755 index 000000000..194ac293c --- /dev/null +++ b/perl/1kgScripts/runPilot2Pipeline.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +sub usage { + print "Usage: perl runPilot2Pipeline.pl\n\t-i \n\t-o \n\t[-sting Sting dir]\n\t[-q farm queue; default:gsa]\n\t[-dry]\n"; + exit(1); +} + +my $inputDir = undef; +my $outputDir = undef; +my $dry; +my $queue = "gsa"; +my $sting = "/humgen/gsa-scr1/ebanks/Sting"; + +GetOptions( "i=s" => \$inputDir, + "odir=s" => \$outputDir, + "q:s" => \$queue, + "dry!" => \$dry, + "sting:s" => \$sting ); + +usage() if ( !$inputDir || !$outputDir ); + +my @samples = ("NA19238","NA19239","NA19240","NA12878","NA12891","NA12892"); + +foreach my $sample (@samples) { + + my $inputBamSLX = "$inputDir/$sample.pilot2.SLX.bam"; + my $outputHead = "$outputDir/$sample.SLX"; + clean($inputBamSLX, $outputDir, "$outputHead.bam", $queue, $sting, $dry); +# call($outputBam, $outputHead, $queue, $sting, $dry); + + if ($sample eq "NA12878" || $sample eq "NA19240") { + my $inputBamSOLID = "$inputDir/$sample.pilot2.SOLID.bam"; + $outputHead = "$outputDir/$sample.SOLID"; + clean($inputBamSOLID, $outputDir, "$outputHead.bam", $queue, $sting, $dry); +# call($outputBam, $outputHead, $queue, $sting, $dry); + + my $inputBam454 = "$inputDir/$sample.pilot2.454.bam"; + $outputHead = "$outputDir/$sample.454"; +# call($inputBam454, $outputHead, $queue, $sting, $dry); + + my @inputBams = ($inputBamSOLID, $inputBam454); + $outputHead = "$outputDir/$sample.SOLID_454"; +# multicall($inputBams, $outputHead, $queue, $sting, $dry); + + @inputBams = ($inputBamSLX, $inputBamSOLID, $inputBam454); + $outputHead = "$outputDir/$sample.allTechs"; +# multicall($inputBams, $outputHead, $queue, $sting, $dry); + } +} + +sub clean { + + my $inputBam = $_[0]; + my $outputDir = $_[1]; + my $outputBam = $_[2]; + my $queue = $_[3]; + my $sting = $_[4]; + my $dry = $_[5]; + + my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -odir $outputDir -obam $outputBam -q $queue -inject -j $outputBam.cleaningpipeline -sting $sting"; + if ($dry) { + $cmd .= " -dry"; + } + system($cmd); +} + +sub call { + + my $inputBam = $_[0]; + my $outputHead = $_[1]; + my $queue = $_[2]; + my $sting = $_[3]; + my $dry = $_[4]; + + my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBam -o $outputHead -q $queue -snps -wait $inputBam.cleaningpipeline -sting $sting"; + if ($dry) { + $cmd .= " -dry"; + } + system($cmd); +} diff --git a/perl/1kgScripts/runPilotPipelines.sh b/perl/1kgScripts/runPilotPipelines.sh new file mode 100755 index 000000000..3c3230815 --- /dev/null +++ b/perl/1kgScripts/runPilotPipelines.sh @@ -0,0 +1,2 @@ +#perl /humgen/gsa-scr1/ebanks/Sting/perl/1kgScripts/runPilot1Pipeline.pl -i /humgen/gsa-hphome1/projects/1kg_pilot1/mergedBamsByPopulation -o /broad/hptmp/ebanks/1kg_pilot1/cleaned -dry +perl /humgen/gsa-scr1/ebanks/Sting/perl/1kgScripts/runPilot2Pipeline.pl -i /broad/1KG/DCC_merged/freeze5.2 -o /broad/hptmp/ebanks/1kg_pilot2/cleaned -dry diff --git a/perl/enqueueGATKjobsByChromosome.pl b/perl/enqueueGATKjobsByChromosome.pl deleted file mode 100755 index 3685f76ba..000000000 --- a/perl/enqueueGATKjobsByChromosome.pl +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -my $walker = undef; -my $pilot = "pilot2"; -my $queue = "gsa"; -my $tech = "SLX"; -my $jar = "/humgen/gsa-scr1/ebanks/Sting/dist/GenomeAnalysisTK.jar"; - -GetOptions( "T=s" => \$walker, - "p:s" => \$pilot, - "q:s" => \$queue, - "tech:s" => \$tech, - "j:s" => \$jar ); - -exit(1) if ( !$walker ); - -my @samples; -if ($pilot eq "pilot1") { - @samples = ("CEU","YRI","CHB-JPT"); -} elsif ($pilot eq "pilot2") { - @samples = ("NA19238","NA19239","NA19240","NA12878","NA12891","NA12892"); -} - -foreach my $sample (@samples) { - - my $num = 1; - while ($num < 23) { - enqueue($sample, $num, $pilot, $queue, $jar, $walker, $tech); - $num++; - } - - enqueue($sample, "X", $pilot, $queue, $jar, $walker, $tech); - enqueue($sample, "Y", $pilot, $queue, $jar, $walker, $tech); -} - -sub enqueue { - - my $sample = $_[0]; - my $chr = $_[1]; - my $pilot = $_[2]; - my $queue = $_[3]; - my $jar = $_[4]; - my $walker = $_[5]; - my $tech = $_[6]; - - my $inputBam; - if ($pilot eq "pilot2") { - $inputBam = "/broad/1KG/DCC/ftp/pilot_data/$sample/alignment/$sample.chrom$chr.$tech.SRP000032.2009_07.bam"; - } else { - $inputBam = "/broad/hptmp/ebanks/1kg_pilot1/".$sample."_BAMS.list"; - } - - my $outputDir; - if ($pilot eq "pilot1") { - $outputDir = "/broad/hptmp/ebanks/1kg_pilot1/cleaned"; - } else { - $outputDir = "/broad/hptmp/ebanks/1kg_pilot2/cleaned"; - } - - my $cmd; - my $outputFile; - - SWITCH: { - $walker eq "IndelIntervals" && do { - $outputFile = "$outputDir/intervals/$sample.chr$chr.$tech.indels.intervals"; - $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr"; - last SWITCH; - }; - $walker eq "MismatchIntervals" && do { - $outputFile = "$outputDir/intervals/$sample.chr$chr.$tech.mismatches.intervals"; - $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr"; - last SWITCH; - }; - $walker eq "SNPClusters" && do { - $outputFile = "$outputDir/intervals/$sample.chr$chr.$tech.clusters.intervals"; - $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T SNPClusters -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr"; - last SWITCH; - }; - $walker eq "IntervalMerger" && do { - $outputFile = "$outputDir/intervals/$sample.chr$chr.$tech.merged.intervals"; - $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr -intervals $outputDir/intervals/$sample.chr$chr.$tech.indels.intervals -intervals $outputDir/intervals/$sample.chr$chr.$tech.mismatches.intervals"; - last SWITCH; - }; - $walker eq "IntervalCleaner" && do { - if ($pilot eq "pilot2") { - $outputFile = "$outputDir/cleaner/$sample.chr$chr.$tech.bam"; - } else { - $outputFile = "$outputDir/bams/$sample.chr$chr.$tech.bam"; - } - $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -O $outputFile -L $chr -intervals $outputDir/intervals/$sample.chr$chr.$tech.merged.intervals -compress 1"; - if ($pilot eq "pilot2") { - $cmd .= " -cleanedOnly"; - } - last SWITCH; - }; - $walker eq "CleanedReadInjector" && do { - $outputFile = "$outputDir/bams/$sample.chr$chr.$tech.bam"; - $cmd = "bsub -q $queue -o $outputFile.sdout java -Xmx4096m -jar $jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -o $outputFile -L $chr"; - last SWITCH; - }; - - print "$walker is not a supported class\n"; - exit(1); - } - -# print "$cmd\n"; - system($cmd); -} diff --git a/perl/splitAndEnqueueGATKjobs.pl b/perl/splitAndEnqueueGATKjobs.pl new file mode 100755 index 000000000..388c6fed7 --- /dev/null +++ b/perl/splitAndEnqueueGATKjobs.pl @@ -0,0 +1,151 @@ +#!/usr/bin/perl -w + +# Runs a given GATK walker genome-wide, but splits up the jobs +# and then merges the results together. One should really use +# scatter-gather here, but I wanted to test against that framework +# (to ensure consistency) so I wrote this temporary script. +# Also, it allowed me to add in farm commands for wait ids +# (which is currently unavailable in scatter-gather). +# If intervals file is left blank, it splits by chromosome. + +use strict; +use Getopt::Long; + +sub usage { + print "Usage: perl enqueueGATKjobsByChromosome.pl\n\t-cmd \n\t-o \n\t[-oarg GATK output argument; default:o]\n\t[-i intervals file]\n\t[-n number of splits to make]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job farm job name]\n\t[-dry]\n\t[-bam is output format bam?; default:no]\n"; + exit(1); +} + +my $cmd = undef; +my $output = undef; +my $outputArg = "o"; +my $jobName = undef; +my $wait = undef; +my $queue = "gsa"; +my $breaks = 24; +my $intervalsFile = undef; +my $dry; +my $isBam; + +GetOptions( "cmd=s" => \$cmd, + "o=s" => \$output, + "oarg:s" => \$outputArg, + "job:s" => \$jobName, + "wait:s" => \$wait, + "n:i" => \$breaks, + "i:s" => \$intervalsFile, + "dry!" => \$dry, + "bam!" => \$isBam, + "q:s" => \$queue); + +usage() if ( !$cmd || !$output ); + +my @intervals; +if (!$intervalsFile) { + my $num = 1; + while ($num < 23) { + push(@intervals, $num); + $num++; + } + push(@intervals, "X"); + push(@intervals, "Y"); +} else { + open(FILE, "< $intervalsFile") or die "can't open $intervalsFile: $!"; + my @lines = ; + chomp(@lines); + my $linecount = scalar(@lines); + if ($linecount < $breaks) { + $breaks = $linecount; + } + + my $linesPerJob = $linecount / $breaks; + my $index = 0; + for (my $i = 1; $i < $breaks; $i++) { + my $interval = ""; + for (my $j = 0; $j < $linesPerJob; $j++) { + $interval .= "$lines[$index];"; + $index++; + } + push(@intervals, $interval); + } + my $interval = ""; + while ($index < $linecount) { + $interval .= "$lines[$index];"; + $index++; + } + push(@intervals, $interval); +} + +my $intervalcount = scalar(@intervals); +for (my $i = 0; $i < $intervalcount; $i++) { + enqueue($intervals[$i], $cmd, $output, $outputArg, $wait, $queue, $dry, $isBam, ($i+1)); +} + +mergeResults($output, $queue, $dry, $isBam, $jobName, $intervalcount); + +sub enqueue { + + my $interval = $_[0]; + my $cmd = $_[1]; + my $outArg = $_[3]; + my $waitid = $_[4]; + my $queue = $_[5]; + my $dry = $_[6]; + my $bam = $_[7]; + my $index = $_[8]; + + my $output = "$_[2].$index"; + if ($bam) { + $output .= ".bam"; + } + + my $bsub = "bsub -q $queue -o $output.sdout -J $_[2]"; + if ($waitid) { + $bsub .= " -w \"ended($waitid)\""; + } + + my $command = "$bsub $cmd -$outputArg $output -L $interval"; + execute($command, $dry); +} + +sub mergeResults { + + my $output = $_[0]; + my $queue = $_[1]; + my $dry = $_[2]; + my $bam = $_[3]; + my $jobName = $_[4]; + my $intervalcount = $_[5]; + + my $cmd = "bsub -q $queue -o $output.sdout -w \"ended($output)\""; + if ($jobName) { + $cmd .= " -J $jobName"; + } + + if ($bam) { + $cmd .= " samtools merge $output "; + for (my $i = 1; $i <= $intervalcount; $i++) { + $cmd .= "$output.$i.bam "; + } + } else { + $cmd .= " cat "; + for (my $i = 1; $i <= $intervalcount; $i++) { + $cmd .= "$output.$i "; + } + $cmd .= "> $output"; + } + + execute($cmd, $dry); +} + +sub execute { + + my $cmd = $_[0]; + my $dry = $_[1]; + + if ($dry) { + print "$cmd\n"; + } else { + system($cmd); + } +}