diff --git a/perl/1kgScripts/runCallingPipeline.pl b/perl/1kgScripts/runCallingPipeline.pl new file mode 100755 index 000000000..780c4993d --- /dev/null +++ b/perl/1kgScripts/runCallingPipeline.pl @@ -0,0 +1,85 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +sub usage { + print "Usage: perl runCallingPipeline.pl\n\t-i \n\t-o \n\t[-sting Sting dir]\n\t[-frac multiplier for indel fractions]\n\t[-snps Should we call snps?]\n\t[-sample for writing vcf with -snps]\n\t[-badsnps bad snps file from cleaning with -snps]\n\t[-doc DepthOfCoverage for filtering with -snps]\n\t[-mq mapping quality zero for filtering with -snps ]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-dry]\n"; + exit(1); +} + +my $inputBamStr = undef; +my $outputHead = undef; +my $wait = undef; +my $dry; +my $snps; +my $badsnps = undef; +my $sample; +my $doc = 100; +my $mq = 100; +my $indelFractionMultiplier = ""; +my $queue = "gsa"; +my $sting = "/humgen/gsa-scr1/ebanks/Sting"; + +GetOptions( "i=s" => \$inputBamStr, + "o=s" => \$outputHead, + "q:s" => \$queue, + "dry!" => \$dry, + "snps!" => \$snps, + "sample:s" => \$sample, + "doc:s" => \$doc, + "mq:s" => \$mq, + "frac:s" => \$indelFractionMultiplier, + "badsnps:s" => \$badsnps, + "wait:s" => \$wait, + "sting:s" => \$sting ); + +usage() if ( !$inputBamStr || !$outputHead ); + +my $indelsHigh = "$outputHead.indels.high.calls"; +my $bsub = "bsub -q $queue -o $indelsHigh.sdout"; +if ($wait) { + $bsub .= " -w \"ended($wait)\""; +} +my $command = "java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -minConsensusFraction 0.5 -minCnt 2 -1kg -minFraction 0.".$indelFractionMultiplier."3 -o $indelsHigh"; +execute("$bsub $command", $dry); + +my $indelsLow = "$outputHead.indels.low.calls"; +$bsub = "bsub -q $queue -o $indelsLow.sdout -J $outputHead"; +if ($wait) { + $bsub .= " -w \"ended($wait)\""; +} +$command = "java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -minConsensusFraction 0.5 -minCnt 2 -1kg -minFraction 0.".$indelFractionMultiplier."1 -o $indelsLow"; +execute("$bsub $command", $dry); + +if ($snps) { + my $snpsFile = "$outputHead.snps.unfiltered.calls"; + $bsub = "bsub -q $queue -o $snpsFile.sdout -J $outputHead"; + if ($wait) { + $bsub .= " -w \"ended($wait)\""; + } + $command = "java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -varout $snpsFile -lod 5.0"; + execute("$bsub $command", $dry); + + my $filterFile = "$outputHead.snps.filtered.calls"; + my $vcfFile = "$outputHead.snps.vcf"; + $bsub = "bsub -q $queue -o $filterFile.sdout -w \"ended($outputHead)\""; + $command = "java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T VariantFiltration -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -vcf $vcfFile -included $filterFile -sample $sample -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod,variant,Variants,$snpsFile,"; + if ($badsnps) { + $command .= "cleaned,CleanedOutSNP,$badsnps,"; + } + $command .= "indels,SimpleIndel,$indelsLow -X DepthOfCoverage:max=$doc -X AlleleBalance:low=0.25,high=0.75 -X FisherStrand:pvalue=0.00001 -X LodThreshold:lod=5 -X MappingQualityZero:max=$mq -X IndelArtifact -X ClusteredSnps:window=7,snps=3"; + execute("$bsub $command", $dry); +} + +sub execute { + + my $cmd = $_[0]; + my $dry = $_[1]; + + if ($dry) { + print "$cmd\n"; + } else { + system($cmd); + } +} diff --git a/perl/1kgScripts/runCleaningPipeline.pl b/perl/1kgScripts/runCleaningPipeline.pl index f38faf6a2..4b8b5f80d 100755 --- a/perl/1kgScripts/runCleaningPipeline.pl +++ b/perl/1kgScripts/runCleaningPipeline.pl @@ -4,34 +4,34 @@ use strict; use Getopt::Long; sub usage { - print "Usage: perl runCleaningPipeline.pl\n\t-i \n\t-odir \n\t-obam \n\t[-sting Sting dir]\n\t[-inject]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job final farm job name]\n\t[-dry]\n"; + print "Usage: perl runCleaningPipeline.pl\n\t-i \n\t-obam \n\t[-sting Sting dir]\n\t[-badsnps badsnps file name]\n\t[-inject]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job final farm job name]\n\t[-dry]\n"; exit(1); } my $inputBam = undef; -my $outputDir = undef; my $outputBam = undef; my $jobName = undef; my $wait = undef; +my $badsnps = undef; my $dry; my $inject; my $queue = "gsa"; my $sting = "/humgen/gsa-scr1/ebanks/Sting"; GetOptions( "i=s" => \$inputBam, - "odir=s" => \$outputDir, "obam=s" => \$outputBam, "q:s" => \$queue, "dry!" => \$dry, "inject!" => \$inject, "job:s" => \$jobName, "wait:s" => \$wait, + "badsnps:s" => \$badsnps, "sting:s" => \$sting ); -usage() if ( !$inputBam || !$outputDir || !$outputBam ); +usage() if ( !$inputBam || !$outputBam ); -my $indelIntervals = "$outputDir/$outputBam.indels.intervals"; -my $command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $indelIntervals -oarg o -j $outputDir/$outputBam.intervals -q $queue"; +my $indelIntervals = "$outputBam.indels.intervals"; +my $command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $indelIntervals -oarg o -j $outputBam.intervals -q $queue"; if ($dry) { $command .= " -dry"; } @@ -40,8 +40,8 @@ if ($wait) { } system($command); -my $mismatchIntervals = "$outputDir/$outputBam.mismatches.intervals"; -$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $mismatchIntervals -oarg o -j $outputDir/$outputBam.intervals -q $queue"; +my $mismatchIntervals = "$outputBam.mismatches.intervals"; +$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $mismatchIntervals -oarg o -j $outputBam.intervals -q $queue"; if ($dry) { $command .= " -dry"; } @@ -50,8 +50,8 @@ if ($wait) { } system($command); -my $mergedIntervals = "$outputDir/$outputBam.merged.intervals"; -$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -intervals $indelIntervals -intervals $mismatchIntervals\" -o $mergedIntervals -oarg o -wait $outputDir/$outputBam.intervals -j $outputDir/$outputBam.merged -q $queue"; +my $mergedIntervals = "$outputBam.merged.intervals"; +$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -intervals $indelIntervals -intervals $mismatchIntervals\" -o $mergedIntervals -oarg o -wait $outputBam.intervals -j $outputBam.merged -q $queue"; if ($dry) { $command .= " -dry"; } @@ -59,30 +59,45 @@ system($command); my $cleanedBam; if ($inject) { - $cleanedBam = "$outputDir/$outputBam.cleaned.bam"; + $cleanedBam = "$outputBam.cleaned.bam"; } else { - $cleanedBam = "$outputDir/$outputBam"; + $cleanedBam = "$outputBam"; } $command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -compress 1"; if ($inject) { - $command .= " -cleanedOnly\" -j $outputDir/$outputBam.cleaned"; + $command .= " -cleanedOnly\" -j $outputBam.cleaned"; } else { $command .= "\" -j $jobName"; } -$command .= "\" -o $cleanedBam -oarg O -wait $outputDir/$outputBam.merged -q $queue -bam -i $mergedIntervals -n 50"; +$command .= " -o $cleanedBam -oarg O -wait $outputBam.merged -q $queue -bam -i $mergedIntervals -n 50"; if ($dry) { $command .= " -dry"; } system($command); +my $snpsFile = $badsnps; +if (!$snpsFile) { + $snpsFile = "$outputBam.badsnps"; +} +$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\""; +if ($inject) { + $command .= " -j $outputBam.cleaned"; +} else { + $command .= " -j $jobName"; +} +$command .= "\" -o $snpsFile -oarg snps -wait $outputBam.merged -q $queue -i $mergedIntervals -n 50"; +if ($dry) { + $command .= " -dry"; +} +system($command); if ($inject) { - my $bam = "$outputDir/$outputBam"; + my $bam = "$outputBam"; $command = "bsub -q $queue -o $bam.sdout"; if ($jobName) { $command .= " -J $jobName"; } - $command .= " -w \"ended($outputDir/$outputBam.cleaned)\" java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam --output_bam $bam --cleaned_reads $cleanedBam -compress 1"; + $command .= " -w \"ended($outputBam.cleaned)\" java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam --output_bam $bam --cleaned_reads $cleanedBam -compress 1"; if ($dry) { print "$command\n"; } else { diff --git a/perl/1kgScripts/runPilot1Pipeline.pl b/perl/1kgScripts/runPilot1Pipeline.pl index 39ece6fb0..a73dc6a1e 100755 --- a/perl/1kgScripts/runPilot1Pipeline.pl +++ b/perl/1kgScripts/runPilot1Pipeline.pl @@ -26,22 +26,24 @@ my @samples = ("ceu","yri","chb_jpt"); foreach my $sample (@samples) { - $inputBam = "$inputDir/low_coverage_$sample.bam"; - $outputHead = "$outputDir/$sample"; - clean($inputBam, $outputDir, "$outputHead.bam", $queue, $sting, $dry); -# call($outputBam, $outputHead, $queue, $sting, $dry); + my $inputBam = "$inputDir/low_coverage_$sample.bam"; + my $outputHead = "$outputDir/$sample"; + my $outputBam = "$outputHead.bam"; + my $badsnps = "$outputBam.badsnps"; + clean($inputBam, $outputBam, $queue, $sting, $dry, $badsnps); + call("-I $outputBam", $outputHead, $queue, $sting, $dry, "$inputBam.cleaningpipeline", $sample, $badsnps); } sub clean { my $inputBam = $_[0]; - my $outputDir = $_[1]; - my $outputBam = $_[2]; - my $queue = $_[3]; - my $sting = $_[4]; - my $dry = $_[5]; + my $outputBam = $_[1]; + my $queue = $_[2]; + my $sting = $_[3]; + my $dry = $_[4]; + my $badsnps = $_[5]; - my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -odir $outputDir -obam $outputBam -q $queue -j $outputBam.cleaningpipeline -sting $sting"; + my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -obam $outputBam -q $queue -j $outputBam.cleaningpipeline -sting $sting -badsnps $badsnps"; if ($dry) { $cmd .= " -dry"; } @@ -50,15 +52,21 @@ sub clean { sub call { - my $inputBam = $_[0]; + my $inputBams = $_[0]; my $outputHead = $_[1]; my $queue = $_[2]; my $sting = $_[3]; my $dry = $_[4]; + my $wait = $_[5]; + my $sample = $_[6]; + my $badsnps = $_[7]; - $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBam -o $outputHead -q $queue -wait $inputBam.cleaningpipeline -sting $sting"; + my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBams -o $outputHead -q $queue -sting $sting -frac 0 -sample $sample -badsnps $badsnps"; if ($dry) { $cmd .= " -dry"; } + if ($wait) { + $cmd .= " -wait $wait"; + } system($cmd); } diff --git a/perl/1kgScripts/runPilot2Pipeline.pl b/perl/1kgScripts/runPilot2Pipeline.pl index 194ac293c..92f5da232 100755 --- a/perl/1kgScripts/runPilot2Pipeline.pl +++ b/perl/1kgScripts/runPilot2Pipeline.pl @@ -24,43 +24,60 @@ usage() if ( !$inputDir || !$outputDir ); my @samples = ("NA19238","NA19239","NA19240","NA12878","NA12891","NA12892"); +# Official genome-wide Depth of Coverage tables for pilot 2, freeze 5: +# NA12878 NA12891 NA12892 NA19238 NA19239 NA19240 +# 454: 36 18 +# SLX: 82 91 70 56 68 86 +# SOLID: 37 64 +# 454+SLD: 64 77 +# ALL: 138 150 + +my %DoC_454 = ( "NA12878" => 36, "NA19240" => 18 ); +my %DoC_slx = ( "NA12878" => 82, "NA12891" => 91, "NA12892" => 70,"NA19238" => 56, "NA19239" => 68, "NA19240" => 86 ); +my %DoC_solid = ( "NA12878" => 37, "NA19240" => 64 ); +my %DoC_454solid = ( "NA12878" => 64, "NA19240" => 77 ); +my %DoC_all = ( "NA12878" => 138, "NA19240" => 150 ); +my %MQ_hash = ( "SLX" => 100, "SOLID" => 5, "454" => 5, "454SOLID" => 10, "ALL" => 110 ); + foreach my $sample (@samples) { my $inputBamSLX = "$inputDir/$sample.pilot2.SLX.bam"; - my $outputHead = "$outputDir/$sample.SLX"; - clean($inputBamSLX, $outputDir, "$outputHead.bam", $queue, $sting, $dry); -# call($outputBam, $outputHead, $queue, $sting, $dry); + my $outputHeadSLX = "$outputDir/$sample.SLX"; + my $outputBamSLX = "$outputHeadSLX.bam"; + my $badsnpsSLX = "$outputDir/$outputBamSLX.badsnps"; + clean($inputBamSLX, $outputBamSLX, $queue, $sting, $dry, $badsnpsSLX); + call("-I $outputBamSLX", $outputHeadSLX, $queue, $sting, $dry, "$inputBamSLX.cleaningpipeline", $sample, $badsnpsSLX, $DoC_slx{$sample}, $MQ_hash{"SLX"}); if ($sample eq "NA12878" || $sample eq "NA19240") { my $inputBamSOLID = "$inputDir/$sample.pilot2.SOLID.bam"; - $outputHead = "$outputDir/$sample.SOLID"; - clean($inputBamSOLID, $outputDir, "$outputHead.bam", $queue, $sting, $dry); -# call($outputBam, $outputHead, $queue, $sting, $dry); + my $outputHeadSOLID = "$outputDir/$sample.SOLID"; + my $outputBamSOLID = "$outputHeadSOLID.bam"; + my $badsnpsSOLID = "$outputDir/$outputBamSOLID.badsnps"; + clean($inputBamSOLID, $outputBamSOLID, $queue, $sting, $dry, $badsnpsSOLID); + call("-I $outputBamSOLID", $outputHeadSOLID, $queue, $sting, $dry, "$inputBamSOLID.cleaningpipeline", $sample, $badsnpsSOLID, $DoC_solid{$sample}, $MQ_hash{"SOLID"}); my $inputBam454 = "$inputDir/$sample.pilot2.454.bam"; - $outputHead = "$outputDir/$sample.454"; -# call($inputBam454, $outputHead, $queue, $sting, $dry); + my $outputHead454 = "$outputDir/$sample.454"; + call("-I $inputBam454", $outputHead454, $queue, $sting, $dry, undef, $sample, $badsnpsSLX, $DoC_454{$sample}, $MQ_hash{"454"}); - my @inputBams = ($inputBamSOLID, $inputBam454); - $outputHead = "$outputDir/$sample.SOLID_454"; -# multicall($inputBams, $outputHead, $queue, $sting, $dry); + my $outputHead = "$outputDir/$sample.SOLID_454"; + call("-I $outputBamSOLID -I $inputBam454", $outputHead, $queue, $sting, $dry, "$inputBamSOLID.cleaningpipeline", $sample, $badsnpsSOLID, $DoC_454solid{$sample}, $MQ_hash{"454SOLID"}); - @inputBams = ($inputBamSLX, $inputBamSOLID, $inputBam454); $outputHead = "$outputDir/$sample.allTechs"; -# multicall($inputBams, $outputHead, $queue, $sting, $dry); + call("-I $outputBamSLX -I $outputBamSOLID -I $inputBam454", $outputHead, $queue, $sting, $dry, "*.cleaningpipeline", $sample, $badsnpsSLX, $DoC_all{$sample}, $MQ_hash{"ALL"}); } } sub clean { my $inputBam = $_[0]; - my $outputDir = $_[1]; - my $outputBam = $_[2]; - my $queue = $_[3]; - my $sting = $_[4]; - my $dry = $_[5]; + my $outputBam = $_[1]; + my $queue = $_[2]; + my $sting = $_[3]; + my $dry = $_[4]; + my $badsnps = $_[5]; - my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -odir $outputDir -obam $outputBam -q $queue -inject -j $outputBam.cleaningpipeline -sting $sting"; + my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -obam $outputBam -q $queue -inject -j $outputBam.cleaningpipeline -sting $sting -badsnps $badsnps"; if ($dry) { $cmd .= " -dry"; } @@ -69,15 +86,23 @@ sub clean { sub call { - my $inputBam = $_[0]; + my $inputBams = $_[0]; my $outputHead = $_[1]; my $queue = $_[2]; my $sting = $_[3]; my $dry = $_[4]; + my $wait = $_[5]; + my $sample = $_[6]; + my $badsnps = $_[7]; + my $doc = $_[8]; + my $mq = $_[9]; - my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBam -o $outputHead -q $queue -snps -wait $inputBam.cleaningpipeline -sting $sting"; + my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBams -o $outputHead -q $queue -snps -sting $sting -sample $sample -badsnps $badsnps -doc $doc -mq $mq"; if ($dry) { $cmd .= " -dry"; } + if ($wait) { + $cmd .= " -wait $wait"; + } system($cmd); }