diff --git a/perl/1kgScripts/runCallingPipeline.pl b/perl/1kgScripts/runCallingPipeline.pl deleted file mode 100755 index 1e5bb550c..000000000 --- a/perl/1kgScripts/runCallingPipeline.pl +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -sub usage { - print "Usage: perl runCallingPipeline.pl\n\t-i \n\t-o \n\t[-sting Sting dir]\n\t[-frac multiplier for indel fractions]\n\t[-snps Should we call snps?]\n\t[-sample for writing vcf with -snps]\n\t[-badsnps bad snps file from cleaning with -snps]\n\t[-doc DepthOfCoverage for filtering with -snps]\n\t[-mq mapping quality zero for filtering with -snps ]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-dry]\n"; - exit(1); -} - -my $inputBamStr = undef; -my $outputHead = undef; -my $wait = undef; -my $dry; -my $snps; -my $badsnps = undef; -my $sample; -my $doc = 100; -my $mq = 100; -my $indelFractionMultiplier = ""; -my $queue = "gsa"; -my $sting = "/humgen/gsa-scr1/ebanks/Sting"; - -GetOptions( "i=s" => \$inputBamStr, - "o=s" => \$outputHead, - "q:s" => \$queue, - "dry!" => \$dry, - "snps!" => \$snps, - "sample:s" => \$sample, - "doc:s" => \$doc, - "mq:s" => \$mq, - "frac:s" => \$indelFractionMultiplier, - "badsnps:s" => \$badsnps, - "wait:s" => \$wait, - "sting:s" => \$sting ); - -usage() if ( !$inputBamStr || !$outputHead ); - -my $indelsHigh = "$outputHead.indels.high.calls"; -my $bsub = "bsub -q $queue -o $indelsHigh.sdout"; -if ($wait) { - $bsub .= " -w \"ended($wait)\""; -} -my $command = "java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -minConsensusFraction 0.5 -minCnt 2 -1kg -minFraction 0.".$indelFractionMultiplier."3 -O $indelsHigh"; -execute("$bsub $command", $dry); - -my $indelsLow = "$outputHead.indels.low.calls"; -$bsub = "bsub -q $queue -o $indelsLow.sdout -J $outputHead"; -if ($wait) { - $bsub .= " -w \"ended($wait)\""; -} -$command = "java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -minConsensusFraction 0.5 -minCnt 2 -1kg -minFraction 0.".$indelFractionMultiplier."1 -O $indelsLow"; -execute("$bsub $command", $dry); - -if ($snps) { - my $snpsFile = "$outputHead.snps.unfiltered.calls"; - $bsub = "bsub -q $queue -o $snpsFile.sdout -J $outputHead"; - if ($wait) { - $bsub .= " -w \"ended($wait)\""; - } - $command = "java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -varout $snpsFile -lod 0.0"; - execute("$bsub $command", $dry); - - my $filterFile = "$outputHead.snps.filtered.calls"; - my $vcfFile = "$outputHead.snps.vcf"; - $bsub = "bsub -q $queue -o $filterFile.sdout -w \"ended($outputHead)\""; - $command = "java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T VariantFiltration -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -vcf $vcfFile -included $filterFile -sample $sample -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod,variant,Variants,$snpsFile,"; - if ($badsnps) { - $command .= "cleaned,CleanedOutSNP,$badsnps,"; - } - $command .= "indels,SimpleIndel,$indelsLow -X DepthOfCoverage:max=$doc -X AlleleBalance:low=0.25,high=0.75 -X FisherStrand:pvalue=0.00001 -X LodThreshold:lod=5.0 -X MappingQualityZero:max=$mq -X IndelArtifact -X ClusteredSnps:window=7,snps=3"; - execute("$bsub $command", $dry); -} - -sub execute { - - my $cmd = $_[0]; - my $dry = $_[1]; - - if ($dry) { - print "$cmd\n"; - } else { - system($cmd); - } -} diff --git a/perl/1kgScripts/runCleaningPipeline.pl b/perl/1kgScripts/runCleaningPipeline.pl deleted file mode 100755 index 5b5a34dd6..000000000 --- a/perl/1kgScripts/runCleaningPipeline.pl +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -sub usage { - print "Usage: perl runCleaningPipeline.pl\n\t-i \n\t-obam \n\t[-sting Sting dir]\n\t[-badsnps badsnps file name]\n\t[-inject]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job final farm job name]\n\t[-dry]\n"; - exit(1); -} - -my $inputBam = undef; -my $outputBam = undef; -my $jobName = undef; -my $wait = undef; -my $badsnps = undef; -my $dry; -my $inject; -my $queue = "gsa"; -my $sting = "/humgen/gsa-scr1/ebanks/Sting"; - -GetOptions( "i=s" => \$inputBam, - "obam=s" => \$outputBam, - "q:s" => \$queue, - "dry!" => \$dry, - "inject!" => \$inject, - "job:s" => \$jobName, - "wait:s" => \$wait, - "badsnps:s" => \$badsnps, - "sting:s" => \$sting ); - -usage() if ( !$inputBam || !$outputBam ); - -my $indelIntervals = "$outputBam.indels.intervals"; -my $command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $indelIntervals -oarg o -j $outputBam.intervals -q $queue"; -if ($dry) { - $command .= " -dry"; -} -if ($wait) { - $command .= " -wait $wait"; -} -system($command); - -my $mismatchIntervals = "$outputBam.mismatches.intervals"; -$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $mismatchIntervals -oarg o -j $outputBam.intervals -q $queue"; -if ($dry) { - $command .= " -dry"; -} -if ($wait) { - $command .= " -wait $wait"; -} -system($command); - -my $mergedIntervals = "$outputBam.merged.intervals"; -$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -intervals $indelIntervals -intervals $mismatchIntervals\" -o $mergedIntervals -oarg o -wait $outputBam.intervals -j $outputBam.merged -q $queue"; -if ($dry) { - $command .= " -dry"; -} -system($command); - -my $cleanedBam; -if ($inject) { - $cleanedBam = "$outputBam.cleaned.bam"; -} else { - $cleanedBam = "$outputBam"; -} -$command = "bsub -q $queue -o $outputBam.cleaner.script1 -w \"ended($outputBam.merged)\" -J $outputBam.cleaner.script perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -compress 1"; -if ($inject) { - $command .= " -cleanedOnly\" -j $outputBam.cleaner.clean"; -} else { - $command .= "\" -j $jobName"; -} -$command .= " -o $cleanedBam -oarg O -q $queue -bam -i $mergedIntervals -n 50"; -if ($dry) { - $command .= " -dry"; - print "$command\n"; -} else { - system($command); -} - -my $snpsFile = $badsnps; -if (!$snpsFile) { - $snpsFile = "$outputBam.badsnps"; -} -$command = "bsub -q $queue -o $outputBam.cleaner.script2 -w \"ended($outputBam.merged)\" -J $outputBam.cleaner.script perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\""; -if ($inject) { - $command .= " -j $outputBam.cleaner.badsnps"; -} else { - $command .= " -j $jobName"; -} -$command .= " -o $snpsFile -oarg snps -q $queue -i $mergedIntervals -n 50"; -if ($dry) { - $command .= " -dry"; - print "$command\n"; -} else { - system($command); -} - -if ($inject) { - my $bam = "$outputBam"; - $command = "bsub -q $queue -o $bam.sdout"; - if ($jobName) { - $command .= " -J $jobName"; - } - $command .= " -w \"ended($outputBam.cleaner.*)\" java -Djava.io.tmpdir=/broad/hptmp/ -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam --output_bam $bam --cleaned_reads $cleanedBam -compress 1"; - if ($dry) { - print "$command\n"; - } else { - system($command); - } -} diff --git a/perl/1kgScripts/runPilot1Pipeline.pl b/perl/1kgScripts/runPilot1Pipeline.pl deleted file mode 100755 index 4a3b0ab81..000000000 --- a/perl/1kgScripts/runPilot1Pipeline.pl +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -sub usage { - print "Usage: perl runPilot1Pipeline.pl\n\t-i \n\t-o \n\t[-sting Sting dir]\n\t[-q farm queue; default:gsa]\n\t[-dry]\n"; - exit(1); -} - -my $inputDir = undef; -my $outputDir = undef; -my $dry; -my $queue = "gsa"; -my $sting = "/humgen/gsa-scr1/ebanks/Sting"; - -GetOptions( "i=s" => \$inputDir, - "odir=s" => \$outputDir, - "q:s" => \$queue, - "dry!" => \$dry, - "sting:s" => \$sting ); - -usage() if ( !$inputDir || !$outputDir ); - -my @samples = ("ceu","yri","chb_jpt"); - -foreach my $sample (@samples) { - - my $inputBam = "$inputDir/low_coverage_$sample.bam"; - my $outputHead = "$outputDir/$sample"; - my $outputBam = "$outputHead.bam"; - my $badsnps = "$outputBam.badsnps"; - clean($inputBam, $outputBam, $queue, $sting, $dry, $badsnps); - call("-I $outputBam", $outputHead, $queue, $sting, $dry, "$inputBam.cleaner.*", $sample, $badsnps); -} - -sub clean { - - my $inputBam = $_[0]; - my $outputBam = $_[1]; - my $queue = $_[2]; - my $sting = $_[3]; - my $dry = $_[4]; - my $badsnps = $_[5]; - - my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -obam $outputBam -q $queue -j $outputBam.cleaner.pipeline -sting $sting -badsnps $badsnps"; - if ($dry) { - $cmd .= " -dry"; - } - system($cmd); -} - -sub call { - - my $inputBams = $_[0]; - my $outputHead = $_[1]; - my $queue = $_[2]; - my $sting = $_[3]; - my $dry = $_[4]; - my $wait = $_[5]; - my $sample = $_[6]; - my $badsnps = $_[7]; - - my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBams -o $outputHead -q $queue -sting $sting -frac 0 -sample $sample -badsnps $badsnps"; - if ($dry) { - $cmd .= " -dry"; - } - if ($wait) { - $cmd .= " -wait $wait"; - } - system($cmd); -} diff --git a/perl/1kgScripts/runPilot2Pipeline.pl b/perl/1kgScripts/runPilot2Pipeline.pl deleted file mode 100755 index 58c634859..000000000 --- a/perl/1kgScripts/runPilot2Pipeline.pl +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long; - -sub usage { - print "Usage: perl runPilot2Pipeline.pl\n\t-i \n\t-o \n\t[-sting Sting dir]\n\t[-q farm queue; default:gsa]\n\t[-dry]\n"; - exit(1); -} - -my $inputDir = undef; -my $outputDir = undef; -my $dry; -my $queue = "gsa"; -my $sting = "/humgen/gsa-scr1/ebanks/Sting"; - -GetOptions( "i=s" => \$inputDir, - "o=s" => \$outputDir, - "q:s" => \$queue, - "dry!" => \$dry, - "sting:s" => \$sting ); - -usage() if ( !$inputDir || !$outputDir ); - -my @samples = ("NA19238","NA19239","NA19240","NA12878","NA12891","NA12892"); - -# Official genome-wide Depth of Coverage tables for pilot 2, freeze 5: -# NA12878 NA12891 NA12892 NA19238 NA19239 NA19240 -# 454: 36 18 -# SLX: 82 91 70 56 68 86 -# SOLID: 37 64 -# 454+SLD: 64 77 -# ALL: 138 150 - -my %DoC_454 = ( "NA12878" => 36, "NA19240" => 18 ); -my %DoC_slx = ( "NA12878" => 82, "NA12891" => 91, "NA12892" => 70,"NA19238" => 56, "NA19239" => 68, "NA19240" => 86 ); -my %DoC_solid = ( "NA12878" => 37, "NA19240" => 64 ); -my %DoC_454solid = ( "NA12878" => 64, "NA19240" => 77 ); -my %DoC_all = ( "NA12878" => 138, "NA19240" => 150 ); -my %MQ_hash = ( "SLX" => 100, "SOLID" => 5, "454" => 5, "454SOLID" => 10, "ALL" => 110 ); - -foreach my $sample (@samples) { - - my $inputBamSLX = "$inputDir/$sample.pilot2.SLX.bam"; - my $outputHeadSLX = "$outputDir/$sample.SLX"; - my $outputBamSLX = "$outputHeadSLX.bam"; - my $badsnpsSLX = "$outputBamSLX.badsnps"; - clean($inputBamSLX, $outputBamSLX, $queue, $sting, $dry, $badsnpsSLX); - call("-I $outputBamSLX", $outputHeadSLX, $queue, $sting, $dry, "$outputBamSLX.cleaner.*", $sample, $badsnpsSLX, $DoC_slx{$sample}, $MQ_hash{"SLX"}); - - if ($sample eq "NA12878" || $sample eq "NA19240") { - my $inputBamSOLID = "$inputDir/$sample.pilot2.SOLID.bam"; - my $outputHeadSOLID = "$outputDir/$sample.SOLID"; - my $outputBamSOLID = "$outputHeadSOLID.bam"; - my $badsnpsSOLID = "$outputBamSOLID.badsnps"; - clean($inputBamSOLID, $outputBamSOLID, $queue, $sting, $dry, $badsnpsSOLID); - call("-I $outputBamSOLID", $outputHeadSOLID, $queue, $sting, $dry, "$outputBamSOLID.cleaner.*", $sample, $badsnpsSOLID, $DoC_solid{$sample}, $MQ_hash{"SOLID"}); - - my $inputBam454 = "$inputDir/$sample.pilot2.454.bam"; - my $outputHead454 = "$outputDir/$sample.454"; - call("-I $inputBam454", $outputHead454, $queue, $sting, $dry, "$outputBamSLX.cleaner.*", $sample, $badsnpsSLX, $DoC_454{$sample}, $MQ_hash{"454"}); - - my $outputHead = "$outputDir/$sample.SOLID_454"; - call("-I $outputBamSOLID -I $inputBam454", $outputHead, $queue, $sting, $dry, "$outputBamSOLID.cleaner.*", $sample, $badsnpsSOLID, $DoC_454solid{$sample}, $MQ_hash{"454SOLID"}); - - $outputHead = "$outputDir/$sample.allTechs"; - call("-I $outputBamSLX -I $outputBamSOLID -I $inputBam454", $outputHead, $queue, $sting, $dry, "*.cleaner.*", $sample, $badsnpsSLX, $DoC_all{$sample}, $MQ_hash{"ALL"}); - } -} - -sub clean { - - my $inputBam = $_[0]; - my $outputBam = $_[1]; - my $queue = $_[2]; - my $sting = $_[3]; - my $dry = $_[4]; - my $badsnps = $_[5]; - - my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -obam $outputBam -q $queue -inject -j $outputBam.cleaner.pipeline -sting $sting -badsnps $badsnps"; - if ($dry) { - $cmd .= " -dry"; - } - system($cmd); -} - -sub call { - - my $inputBams = $_[0]; - my $outputHead = $_[1]; - my $queue = $_[2]; - my $sting = $_[3]; - my $dry = $_[4]; - my $wait = $_[5]; - my $sample = $_[6]; - my $badsnps = $_[7]; - my $doc = $_[8]; - my $mq = $_[9]; - - my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i \"$inputBams\" -o $outputHead -q $queue -snps -sting $sting -sample $sample -badsnps $badsnps -doc $doc -mq $mq"; - if ($dry) { - $cmd .= " -dry"; - } - if ($wait) { - $cmd .= " -wait $wait"; - } - system($cmd); -} diff --git a/perl/1kgScripts/runPilotPipelines.sh b/perl/1kgScripts/runPilotPipelines.sh deleted file mode 100755 index d95c9634b..000000000 --- a/perl/1kgScripts/runPilotPipelines.sh +++ /dev/null @@ -1,2 +0,0 @@ -#perl /humgen/gsa-scr1/ebanks/Sting_STABLE/perl/1kgScripts/runPilot1Pipeline.pl -i /humgen/gsa-hphome1/projects/1kg_pilot1/mergedBamsByPopulation -o /broad/hptmp/ebanks/1kg_pilot1/cleaned -sting /humgen/gsa-scr1/ebanks/Sting_STABLE -dry -perl /humgen/gsa-scr1/ebanks/Sting_STABLE/perl/1kgScripts/runPilot2Pipeline.pl -i /broad/1KG/DCC_merged/freeze5.2 -o /broad/hptmp/ebanks/1kg_pilot2/cleaned -sting /humgen/gsa-scr1/ebanks/Sting_STABLE diff --git a/perl/sync1000Genomes/README.sync b/perl/sync1000Genomes/README.sync new file mode 100644 index 000000000..d8492750a --- /dev/null +++ b/perl/sync1000Genomes/README.sync @@ -0,0 +1,9 @@ +Before sync'ing 1000 Genomes, you need to be logged in as gsa-dev: +% sudo -s -u gsa-dev +[Note that this step needs to be done before subsequent steps for stability] + +To use Aspera, you'll need to ssh into one of the appropriate machines: +% ssh vbigtube or mirror + +[The NCBI Aspera source is: anonftp@ftp-private.ncbi.nih.gov:/1000genomes/ftp/] + diff --git a/perl/sync1000Genomes/runAspera.pl b/perl/sync1000Genomes/runAspera.pl new file mode 100755 index 000000000..cf4c6554d --- /dev/null +++ b/perl/sync1000Genomes/runAspera.pl @@ -0,0 +1,19 @@ +#!/usr/bin/perl -w + +# Runs Aspera to pull down files + +use strict; +use Getopt::Long; + +my $source = undef; +my $dest = "."; +GetOptions( "source=s" => \$source, + "dest=s" => \$dest); + +if ( !$source) { + print "Usage: runAspera.pl\n\t-source \t\n\t-dest \t\t\n"; + exit(1); +} + +my $cmd = "ascp -i /opt/aspera/etc/asperaweb_id_dsa.putty -k2 -QTr -l2G -d -v $source $dest"; +system($cmd); diff --git a/perl/sync1000Genomes/runWget.pl b/perl/sync1000Genomes/runWget.pl new file mode 100755 index 000000000..c2bdac748 --- /dev/null +++ b/perl/sync1000Genomes/runWget.pl @@ -0,0 +1,19 @@ +#!/usr/bin/perl -w + +# Runs Wget to pull down a file + +use strict; +use Getopt::Long; + +my $file = undef; +GetOptions( "file=s" => \$file); + +if ( !$file) { + print "Usage: runWget.pl\n\t-file \t\n"; + exit(1); +} + +chomp($file); +my $cmd = "wget -O /humgen/1kg/DCC/ftp/$file ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/$file"; +print "$cmd\n"; +system($cmd); diff --git a/perl/sync1000Genomes/syncFilesInList.pl b/perl/sync1000Genomes/syncFilesInList.pl new file mode 100755 index 000000000..6a7ac83c5 --- /dev/null +++ b/perl/sync1000Genomes/syncFilesInList.pl @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w + +use Getopt::Long; + +sub usage { + print "Usage: perl syncFilesInList.pl\n\t-files \n\t-protocol [defaults to 'aspera'; can also use 'wget']\n\t[-dry]\n"; + exit(1); +} + +my $files = undef; +my $dry; +my $protocol = "aspera"; + +GetOptions( "files=s" => \$files, + "dry!" => \$dry, + "protocol=s" => \$protocol); + +usage() if ( !$files ); + +open(LIST, "< $files") or die "can't open $files: $!"; +while ( ) { + chomp($_); + if ( $protocol eq "aspera" ) { + $_ =~ m/data\/(.*)\/alignment.*/; + $cmd = "./runAspera.pl -source anonftp\@ftp-private.ncbi.nih.gov:/1000genomes/ftp/$_ -dest /humgen/1kg/DCC/ftp/data/$1/alignment/"; + execute($cmd, $dry); + } elsif ( $protocol eq "wget" ) { + $cmd = "./runWget.pl -file $_"; + execute($cmd, $dry); + } else { + usage(); + } +} +close(LIST); + +sub execute { + + my $cmd = $_[0]; + my $dry = $_[1]; + + if ($dry) { + print "$cmd\n"; + } else { + system($cmd); + } +}