finished up modularized pipeline; needs some testing but is generally done.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1610 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-09-14 02:41:57 +00:00
parent 5b41ef5f70
commit 0feee9cdfd
4 changed files with 182 additions and 49 deletions

View File

@ -0,0 +1,85 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
sub usage {
print "Usage: perl runCallingPipeline.pl\n\t-i <GATK input bam command>\n\t-o <output file head>\n\t[-sting Sting dir]\n\t[-frac multiplier for indel fractions]\n\t[-snps Should we call snps?]\n\t[-sample for writing vcf with -snps]\n\t[-badsnps bad snps file from cleaning with -snps]\n\t[-doc DepthOfCoverage for filtering with -snps]\n\t[-mq mapping quality zero for filtering with -snps ]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-dry]\n";
exit(1);
}
my $inputBamStr = undef;
my $outputHead = undef;
my $wait = undef;
my $dry;
my $snps;
my $badsnps = undef;
my $sample;
my $doc = 100;
my $mq = 100;
my $indelFractionMultiplier = "";
my $queue = "gsa";
my $sting = "/humgen/gsa-scr1/ebanks/Sting";
GetOptions( "i=s" => \$inputBamStr,
"o=s" => \$outputHead,
"q:s" => \$queue,
"dry!" => \$dry,
"snps!" => \$snps,
"sample:s" => \$sample,
"doc:s" => \$doc,
"mq:s" => \$mq,
"frac:s" => \$indelFractionMultiplier,
"badsnps:s" => \$badsnps,
"wait:s" => \$wait,
"sting:s" => \$sting );
usage() if ( !$inputBamStr || !$outputHead );
my $indelsHigh = "$outputHead.indels.high.calls";
my $bsub = "bsub -q $queue -o $indelsHigh.sdout";
if ($wait) {
$bsub .= " -w \"ended($wait)\"";
}
my $command = "java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -minConsensusFraction 0.5 -minCnt 2 -1kg -minFraction 0.".$indelFractionMultiplier."3 -o $indelsHigh";
execute("$bsub $command", $dry);
my $indelsLow = "$outputHead.indels.low.calls";
$bsub = "bsub -q $queue -o $indelsLow.sdout -J $outputHead";
if ($wait) {
$bsub .= " -w \"ended($wait)\"";
}
$command = "java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -minConsensusFraction 0.5 -minCnt 2 -1kg -minFraction 0.".$indelFractionMultiplier."1 -o $indelsLow";
execute("$bsub $command", $dry);
if ($snps) {
my $snpsFile = "$outputHead.snps.unfiltered.calls";
$bsub = "bsub -q $queue -o $snpsFile.sdout -J $outputHead";
if ($wait) {
$bsub .= " -w \"ended($wait)\"";
}
$command = "java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -varout $snpsFile -lod 5.0";
execute("$bsub $command", $dry);
my $filterFile = "$outputHead.snps.filtered.calls";
my $vcfFile = "$outputHead.snps.vcf";
$bsub = "bsub -q $queue -o $filterFile.sdout -w \"ended($outputHead)\"";
$command = "java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T VariantFiltration -R /broad/1KG/reference/human_b36_both.fasta $inputBamStr -vcf $vcfFile -included $filterFile -sample $sample -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod,variant,Variants,$snpsFile,";
if ($badsnps) {
$command .= "cleaned,CleanedOutSNP,$badsnps,";
}
$command .= "indels,SimpleIndel,$indelsLow -X DepthOfCoverage:max=$doc -X AlleleBalance:low=0.25,high=0.75 -X FisherStrand:pvalue=0.00001 -X LodThreshold:lod=5 -X MappingQualityZero:max=$mq -X IndelArtifact -X ClusteredSnps:window=7,snps=3";
execute("$bsub $command", $dry);
}
sub execute {
my $cmd = $_[0];
my $dry = $_[1];
if ($dry) {
print "$cmd\n";
} else {
system($cmd);
}
}

View File

@ -4,34 +4,34 @@ use strict;
use Getopt::Long;
sub usage {
print "Usage: perl runCleaningPipeline.pl\n\t-i <input bam>\n\t-odir <output directory>\n\t-obam <output bam name>\n\t[-sting Sting dir]\n\t[-inject]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job final farm job name]\n\t[-dry]\n";
print "Usage: perl runCleaningPipeline.pl\n\t-i <input bam>\n\t-obam <output bam name>\n\t[-sting Sting dir]\n\t[-badsnps badsnps file name]\n\t[-inject]\n\t[-q farm queue; default:gsa]\n\t[-wait farm wait id]\n\t[-job final farm job name]\n\t[-dry]\n";
exit(1);
}
my $inputBam = undef;
my $outputDir = undef;
my $outputBam = undef;
my $jobName = undef;
my $wait = undef;
my $badsnps = undef;
my $dry;
my $inject;
my $queue = "gsa";
my $sting = "/humgen/gsa-scr1/ebanks/Sting";
GetOptions( "i=s" => \$inputBam,
"odir=s" => \$outputDir,
"obam=s" => \$outputBam,
"q:s" => \$queue,
"dry!" => \$dry,
"inject!" => \$inject,
"job:s" => \$jobName,
"wait:s" => \$wait,
"badsnps:s" => \$badsnps,
"sting:s" => \$sting );
usage() if ( !$inputBam || !$outputDir || !$outputBam );
usage() if ( !$inputBam || !$outputBam );
my $indelIntervals = "$outputDir/$outputBam.indels.intervals";
my $command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $indelIntervals -oarg o -j $outputDir/$outputBam.intervals -q $queue";
my $indelIntervals = "$outputBam.indels.intervals";
my $command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IndelIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $indelIntervals -oarg o -j $outputBam.intervals -q $queue";
if ($dry) {
$command .= " -dry";
}
@ -40,8 +40,8 @@ if ($wait) {
}
system($command);
my $mismatchIntervals = "$outputDir/$outputBam.mismatches.intervals";
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $mismatchIntervals -oarg o -j $outputDir/$outputBam.intervals -q $queue";
my $mismatchIntervals = "$outputBam.mismatches.intervals";
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T MismatchIntervals -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\" -o $mismatchIntervals -oarg o -j $outputBam.intervals -q $queue";
if ($dry) {
$command .= " -dry";
}
@ -50,8 +50,8 @@ if ($wait) {
}
system($command);
my $mergedIntervals = "$outputDir/$outputBam.merged.intervals";
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -intervals $indelIntervals -intervals $mismatchIntervals\" -o $mergedIntervals -oarg o -wait $outputDir/$outputBam.intervals -j $outputDir/$outputBam.merged -q $queue";
my $mergedIntervals = "$outputBam.merged.intervals";
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalMerger -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -intervals $indelIntervals -intervals $mismatchIntervals\" -o $mergedIntervals -oarg o -wait $outputBam.intervals -j $outputBam.merged -q $queue";
if ($dry) {
$command .= " -dry";
}
@ -59,30 +59,45 @@ system($command);
my $cleanedBam;
if ($inject) {
$cleanedBam = "$outputDir/$outputBam.cleaned.bam";
$cleanedBam = "$outputBam.cleaned.bam";
} else {
$cleanedBam = "$outputDir/$outputBam";
$cleanedBam = "$outputBam";
}
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam -compress 1";
if ($inject) {
$command .= " -cleanedOnly\" -j $outputDir/$outputBam.cleaned";
$command .= " -cleanedOnly\" -j $outputBam.cleaned";
} else {
$command .= "\" -j $jobName";
}
$command .= "\" -o $cleanedBam -oarg O -wait $outputDir/$outputBam.merged -q $queue -bam -i $mergedIntervals -n 50";
$command .= " -o $cleanedBam -oarg O -wait $outputBam.merged -q $queue -bam -i $mergedIntervals -n 50";
if ($dry) {
$command .= " -dry";
}
system($command);
my $snpsFile = $badsnps;
if (!$snpsFile) {
$snpsFile = "$outputBam.badsnps";
}
$command = "perl $sting/perl/splitAndEnqueueGATKjobs.pl -cmd \"java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T IntervalCleaner -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam\"";
if ($inject) {
$command .= " -j $outputBam.cleaned";
} else {
$command .= " -j $jobName";
}
$command .= "\" -o $snpsFile -oarg snps -wait $outputBam.merged -q $queue -i $mergedIntervals -n 50";
if ($dry) {
$command .= " -dry";
}
system($command);
if ($inject) {
my $bam = "$outputDir/$outputBam";
my $bam = "$outputBam";
$command = "bsub -q $queue -o $bam.sdout";
if ($jobName) {
$command .= " -J $jobName";
}
$command .= " -w \"ended($outputDir/$outputBam.cleaned)\" java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam --output_bam $bam --cleaned_reads $cleanedBam -compress 1";
$command .= " -w \"ended($outputBam.cleaned)\" java -Xmx4096m -jar $sting/dist/GenomeAnalysisTK.jar -S SILENT -T CleanedReadInjector -R /broad/1KG/reference/human_b36_both.fasta -I $inputBam --output_bam $bam --cleaned_reads $cleanedBam -compress 1";
if ($dry) {
print "$command\n";
} else {

View File

@ -26,22 +26,24 @@ my @samples = ("ceu","yri","chb_jpt");
foreach my $sample (@samples) {
$inputBam = "$inputDir/low_coverage_$sample.bam";
$outputHead = "$outputDir/$sample";
clean($inputBam, $outputDir, "$outputHead.bam", $queue, $sting, $dry);
# call($outputBam, $outputHead, $queue, $sting, $dry);
my $inputBam = "$inputDir/low_coverage_$sample.bam";
my $outputHead = "$outputDir/$sample";
my $outputBam = "$outputHead.bam";
my $badsnps = "$outputBam.badsnps";
clean($inputBam, $outputBam, $queue, $sting, $dry, $badsnps);
call("-I $outputBam", $outputHead, $queue, $sting, $dry, "$inputBam.cleaningpipeline", $sample, $badsnps);
}
sub clean {
my $inputBam = $_[0];
my $outputDir = $_[1];
my $outputBam = $_[2];
my $queue = $_[3];
my $sting = $_[4];
my $dry = $_[5];
my $outputBam = $_[1];
my $queue = $_[2];
my $sting = $_[3];
my $dry = $_[4];
my $badsnps = $_[5];
my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -odir $outputDir -obam $outputBam -q $queue -j $outputBam.cleaningpipeline -sting $sting";
my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -obam $outputBam -q $queue -j $outputBam.cleaningpipeline -sting $sting -badsnps $badsnps";
if ($dry) {
$cmd .= " -dry";
}
@ -50,15 +52,21 @@ sub clean {
sub call {
my $inputBam = $_[0];
my $inputBams = $_[0];
my $outputHead = $_[1];
my $queue = $_[2];
my $sting = $_[3];
my $dry = $_[4];
my $wait = $_[5];
my $sample = $_[6];
my $badsnps = $_[7];
$cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBam -o $outputHead -q $queue -wait $inputBam.cleaningpipeline -sting $sting";
my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBams -o $outputHead -q $queue -sting $sting -frac 0 -sample $sample -badsnps $badsnps";
if ($dry) {
$cmd .= " -dry";
}
if ($wait) {
$cmd .= " -wait $wait";
}
system($cmd);
}

View File

@ -24,43 +24,60 @@ usage() if ( !$inputDir || !$outputDir );
my @samples = ("NA19238","NA19239","NA19240","NA12878","NA12891","NA12892");
# Official genome-wide Depth of Coverage tables for pilot 2, freeze 5:
# NA12878 NA12891 NA12892 NA19238 NA19239 NA19240
# 454: 36 18
# SLX: 82 91 70 56 68 86
# SOLID: 37 64
# 454+SLD: 64 77
# ALL: 138 150
my %DoC_454 = ( "NA12878" => 36, "NA19240" => 18 );
my %DoC_slx = ( "NA12878" => 82, "NA12891" => 91, "NA12892" => 70,"NA19238" => 56, "NA19239" => 68, "NA19240" => 86 );
my %DoC_solid = ( "NA12878" => 37, "NA19240" => 64 );
my %DoC_454solid = ( "NA12878" => 64, "NA19240" => 77 );
my %DoC_all = ( "NA12878" => 138, "NA19240" => 150 );
my %MQ_hash = ( "SLX" => 100, "SOLID" => 5, "454" => 5, "454SOLID" => 10, "ALL" => 110 );
foreach my $sample (@samples) {
my $inputBamSLX = "$inputDir/$sample.pilot2.SLX.bam";
my $outputHead = "$outputDir/$sample.SLX";
clean($inputBamSLX, $outputDir, "$outputHead.bam", $queue, $sting, $dry);
# call($outputBam, $outputHead, $queue, $sting, $dry);
my $outputHeadSLX = "$outputDir/$sample.SLX";
my $outputBamSLX = "$outputHeadSLX.bam";
my $badsnpsSLX = "$outputDir/$outputBamSLX.badsnps";
clean($inputBamSLX, $outputBamSLX, $queue, $sting, $dry, $badsnpsSLX);
call("-I $outputBamSLX", $outputHeadSLX, $queue, $sting, $dry, "$inputBamSLX.cleaningpipeline", $sample, $badsnpsSLX, $DoC_slx{$sample}, $MQ_hash{"SLX"});
if ($sample eq "NA12878" || $sample eq "NA19240") {
my $inputBamSOLID = "$inputDir/$sample.pilot2.SOLID.bam";
$outputHead = "$outputDir/$sample.SOLID";
clean($inputBamSOLID, $outputDir, "$outputHead.bam", $queue, $sting, $dry);
# call($outputBam, $outputHead, $queue, $sting, $dry);
my $outputHeadSOLID = "$outputDir/$sample.SOLID";
my $outputBamSOLID = "$outputHeadSOLID.bam";
my $badsnpsSOLID = "$outputDir/$outputBamSOLID.badsnps";
clean($inputBamSOLID, $outputBamSOLID, $queue, $sting, $dry, $badsnpsSOLID);
call("-I $outputBamSOLID", $outputHeadSOLID, $queue, $sting, $dry, "$inputBamSOLID.cleaningpipeline", $sample, $badsnpsSOLID, $DoC_solid{$sample}, $MQ_hash{"SOLID"});
my $inputBam454 = "$inputDir/$sample.pilot2.454.bam";
$outputHead = "$outputDir/$sample.454";
# call($inputBam454, $outputHead, $queue, $sting, $dry);
my $outputHead454 = "$outputDir/$sample.454";
call("-I $inputBam454", $outputHead454, $queue, $sting, $dry, undef, $sample, $badsnpsSLX, $DoC_454{$sample}, $MQ_hash{"454"});
my @inputBams = ($inputBamSOLID, $inputBam454);
$outputHead = "$outputDir/$sample.SOLID_454";
# multicall($inputBams, $outputHead, $queue, $sting, $dry);
my $outputHead = "$outputDir/$sample.SOLID_454";
call("-I $outputBamSOLID -I $inputBam454", $outputHead, $queue, $sting, $dry, "$inputBamSOLID.cleaningpipeline", $sample, $badsnpsSOLID, $DoC_454solid{$sample}, $MQ_hash{"454SOLID"});
@inputBams = ($inputBamSLX, $inputBamSOLID, $inputBam454);
$outputHead = "$outputDir/$sample.allTechs";
# multicall($inputBams, $outputHead, $queue, $sting, $dry);
call("-I $outputBamSLX -I $outputBamSOLID -I $inputBam454", $outputHead, $queue, $sting, $dry, "*.cleaningpipeline", $sample, $badsnpsSLX, $DoC_all{$sample}, $MQ_hash{"ALL"});
}
}
sub clean {
my $inputBam = $_[0];
my $outputDir = $_[1];
my $outputBam = $_[2];
my $queue = $_[3];
my $sting = $_[4];
my $dry = $_[5];
my $outputBam = $_[1];
my $queue = $_[2];
my $sting = $_[3];
my $dry = $_[4];
my $badsnps = $_[5];
my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -odir $outputDir -obam $outputBam -q $queue -inject -j $outputBam.cleaningpipeline -sting $sting";
my $cmd = "perl $sting/perl/1kgScripts/runCleaningPipeline.pl -i $inputBam -obam $outputBam -q $queue -inject -j $outputBam.cleaningpipeline -sting $sting -badsnps $badsnps";
if ($dry) {
$cmd .= " -dry";
}
@ -69,15 +86,23 @@ sub clean {
sub call {
my $inputBam = $_[0];
my $inputBams = $_[0];
my $outputHead = $_[1];
my $queue = $_[2];
my $sting = $_[3];
my $dry = $_[4];
my $wait = $_[5];
my $sample = $_[6];
my $badsnps = $_[7];
my $doc = $_[8];
my $mq = $_[9];
my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBam -o $outputHead -q $queue -snps -wait $inputBam.cleaningpipeline -sting $sting";
my $cmd = "perl $sting/perl/1kgScripts/runCallingPipeline.pl -i $inputBams -o $outputHead -q $queue -snps -sting $sting -sample $sample -badsnps $badsnps -doc $doc -mq $mq";
if ($dry) {
$cmd .= " -dry";
}
if ($wait) {
$cmd .= " -wait $wait";
}
system($cmd);
}