From 8040998c15a48949999c91e3769bdc7efba37c01 Mon Sep 17 00:00:00 2001 From: kshakir Date: Mon, 7 Feb 2011 22:01:09 +0000 Subject: [PATCH] Renamed the pipeline yaml dbsnpFile to genotypeDbsnp, and added an evalDbsnp. Added a genotypeDbsnpType and evalDbsnpType to check the extensions for .vcf or .rod. Moved renaming of "recalibrated" bams to "cleaned" from sed to yaml generation template (see diff for more info). Renamed fCP.q to FCP.q. Though it's still disabled until VariantEval is updated, added changes above to the FCPTest. Removed refseq table from the queue.sh wrapper script. Only specified in the yaml. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5213 348d0f76-0448-11de-a6fe-93d51630548a --- .../datasources/pipeline/PipelineProject.java | 44 +++++++++++++++++-- ...allingPipeline.q => FullCallingPipeline.q} | 23 +++------- .../playground/FullCallingPipelineTest.scala | 14 +++--- shell/getFirehosePipelineYaml.sh | 38 ++++++++++------ shell/queue.sh | 3 +- 5 files changed, 77 insertions(+), 45 deletions(-) rename scala/qscript/playground/{fullCallingPipeline.q => FullCallingPipeline.q} (94%) diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java index 9b51beb53..5f917daa6 100644 --- a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java +++ b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java @@ -35,7 +35,8 @@ public class PipelineProject { private String name; private File referenceFile; private File intervalList; - private File dbsnpFile; + private File genotypeDbsnp; + private File evalDbsnp; private File refseqTable; private Map tags = new TreeMap(); @@ -63,12 +64,40 @@ public class PipelineProject { this.referenceFile = referenceFile; } + /** use genotype dbsnp file */ + @Deprecated public File getDbsnpFile() { - return dbsnpFile; + return getGenotypeDbsnp(); } + /** use genotype dbsnp file */ + @Deprecated public void setDbsnpFile(File dbsnpFile) { - this.dbsnpFile = dbsnpFile; + this.setGenotypeDbsnp(dbsnpFile); + } + + public File getGenotypeDbsnp() { + return genotypeDbsnp; + } + + public void setGenotypeDbsnp(File genotypeDbsnp) { + this.genotypeDbsnp = genotypeDbsnp; + } + + public String getGenotypeDbsnpType() { + return getDbsnpType(genotypeDbsnp); + } + + public File getEvalDbsnp() { + return evalDbsnp; + } + + public void setEvalDbsnp(File evalDbsnp) { + this.evalDbsnp = evalDbsnp; + } + + public String getEvalDbsnpType() { + return getDbsnpType(genotypeDbsnp); } public File getRefseqTable() { @@ -86,4 +115,13 @@ public class PipelineProject { public void setTags(Map tags) { this.tags = tags; } + + private String getDbsnpType(File file) { + if (file == null) + return null; + else if (file.getName().toLowerCase().endsWith(".vcf")) + return "vcf"; + else + return "dbsnp"; + } } diff --git a/scala/qscript/playground/fullCallingPipeline.q b/scala/qscript/playground/FullCallingPipeline.q similarity index 94% rename from scala/qscript/playground/fullCallingPipeline.q rename to scala/qscript/playground/FullCallingPipeline.q index 487e2fe5a..095463a25 100755 --- a/scala/qscript/playground/fullCallingPipeline.q +++ b/scala/qscript/playground/FullCallingPipeline.q @@ -10,7 +10,7 @@ import org.broadinstitute.sting.queue.QScript import collection.JavaConversions._ import org.broadinstitute.sting.utils.yaml.YamlUtils -class fullCallingPipeline extends QScript { +class FullCallingPipeline extends QScript { qscript => @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y") @@ -64,8 +64,6 @@ class fullCallingPipeline extends QScript { private var pipeline: Pipeline = _ - private var dbsnpType: String = _ - trait CommandLineGATKArgs extends CommandLineGATK { this.intervals = List(qscript.pipeline.getProject.getIntervalList) this.jarFile = qscript.gatkJar @@ -79,12 +77,6 @@ class fullCallingPipeline extends QScript { def script = { pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile) - //var dbsnpType: String = _ //figure out how to get it so this is recognized and the whole thing has access to it. - if (qscript.pipeline.getProject.getDbsnpFile.toString.contains("rod")){ - dbsnpType = "dbSNP" - } else { - dbsnpType = "VCF" - } val projectBase: String = qscript.pipeline.getProject.getName // TODO: Fix command lines that pass -refseqTable @@ -131,7 +123,7 @@ class fullCallingPipeline extends QScript { realigner.intervals = Nil realigner.intervalsString = Nil realigner.scatterCount = num_cleaner_scatter_jobs - realigner.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) + realigner.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp) realigner.rodBind :+= RodBind("indels", "VCF", swapExt(realigner.reference_sequence.getParentFile, realigner.reference_sequence, "fasta", "1kg_pilot_indels.vcf")) // if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates @@ -214,7 +206,7 @@ class fullCallingPipeline extends QScript { snps.group :+= "Standard" snps.out = new File("SnpCalls", base+".vcf") snps.downsample_to_coverage = Some(qscript.downsampling_coverage) - snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) + snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp) snps.memoryLimit = Some(6) snps.scatterCount = qscript.num_snp_scatter_jobs @@ -242,7 +234,7 @@ class fullCallingPipeline extends QScript { indels.group :+= "Standard" indels.out = new File("IndelCalls", base+".vcf") indels.downsample_to_coverage = Some(qscript.downsampling_coverage) - indels.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) + indels.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp) indels.memoryLimit = Some(6) indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL) @@ -308,12 +300,7 @@ class fullCallingPipeline extends QScript { eval.reportLocation = new File("SnpCalls", base+".eval") eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R) eval.analysisName = base+"_VariantEval" - if(dbsnpType=="VCF"){ - eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") - - } else{ - eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) - } + eval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp) // 5. Make the bam list val listOfBams = new File(base +".BamFiles.list") diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala b/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala index 6e2177d02..96cea668a 100644 --- a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala +++ b/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala @@ -23,11 +23,6 @@ class FullCallingPipelineTest { new K1gBam("C474", "NA19652", 1), new K1gBam("C474", "NA19654", 1)) - // In fullCallingPipeline.q VariantEval is always compared against 129. - // Until the newvarianteval is finalized which will allow java import of the prior results, - // we re-run VariantEval to validate the run, and replicate that behavior here. - private final val variantEvalDbsnpFile = new File(BaseTest.b37dbSNP129) - val k1gChr20Dataset = { val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20") dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list")) @@ -64,7 +59,8 @@ class FullCallingPipelineTest { val project = new PipelineProject project.setName(projectName) project.setReferenceFile(new File(BaseTest.hg19Reference)) - project.setDbsnpFile(new File(BaseTest.b37dbSNP132)) + project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132)) + project.setEvalDbsnp(new File(BaseTest.b37dbSNP129)) project.setRefseqTable(new File(BaseTest.hg19Refseq)) var samples = List.empty[PipelineSample] @@ -94,12 +90,12 @@ class FullCallingPipelineTest { @Test(dataProvider="datasets", enabled=false) def testFullCallingPipeline(dataset: PipelineDataset) = { val projectName = dataset.pipeline.getProject.getName - val testName = "fullCallingPipeline-" + projectName + val testName = "FullCallingPipeline-" + projectName val yamlFile = writeYaml(testName, dataset.pipeline) var cleanType = "cleaned" // Run the pipeline with the expected inputs. - var pipelineCommand = ("-retry 1 -S scala/qscript/playground/fullCallingPipeline.q" + + var pipelineCommand = ("-retry 1 -S scala/qscript/playground/FullCallingPipeline.q" + " -jobProject %s -Y %s" + " -tearScript %s/R/DataProcessingReport/GetTearsheetStats.R" + " --gatkjar %s") @@ -119,7 +115,7 @@ class FullCallingPipelineTest { pipelineSpec.evalSpec.vcf = new File(PipelineTest.runDir(testName) + "SnpCalls/%s.%s.annotated.handfiltered.vcf".format(projectName, cleanType)) pipelineSpec.evalSpec.reference = dataset.pipeline.getProject.getReferenceFile pipelineSpec.evalSpec.intervals = dataset.pipeline.getProject.getIntervalList - pipelineSpec.evalSpec.dbsnp = variantEvalDbsnpFile + pipelineSpec.evalSpec.dbsnp = dataset.pipeline.getProject.getEvalDbsnp pipelineSpec.evalSpec.validations = dataset.validations // Run the test, at least checking if the command compiles diff --git a/shell/getFirehosePipelineYaml.sh b/shell/getFirehosePipelineYaml.sh index a6e1ef10c..8f03b1e4b 100755 --- a/shell/getFirehosePipelineYaml.sh +++ b/shell/getFirehosePipelineYaml.sh @@ -32,19 +32,22 @@ FIREHOSE_ANNOTATIONS=(reference_file interval_list \ # YAML templates +# Project YAML template, once per file. PROJECT_YAML_TEMPLATE='"\n\ project: {\n\ name: '"$ENTITY_SET_ID"',\n\ referenceFile: %s,\n\ - dbsnpFile: %s,\n\ + genotypeDbsnpFile: %s,\n\ + evalDbsnpFile: %s,\n\ refseqTable: %s,\n\ intervalList: %s\n\ - },", $1, dbsnp, refseq, $2' + },", $1, genotypeDbsnp, evalDbsnp, refseq, $2' +# Project YAML template, once per sample. SAMPLE_YAML_TEMPLATE='"\n\ {\n\ id: %s,\n\ - bamFiles: { recalibrated: %s },\n\ + bamFiles: { cleaned: %s },\n\ tags: {\n\ SQUIDProject: %s,\n\ CollaboratorID: %s\n\ @@ -86,20 +89,27 @@ BEGIN { refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"; dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/"; - dbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; + # add hg18 specific files to awk associative arrays + genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; + evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt"; - dbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf"; + # add hg19 specific files to awk associative arrays + genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf"; + evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod"; refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt"; printf "{" } { if (NR == 1) { + # Based on the reference of the first sample, specify the dbsnps and refseq tables. + reference_part_count = split($1, reference_parts, "/") reference_name = reference_parts[reference_part_count]; - dbsnp = dbsnps[reference_name]; + genotypeDbsnp = genotypeDbsnps[reference_name]; + evalDbsnp = evalDbsnps[reference_name]; refseq = refseqs[reference_name]; printf '"$PROJECT_YAML_TEMPLATE"' @@ -115,12 +125,14 @@ END { print "\n}" }' > $PIPELINE_YAML_FILE -hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE` +#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE` -if [ "$hg19" -ne 0 ]; then - sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2 - mv yaml2 $PIPELINE_YAML_FILE -fi +# NOTE: DBSNP's are populated via AWK's BEGIN block above. +#if [ "$hg19" -ne 0 ]; then +# sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2 +# mv yaml2 $PIPELINE_YAML_FILE +#fi -sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2 -mv yaml2 $PIPELINE_YAML_FILE +# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above. +#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2 +#mv yaml2 $PIPELINE_YAML_FILE diff --git a/shell/queue.sh b/shell/queue.sh index a939f5525..3b2826d92 100755 --- a/shell/queue.sh +++ b/shell/queue.sh @@ -2,7 +2,6 @@ SAMPLE_SET=$1 STING_HOME=/humgen/gsa-pipeline/.repository -REFSEQ_TABLE=/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/refGene-big-table-hg19.txt JOB_QUEUE=week SHORT_QUEUE=hour TMP_DIR=$PWD/tmp @@ -14,4 +13,4 @@ use R-2.10 use Oracle-full-client use .cx-oracle-5.0.2-python-2.6.5-oracle-full-client-11.1 -java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/fullCallingPipeline.q -Y "$SAMPLE_SET".yaml -refseqTable "$REFSEQ_TABLE" --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2 +java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/FullCallingPipeline.q -Y "$SAMPLE_SET".yaml --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2