diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java index 9b51beb53..5f917daa6 100644 --- a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java +++ b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java @@ -35,7 +35,8 @@ public class PipelineProject { private String name; private File referenceFile; private File intervalList; - private File dbsnpFile; + private File genotypeDbsnp; + private File evalDbsnp; private File refseqTable; private Map tags = new TreeMap(); @@ -63,12 +64,40 @@ public class PipelineProject { this.referenceFile = referenceFile; } + /** use genotype dbsnp file */ + @Deprecated public File getDbsnpFile() { - return dbsnpFile; + return getGenotypeDbsnp(); } + /** use genotype dbsnp file */ + @Deprecated public void setDbsnpFile(File dbsnpFile) { - this.dbsnpFile = dbsnpFile; + this.setGenotypeDbsnp(dbsnpFile); + } + + public File getGenotypeDbsnp() { + return genotypeDbsnp; + } + + public void setGenotypeDbsnp(File genotypeDbsnp) { + this.genotypeDbsnp = genotypeDbsnp; + } + + public String getGenotypeDbsnpType() { + return getDbsnpType(genotypeDbsnp); + } + + public File getEvalDbsnp() { + return evalDbsnp; + } + + public void setEvalDbsnp(File evalDbsnp) { + this.evalDbsnp = evalDbsnp; + } + + public String getEvalDbsnpType() { + return getDbsnpType(genotypeDbsnp); } public File getRefseqTable() { @@ -86,4 +115,13 @@ public class PipelineProject { public void setTags(Map tags) { this.tags = tags; } + + private String getDbsnpType(File file) { + if (file == null) + return null; + else if (file.getName().toLowerCase().endsWith(".vcf")) + return "vcf"; + else + return "dbsnp"; + } } diff --git a/scala/qscript/playground/fullCallingPipeline.q b/scala/qscript/playground/FullCallingPipeline.q similarity index 94% rename from scala/qscript/playground/fullCallingPipeline.q rename to scala/qscript/playground/FullCallingPipeline.q index 487e2fe5a..095463a25 100755 --- a/scala/qscript/playground/fullCallingPipeline.q +++ b/scala/qscript/playground/FullCallingPipeline.q @@ -10,7 +10,7 @@ import org.broadinstitute.sting.queue.QScript import collection.JavaConversions._ import org.broadinstitute.sting.utils.yaml.YamlUtils -class fullCallingPipeline extends QScript { +class FullCallingPipeline extends QScript { qscript => @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y") @@ -64,8 +64,6 @@ class fullCallingPipeline extends QScript { private var pipeline: Pipeline = _ - private var dbsnpType: String = _ - trait CommandLineGATKArgs extends CommandLineGATK { this.intervals = List(qscript.pipeline.getProject.getIntervalList) this.jarFile = qscript.gatkJar @@ -79,12 +77,6 @@ class fullCallingPipeline extends QScript { def script = { pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile) - //var dbsnpType: String = _ //figure out how to get it so this is recognized and the whole thing has access to it. - if (qscript.pipeline.getProject.getDbsnpFile.toString.contains("rod")){ - dbsnpType = "dbSNP" - } else { - dbsnpType = "VCF" - } val projectBase: String = qscript.pipeline.getProject.getName // TODO: Fix command lines that pass -refseqTable @@ -131,7 +123,7 @@ class fullCallingPipeline extends QScript { realigner.intervals = Nil realigner.intervalsString = Nil realigner.scatterCount = num_cleaner_scatter_jobs - realigner.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) + realigner.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp) realigner.rodBind :+= RodBind("indels", "VCF", swapExt(realigner.reference_sequence.getParentFile, realigner.reference_sequence, "fasta", "1kg_pilot_indels.vcf")) // if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates @@ -214,7 +206,7 @@ class fullCallingPipeline extends QScript { snps.group :+= "Standard" snps.out = new File("SnpCalls", base+".vcf") snps.downsample_to_coverage = Some(qscript.downsampling_coverage) - snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) + snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp) snps.memoryLimit = Some(6) snps.scatterCount = qscript.num_snp_scatter_jobs @@ -242,7 +234,7 @@ class fullCallingPipeline extends QScript { indels.group :+= "Standard" indels.out = new File("IndelCalls", base+".vcf") indels.downsample_to_coverage = Some(qscript.downsampling_coverage) - indels.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) + indels.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp) indels.memoryLimit = Some(6) indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL) @@ -308,12 +300,7 @@ class fullCallingPipeline extends QScript { eval.reportLocation = new File("SnpCalls", base+".eval") eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R) eval.analysisName = base+"_VariantEval" - if(dbsnpType=="VCF"){ - eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") - - } else{ - eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) - } + eval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp) // 5. Make the bam list val listOfBams = new File(base +".BamFiles.list") diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala b/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala index 6e2177d02..96cea668a 100644 --- a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala +++ b/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala @@ -23,11 +23,6 @@ class FullCallingPipelineTest { new K1gBam("C474", "NA19652", 1), new K1gBam("C474", "NA19654", 1)) - // In fullCallingPipeline.q VariantEval is always compared against 129. - // Until the newvarianteval is finalized which will allow java import of the prior results, - // we re-run VariantEval to validate the run, and replicate that behavior here. - private final val variantEvalDbsnpFile = new File(BaseTest.b37dbSNP129) - val k1gChr20Dataset = { val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20") dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list")) @@ -64,7 +59,8 @@ class FullCallingPipelineTest { val project = new PipelineProject project.setName(projectName) project.setReferenceFile(new File(BaseTest.hg19Reference)) - project.setDbsnpFile(new File(BaseTest.b37dbSNP132)) + project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132)) + project.setEvalDbsnp(new File(BaseTest.b37dbSNP129)) project.setRefseqTable(new File(BaseTest.hg19Refseq)) var samples = List.empty[PipelineSample] @@ -94,12 +90,12 @@ class FullCallingPipelineTest { @Test(dataProvider="datasets", enabled=false) def testFullCallingPipeline(dataset: PipelineDataset) = { val projectName = dataset.pipeline.getProject.getName - val testName = "fullCallingPipeline-" + projectName + val testName = "FullCallingPipeline-" + projectName val yamlFile = writeYaml(testName, dataset.pipeline) var cleanType = "cleaned" // Run the pipeline with the expected inputs. - var pipelineCommand = ("-retry 1 -S scala/qscript/playground/fullCallingPipeline.q" + + var pipelineCommand = ("-retry 1 -S scala/qscript/playground/FullCallingPipeline.q" + " -jobProject %s -Y %s" + " -tearScript %s/R/DataProcessingReport/GetTearsheetStats.R" + " --gatkjar %s") @@ -119,7 +115,7 @@ class FullCallingPipelineTest { pipelineSpec.evalSpec.vcf = new File(PipelineTest.runDir(testName) + "SnpCalls/%s.%s.annotated.handfiltered.vcf".format(projectName, cleanType)) pipelineSpec.evalSpec.reference = dataset.pipeline.getProject.getReferenceFile pipelineSpec.evalSpec.intervals = dataset.pipeline.getProject.getIntervalList - pipelineSpec.evalSpec.dbsnp = variantEvalDbsnpFile + pipelineSpec.evalSpec.dbsnp = dataset.pipeline.getProject.getEvalDbsnp pipelineSpec.evalSpec.validations = dataset.validations // Run the test, at least checking if the command compiles diff --git a/shell/getFirehosePipelineYaml.sh b/shell/getFirehosePipelineYaml.sh index a6e1ef10c..8f03b1e4b 100755 --- a/shell/getFirehosePipelineYaml.sh +++ b/shell/getFirehosePipelineYaml.sh @@ -32,19 +32,22 @@ FIREHOSE_ANNOTATIONS=(reference_file interval_list \ # YAML templates +# Project YAML template, once per file. PROJECT_YAML_TEMPLATE='"\n\ project: {\n\ name: '"$ENTITY_SET_ID"',\n\ referenceFile: %s,\n\ - dbsnpFile: %s,\n\ + genotypeDbsnpFile: %s,\n\ + evalDbsnpFile: %s,\n\ refseqTable: %s,\n\ intervalList: %s\n\ - },", $1, dbsnp, refseq, $2' + },", $1, genotypeDbsnp, evalDbsnp, refseq, $2' +# Project YAML template, once per sample. SAMPLE_YAML_TEMPLATE='"\n\ {\n\ id: %s,\n\ - bamFiles: { recalibrated: %s },\n\ + bamFiles: { cleaned: %s },\n\ tags: {\n\ SQUIDProject: %s,\n\ CollaboratorID: %s\n\ @@ -86,20 +89,27 @@ BEGIN { refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"; dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/"; - dbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; + # add hg18 specific files to awk associative arrays + genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; + evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt"; - dbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf"; + # add hg19 specific files to awk associative arrays + genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf"; + evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod"; refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt"; printf "{" } { if (NR == 1) { + # Based on the reference of the first sample, specify the dbsnps and refseq tables. + reference_part_count = split($1, reference_parts, "/") reference_name = reference_parts[reference_part_count]; - dbsnp = dbsnps[reference_name]; + genotypeDbsnp = genotypeDbsnps[reference_name]; + evalDbsnp = evalDbsnps[reference_name]; refseq = refseqs[reference_name]; printf '"$PROJECT_YAML_TEMPLATE"' @@ -115,12 +125,14 @@ END { print "\n}" }' > $PIPELINE_YAML_FILE -hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE` +#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE` -if [ "$hg19" -ne 0 ]; then - sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2 - mv yaml2 $PIPELINE_YAML_FILE -fi +# NOTE: DBSNP's are populated via AWK's BEGIN block above. +#if [ "$hg19" -ne 0 ]; then +# sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2 +# mv yaml2 $PIPELINE_YAML_FILE +#fi -sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2 -mv yaml2 $PIPELINE_YAML_FILE +# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above. +#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2 +#mv yaml2 $PIPELINE_YAML_FILE diff --git a/shell/queue.sh b/shell/queue.sh index a939f5525..3b2826d92 100755 --- a/shell/queue.sh +++ b/shell/queue.sh @@ -2,7 +2,6 @@ SAMPLE_SET=$1 STING_HOME=/humgen/gsa-pipeline/.repository -REFSEQ_TABLE=/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/refGene-big-table-hg19.txt JOB_QUEUE=week SHORT_QUEUE=hour TMP_DIR=$PWD/tmp @@ -14,4 +13,4 @@ use R-2.10 use Oracle-full-client use .cx-oracle-5.0.2-python-2.6.5-oracle-full-client-11.1 -java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/fullCallingPipeline.q -Y "$SAMPLE_SET".yaml -refseqTable "$REFSEQ_TABLE" --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2 +java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/FullCallingPipeline.q -Y "$SAMPLE_SET".yaml --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2