From 8040998c15a48949999c91e3769bdc7efba37c01 Mon Sep 17 00:00:00 2001
From: kshakir <kshakir@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Mon, 7 Feb 2011 22:01:09 +0000
Subject: [PATCH] Renamed the pipeline yaml dbsnpFile to genotypeDbsnp, and
 added an evalDbsnp. Added a genotypeDbsnpType and evalDbsnpType to check the
 extensions for .vcf or .rod. Moved renaming of "recalibrated" bams to
 "cleaned" from sed to yaml generation template (see diff for more info).
 Renamed fCP.q to FCP.q. Though it's still disabled until VariantEval is
 updated, added changes above to the FCPTest. Removed refseq table from the
 queue.sh wrapper script. Only specified in the yaml.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5213 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../datasources/pipeline/PipelineProject.java | 44 +++++++++++++++++--
 ...allingPipeline.q => FullCallingPipeline.q} | 23 +++-------
 .../playground/FullCallingPipelineTest.scala  | 14 +++---
 shell/getFirehosePipelineYaml.sh              | 38 ++++++++++------
 shell/queue.sh                                |  3 +-
 5 files changed, 77 insertions(+), 45 deletions(-)
 rename scala/qscript/playground/{fullCallingPipeline.q => FullCallingPipeline.q} (94%)

diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java
index 9b51beb53..5f917daa6 100644
--- a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java
+++ b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java
@@ -35,7 +35,8 @@ public class PipelineProject {
     private String name;
     private File referenceFile;
     private File intervalList;
-    private File dbsnpFile;
+    private File genotypeDbsnp;
+    private File evalDbsnp;
     private File refseqTable;
     private Map<String, String> tags = new TreeMap<String, String>();
 
@@ -63,12 +64,40 @@ public class PipelineProject {
         this.referenceFile = referenceFile;
     }
 
+    /** use genotype dbsnp file */
+    @Deprecated
     public File getDbsnpFile() {
-        return dbsnpFile;
+        return getGenotypeDbsnp();
     }
 
+    /** use genotype dbsnp file */
+    @Deprecated
     public void setDbsnpFile(File dbsnpFile) {
-        this.dbsnpFile = dbsnpFile;
+        this.setGenotypeDbsnp(dbsnpFile);
+    }
+
+    public File getGenotypeDbsnp() {
+        return genotypeDbsnp;
+    }
+
+    public void setGenotypeDbsnp(File genotypeDbsnp) {
+        this.genotypeDbsnp = genotypeDbsnp;
+    }
+
+    public String getGenotypeDbsnpType() {
+        return getDbsnpType(genotypeDbsnp);
+    }
+
+    public File getEvalDbsnp() {
+        return evalDbsnp;
+    }
+
+    public void setEvalDbsnp(File evalDbsnp) {
+        this.evalDbsnp = evalDbsnp;
+    }
+
+    public String getEvalDbsnpType() {
+        return getDbsnpType(genotypeDbsnp);
     }
 
     public File getRefseqTable() {
@@ -86,4 +115,13 @@ public class PipelineProject {
     public void setTags(Map<String, String> tags) {
         this.tags = tags;
     }
+
+    private String getDbsnpType(File file) {
+        if (file == null)
+            return null;
+        else if (file.getName().toLowerCase().endsWith(".vcf"))
+            return "vcf";
+        else
+            return "dbsnp";
+    }
 }
diff --git a/scala/qscript/playground/fullCallingPipeline.q b/scala/qscript/playground/FullCallingPipeline.q
similarity index 94%
rename from scala/qscript/playground/fullCallingPipeline.q
rename to scala/qscript/playground/FullCallingPipeline.q
index 487e2fe5a..095463a25 100755
--- a/scala/qscript/playground/fullCallingPipeline.q
+++ b/scala/qscript/playground/FullCallingPipeline.q
@@ -10,7 +10,7 @@ import org.broadinstitute.sting.queue.QScript
 import collection.JavaConversions._
 import org.broadinstitute.sting.utils.yaml.YamlUtils
 
-class fullCallingPipeline extends QScript {
+class FullCallingPipeline extends QScript {
   qscript =>
 
   @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
@@ -64,8 +64,6 @@ class fullCallingPipeline extends QScript {
 
   private var pipeline: Pipeline = _
 
-  private var dbsnpType: String = _
-
   trait CommandLineGATKArgs extends CommandLineGATK {
     this.intervals = List(qscript.pipeline.getProject.getIntervalList)
     this.jarFile = qscript.gatkJar
@@ -79,12 +77,6 @@ class fullCallingPipeline extends QScript {
 
   def script = {
     pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
-    //var dbsnpType: String = _ //figure out how to get it so this is recognized and the whole thing has access to it.
-      if (qscript.pipeline.getProject.getDbsnpFile.toString.contains("rod")){
-        dbsnpType = "dbSNP"
-      } else  {
-        dbsnpType = "VCF"
-      }
 
     val projectBase: String = qscript.pipeline.getProject.getName
     // TODO: Fix command lines that pass -refseqTable
@@ -131,7 +123,7 @@ class fullCallingPipeline extends QScript {
         realigner.intervals = Nil
         realigner.intervalsString = Nil
         realigner.scatterCount = num_cleaner_scatter_jobs
-        realigner.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
+        realigner.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
         realigner.rodBind :+= RodBind("indels", "VCF", swapExt(realigner.reference_sequence.getParentFile, realigner.reference_sequence, "fasta", "1kg_pilot_indels.vcf"))
 
         // if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates
@@ -214,7 +206,7 @@ class fullCallingPipeline extends QScript {
     snps.group :+= "Standard"
     snps.out = new File("SnpCalls", base+".vcf")
     snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
-    snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
+    snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
     snps.memoryLimit = Some(6)
 
     snps.scatterCount = qscript.num_snp_scatter_jobs
@@ -242,7 +234,7 @@ class fullCallingPipeline extends QScript {
     indels.group :+= "Standard"
     indels.out = new File("IndelCalls", base+".vcf")
     indels.downsample_to_coverage = Some(qscript.downsampling_coverage)
-    indels.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
+    indels.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
     indels.memoryLimit = Some(6)
     indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL)
 
@@ -308,12 +300,7 @@ class fullCallingPipeline extends QScript {
     eval.reportLocation = new File("SnpCalls", base+".eval")
     eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R)
     eval.analysisName = base+"_VariantEval"
-    if(dbsnpType=="VCF"){
-     eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
-
-    } else{
-      eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
-    }
+    eval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp)
 
     // 5. Make the bam list
     val listOfBams =  new File(base +".BamFiles.list")
diff --git a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala b/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala
index 6e2177d02..96cea668a 100644
--- a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala
+++ b/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala
@@ -23,11 +23,6 @@ class FullCallingPipelineTest {
     new K1gBam("C474", "NA19652", 1),
     new K1gBam("C474", "NA19654", 1))
 
-  // In fullCallingPipeline.q VariantEval is always compared against 129.
-  // Until the newvarianteval is finalized which will allow java import of the prior results,
-  // we re-run VariantEval to validate the run, and replicate that behavior here.
-  private final val variantEvalDbsnpFile = new File(BaseTest.b37dbSNP129)
-
   val k1gChr20Dataset = {
     val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20")
     dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"))
@@ -64,7 +59,8 @@ class FullCallingPipelineTest {
     val project = new PipelineProject
     project.setName(projectName)
     project.setReferenceFile(new File(BaseTest.hg19Reference))
-    project.setDbsnpFile(new File(BaseTest.b37dbSNP132))
+    project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132))
+    project.setEvalDbsnp(new File(BaseTest.b37dbSNP129))
     project.setRefseqTable(new File(BaseTest.hg19Refseq))
 
     var samples = List.empty[PipelineSample]
@@ -94,12 +90,12 @@ class FullCallingPipelineTest {
   @Test(dataProvider="datasets", enabled=false)
   def testFullCallingPipeline(dataset: PipelineDataset) = {
     val projectName = dataset.pipeline.getProject.getName
-    val testName = "fullCallingPipeline-" + projectName
+    val testName = "FullCallingPipeline-" + projectName
     val yamlFile = writeYaml(testName, dataset.pipeline)
     var cleanType = "cleaned"
 
     // Run the pipeline with the expected inputs.
-    var pipelineCommand = ("-retry 1 -S scala/qscript/playground/fullCallingPipeline.q" +
+    var pipelineCommand = ("-retry 1 -S scala/qscript/playground/FullCallingPipeline.q" +
             " -jobProject %s -Y %s" +
             " -tearScript %s/R/DataProcessingReport/GetTearsheetStats.R" +
             " --gatkjar %s")
@@ -119,7 +115,7 @@ class FullCallingPipelineTest {
     pipelineSpec.evalSpec.vcf = new File(PipelineTest.runDir(testName) + "SnpCalls/%s.%s.annotated.handfiltered.vcf".format(projectName, cleanType))
     pipelineSpec.evalSpec.reference = dataset.pipeline.getProject.getReferenceFile
     pipelineSpec.evalSpec.intervals = dataset.pipeline.getProject.getIntervalList
-    pipelineSpec.evalSpec.dbsnp = variantEvalDbsnpFile
+    pipelineSpec.evalSpec.dbsnp = dataset.pipeline.getProject.getEvalDbsnp
     pipelineSpec.evalSpec.validations = dataset.validations
 
     // Run the test, at least checking if the command compiles
diff --git a/shell/getFirehosePipelineYaml.sh b/shell/getFirehosePipelineYaml.sh
index a6e1ef10c..8f03b1e4b 100755
--- a/shell/getFirehosePipelineYaml.sh
+++ b/shell/getFirehosePipelineYaml.sh
@@ -32,19 +32,22 @@ FIREHOSE_ANNOTATIONS=(reference_file interval_list \
 
 # YAML templates
 
+# Project YAML template, once per file.
 PROJECT_YAML_TEMPLATE='"\n\
   project: {\n\
     name: '"$ENTITY_SET_ID"',\n\
     referenceFile: %s,\n\
-    dbsnpFile: %s,\n\
+    genotypeDbsnpFile: %s,\n\
+    evalDbsnpFile: %s,\n\
     refseqTable: %s,\n\
     intervalList: %s\n\
-  },", $1, dbsnp, refseq, $2'
+  },", $1, genotypeDbsnp, evalDbsnp, refseq, $2'
 
+# Project YAML template, once per sample.
 SAMPLE_YAML_TEMPLATE='"\n\
     {\n\
       id: %s,\n\
-      bamFiles: { recalibrated: %s },\n\
+      bamFiles: { cleaned: %s },\n\
       tags: {\n\
         SQUIDProject: %s,\n\
         CollaboratorID: %s\n\
@@ -86,20 +89,27 @@ BEGIN {
     refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
     dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/";
 
-    dbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
+    # add hg18 specific files to awk associative arrays
+    genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
+    evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
     refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt";
 
-    dbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
+    # add hg19 specific files to awk associative arrays
+    genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
+    evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod";
     refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt";
 
     printf "{"
 }
 {
     if (NR == 1) {
+        # Based on the reference of the first sample, specify the dbsnps and refseq tables.
+
         reference_part_count = split($1, reference_parts, "/")
         reference_name = reference_parts[reference_part_count];
 
-        dbsnp = dbsnps[reference_name];
+        genotypeDbsnp = genotypeDbsnps[reference_name];
+        evalDbsnp = evalDbsnps[reference_name];
         refseq = refseqs[reference_name];
 
         printf '"$PROJECT_YAML_TEMPLATE"'
@@ -115,12 +125,14 @@ END {
     print "\n}"
 }' > $PIPELINE_YAML_FILE
 
-hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
+#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
 
-if [ "$hg19" -ne 0 ]; then
-    sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2 
-    mv yaml2 $PIPELINE_YAML_FILE
-fi
+# NOTE: DBSNP's are populated via AWK's BEGIN block above.
+#if [ "$hg19" -ne 0 ]; then
+#    sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2
+#    mv yaml2 $PIPELINE_YAML_FILE
+#fi
 
-sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
-mv yaml2 $PIPELINE_YAML_FILE
+# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above.
+#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
+#mv yaml2 $PIPELINE_YAML_FILE
diff --git a/shell/queue.sh b/shell/queue.sh
index a939f5525..3b2826d92 100755
--- a/shell/queue.sh
+++ b/shell/queue.sh
@@ -2,7 +2,6 @@
 
 SAMPLE_SET=$1
 STING_HOME=/humgen/gsa-pipeline/.repository
-REFSEQ_TABLE=/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/refGene-big-table-hg19.txt
 JOB_QUEUE=week
 SHORT_QUEUE=hour
 TMP_DIR=$PWD/tmp
@@ -14,4 +13,4 @@ use R-2.10
 use Oracle-full-client
 use .cx-oracle-5.0.2-python-2.6.5-oracle-full-client-11.1
 
-java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/fullCallingPipeline.q -Y "$SAMPLE_SET".yaml -refseqTable "$REFSEQ_TABLE" --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2
+java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/FullCallingPipeline.q -Y "$SAMPLE_SET".yaml --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2