Renamed the pipeline yaml dbsnpFile to genotypeDbsnp, and added an evalDbsnp.

Added a genotypeDbsnpType and evalDbsnpType to check the extensions for .vcf or .rod. Moved renaming of "recalibrated" bams to "cleaned" from sed to yaml generation template (see diff for more info). Renamed fCP.q to FCP.q. Though it's still disabled until VariantEval is updated, added changes above to the FCPTest. Removed refseq table from the queue.sh wrapper script. Only specified in the yaml. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5213 348d0f76-0448-11de-a6fe-93d51630548a
2011-02-07 22:01:09 +00:00 · 2011-02-07 22:01:09 +00:00 · 8040998c15
parent bceb2a9460
commit 8040998c15
5 changed files with 77 additions and 45 deletions
--- a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java
+++ b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java
@ -35,7 +35,8 @@ public class PipelineProject {
    private String name;
    private File referenceFile;
    private File intervalList;
-    private File dbsnpFile;
+    private File genotypeDbsnp;
+    private File evalDbsnp;
    private File refseqTable;
    private Map<String, String> tags = new TreeMap<String, String>();

@ -63,12 +64,40 @@ public class PipelineProject {
        this.referenceFile = referenceFile;
    }

+    /** use genotype dbsnp file */
+    @Deprecated
    public File getDbsnpFile() {
-        return dbsnpFile;
+        return getGenotypeDbsnp();
    }

+    /** use genotype dbsnp file */
+    @Deprecated
    public void setDbsnpFile(File dbsnpFile) {
-        this.dbsnpFile = dbsnpFile;
+        this.setGenotypeDbsnp(dbsnpFile);
+    }
+
+    public File getGenotypeDbsnp() {
+        return genotypeDbsnp;
+    }
+
+    public void setGenotypeDbsnp(File genotypeDbsnp) {
+        this.genotypeDbsnp = genotypeDbsnp;
+    }
+
+    public String getGenotypeDbsnpType() {
+        return getDbsnpType(genotypeDbsnp);
+    }
+
+    public File getEvalDbsnp() {
+        return evalDbsnp;
+    }
+
+    public void setEvalDbsnp(File evalDbsnp) {
+        this.evalDbsnp = evalDbsnp;
+    }
+
+    public String getEvalDbsnpType() {
+        return getDbsnpType(genotypeDbsnp);
    }

    public File getRefseqTable() {
@ -86,4 +115,13 @@ public class PipelineProject {
    public void setTags(Map<String, String> tags) {
        this.tags = tags;
    }
+
+    private String getDbsnpType(File file) {
+        if (file == null)
+            return null;
+        else if (file.getName().toLowerCase().endsWith(".vcf"))
+            return "vcf";
+        else
+            return "dbsnp";
+    }
 }
--- a/scala/qscript/playground/FullCallingPipeline.q
+++ b/scala/qscript/playground/FullCallingPipeline.q
@ -10,7 +10,7 @@ import org.broadinstitute.sting.queue.QScript
 import collection.JavaConversions._
 import org.broadinstitute.sting.utils.yaml.YamlUtils

-class fullCallingPipeline extends QScript {
+class FullCallingPipeline extends QScript {
  qscript =>

  @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
@ -64,8 +64,6 @@ class fullCallingPipeline extends QScript {

  private var pipeline: Pipeline = _

-  private var dbsnpType: String = _
-
  trait CommandLineGATKArgs extends CommandLineGATK {
    this.intervals = List(qscript.pipeline.getProject.getIntervalList)
    this.jarFile = qscript.gatkJar
@ -79,12 +77,6 @@ class fullCallingPipeline extends QScript {

  def script = {
    pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
-    //var dbsnpType: String = _ //figure out how to get it so this is recognized and the whole thing has access to it.
-      if (qscript.pipeline.getProject.getDbsnpFile.toString.contains("rod")){
-        dbsnpType = "dbSNP"
-      } else  {
-        dbsnpType = "VCF"
-      }

    val projectBase: String = qscript.pipeline.getProject.getName
    // TODO: Fix command lines that pass -refseqTable
@ -131,7 +123,7 @@ class fullCallingPipeline extends QScript {
        realigner.intervals = Nil
        realigner.intervalsString = Nil
        realigner.scatterCount = num_cleaner_scatter_jobs
-        realigner.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
+        realigner.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
        realigner.rodBind :+= RodBind("indels", "VCF", swapExt(realigner.reference_sequence.getParentFile, realigner.reference_sequence, "fasta", "1kg_pilot_indels.vcf"))

        // if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates
@ -214,7 +206,7 @@ class fullCallingPipeline extends QScript {
    snps.group :+= "Standard"
    snps.out = new File("SnpCalls", base+".vcf")
    snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
-    snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
+    snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
    snps.memoryLimit = Some(6)

    snps.scatterCount = qscript.num_snp_scatter_jobs
@ -242,7 +234,7 @@ class fullCallingPipeline extends QScript {
    indels.group :+= "Standard"
    indels.out = new File("IndelCalls", base+".vcf")
    indels.downsample_to_coverage = Some(qscript.downsampling_coverage)
-    indels.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
+    indels.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
    indels.memoryLimit = Some(6)
    indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL)

@ -308,12 +300,7 @@ class fullCallingPipeline extends QScript {
    eval.reportLocation = new File("SnpCalls", base+".eval")
    eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R)
    eval.analysisName = base+"_VariantEval"
-    if(dbsnpType=="VCF"){
-     eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
-
-    } else{
-      eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
-    }
+    eval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp)

    // 5. Make the bam list
    val listOfBams =  new File(base +".BamFiles.list")
--- a/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala
+++ b/scala/test/org/broadinstitute/sting/queue/pipeline/playground/FullCallingPipelineTest.scala
@ -23,11 +23,6 @@ class FullCallingPipelineTest {
    new K1gBam("C474", "NA19652", 1),
    new K1gBam("C474", "NA19654", 1))

-  // In fullCallingPipeline.q VariantEval is always compared against 129.
-  // Until the newvarianteval is finalized which will allow java import of the prior results,
-  // we re-run VariantEval to validate the run, and replicate that behavior here.
-  private final val variantEvalDbsnpFile = new File(BaseTest.b37dbSNP129)
-
  val k1gChr20Dataset = {
    val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20")
    dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"))
@ -64,7 +59,8 @@ class FullCallingPipelineTest {
    val project = new PipelineProject
    project.setName(projectName)
    project.setReferenceFile(new File(BaseTest.hg19Reference))
-    project.setDbsnpFile(new File(BaseTest.b37dbSNP132))
+    project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132))
+    project.setEvalDbsnp(new File(BaseTest.b37dbSNP129))
    project.setRefseqTable(new File(BaseTest.hg19Refseq))

    var samples = List.empty[PipelineSample]
@ -94,12 +90,12 @@ class FullCallingPipelineTest {
  @Test(dataProvider="datasets", enabled=false)
  def testFullCallingPipeline(dataset: PipelineDataset) = {
    val projectName = dataset.pipeline.getProject.getName
-    val testName = "fullCallingPipeline-" + projectName
+    val testName = "FullCallingPipeline-" + projectName
    val yamlFile = writeYaml(testName, dataset.pipeline)
    var cleanType = "cleaned"

    // Run the pipeline with the expected inputs.
-    var pipelineCommand = ("-retry 1 -S scala/qscript/playground/fullCallingPipeline.q" +
+    var pipelineCommand = ("-retry 1 -S scala/qscript/playground/FullCallingPipeline.q" +
            " -jobProject %s -Y %s" +
            " -tearScript %s/R/DataProcessingReport/GetTearsheetStats.R" +
            " --gatkjar %s")
@ -119,7 +115,7 @@ class FullCallingPipelineTest {
    pipelineSpec.evalSpec.vcf = new File(PipelineTest.runDir(testName) + "SnpCalls/%s.%s.annotated.handfiltered.vcf".format(projectName, cleanType))
    pipelineSpec.evalSpec.reference = dataset.pipeline.getProject.getReferenceFile
    pipelineSpec.evalSpec.intervals = dataset.pipeline.getProject.getIntervalList
-    pipelineSpec.evalSpec.dbsnp = variantEvalDbsnpFile
+    pipelineSpec.evalSpec.dbsnp = dataset.pipeline.getProject.getEvalDbsnp
    pipelineSpec.evalSpec.validations = dataset.validations

    // Run the test, at least checking if the command compiles
--- a/shell/getFirehosePipelineYaml.sh
+++ b/shell/getFirehosePipelineYaml.sh
@ -32,19 +32,22 @@ FIREHOSE_ANNOTATIONS=(reference_file interval_list \

 # YAML templates

+# Project YAML template, once per file.
 PROJECT_YAML_TEMPLATE='"\n\
  project: {\n\
    name: '"$ENTITY_SET_ID"',\n\
    referenceFile: %s,\n\
-    dbsnpFile: %s,\n\
+    genotypeDbsnpFile: %s,\n\
+    evalDbsnpFile: %s,\n\
    refseqTable: %s,\n\
    intervalList: %s\n\
-  },", $1, dbsnp, refseq, $2'
+  },", $1, genotypeDbsnp, evalDbsnp, refseq, $2'

+# Project YAML template, once per sample.
 SAMPLE_YAML_TEMPLATE='"\n\
    {\n\
      id: %s,\n\
-      bamFiles: { recalibrated: %s },\n\
+      bamFiles: { cleaned: %s },\n\
      tags: {\n\
        SQUIDProject: %s,\n\
        CollaboratorID: %s\n\
@ -86,20 +89,27 @@ BEGIN {
    refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
    dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/";

-    dbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
+    # add hg18 specific files to awk associative arrays
+    genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
+    evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
    refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt";

-    dbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
+    # add hg19 specific files to awk associative arrays
+    genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
+    evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod";
    refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt";

    printf "{"
 }
 {
    if (NR == 1) {
+        # Based on the reference of the first sample, specify the dbsnps and refseq tables.
+
        reference_part_count = split($1, reference_parts, "/")
        reference_name = reference_parts[reference_part_count];

-        dbsnp = dbsnps[reference_name];
+        genotypeDbsnp = genotypeDbsnps[reference_name];
+        evalDbsnp = evalDbsnps[reference_name];
        refseq = refseqs[reference_name];

        printf '"$PROJECT_YAML_TEMPLATE"'
@ -115,12 +125,14 @@ END {
    print "\n}"
 }' > $PIPELINE_YAML_FILE

-hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
+#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`

-if [ "$hg19" -ne 0 ]; then
-    sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2 
-    mv yaml2 $PIPELINE_YAML_FILE
-fi
+# NOTE: DBSNP's are populated via AWK's BEGIN block above.
+#if [ "$hg19" -ne 0 ]; then
+#    sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2
+#    mv yaml2 $PIPELINE_YAML_FILE
+#fi

-sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
-mv yaml2 $PIPELINE_YAML_FILE
+# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above.
+#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
+#mv yaml2 $PIPELINE_YAML_FILE
--- a/shell/queue.sh
+++ b/shell/queue.sh
@ -2,7 +2,6 @@

 SAMPLE_SET=$1
 STING_HOME=/humgen/gsa-pipeline/.repository
-REFSEQ_TABLE=/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/refGene-big-table-hg19.txt
 JOB_QUEUE=week
 SHORT_QUEUE=hour
 TMP_DIR=$PWD/tmp
@ -14,4 +13,4 @@ use R-2.10
 use Oracle-full-client
 use .cx-oracle-5.0.2-python-2.6.5-oracle-full-client-11.1

-java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/fullCallingPipeline.q -Y "$SAMPLE_SET".yaml -refseqTable "$REFSEQ_TABLE" --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2
+java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/FullCallingPipeline.q -Y "$SAMPLE_SET".yaml --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2