From 20b38b38f302bb2142d6ae6c46277e2bcf0a4173 Mon Sep 17 00:00:00 2001 From: kshakir Date: Wed, 22 Sep 2010 19:47:49 +0000 Subject: [PATCH] Updated from SnakeYAML 1.6 to 1.7. Added a pipeline java bean and YAML utility to serialize java beans. Added a getFirehosePipelineYaml.sh that can pull firehose data into the pipeline yaml file format. Updated the fullCallingPipeline.q to begin using the pipeline yaml file format for bams and reference. More changes to come as this code gets tested out in the fullCallingPipeline. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4329 348d0f76-0448-11de-a6fe-93d51630548a --- ivy.xml | 2 +- .../sting/datasources/pipeline/Pipeline.java | 62 ++++++++++ .../datasources/pipeline/PipelineProject.java | 80 +++++++++++++ .../datasources/pipeline/PipelineSample.java | 62 ++++++++++ .../datasources/sample/SampleDataSource.java | 4 +- .../utils/yaml/FieldOrderComparator.java | 52 +++++++++ .../utils/yaml/StingYamlRepresenter.java | 88 ++++++++++++++ .../sting/utils/yaml/YamlUtils.java | 107 ++++++++++++++++++ .../pipeline/PipelineUnitTest.java | 81 +++++++++++++ scala/qscript/fullCallingPipeline.q | 61 +++++----- shell/getFirehosePipelineYaml.sh | 92 +++++++++++++++ 11 files changed, 657 insertions(+), 34 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java create mode 100644 java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java create mode 100644 java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java create mode 100644 java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java create mode 100644 java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java create mode 100644 java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java create mode 100644 java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java create mode 100644 shell/getFirehosePipelineYaml.sh diff --git a/ivy.xml b/ivy.xml index c62acf8c8..a078c0f40 100644 --- a/ivy.xml +++ b/ivy.xml @@ -17,7 +17,7 @@ - + diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java b/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java new file mode 100644 index 000000000..f8f8b2d29 --- /dev/null +++ b/java/src/org/broadinstitute/sting/datasources/pipeline/Pipeline.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.datasources.pipeline; + +import java.util.ArrayList; +import java.util.List; + +/** + * Java bean for storing a list of samples for a pipeline. + * + * NOTE: This class is used in a very similar way to the classes in + * org.broadinstitute.sting.gatk.datasources.sample. + * + * Both store / load sample information from the file system as YAML. + * + * This package will likely be refactored to share common functionality + * with the other at a future date as requirements coalesce. + * + * - kshakir September 22, 2010 + */ +public class Pipeline { + private PipelineProject project = new PipelineProject(); + private List samples = new ArrayList(); + + public PipelineProject getProject() { + return project; + } + + public void setProject(PipelineProject project) { + this.project = project; + } + + public List getSamples() { + return samples; + } + + public void setSamples(List samples) { + this.samples = samples; + } +} diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java new file mode 100644 index 000000000..8c8e8c7f1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineProject.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.datasources.pipeline; + +import java.io.File; +import java.util.Map; +import java.util.TreeMap; + +/** + * Java bean defining the project for a pipeline. + */ +public class PipelineProject { + private String name; + private File referenceFile; + private File intervalList; + private File dbsnpFile; + private Map tags = new TreeMap(); + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public File getIntervalList() { + return intervalList; + } + + public void setIntervalList(File intervalList) { + this.intervalList = intervalList; + } + + public File getReferenceFile() { + return referenceFile; + } + + public void setReferenceFile(File referenceFile) { + this.referenceFile = referenceFile; + } + + public File getDbsnpFile() { + return dbsnpFile; + } + + public void setDbsnpFile(File dbsnpFile) { + this.dbsnpFile = dbsnpFile; + } + + public Map getTags() { + return tags; + } + + public void setTags(Map tags) { + this.tags = tags; + } +} diff --git a/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java new file mode 100644 index 000000000..701841302 --- /dev/null +++ b/java/src/org/broadinstitute/sting/datasources/pipeline/PipelineSample.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.datasources.pipeline; + +import java.io.File; +import java.util.Map; +import java.util.TreeMap; + +/** + * Java bean defining a sample for a pipeline. + */ +public class PipelineSample { + private String id; + private Map bamFiles = new TreeMap(); + private Map tags = new TreeMap(); + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public Map getBamFiles() { + return bamFiles; + } + + public void setBamFiles(Map bamFiles) { + this.bamFiles = bamFiles; + } + + public Map getTags() { + return tags; + } + + public void setTags(Map tags) { + this.tags = tags; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java index eedc15e7f..55f4e2fd5 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java @@ -6,7 +6,6 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.exceptions.StingException; -import org.yaml.snakeyaml.Loader; import org.yaml.snakeyaml.TypeDescription; import org.yaml.snakeyaml.Yaml; import org.yaml.snakeyaml.constructor.Constructor; @@ -115,8 +114,7 @@ public class SampleDataSource { desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class); desc.putListPropertyType("sampleAliases", SampleAlias.class); con.addTypeDescription(desc); - Loader loader = new Loader(con); - Yaml yaml = new Yaml(loader); + Yaml yaml = new Yaml(con); // SampleFileParser stores an object representation of a sample file - this is what we'll parse SampleFileParser parser; diff --git a/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java b/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java new file mode 100644 index 000000000..2a043466a --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.yaml; + +import org.yaml.snakeyaml.introspector.Property; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +/** + * Orders properties based on the order of the fields in the Java Bean. + */ +class FieldOrderComparator implements Comparator { + private final List propertyOrder; + + public FieldOrderComparator(Class clazz) { + propertyOrder = new ArrayList(); + for (Field field : clazz.getDeclaredFields()) + propertyOrder.add(field.getName()); + } + + @Override + public int compare(Property one, Property two) { + Integer index1 = propertyOrder.indexOf(one.getName()); + Integer index2 = propertyOrder.indexOf(two.getName()); + return index1.compareTo(index2); + } +} diff --git a/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java b/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java new file mode 100644 index 000000000..157b1ce27 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.yaml; + +import org.yaml.snakeyaml.introspector.Property; +import org.yaml.snakeyaml.nodes.*; +import org.yaml.snakeyaml.representer.Represent; +import org.yaml.snakeyaml.representer.Representer; + +import java.beans.IntrospectionException; +import java.io.File; +import java.util.Set; +import java.util.TreeSet; + +/** + * A representer with Sting prefered settings. + * - Fields are ordered in the order of the class declaration, instead of alphabetically. + * - Empty maps and sequences are not output. + * - Files are converted to their absolute paths. + */ +public class StingYamlRepresenter extends Representer { + + public StingYamlRepresenter() { + super(); + this.representers.put(File.class, new RepresentFile()); + } + + @Override + protected Set getProperties(Class type) throws IntrospectionException { + TreeSet properties = new TreeSet(new FieldOrderComparator(type)); + properties.addAll(super.getProperties(type)); + return properties; + } + + @Override + protected NodeTuple representJavaBeanProperty(Object javaBean, Property property, + Object propertyValue, Tag customTag) { + NodeTuple tuple = super.representJavaBeanProperty(javaBean, property, propertyValue, customTag); + Node valueNode = tuple.getValueNode(); + if (Tag.NULL.equals(valueNode.getTag())) { + return null;// skip 'null' values + } + if (valueNode instanceof CollectionNode) { + if (Tag.SEQ.equals(valueNode.getTag())) { + SequenceNode seq = (SequenceNode) valueNode; + if (seq.getValue().isEmpty()) { + return null;// skip empty lists + } + } + if (Tag.MAP.equals(valueNode.getTag())) { + MappingNode seq = (MappingNode) valueNode; + if (seq.getValue().isEmpty()) { + return null;// skip empty maps + } + } + } + return tuple; + } + + private class RepresentFile implements Represent { + @Override + public Node representData(Object o) { + return StingYamlRepresenter.this.representScalar(Tag.STR, ((File)o).getPath()); + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java b/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java new file mode 100644 index 000000000..715c71efc --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.yaml; + +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.yaml.snakeyaml.DumperOptions; +import org.yaml.snakeyaml.Yaml; +import org.yaml.snakeyaml.constructor.Constructor; +import org.yaml.snakeyaml.nodes.Tag; +import org.yaml.snakeyaml.representer.Representer; + +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; + +/** + * A collection of utilities for operating on YAML. + * Uses the FLOW style of writing YAML, versus the BLOCK style. + * By default uses a representer that prunes empty lists and maps. + */ +public class YamlUtils { + private static Representer representer = new StingYamlRepresenter(); + private static DumperOptions options = new DumperOptions(); + + static { + options.setCanonical(false); + options.setExplicitRoot(Tag.MAP); + options.setDefaultFlowStyle(DumperOptions.FlowStyle.FLOW); + options.setPrettyFlow(true); + } + + /** + * Serialize an object to the file system. + * @param o Object to serialize. + * @param file Path to write the serialized YAML. + */ + public static void dump(Object o, File file) { + dump(o, file, representer); + } + + /** + * Serialize an object to the file system. + * @param o Object to serialize. + * @param file Path to write the serialized YAML. + * @param representer Custom representer with rules on how to serialize YAML. + */ + public static void dump(Object o, File file, Representer representer) { + Constructor constructor = new Constructor(o.getClass()); + Yaml yaml = new Yaml(constructor, representer, options); + try { + yaml.dump(o, new FileWriter(file)); + } catch (IOException ioe) { + throw new UserException.CouldNotCreateOutputFile(file, ioe); + } + } + + /** + * Deserialize an object from the file system. + * @param clazz Clazz to deserialize. + * @param file Path to read the deserialized YAML. + * @return Object deserialized from the file system. + */ + public static T load(Class clazz, File file) { + return load(clazz, file, representer); + } + + /** + * Deserialize an object from the file system. + * @param clazz Clazz to deserialize. + * @param file Path to read the deserialized YAML. + * @param representer Custom representer with rules on how to deserialize YAML. + * @return Object deserialized from the file system. + */ + @SuppressWarnings("unchecked") + public static T load(Class clazz, File file, Representer representer) { + Constructor constructor = new Constructor(clazz); + Yaml yaml = new Yaml(constructor, representer, options); + try { + return (T) yaml.load(new FileReader(file)); + } catch (IOException ioe) { + throw new UserException.CouldNotReadInputFile(file, ioe); + } + } +} diff --git a/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java b/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java new file mode 100644 index 000000000..7a32bcea5 --- /dev/null +++ b/java/test/org/broadinstitute/sting/datasources/pipeline/PipelineUnitTest.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.datasources.pipeline; + +import org.broadinstitute.sting.utils.yaml.YamlUtils; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.util.Map; + +public class PipelineUnitTest { + @Test + public void testDumpAndLoad() throws Exception { + Pipeline pipeline = new Pipeline(); + + pipeline.getProject().setName("PRJ_NAME"); + pipeline.getProject().setReferenceFile(new File("my.fasta")); + pipeline.getProject().setDbsnpFile(new File("my.dbsnp")); + pipeline.getProject().getTags().put("testProjectTag", "project value here"); + + PipelineSample sample = new PipelineSample(); + sample.setId("SMP_ID"); + sample.getBamFiles().put("recalibrated", new File("recalibrated.bam")); + sample.getBamFiles().put("cleaned", new File("/absolute/path/to/cleaned.bam")); + sample.getTags().put("testSampleTag", "sample value here"); + + pipeline.getSamples().add(sample); + + File file = File.createTempFile("testDumpAndLoad", ".yaml"); + YamlUtils.dump(pipeline, file); + Pipeline pipelineLoad = YamlUtils.load(Pipeline.class, file); + + Assert.assertEquals(pipeline.getProject().getName(), pipelineLoad.getProject().getName()); + Assert.assertEquals(pipeline.getProject().getReferenceFile(), pipelineLoad.getProject().getReferenceFile()); + Assert.assertEquals(pipeline.getProject().getIntervalList(), pipelineLoad.getProject().getIntervalList()); + Assert.assertEquals(pipeline.getProject().getDbsnpFile(), pipelineLoad.getProject().getDbsnpFile()); + + Assert.assertEquals(pipeline.getProject().getTags().size(), pipelineLoad.getProject().getTags().size()); + for (Map.Entry entry : pipeline.getProject().getTags().entrySet()) + Assert.assertEquals(entry.getValue(), pipeline.getProject().getTags().get(entry.getKey())); + + Assert.assertEquals(pipeline.getSamples().size(), pipelineLoad.getSamples().size()); + for (int i = 0; i < pipeline.getSamples().size(); i++) { + PipelineSample pipelineSample = pipeline.getSamples().get(i); + PipelineSample pipelineLoadSample = pipelineLoad.getSamples().get(i); + + Assert.assertEquals(pipelineSample.getId(), pipelineLoadSample.getId()); + + Assert.assertEquals(pipelineSample.getBamFiles().size(), pipelineLoadSample.getBamFiles().size()); + for (Map.Entry entry : pipelineSample.getBamFiles().entrySet()) + Assert.assertEquals(entry.getValue(), pipelineSample.getBamFiles().get(entry.getKey())); + + Assert.assertEquals(pipelineSample.getTags().size(), pipelineLoadSample.getTags().size()); + for (Map.Entry entry : pipelineSample.getTags().entrySet()) + Assert.assertEquals(entry.getValue(), pipelineSample.getTags().get(entry.getKey())); + } + } +} diff --git a/scala/qscript/fullCallingPipeline.q b/scala/qscript/fullCallingPipeline.q index 402e9dc25..05fccf16c 100755 --- a/scala/qscript/fullCallingPipeline.q +++ b/scala/qscript/fullCallingPipeline.q @@ -1,24 +1,24 @@ +import org.broadinstitute.sting.datasources.pipeline.Pipeline import org.broadinstitute.sting.gatk.DownsampleType import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction import org.broadinstitute.sting.queue.extensions.samtools._ import org.broadinstitute.sting.queue.{QException, QScript} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.yaml.YamlUtils class fullCallingPipeline extends QScript { qscript => - @Argument(doc = "reference", shortName="R") - var reference: File = _ - @Argument(doc="contigIntervals", shortName="contigIntervals") var contigIntervals: File = _ @Argument(doc="numContigs", shortName="numContigs") var numContigs: Int = _ - @Argument(doc="project", shortName="project") - var project: String = _ + @Argument(fullName="pipeline_yaml", shortName="PY", doc="Pipeline YAML file") + var pipelineYamlFile: File = _ @Input(doc="trigger", shortName="trigger", required=false) var trigger: File = _ @@ -35,12 +35,6 @@ class fullCallingPipeline extends QScript { @Input(doc="Picard FixMateInformation.jar. At the Broad this can be found at /seq/software/picard/current/bin/FixMateInformation.jar. Outside the broad see http://picard.sourceforge.net/") var picardFixMatesJar: File = _ - @Input(doc="intervals") - var intervals: File = _ - - @Input(doc="bam files", shortName="I") - var bamFiles: List[File] = Nil - @Input(doc="gatk jar") var gatkJar: File = _ @@ -50,9 +44,6 @@ class fullCallingPipeline extends QScript { @Input(doc="SNP cluster filter -- window size",shortName="snpClusterWindow",required=false) var snpClusterWindow = 7 - @Input(doc="dbSNP version",shortName="D") - var dbSNP: File = _ - @Input(doc="target titv for recalibration",shortName="titv",required=false) var target_titv = 2.1 @@ -65,11 +56,12 @@ class fullCallingPipeline extends QScript { @Input(doc="Number of jobs to scatter indel genotyper",shortName="indelScatter",required=false) var num_indel_scatter_jobs = 5 + private var pipeline: Pipeline = _ trait CommandLineGATKArgs extends CommandLineGATK { - this.intervals = qscript.intervals + this.intervals = qscript.pipeline.getProject.getIntervalList this.jarFile = qscript.gatkJar - this.reference_sequence = qscript.reference + this.reference_sequence = qscript.pipeline.getProject.getReferenceFile } @@ -77,20 +69,26 @@ class fullCallingPipeline extends QScript { def script = { - val projectBase: String = qscript.project + pipeline = YamlUtils.load(classOf[Pipeline], qscript.pipelineYamlFile) + val projectBase: String = qscript.pipeline.getProject.getName val cleanedBase: String = projectBase + ".cleaned" val uncleanedBase: String = projectBase + ".uncleaned" // there are commands that use all the bam files - var cleanBamFiles = List.empty[File] + val recalibratedSamples = qscript.pipeline.getSamples + .filter(_.getBamFiles.contains("recalibrated")) - for ( bam <- qscript.bamFiles ) { + for ( sample <- recalibratedSamples ) { // put unclean bams in unclean genotypers // in advance, create the extension files + val bam = sample.getBamFiles.get("recalibrated") + if (!sample.getBamFiles.contains("cleaned")) + sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam")) + val cleaned_bam = sample.getBamFiles.get("cleaned") + val indel_targets = swapExt(bam,"bam","realigner_targets.interval_list") - val cleaned_bam = swapExt(bam,"bam","cleaned.bam") // note-- the scatter is in the definition itself // create the cleaning commands @@ -131,8 +129,6 @@ class fullCallingPipeline extends QScript { gather.jarFile = qscript.picardFixMatesJar // Don't pass this AS=true to fix mates! gather.assumeSorted = None - case (gather: SimpleTextGatherFunction, _) => - throw new QException("Cannot text-gather a realignment job") } } else { realigner.out = swapExt(bam,"bam","unfixed.cleaned.bam") @@ -149,10 +145,6 @@ class fullCallingPipeline extends QScript { var samtoolsindex = new SamtoolsIndexFunction samtoolsindex.bamFile = cleaned_bam - // put clean bams in clean genotypers - - cleanBamFiles :+= cleaned_bam - // COMMENT THIS NEXT BLOCK TO SKIP CLEANING if ( realigner.scatterCount > 1 ) add(targetCreator,realigner,samtoolsindex) @@ -160,8 +152,17 @@ class fullCallingPipeline extends QScript { add(targetCreator,realigner,fixMates,samtoolsindex) } + val recalibratedBamFiles = recalibratedSamples + .map(_.getBamFiles.get("recalibrated")) + .toList + + val cleanBamFiles = qscript.pipeline.getSamples + .filter(_.getBamFiles.contains("cleaned")) + .map(_.getBamFiles.get("cleaned")) + .toList + // actually make calls - endToEnd(uncleanedBase,qscript.bamFiles) + endToEnd(uncleanedBase,recalibratedBamFiles) // COMMENT THIS NEXT LINE TO AVOID CALLING ON CLEANED FILES endToEnd(cleanedBase,cleanBamFiles) } @@ -218,7 +219,7 @@ class fullCallingPipeline extends QScript { loopNo += 1 } val mergeIndels = new CombineVariants with CommandLineGATKArgs - mergeIndels.out = new TaggedFile(qscript.project+".indels.vcf","vcf") + mergeIndels.out = new TaggedFile(qscript.pipeline.getProject.getName+".indels.vcf","vcf") mergeIndels.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY) mergeIndels.priority = priority mergeIndels.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION) @@ -259,7 +260,7 @@ class fullCallingPipeline extends QScript { // todo -- args for resources (properties file) val clusters = new GenerateVariantClusters with CommandLineGATKArgs clusters.rodBind :+= RodBind("input", "VCF", masker.out) - clusters.DBSNP = qscript.dbSNP + clusters.DBSNP = qscript.pipeline.getProject.getDbsnpFile val clusters_clusterFile = swapExt(new File(snps.out.getAbsolutePath),".vcf",".cluster") clusters.clusterFile = clusters_clusterFile clusters.memoryLimit = Some(4) @@ -271,7 +272,7 @@ class fullCallingPipeline extends QScript { // 3.ii apply gaussian clusters to the masked vcf val recalibrate = new VariantRecalibrator with CommandLineGATKArgs recalibrate.clusterFile = clusters.clusterFile - recalibrate.DBSNP = qscript.dbSNP + recalibrate.DBSNP = qscript.pipeline.getProject.getDbsnpFile recalibrate.rodBind :+= RodBind("input", "VCF", masker.out) recalibrate.out = swapExt(masker.out,".vcf",".recalibrated.vcf") recalibrate.target_titv = qscript.target_titv diff --git a/shell/getFirehosePipelineYaml.sh b/shell/getFirehosePipelineYaml.sh new file mode 100644 index 000000000..6966c2785 --- /dev/null +++ b/shell/getFirehosePipelineYaml.sh @@ -0,0 +1,92 @@ +#!/bin/sh + +# Downloads a set of samples from Firehose using the Firehose Test Harness and awk to generate a YAML file. + +ENTITY_SET_ID=$1 +ENTITY_SET_TYPE=Sample_Set +ENTITY_TYPE=Sample + +if [ "$ENTITY_SET_ID" == "" ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +# Firehose variables + +FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source +CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis +FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py" +FIREHOSE_HOST=firehose +FIREHOSE_PORT=8080 +FIREHOSE_DOMAIN=gsa +FIREHOSE_WORKSPACE=trunk + +# YAML file to write + +PIPELINE_YAML_FILE=$ENTITY_SET_ID.yaml + +# Annotations to pull down from Firehose + +FIREHOSE_ANNOTATIONS=(reference_file dbsnp_file interval_list \ + sample_id recalibrated_bam_file squid_project collaborator_id) + +# YAML templates + +PROJECT_YAML_TEMPLATE='" \ + project: { \ + name: '"$ENTITY_SET_ID"', \ + referenceFile: %s, \ + dbsnpFile: %s, \ + intervalList: %s \ + },", $1, $2, $3' + +SAMPLE_YAML_TEMPLATE='" \ + { \ + id: %s, \ + bamFiles: { recalibrated: %s }, \ + tags: { \ + SQUIDProject: %s, \ + CollaboratorID: %s \ + } \ + }", $4, $5, $6, $7' + +index=0 +count=${#FIREHOSE_ANNOTATIONS[@]} +FIREHOSE_VARIABLES="" +TAB=' ' + +# Build the tab separated list of firehose arguments + +while [ "$index" -lt "$count" ]; do + if [ "$FIREHOSE_VARIABLES" != "" ]; then + FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB + fi + FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}' + let "index = $index + 1" +done + +# Retrieve all the required variables and run the pipeline in Queue. +$FIREHOSE_TEST_HARNESS \ + -d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \ + -t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \ + "echo '$FIREHOSE_VARIABLES'" && \ +\ +# Generate yaml from firehose output +. firehose-populated-commands.sh | awk ' +BEGIN { + printf "{" +} +{ + if (NR == 1) { + printf '"$PROJECT_YAML_TEMPLATE"' + printf "\n samples: [" + } else { + printf "," + } + printf '"$SAMPLE_YAML_TEMPLATE"' +} +END { + if (NR > 0) + printf "\n ]" + print "\n}" +}' > $PIPELINE_YAML_FILE