From dd71884b0c095fc57ff3dbb1a06961ac771c97ee Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 12:08:07 -0700 Subject: [PATCH] On path to SampleDB engine integration -- PedReader tag parser -- Separation of SampleDBBuilder from SampleDB (now immutable) -- Removed old sample engine arguments --- .../arguments/GATKArgumentCollection.java | 29 +++-- .../sting/gatk/samples/PedReader.java | 56 +++++++- .../gatk/samples/PedigreeValidationType.java | 34 +++++ .../sting/gatk/samples/SampleDB.java | 54 ++------ .../sting/gatk/samples/SampleDBBuilder.java | 121 ++++++++++++++++++ .../sting/gatk/samples/PedReaderUnitTest.java | 66 ++++++++++ ...rceUnitTest.java => SampleDBUnitTest.java} | 4 +- 7 files changed, 304 insertions(+), 60 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java rename public/java/test/org/broadinstitute/sting/gatk/samples/{SampleDataSourceUnitTest.java => SampleDBUnitTest.java} (83%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 9ce402cf3..c27bb26d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; @@ -44,10 +45,7 @@ import org.simpleframework.xml.stream.HyphenStyle; import java.io.File; import java.io.InputStream; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @author aaron @@ -72,11 +70,6 @@ public class GATKArgumentCollection { @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) public List samFiles = new ArrayList(); - // parameters and their defaults - @ElementList(required = false) - @Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false) - public List sampleFiles = new ArrayList(); - @Element(required = false) @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; @@ -215,9 +208,25 @@ public class GATKArgumentCollection { // -------------------------------------------------------------------------------------------------------------- // - // distributed GATK arguments + // PED (pedigree) support // // -------------------------------------------------------------------------------------------------------------- + + /** + * MARK: add documentation details + */ + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree file / string for samples",required=false) + public List pedigreeData = Collections.emptyList(); + + @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) + public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; + + // -------------------------------------------------------------------------------------------------------------- + // + // BAM indexing and sharding arguments + // + // -------------------------------------------------------------------------------------------------------------- + @Element(required = false) @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false) @Hidden diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index d697498be..ec49b0f60 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -114,10 +114,42 @@ public class PedReader { final static private Set CATAGORICAL_TRAIT_VALUES = new HashSet(Arrays.asList("-9", "0", "1", "2")); final static private String commentMarker = "#"; + /** + * An enum that specifies which, if any, of the standard PED fields are + * missing from the input records. For example, suppose we have the full record: + * + * "fam1 kid dad mom 1 2" + * + * indicating a male affected child. This can be parsed with the -ped x.ped argument + * to the GATK. Suppose we only have: + * + * "fam1 kid 1" + * + * we can parse the reduced version of this record with -ped:NO_PARENTS,NO_PHENOTYPE x.ped + */ public enum MissingPedField { + /** + * The PED records do not have the first (FAMILY_ID) argument. The family id + * will be set to null / empty. + */ NO_FAMILY_ID, + + /** + * The PED records do not have either the paternal or maternal IDs, so + * the corresponding IDs are set to null. + */ NO_PARENTS, + + /** + * The PED records do not have the GENDER field, so the sex of each + * sample will be set to UNKNOWN. + */ NO_SEX, + + /** + * The PED records do not have the PHENOTYPE field, so the phenotype + * of each sample will be set to UNKNOWN. + */ NO_PHENOTYPE } @@ -233,8 +265,6 @@ public class PedReader { if ( mom != null ) samples.add(mom); } - - sampleDB.validate(samples); return samples; } @@ -253,4 +283,26 @@ public class PedReader { } else return null; } + + /** + * Parses a list of tags from the command line, assuming it comes from the GATK Engine + * tags, and returns the corresponding EnumSet. + * + * @param arg the actual engine arg, used for the UserException if there's an error + * @param tags a list of string tags that should be converted to the MissingPedField value + * @return + */ + public static final EnumSet parseMissingFieldTags(final Object arg, final List tags) { + final EnumSet missingFields = EnumSet.noneOf(MissingPedField.class); + + for ( final String tag : tags ) { + try { + missingFields.add(MissingPedField.valueOf(tag)); + } catch ( IllegalArgumentException e ) { + throw new UserException.BadArgumentValue(arg.toString(), "Unknown tag " + tag + " allowed values are " + MissingPedField.values()); + } + } + + return missingFields; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java new file mode 100644 index 000000000..8a1a4f225 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** +* +*/ +public enum PedigreeValidationType { + STRICT, + LINIENT, + SILENT, +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 6a2ec2ac4..75b37d758 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -38,51 +38,9 @@ public class SampleDB { } - public SampleDB(final SAMFileHeader header, final List sampleFiles) { - this(); - addSamples(header); - addSamples(sampleFiles); - } - - // -------------------------------------------------------------------------------- - // - // Functions for adding samples to the DB - // - // TODO: these should be protected, really - // - // -------------------------------------------------------------------------------- - /** - * Hallucinates sample objects for all the samples in the SAM file and stores them - */ - protected SampleDB addSamples(SAMFileHeader header) { - for (String sampleName : SampleUtils.getSAMFileSamples(header)) { - if (getSample(sampleName) == null) { - Sample newSample = new Sample(sampleName, this); - samples.put(sampleName, newSample); - } - } - return this; - } - - protected SampleDB addSamples(final List sampleFiles) { - // add files consecutively - for (File file : sampleFiles) { - addSamples(file); - } - return this; - } - - /** - * Parse one sample file and integrate it with samples that are already there - * Fail quickly if we find any errors in the file - */ - protected SampleDB addSamples(File sampleFile) { - return this; - } - - /** - * Add a sample to the collection + * Protected function to add a single sample to the database + * * @param sample to be added */ protected SampleDB addSample(Sample sample) { @@ -215,10 +173,14 @@ public class SampleDB { // -------------------------------------------------------------------------------- public final void validate() { - validate(getSamples()); + validate(getSamples(), PedigreeValidationType.STRICT); } - public final void validate(Collection samplesToCheck) { + public final void validate(PedigreeValidationType validationType) { + validate(getSamples(), validationType); + } + public final void validate(Collection samplesToCheck, PedigreeValidationType validationType) { + // todo -- actually do an implementation } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java new file mode 100644 index 000000000..33bed89d2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Genotype; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * + */ +public class SampleDBBuilder { + PedigreeValidationType validationStrictness; + final SampleDB sampleDB = new SampleDB(); + final GenomeAnalysisEngine engine; + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + */ + public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) { + this.engine = engine; + this.validationStrictness = validationStrictness; + } + + /** + * Hallucinates sample objects for all the samples in the SAM file and stores them + */ + public SampleDBBuilder addSamples(SAMFileHeader header) { + for (String sampleName : SampleUtils.getSAMFileSamples(header)) { + if (sampleDB.getSample(sampleName) == null) { + final Sample newSample = new Sample(sampleName, sampleDB); + addSample(newSample); + } + } + return this; + } + + public SampleDBBuilder addSamples(final List pedigreeArguments) { + for (final String ped : pedigreeArguments) { + final File pedFile = new File(ped); + if ( pedFile.exists() ) + addSamples(pedFile); + else + addSamples(ped); + } + + return this; + } + + /** + * Parse one sample file and integrate it with samples that are already there + * Fail quickly if we find any errors in the file + */ + protected SampleDBBuilder addSamples(File sampleFile) { + final PedReader reader = new PedReader(); + + try { + reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleFile, e); + } + + return this; + } + + protected SampleDBBuilder addSamples(final String string) { + final PedReader reader = new PedReader(); + reader.parse(string, getMissingFields(string), sampleDB); + return this; + } + + /** + * Add a sample to the collection + * @param sample to be added + */ + protected SampleDBBuilder addSample(Sample sample) { + sampleDB.addSample(sample); + return this; + } + + public SampleDB getFinalSampleDB() { + sampleDB.validate(validationStrictness); + return sampleDB; + } + + public EnumSet getMissingFields(final Object engineArg) { + final List posTags = engine.getTags(engineArg).getPositionalTags(); + return PedReader.parseMissingFieldTags(engineArg, posTags); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index c14995dca..57bc6cf3b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.samples; import org.apache.log4j.Logger; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -285,4 +286,69 @@ public class PedReaderUnitTest extends BaseTest { parts.remove(field.ordinal()); return Utils.join("\t", parts); } + + // ----------------------------------------------------------------- + // parsing tags + // ----------------------------------------------------------------- + + private class PedReaderTestTagParsing extends TestDataProvider { + public EnumSet expected; + public final List tags; + + private PedReaderTestTagParsing(final List tags, EnumSet missingDesc) { + super(PedReaderTestTagParsing.class); + this.tags = tags; + this.expected = missingDesc; + + } + } + + @DataProvider(name = "readerTestTagParsing") + public Object[][] createReaderTestTagParsing() { + new PedReaderTestTagParsing( + Collections.emptyList(), + EnumSet.noneOf(PedReader.MissingPedField.class)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_FAMILY_ID"), + EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_PARENTS"), + EnumSet.of(PedReader.MissingPedField.NO_PARENTS)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_PHENOTYPE"), + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX"), + EnumSet.of(PedReader.MissingPedField.NO_SEX)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX", "NO_PHENOTYPE"), + EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX", "NO_PHENOTYPE", "NO_PARENTS"), + EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS)); + + return PedReaderTestTagParsing.getTests(PedReaderTestTagParsing.class); + } + + @Test(enabled = true, dataProvider = "readerTestTagParsing") + public void testPedReaderTagParsing(PedReaderTestTagParsing test) { + EnumSet parsed = PedReader.parseMissingFieldTags("test", test.tags); + Assert.assertEquals(test.expected, parsed, "Failed to properly parse tags " + test.tags); + } + + @Test(enabled = true, expectedExceptions = UserException.class) + public void testPedReaderTagParsing1() { + EnumSet parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("XXX")); + } + + @Test(enabled = true, expectedExceptions = UserException.class) + public void testPedReaderTagParsing2() { + EnumSet parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("NO_SEX", "XXX")); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java similarity index 83% rename from public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index 90dd8e36e..500d322db 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -14,7 +14,7 @@ import java.util.*; * Date: Sep 9, 2010 * Time: 8:21:00 AM */ -public class SampleDataSourceUnitTest extends BaseTest { +public class SampleDBUnitTest extends BaseTest { // this empty header used to instantiate sampledatasource objects private static SAMFileHeader header = new SAMFileHeader(); @@ -24,6 +24,6 @@ public class SampleDataSourceUnitTest extends BaseTest { // make sure samples are created from the SAM file correctly @Test() public void loadSAMSamplesTest() { - SampleDB s = new SampleDB(header, Collections.emptyList()); + //SampleDB s = new SampleDB(header); } }