From e5f81d25d4685e007f0fe2720fd08e506c73db20 Mon Sep 17 00:00:00 2001 From: bthomas Date: Wed, 15 Sep 2010 11:50:22 +0000 Subject: [PATCH] Adding the --sample-metadata (-SM) command line argument and associated functionality. This is something Matt and I have been working on for a while. Basically, it allows you to integrate sample metadata into an analysis, by including a sample file. More detailed documentation is on the wiki: http://www.broadinstitute.org/gsa/wiki/index.php/Adding_Sample_data_to_an_analysis This commit adds two important classes: Sample, which contains data about one sample; and SampleDataSource, which manages sample data a la ReferenceDataSource and ReadsDataSource. This code should be stable, but it has not been integrated with existing walkers yet. That's the next commit. In the meantime, feel free to experiment with the code - there are two basic example walkers in the playground.sample package. And PLEASE let me know if you see any errors/inconsistencies. Note that this also adds a new dependency on SnakeYaml, a YAML parser. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4285 348d0f76-0448-11de-a6fe-93d51630548a --- ivy.xml | 1 + .../sting/gatk/GenomeAnalysisEngine.java | 73 ++- .../arguments/GATKArgumentCollection.java | 5 + .../sample/PropertyDefinition.java | 30 ++ .../sting/gatk/datasources/sample/Sample.java | 190 ++++++++ .../gatk/datasources/sample/SampleAlias.java | 31 ++ .../datasources/sample/SampleDataSource.java | 459 ++++++++++++++++++ .../datasources/sample/SampleFileParser.java | 65 +++ .../gatk/datasources/sample/SampleParser.java | 43 ++ .../sample/CountLociByPopulationWalker.java | 53 ++ .../playground/sample/CountMalesWalker.java | 28 ++ .../sample/SampleDataSourceTest.java | 154 ++++++ .../gatk/datasources/sample/SampleTest.java | 63 +++ 13 files changed, 1194 insertions(+), 1 deletion(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java create mode 100644 java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java create mode 100644 java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java create mode 100644 java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java create mode 100644 java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java diff --git a/ivy.xml b/ivy.xml index 9bb0b1f03..c62acf8c8 100644 --- a/ivy.xml +++ b/ivy.xml @@ -17,6 +17,7 @@ + diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 9ed27d046..c88de62c9 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -30,6 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.datasources.sample.SampleFileParser; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -76,6 +79,11 @@ public class GenomeAnalysisEngine { */ private ReferenceDataSource referenceDataSource = null; + /** + * Accessor for sample metadata + */ + private SampleDataSource sampleDataSource = null; + /** * Accessor for sharded reference-ordered data. */ @@ -388,6 +396,10 @@ public class GenomeAnalysisEngine { rodDataSources = getReferenceOrderedDataSources(my_walker, tracks); } + private void initializeSampleDataSource() { + this.sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); + } + /** * setup a microscheduler * @@ -966,5 +978,64 @@ public class GenomeAnalysisEngine { } } return unpackedReads; - } + } + + /** + * Get a sample by its ID + * If an alias is passed in, return the main sample object + * @param id + * @return sample Object with this ID + */ + public Sample getSampleById(String id) { + return sampleDataSource.getSampleById(id); + } + + /** + * Get the sample for a given read group + * Must first look up ID for read group + * @param readGroup of sample + * @return sample object with ID from the read group + */ + public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { + return sampleDataSource.getSampleByReadGroup(readGroup); + } + + /** + * Get a sample for a given read + * Must first look up read group, and then sample ID for that read group + * @param read of sample + * @return sample object of this read + */ + public Sample getSampleByRead(SAMRecord read) { + return getSampleByReadGroup(read.getReadGroup()); + } + + /** + * Get number of sample objects + * @return size of samples map + */ + public int sampleCount() { + return sampleDataSource.sampleCount(); + } + + /** + * Return all samples with a given family ID + * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this + * @param familyId + * @return + */ + public Set getFamily(String familyId) { + return sampleDataSource.getFamily(familyId); + } + + /** + * Returns all children of a given sample + * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient + * @param sample + * @return + */ + public Set getChildren(Sample sample) { + return sampleDataSource.getChildren(sample); + } + } diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 6abd8c209..b73f6a7ee 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -70,6 +70,11 @@ public class GATKArgumentCollection { @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) public List samFiles = new ArrayList(); + // parameters and their defaults + @ElementList(required = false) + @Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false) + public List sampleFiles = new ArrayList(); + @Element(required = false) @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java new file mode 100644 index 000000000..433e0af40 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Aug 12, 2010 + * Time: 2:09:16 PM + */ +public class PropertyDefinition { + + String property; + + String[] values; + + public String getProperty() { + return property; + } + + public void setProperty(String property) { + this.property = property; + } + + public String[] getValues() { + return values; + } + + public void setValues(String[] values) { + this.values = values; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java new file mode 100644 index 000000000..c509df317 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java @@ -0,0 +1,190 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + + +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Jul 26, 2010 + * Time: 3:31:38 PM + */ +public class Sample implements java.io.Serializable { + + private final String id; + + private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file + + private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file + + private HashMap properties = new HashMap(); + + private HashMap relationships = new HashMap(); + + public enum Gender { + MALE, + FEMALE, + UNKNOWN + } + + public Sample(String id) { + if (id == null) { + throw new StingException("Error creating sample: sample ID cannot be null"); + } + this.id = id; + } + + public String getId() { + return this.id; + } + + public Map getProperties() { + return properties; + } + + public void setProperties(Map properties) { + this.properties = (HashMap) properties; + } + + + public void setSampleFileEntry(boolean value) { + this.hasSampleFileEntry = value; + } + + public boolean hasSAMFileEntry() { + return this.hasSAMFileEntry; + } + + public void setSAMFileEntry(boolean value) { + this.hasSAMFileEntry = value; + } + + public boolean hasSampleFileEntry() { + return this.hasSampleFileEntry; + } + + /** + * Get one property + * @param key key of property + * @return value of property as generic object + */ + public Object getProperty(String key) { + return properties.get(key); + } + + /** + * Set a property + * If property already exists, it is overwritten + * @param key key of property + * @param value object to be stored in properties array + */ + public void setProperty(String key, Object value) { + + if (relationships.containsKey(key)) { + throw new StingException("The same key cannot exist as a property and a relationship"); + } + + if (key.equals("gender") && value.getClass() != Gender.class) { + throw new StingException("'gender' property must be of type Sample.Gender"); + } + + if (key.equals("population") && value.getClass() != String.class) { + throw new StingException("'population' property must be of type String"); + } + + properties.put(key, value); + } + + /** + * Get one relationship + * @param key of relationship + * @return Sample object that this relationship points to + */ + public Sample getRelationship(String key) { + return relationships.get(key); + } + + /** + * Set one relationship + * If already set, it is overwritten + * @param key key of the relationship + * @param value Sample object this relationship points to + */ + public void setRelationship(String key, Sample value) { + if (properties.containsKey(key)) { + throw new StingException("The same key cannot exist as a property and a relationship"); + } + relationships.put(key, value); + } + + /** + * Get the sample's mother + * @return sample object with relationship mother, if exists, or null + */ + public Sample getMother() { + return getRelationship("mother"); + } + + /** + * Get the sample's father + * @return sample object with relationship father, if exists, or null + */ + public Sample getFather() { + return getRelationship("father"); + } + + /** + * Get gender of the sample + * @return property of key "gender" - must be of type Gender + */ + public Gender getGender() { + return (Gender) properties.get("gender"); + } + + public String getPopulation() { + return (String) properties.get("population"); + } + + public String getFamilyId() { + return (String) properties.get("familyId"); + } + + /** + * @return True if sample is male, false if female, unknown, or null + */ + public boolean isMale() { + return properties.get("gender") == Gender.MALE; + } + + /** + * @return True if sample is female, false if male, unknown or null + */ + public boolean isFemale() { + return properties.get("gender") == Gender.MALE; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Sample sample = (Sample) o; + + if (hasSAMFileEntry != sample.hasSAMFileEntry) return false; + if (hasSampleFileEntry != sample.hasSampleFileEntry) return false; + if (id != null ? !id.equals(sample.id) : sample.id != null) return false; + if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false; + if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null) + return false; + + return true; + } + + @Override + public int hashCode() { + return id.hashCode(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java new file mode 100644 index 000000000..ce749cb83 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java @@ -0,0 +1,31 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Aug 13, 2010 + * Time: 5:13:46 PM + */ +public class SampleAlias { + + String mainId; + + String[] otherIds; + + public String getMainId() { + return mainId; + } + + public void setMainId(String mainId) { + this.mainId = mainId; + } + + public String[] getOtherIds() { + return otherIds; + } + + public void setOtherIds(String[] otherIds) { + this.otherIds = otherIds; + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java new file mode 100644 index 000000000..960664832 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java @@ -0,0 +1,459 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.yaml.snakeyaml.Loader; +import org.yaml.snakeyaml.TypeDescription; +import org.yaml.snakeyaml.Yaml; +import org.yaml.snakeyaml.constructor.Constructor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Jul 26, 2010 + * Time: 3:30:09 PM + * + * This class stores and manages sample metadata. This data is encoded in a sample file, which can be included + * in the GATK by the "--samples" argument. This class reads and parses those files. + * + * Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really + * only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker + * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource. + * + */ +public class SampleDataSource { + + /** + * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so + * this is stored as a HashMap. + */ + private HashMap samples = new HashMap(); + + /** + * Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different + * datasets. If this is the case, one ID is the "primary ID" and others are "aliases". + * + * This maps ID => primary ID for all samples ID strings - both primary IDs and aliases. + */ + private HashMap sampleAliases = new HashMap(); + + /** + * While loading sample files, we must be aware of "special" properties and relationships that are always allowed + */ + public static final String[] specialProperties = new String[] {"familyId", "population", "gender"}; + public static final String[] specialRelationships = new String[] {"mother", "father"}; + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + * @param header SAMFileHeader that has been created for this analysis + * @param sampleFiles Sample files that were included on the command line + */ + public SampleDataSource(SAMFileHeader header, List sampleFiles) { + + // create empty sample object for each sample referenced in the SAM header + for (String sampleName : SampleUtils.getSAMFileSamples(header)) { + if (!hasSample(sampleName)) { + Sample newSample = new Sample(sampleName); + newSample.setSAMFileEntry(true); + samples.put(sampleName, newSample); + } + } + + // add files consecutively + if (sampleFiles != null) { + for (File file : sampleFiles) { + addFile(file); + } + } + } + + /** + * Hallucinates sample objects for all the samples in the SAM file and stores them + */ + private void getSamplesFromSAMFile() { + for (String sampleName : SampleUtils.getSAMFileSamples(GenomeAnalysisEngine.instance.getSAMFileHeader())) { + if (!hasSample(sampleName)) { + Sample newSample = new Sample(sampleName); + newSample.setSAMFileEntry(true); + samples.put(sampleName, newSample); + } + } + } + + /** + * Parse one sample file and integrate it with samples that are already there + * Fail quickly if we find any errors in the file + */ + private void addFile(File sampleFile) { + + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(sampleFile)); + } + catch (IOException e) { + throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e); + } + + // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file + Constructor con = new Constructor(SampleFileParser.class); + TypeDescription desc = new TypeDescription(SampleFileParser.class); + desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class); + desc.putListPropertyType("sampleAliases", SampleAlias.class); + con.addTypeDescription(desc); + Loader loader = new Loader(con); + Yaml yaml = new Yaml(loader); + + // SampleFileParser stores an object representation of a sample file - this is what we'll parse + SampleFileParser parser; + try { + parser = (SampleFileParser) yaml.load(reader); + } + catch (Exception e) { // TODO: should we have more granular exception here? + throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e); + } + + // check to see which validation options were built into the file + boolean restrictProperties = parser.getAllowedProperties() != null; + boolean restrictRelationships = parser.getAllowedRelationships() != null; + boolean restrictPropertyValues = parser.getPropertyDefinitions() != null; + + // propertyValues stores the values that are allowed for a given property + HashMap propertyValues = null; + if (restrictPropertyValues) { + propertyValues = new HashMap(); + for (PropertyDefinition def : parser.getPropertyDefinitions()) { + HashSet set = new HashSet(); + for (String value : def.getValues()) { + set.add(value); + } + propertyValues.put(def.getProperty(), set); + } + } + + // make sure the aliases are valid + validateAliases(parser); + + // loop through each sample in the file - a SampleParser stores an object that will become a Sample + for (SampleParser sampleParser : parser.getSamples()) { + + // step 1: add the sample if it doesn't already exist + Sample sample = getSampleById(sampleParser.getId()); + if (sample == null) { + sample = new Sample(sampleParser.getId()); + } + addSample(sample); + sample.setSampleFileEntry(true); + + // step 2: add the properties + if (sampleParser.getProperties() != null) { + for (String property : sampleParser.getProperties().keySet()) { + + // check that property is allowed + if (restrictProperties) { + if (!isPropertyValid(property, parser.getAllowedProperties())) { + throw new StingException(property + " is an invalid property. It is not included in the list " + + "of allowed properties."); + } + } + + // next check that the value is allowed + if (restrictPropertyValues) { + if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) { + throw new StingException("The value of property '" + property + "' is invalid. " + + "It is not included in the list of allowed values for this property."); + } + } + + // next check that there isn't already a conflicting property there + if (sample.getProperty(property) != null && + sample.getProperty(property) != sampleParser.getProperties().get(property)) + { + throw new StingException(property + " is a conflicting property!"); + } + + // checks are passed - now add the property! + saveProperty(sample, property, sampleParser.getProperties().get(property)); + } + } + + // step 3: add the relationships + if (sampleParser.getRelationships() != null) { + for (String relationship : sampleParser.getRelationships().keySet()) { + String relativeId = sampleParser.getRelationships().get(relationship); + if (relativeId == null) { + throw new StingException("The relationship cannot be null"); + } + + // first check that it's not invalid + if (restrictRelationships) { + if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) { + throw new StingException(relationship + " is an invalid relationship"); + } + } + + // next check that there isn't already a conflicting property there + if (sample.getRelationship(relationship) != null) { + if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) { + throw new StingException(relationship + " is a conflicting relationship!"); + } + // if the relationship is already set - and consistent with what we're reading now - no need to continue + else { + continue; + } + } + + // checks are passed - now save the relationship + saveRelationship(sample, relationship, relativeId); + } + } + } + + } + + private boolean isValueAllowed(String key, Object value, HashMap valuesList) { + + // if the property values weren't specified for this property, then any value is okay + if (!valuesList.containsKey(key)) { + return true; + } + + // if this property has enumerated values, it must be a string + else if (value.getClass() != String.class) + return false; + + // is the value specified or not? + else if (!valuesList.get(key).contains(value)) + return false; + + return true; + } + + /** + * Makes sure that the aliases are valid + * Checks that 1) no string is used as both a main ID and an alias; + * 2) no alias is used more than once + * @param parser + */ + private void validateAliases(SampleFileParser parser) { + + // no aliases sure validate + if (parser.getSampleAliases() == null) + return; + + HashSet mainIds = new HashSet(); + HashSet otherIds = new HashSet(); + + for (SampleAlias sampleAlias : parser.getSampleAliases()) { + mainIds.add(sampleAlias.getMainId()); + for (String otherId : sampleAlias.getOtherIds()) { + if (mainIds.contains(otherId)) + throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " + + "be both a main ID and an other ID", otherId)); + + if (!otherIds.add(otherId)) + throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " + + "alias more than once.", otherId)); + } + } + } + + private boolean isPropertyValid(String property, String[] allowedProperties) { + + // is it a special property that is always allowed? + for (String allowedProperty : specialProperties) { + if (property.equals(allowedProperty)) + return true; + } + + // is it in the allowed properties list? + for (String allowedProperty : allowedProperties) { + if (property.equals(allowedProperty)) + return true; + } + + return false; + } + + private boolean isRelationshipValid(String relationship, String[] allowedRelationships) { + + // is it a special relationship that is always allowed? + for (String allowedRelationship : specialRelationships) { + if (relationship.equals(allowedRelationship)) + return true; + } + + // is it in the allowed properties list? + for (String allowedRelationship : allowedRelationships) { + if (relationship.equals(allowedRelationship)) + return true; + } + + return false; + } + + /** + * Saves a property as the correct type + * @param key property key + * @param value property value, as read from YAML parser + * @return property value to be stored + */ + private void saveProperty(Sample sample, String key, Object value) { + + // convert gender to the right type, if it was stored as a String + if (key.equals("gender")) { + if (((String) value).toLowerCase().equals("male")) { + value = Sample.Gender.MALE; + } + else if (((String) value).toLowerCase().equals("female")) { + value = Sample.Gender.FEMALE; + } + else if (((String) value).toLowerCase().equals("unknown")) { + value = Sample.Gender.UNKNOWN; + } + else if (value != null) { + throw new StingException("'gender' property must be male, female, or unknown."); + } + value = null; + } + sample.setProperty(key, value); + } + + /** + * Saves a relationship as the correct type + * @param key relationship key + * @param relativeId sample ID string of the relative + * @return relationship value to be stored + */ + private void saveRelationship(Sample sample, String key, String relativeId) { + + // get the reference that we'll store as the value + Sample relative = getSampleById(relativeId); + + // create sample object for the relative, if necessary + if (relative == null) { + relative = new Sample(relativeId); + addSample(relative); + } + sample.setRelationship(key, relative); + } + + + + /** + * Filter a sample name in case it is an alias + * @param sampleId to be filtered + * @return ID of sample that stores data for this alias + */ + private String aliasFilter(String sampleId) { + if (!sampleAliases.containsKey(sampleId)) + return sampleId; + else + return sampleAliases.get(sampleId); + } + + /** + * Add a sample to the collection + * @param sample to be added + */ + private void addSample(Sample sample) { + samples.put(sample.getId(), sample); + } + + /** + * Check if sample with this ID exists + * Note that this will return true if name passed in is an alias + * @param id ID of sample to be checked + * @return true if sample exists; false if not + */ + public boolean hasSample(String id) { + return samples.get(aliasFilter(id)) != null; + } + + /** + * Get a sample by its ID + * If an alias is passed in, return the main sample object + * @param id + * @return sample Object with this ID + */ + public Sample getSampleById(String id) { + return samples.get(aliasFilter(id)); + } + + /** + * Get the sample for a given read group + * Must first look up ID for read group + * @param readGroup of sample + * @return sample object with ID from the read group + */ + public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { + String nameFromReadGroup = readGroup.getSample(); + return getSampleById(nameFromReadGroup); + } + + /** + * Get a sample for a given read + * Must first look up read group, and then sample ID for that read group + * @param read of sample + * @return sample object of this read + */ + public Sample getSampleByRead(SAMRecord read) { + return getSampleByReadGroup(read.getReadGroup()); + } + + /** + * Get number of sample objects + * @return size of samples map + */ + public int sampleCount() { + return samples.size(); + } + + /** + * Return all samples with a given family ID + * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this + * @param familyId + * @return + */ + public Set getFamily(String familyId) { + HashSet familyMembers = new HashSet(); + + for (Sample sample : samples.values()) { + if (sample.getFamilyId() != null) { + if (sample.getFamilyId().equals(familyId)) + familyMembers.add(sample); + } + } + return familyMembers; + } + + /** + * Returns all children of a given sample + * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient + * @param sample + * @return + */ + public Set getChildren(Sample sample) { + HashSet children = new HashSet(); + for (Sample familyMember : getFamily(sample.getFamilyId())) { + if (familyMember.getMother() == sample || familyMember.getFather() == sample) { + children.add(familyMember); + } + } + return children; + } + + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java new file mode 100644 index 000000000..a362af663 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Aug 12, 2010 + * Time: 1:30:44 PM + */ +public class SampleFileParser { + + private SampleAlias[] sampleAliases; + + private String[] allowedProperties; + + private String[] allowedRelationships; + + private PropertyDefinition[] propertyDefinitions; + + private SampleParser[] samples; + + public PropertyDefinition[] getPropertyDefinitions() { + return propertyDefinitions; + } + + public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) { + this.propertyDefinitions = propertyDefinitions; + } + + public SampleFileParser() { + + } + + public String[] getAllowedProperties() { + return allowedProperties; + } + + public void setAllowedProperties(String[] allowedProperties) { + this.allowedProperties = allowedProperties; + } + + public SampleParser[] getSamples() { + return samples; + } + + public void setSamples(SampleParser[] samples) { + this.samples = samples; + } + + public String[] getAllowedRelationships() { + return allowedRelationships; + } + + public void setAllowedRelationships(String[] allowedRelationships) { + this.allowedRelationships = allowedRelationships; + } + + public SampleAlias[] getSampleAliases() { + return sampleAliases; + } + + public void setSampleAliases(SampleAlias[] sampleAliases) { + this.sampleAliases = sampleAliases; + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java new file mode 100644 index 000000000..f5e07ca29 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +import java.util.HashMap; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Aug 13, 2010 + * Time: 2:09:43 PM + */ +public class SampleParser { + + private String id; + + private HashMap properties; + + private HashMap relationships; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public HashMap getProperties() { + return properties; + } + + public void setProperties(HashMap properties) { + this.properties = properties; + } + + public HashMap getRelationships() { + return relationships; + } + + public void setRelationships(HashMap relationships) { + this.relationships = relationships; + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java b/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java new file mode 100644 index 000000000..905bb9bf9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java @@ -0,0 +1,53 @@ +package org.broadinstitute.sting.playground.sample; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; + +import java.util.ArrayList; +import java.util.HashMap; + +/** + * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes. + * Simplest example of a locus walker. + */ +public class CountLociByPopulationWalker extends LocusWalker implements TreeReducible { + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + HashMap count = new HashMap(); + + ArrayList reads = (ArrayList) context.getBasePileup().getReads(); + + for (SAMRecord read : reads) { + String population = getToolkit().getSampleByRead(read).getPopulation(); + if (!count.containsKey(population)) { + count.put(population, 1); + } + count.put(population, count.get(population) + 1); + } + + System.out.println("\n\n\n***** LOCUS: " + ref.toString() + " *****"); + for (String population : count.keySet()) { + System.out.println(String.format("%s | %d\n", population, count.get(population))); + } + + return 1; + } + + public Long reduceInit() { return 0l; } + + public Long reduce(Integer value, Long sum) { + return value + sum; + } + + /** + * Reduces two subtrees together. In this case, the implementation of the tree reduce + * is exactly the same as the implementation of the single reduce. + */ + public Long treeReduce(Long lhs, Long rhs) { + return lhs + rhs; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java b/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java new file mode 100644 index 000000000..20cb7fe3d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.playground.sample; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; + +/** + * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * Can also count the number of reads matching a given criterion using read filters (see the + * --read-filter command line argument). Simplest example of a read-backed analysis. + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountMalesWalker extends ReadWalker { + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + Sample sample = getToolkit().getSampleByRead(read); + return sample.isMale() ? 1 : 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java new file mode 100644 index 000000000..e66e6fce3 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java @@ -0,0 +1,154 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Sep 9, 2010 + * Time: 8:21:00 AM + */ +public class SampleDataSourceTest extends BaseTest { + + // this empty header used to instantiate sampledatasource objects + private static SAMFileHeader header = new SAMFileHeader(); + + // all the test sample files are located here + private String sampleFilesDir = validationDataLocation + "sample/"; + + // make sure samples are created from the SAM file correctly + @Test() + public void loadSAMSamplesTest() { + SampleDataSource s = new SampleDataSource(header, null); + } + + // tests that a basic sample with relationships loads correctly + // Note that this is the only test for family relationships - we may want to expand this + @Test() + public void basicLoadSampleFileTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + Assert.assertTrue(s.sampleCount() == 4); + Sample sampleA = s.getSampleById("sampleA"); + Sample sampleB = s.getSampleById("sampleB"); + Assert.assertTrue(sampleB.getMother() == sampleA); + Assert.assertTrue(s.getChildren(sampleA).contains(sampleB)); + Set family = s.getFamily("family1"); + Assert.assertTrue(family.size() == 2); + Assert.assertTrue(family.contains(sampleA)); + Assert.assertTrue(family.contains(sampleB)); + } + + // but that file should fail if it has an extra character in it... + @Test(expected = StingException.class) + public void loadInvalidSampleExtraCharText() { + File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraChar.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + } + + // ...or a typo... + @Test(expected = StingException.class) + public void loadInvalidSampleTypoText() { + File sampleFile = new File(sampleFilesDir + "invalidSyntaxTypo.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + + } + + // ...or an extra unrecognized array + @Test(expected = StingException.class) + public void loadInvalidSampleExtraArrayText() { + File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraArray.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + } + + // make sure aliases work + @Test(expected = StingException.class) + public void sampleAliasText() { + File sampleFile = new File(sampleFilesDir + "basicSampleFileWithAlias.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + // this file has two samples, but one has an alias. let's make sure that checks out... + Assert.assertTrue(s.sampleCount() == 2); + Assert.assertTrue(s.getSampleById("sampleA") == s.getSampleById("sampleC")); + } + + // error is thrown if property is included that's not in properties array + @Test(expected = StingException.class) + public void unallowedPropertySampleTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedProperty.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + } + + // same as above, with relationship + @Test(expected = StingException.class) + public void unallowedRelationshipSampleTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedRelationship.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + } + + // two sample files + @Test() + public void twoSampleFilesTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); + File secondFile = new File(sampleFilesDir + "basicSampleFileExt.yaml"); + ArrayList files = new ArrayList(); + files.add(sampleFile); + files.add(secondFile); + SampleDataSource s = new SampleDataSource(header, files); + Assert.assertTrue(s.getSampleById("sampleA").getProperty("propC").equals("valC")); + Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); + } + + // two sample files, with contradictory properties + @Test(expected = StingException.class) + public void twoContradictorySampleFilesTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); + File secondFile = new File(sampleFilesDir + "basicSampleFileInvalidExt.yaml"); + ArrayList files = new ArrayList(); + files.add(sampleFile); + files.add(secondFile); + SampleDataSource s = new SampleDataSource(header, files); + } + + // three sample files + @Test() + public void threeSamplesTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); + ArrayList files = new ArrayList(); + files.add(sampleFile); + files.add(new File(sampleFilesDir + "basicSampleFileExt.yaml")); + files.add(new File(sampleFilesDir + "basicSampleFileExt2.yaml")); + SampleDataSource s = new SampleDataSource(header, files); + Assert.assertTrue(s.sampleCount() == 5); + Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC")); + Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); + } + + // make sure we can import data types other than Strings + @Test() + public void sampleTestPropertyType() { + File sampleFile = new File(sampleFilesDir + "sampleFileOtherTypes.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + Sample sample = s.getSampleById("sampleA"); + Assert.assertTrue(sample.getProperty("a").getClass() == Integer.class); + Assert.assertTrue(sample.getProperty("b").getClass() == String.class); + Assert.assertTrue(sample.getProperty("c").getClass() == Double.class); + Assert.assertTrue(sample.getProperty("b").getClass() == String.class); + } + + + // we create lots of single item lists... + private ArrayList makeFileList(File file) { + ArrayList a = new ArrayList(); + a.add(file); + return a; + } +} diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java new file mode 100644 index 000000000..ce73103e8 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java @@ -0,0 +1,63 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +import org.broadinstitute.sting.BaseTest; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Sep 9, 2010 + * Time: 8:21:00 AM + */ +public class SampleTest extends BaseTest { + + static Sample sampleA; + static Sample sampleA1; + static Sample sampleB; + static Sample sampleC; + + @BeforeClass + public static void init() { + sampleA = new Sample("sampleA"); + sampleA.setProperty("uniqueProperty", "uniqueValue"); + sampleA1 = new Sample("sampleA"); + sampleA1.setProperty("uniqueProperty", "uniqueValue"); + sampleB = new Sample("sampleB"); + sampleC = new Sample("sampleC"); + sampleC.setProperty("population", "pop1"); + sampleC.setProperty("gender", Sample.Gender.MALE); + } + + /** + * Testing equality + */ + @Test() + public void equalsTest() { + Assert.assertTrue(sampleA.equals(sampleA1)); + Assert.assertFalse(sampleA == sampleA1); + Assert.assertFalse(sampleA.equals(sampleB)); + } + + /** + * And hash + */ + @Test() + public void basicHashTest() { + Assert.assertFalse(sampleA.hashCode() == sampleB.hashCode()); + Assert.assertTrue(sampleA.hashCode() == sampleA1.hashCode()); + } + + /** + * Now test the special getter methods + */ + @Test() + public void specialGettersTest() { + Assert.assertTrue(sampleC.getId().equals("sampleC")); + Assert.assertTrue(sampleC.getPopulation().equals("pop1")); + Assert.assertTrue(sampleC.isMale()); + Assert.assertFalse(sampleA.isMale()); // sample A doesn't have a gender, so this should be false + } + +} \ No newline at end of file