diff --git a/ivy.xml b/ivy.xml index 9bb0b1f03..c62acf8c8 100644 --- a/ivy.xml +++ b/ivy.xml @@ -17,6 +17,7 @@ + diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 9ed27d046..c88de62c9 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -30,6 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.datasources.sample.SampleFileParser; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -76,6 +79,11 @@ public class GenomeAnalysisEngine { */ private ReferenceDataSource referenceDataSource = null; + /** + * Accessor for sample metadata + */ + private SampleDataSource sampleDataSource = null; + /** * Accessor for sharded reference-ordered data. */ @@ -388,6 +396,10 @@ public class GenomeAnalysisEngine { rodDataSources = getReferenceOrderedDataSources(my_walker, tracks); } + private void initializeSampleDataSource() { + this.sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); + } + /** * setup a microscheduler * @@ -966,5 +978,64 @@ public class GenomeAnalysisEngine { } } return unpackedReads; - } + } + + /** + * Get a sample by its ID + * If an alias is passed in, return the main sample object + * @param id + * @return sample Object with this ID + */ + public Sample getSampleById(String id) { + return sampleDataSource.getSampleById(id); + } + + /** + * Get the sample for a given read group + * Must first look up ID for read group + * @param readGroup of sample + * @return sample object with ID from the read group + */ + public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { + return sampleDataSource.getSampleByReadGroup(readGroup); + } + + /** + * Get a sample for a given read + * Must first look up read group, and then sample ID for that read group + * @param read of sample + * @return sample object of this read + */ + public Sample getSampleByRead(SAMRecord read) { + return getSampleByReadGroup(read.getReadGroup()); + } + + /** + * Get number of sample objects + * @return size of samples map + */ + public int sampleCount() { + return sampleDataSource.sampleCount(); + } + + /** + * Return all samples with a given family ID + * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this + * @param familyId + * @return + */ + public Set getFamily(String familyId) { + return sampleDataSource.getFamily(familyId); + } + + /** + * Returns all children of a given sample + * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient + * @param sample + * @return + */ + public Set getChildren(Sample sample) { + return sampleDataSource.getChildren(sample); + } + } diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 6abd8c209..b73f6a7ee 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -70,6 +70,11 @@ public class GATKArgumentCollection { @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) public List samFiles = new ArrayList(); + // parameters and their defaults + @ElementList(required = false) + @Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false) + public List sampleFiles = new ArrayList(); + @Element(required = false) @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java new file mode 100644 index 000000000..433e0af40 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Aug 12, 2010 + * Time: 2:09:16 PM + */ +public class PropertyDefinition { + + String property; + + String[] values; + + public String getProperty() { + return property; + } + + public void setProperty(String property) { + this.property = property; + } + + public String[] getValues() { + return values; + } + + public void setValues(String[] values) { + this.values = values; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java new file mode 100644 index 000000000..c509df317 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java @@ -0,0 +1,190 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + + +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Jul 26, 2010 + * Time: 3:31:38 PM + */ +public class Sample implements java.io.Serializable { + + private final String id; + + private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file + + private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file + + private HashMap properties = new HashMap(); + + private HashMap relationships = new HashMap(); + + public enum Gender { + MALE, + FEMALE, + UNKNOWN + } + + public Sample(String id) { + if (id == null) { + throw new StingException("Error creating sample: sample ID cannot be null"); + } + this.id = id; + } + + public String getId() { + return this.id; + } + + public Map getProperties() { + return properties; + } + + public void setProperties(Map properties) { + this.properties = (HashMap) properties; + } + + + public void setSampleFileEntry(boolean value) { + this.hasSampleFileEntry = value; + } + + public boolean hasSAMFileEntry() { + return this.hasSAMFileEntry; + } + + public void setSAMFileEntry(boolean value) { + this.hasSAMFileEntry = value; + } + + public boolean hasSampleFileEntry() { + return this.hasSampleFileEntry; + } + + /** + * Get one property + * @param key key of property + * @return value of property as generic object + */ + public Object getProperty(String key) { + return properties.get(key); + } + + /** + * Set a property + * If property already exists, it is overwritten + * @param key key of property + * @param value object to be stored in properties array + */ + public void setProperty(String key, Object value) { + + if (relationships.containsKey(key)) { + throw new StingException("The same key cannot exist as a property and a relationship"); + } + + if (key.equals("gender") && value.getClass() != Gender.class) { + throw new StingException("'gender' property must be of type Sample.Gender"); + } + + if (key.equals("population") && value.getClass() != String.class) { + throw new StingException("'population' property must be of type String"); + } + + properties.put(key, value); + } + + /** + * Get one relationship + * @param key of relationship + * @return Sample object that this relationship points to + */ + public Sample getRelationship(String key) { + return relationships.get(key); + } + + /** + * Set one relationship + * If already set, it is overwritten + * @param key key of the relationship + * @param value Sample object this relationship points to + */ + public void setRelationship(String key, Sample value) { + if (properties.containsKey(key)) { + throw new StingException("The same key cannot exist as a property and a relationship"); + } + relationships.put(key, value); + } + + /** + * Get the sample's mother + * @return sample object with relationship mother, if exists, or null + */ + public Sample getMother() { + return getRelationship("mother"); + } + + /** + * Get the sample's father + * @return sample object with relationship father, if exists, or null + */ + public Sample getFather() { + return getRelationship("father"); + } + + /** + * Get gender of the sample + * @return property of key "gender" - must be of type Gender + */ + public Gender getGender() { + return (Gender) properties.get("gender"); + } + + public String getPopulation() { + return (String) properties.get("population"); + } + + public String getFamilyId() { + return (String) properties.get("familyId"); + } + + /** + * @return True if sample is male, false if female, unknown, or null + */ + public boolean isMale() { + return properties.get("gender") == Gender.MALE; + } + + /** + * @return True if sample is female, false if male, unknown or null + */ + public boolean isFemale() { + return properties.get("gender") == Gender.MALE; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Sample sample = (Sample) o; + + if (hasSAMFileEntry != sample.hasSAMFileEntry) return false; + if (hasSampleFileEntry != sample.hasSampleFileEntry) return false; + if (id != null ? !id.equals(sample.id) : sample.id != null) return false; + if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false; + if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null) + return false; + + return true; + } + + @Override + public int hashCode() { + return id.hashCode(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java new file mode 100644 index 000000000..ce749cb83 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java @@ -0,0 +1,31 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Aug 13, 2010 + * Time: 5:13:46 PM + */ +public class SampleAlias { + + String mainId; + + String[] otherIds; + + public String getMainId() { + return mainId; + } + + public void setMainId(String mainId) { + this.mainId = mainId; + } + + public String[] getOtherIds() { + return otherIds; + } + + public void setOtherIds(String[] otherIds) { + this.otherIds = otherIds; + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java new file mode 100644 index 000000000..960664832 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java @@ -0,0 +1,459 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.yaml.snakeyaml.Loader; +import org.yaml.snakeyaml.TypeDescription; +import org.yaml.snakeyaml.Yaml; +import org.yaml.snakeyaml.constructor.Constructor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Jul 26, 2010 + * Time: 3:30:09 PM + * + * This class stores and manages sample metadata. This data is encoded in a sample file, which can be included + * in the GATK by the "--samples" argument. This class reads and parses those files. + * + * Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really + * only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker + * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource. + * + */ +public class SampleDataSource { + + /** + * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so + * this is stored as a HashMap. + */ + private HashMap samples = new HashMap(); + + /** + * Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different + * datasets. If this is the case, one ID is the "primary ID" and others are "aliases". + * + * This maps ID => primary ID for all samples ID strings - both primary IDs and aliases. + */ + private HashMap sampleAliases = new HashMap(); + + /** + * While loading sample files, we must be aware of "special" properties and relationships that are always allowed + */ + public static final String[] specialProperties = new String[] {"familyId", "population", "gender"}; + public static final String[] specialRelationships = new String[] {"mother", "father"}; + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + * @param header SAMFileHeader that has been created for this analysis + * @param sampleFiles Sample files that were included on the command line + */ + public SampleDataSource(SAMFileHeader header, List sampleFiles) { + + // create empty sample object for each sample referenced in the SAM header + for (String sampleName : SampleUtils.getSAMFileSamples(header)) { + if (!hasSample(sampleName)) { + Sample newSample = new Sample(sampleName); + newSample.setSAMFileEntry(true); + samples.put(sampleName, newSample); + } + } + + // add files consecutively + if (sampleFiles != null) { + for (File file : sampleFiles) { + addFile(file); + } + } + } + + /** + * Hallucinates sample objects for all the samples in the SAM file and stores them + */ + private void getSamplesFromSAMFile() { + for (String sampleName : SampleUtils.getSAMFileSamples(GenomeAnalysisEngine.instance.getSAMFileHeader())) { + if (!hasSample(sampleName)) { + Sample newSample = new Sample(sampleName); + newSample.setSAMFileEntry(true); + samples.put(sampleName, newSample); + } + } + } + + /** + * Parse one sample file and integrate it with samples that are already there + * Fail quickly if we find any errors in the file + */ + private void addFile(File sampleFile) { + + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(sampleFile)); + } + catch (IOException e) { + throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e); + } + + // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file + Constructor con = new Constructor(SampleFileParser.class); + TypeDescription desc = new TypeDescription(SampleFileParser.class); + desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class); + desc.putListPropertyType("sampleAliases", SampleAlias.class); + con.addTypeDescription(desc); + Loader loader = new Loader(con); + Yaml yaml = new Yaml(loader); + + // SampleFileParser stores an object representation of a sample file - this is what we'll parse + SampleFileParser parser; + try { + parser = (SampleFileParser) yaml.load(reader); + } + catch (Exception e) { // TODO: should we have more granular exception here? + throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e); + } + + // check to see which validation options were built into the file + boolean restrictProperties = parser.getAllowedProperties() != null; + boolean restrictRelationships = parser.getAllowedRelationships() != null; + boolean restrictPropertyValues = parser.getPropertyDefinitions() != null; + + // propertyValues stores the values that are allowed for a given property + HashMap propertyValues = null; + if (restrictPropertyValues) { + propertyValues = new HashMap(); + for (PropertyDefinition def : parser.getPropertyDefinitions()) { + HashSet set = new HashSet(); + for (String value : def.getValues()) { + set.add(value); + } + propertyValues.put(def.getProperty(), set); + } + } + + // make sure the aliases are valid + validateAliases(parser); + + // loop through each sample in the file - a SampleParser stores an object that will become a Sample + for (SampleParser sampleParser : parser.getSamples()) { + + // step 1: add the sample if it doesn't already exist + Sample sample = getSampleById(sampleParser.getId()); + if (sample == null) { + sample = new Sample(sampleParser.getId()); + } + addSample(sample); + sample.setSampleFileEntry(true); + + // step 2: add the properties + if (sampleParser.getProperties() != null) { + for (String property : sampleParser.getProperties().keySet()) { + + // check that property is allowed + if (restrictProperties) { + if (!isPropertyValid(property, parser.getAllowedProperties())) { + throw new StingException(property + " is an invalid property. It is not included in the list " + + "of allowed properties."); + } + } + + // next check that the value is allowed + if (restrictPropertyValues) { + if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) { + throw new StingException("The value of property '" + property + "' is invalid. " + + "It is not included in the list of allowed values for this property."); + } + } + + // next check that there isn't already a conflicting property there + if (sample.getProperty(property) != null && + sample.getProperty(property) != sampleParser.getProperties().get(property)) + { + throw new StingException(property + " is a conflicting property!"); + } + + // checks are passed - now add the property! + saveProperty(sample, property, sampleParser.getProperties().get(property)); + } + } + + // step 3: add the relationships + if (sampleParser.getRelationships() != null) { + for (String relationship : sampleParser.getRelationships().keySet()) { + String relativeId = sampleParser.getRelationships().get(relationship); + if (relativeId == null) { + throw new StingException("The relationship cannot be null"); + } + + // first check that it's not invalid + if (restrictRelationships) { + if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) { + throw new StingException(relationship + " is an invalid relationship"); + } + } + + // next check that there isn't already a conflicting property there + if (sample.getRelationship(relationship) != null) { + if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) { + throw new StingException(relationship + " is a conflicting relationship!"); + } + // if the relationship is already set - and consistent with what we're reading now - no need to continue + else { + continue; + } + } + + // checks are passed - now save the relationship + saveRelationship(sample, relationship, relativeId); + } + } + } + + } + + private boolean isValueAllowed(String key, Object value, HashMap valuesList) { + + // if the property values weren't specified for this property, then any value is okay + if (!valuesList.containsKey(key)) { + return true; + } + + // if this property has enumerated values, it must be a string + else if (value.getClass() != String.class) + return false; + + // is the value specified or not? + else if (!valuesList.get(key).contains(value)) + return false; + + return true; + } + + /** + * Makes sure that the aliases are valid + * Checks that 1) no string is used as both a main ID and an alias; + * 2) no alias is used more than once + * @param parser + */ + private void validateAliases(SampleFileParser parser) { + + // no aliases sure validate + if (parser.getSampleAliases() == null) + return; + + HashSet mainIds = new HashSet(); + HashSet otherIds = new HashSet(); + + for (SampleAlias sampleAlias : parser.getSampleAliases()) { + mainIds.add(sampleAlias.getMainId()); + for (String otherId : sampleAlias.getOtherIds()) { + if (mainIds.contains(otherId)) + throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " + + "be both a main ID and an other ID", otherId)); + + if (!otherIds.add(otherId)) + throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " + + "alias more than once.", otherId)); + } + } + } + + private boolean isPropertyValid(String property, String[] allowedProperties) { + + // is it a special property that is always allowed? + for (String allowedProperty : specialProperties) { + if (property.equals(allowedProperty)) + return true; + } + + // is it in the allowed properties list? + for (String allowedProperty : allowedProperties) { + if (property.equals(allowedProperty)) + return true; + } + + return false; + } + + private boolean isRelationshipValid(String relationship, String[] allowedRelationships) { + + // is it a special relationship that is always allowed? + for (String allowedRelationship : specialRelationships) { + if (relationship.equals(allowedRelationship)) + return true; + } + + // is it in the allowed properties list? + for (String allowedRelationship : allowedRelationships) { + if (relationship.equals(allowedRelationship)) + return true; + } + + return false; + } + + /** + * Saves a property as the correct type + * @param key property key + * @param value property value, as read from YAML parser + * @return property value to be stored + */ + private void saveProperty(Sample sample, String key, Object value) { + + // convert gender to the right type, if it was stored as a String + if (key.equals("gender")) { + if (((String) value).toLowerCase().equals("male")) { + value = Sample.Gender.MALE; + } + else if (((String) value).toLowerCase().equals("female")) { + value = Sample.Gender.FEMALE; + } + else if (((String) value).toLowerCase().equals("unknown")) { + value = Sample.Gender.UNKNOWN; + } + else if (value != null) { + throw new StingException("'gender' property must be male, female, or unknown."); + } + value = null; + } + sample.setProperty(key, value); + } + + /** + * Saves a relationship as the correct type + * @param key relationship key + * @param relativeId sample ID string of the relative + * @return relationship value to be stored + */ + private void saveRelationship(Sample sample, String key, String relativeId) { + + // get the reference that we'll store as the value + Sample relative = getSampleById(relativeId); + + // create sample object for the relative, if necessary + if (relative == null) { + relative = new Sample(relativeId); + addSample(relative); + } + sample.setRelationship(key, relative); + } + + + + /** + * Filter a sample name in case it is an alias + * @param sampleId to be filtered + * @return ID of sample that stores data for this alias + */ + private String aliasFilter(String sampleId) { + if (!sampleAliases.containsKey(sampleId)) + return sampleId; + else + return sampleAliases.get(sampleId); + } + + /** + * Add a sample to the collection + * @param sample to be added + */ + private void addSample(Sample sample) { + samples.put(sample.getId(), sample); + } + + /** + * Check if sample with this ID exists + * Note that this will return true if name passed in is an alias + * @param id ID of sample to be checked + * @return true if sample exists; false if not + */ + public boolean hasSample(String id) { + return samples.get(aliasFilter(id)) != null; + } + + /** + * Get a sample by its ID + * If an alias is passed in, return the main sample object + * @param id + * @return sample Object with this ID + */ + public Sample getSampleById(String id) { + return samples.get(aliasFilter(id)); + } + + /** + * Get the sample for a given read group + * Must first look up ID for read group + * @param readGroup of sample + * @return sample object with ID from the read group + */ + public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { + String nameFromReadGroup = readGroup.getSample(); + return getSampleById(nameFromReadGroup); + } + + /** + * Get a sample for a given read + * Must first look up read group, and then sample ID for that read group + * @param read of sample + * @return sample object of this read + */ + public Sample getSampleByRead(SAMRecord read) { + return getSampleByReadGroup(read.getReadGroup()); + } + + /** + * Get number of sample objects + * @return size of samples map + */ + public int sampleCount() { + return samples.size(); + } + + /** + * Return all samples with a given family ID + * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this + * @param familyId + * @return + */ + public Set getFamily(String familyId) { + HashSet familyMembers = new HashSet(); + + for (Sample sample : samples.values()) { + if (sample.getFamilyId() != null) { + if (sample.getFamilyId().equals(familyId)) + familyMembers.add(sample); + } + } + return familyMembers; + } + + /** + * Returns all children of a given sample + * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient + * @param sample + * @return + */ + public Set getChildren(Sample sample) { + HashSet children = new HashSet(); + for (Sample familyMember : getFamily(sample.getFamilyId())) { + if (familyMember.getMother() == sample || familyMember.getFather() == sample) { + children.add(familyMember); + } + } + return children; + } + + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java new file mode 100644 index 000000000..a362af663 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Aug 12, 2010 + * Time: 1:30:44 PM + */ +public class SampleFileParser { + + private SampleAlias[] sampleAliases; + + private String[] allowedProperties; + + private String[] allowedRelationships; + + private PropertyDefinition[] propertyDefinitions; + + private SampleParser[] samples; + + public PropertyDefinition[] getPropertyDefinitions() { + return propertyDefinitions; + } + + public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) { + this.propertyDefinitions = propertyDefinitions; + } + + public SampleFileParser() { + + } + + public String[] getAllowedProperties() { + return allowedProperties; + } + + public void setAllowedProperties(String[] allowedProperties) { + this.allowedProperties = allowedProperties; + } + + public SampleParser[] getSamples() { + return samples; + } + + public void setSamples(SampleParser[] samples) { + this.samples = samples; + } + + public String[] getAllowedRelationships() { + return allowedRelationships; + } + + public void setAllowedRelationships(String[] allowedRelationships) { + this.allowedRelationships = allowedRelationships; + } + + public SampleAlias[] getSampleAliases() { + return sampleAliases; + } + + public void setSampleAliases(SampleAlias[] sampleAliases) { + this.sampleAliases = sampleAliases; + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java new file mode 100644 index 000000000..f5e07ca29 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +import java.util.HashMap; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Aug 13, 2010 + * Time: 2:09:43 PM + */ +public class SampleParser { + + private String id; + + private HashMap properties; + + private HashMap relationships; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public HashMap getProperties() { + return properties; + } + + public void setProperties(HashMap properties) { + this.properties = properties; + } + + public HashMap getRelationships() { + return relationships; + } + + public void setRelationships(HashMap relationships) { + this.relationships = relationships; + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java b/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java new file mode 100644 index 000000000..905bb9bf9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java @@ -0,0 +1,53 @@ +package org.broadinstitute.sting.playground.sample; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; + +import java.util.ArrayList; +import java.util.HashMap; + +/** + * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes. + * Simplest example of a locus walker. + */ +public class CountLociByPopulationWalker extends LocusWalker implements TreeReducible { + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + HashMap count = new HashMap(); + + ArrayList reads = (ArrayList) context.getBasePileup().getReads(); + + for (SAMRecord read : reads) { + String population = getToolkit().getSampleByRead(read).getPopulation(); + if (!count.containsKey(population)) { + count.put(population, 1); + } + count.put(population, count.get(population) + 1); + } + + System.out.println("\n\n\n***** LOCUS: " + ref.toString() + " *****"); + for (String population : count.keySet()) { + System.out.println(String.format("%s | %d\n", population, count.get(population))); + } + + return 1; + } + + public Long reduceInit() { return 0l; } + + public Long reduce(Integer value, Long sum) { + return value + sum; + } + + /** + * Reduces two subtrees together. In this case, the implementation of the tree reduce + * is exactly the same as the implementation of the single reduce. + */ + public Long treeReduce(Long lhs, Long rhs) { + return lhs + rhs; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java b/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java new file mode 100644 index 000000000..20cb7fe3d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.playground.sample; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; + +/** + * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * Can also count the number of reads matching a given criterion using read filters (see the + * --read-filter command line argument). Simplest example of a read-backed analysis. + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountMalesWalker extends ReadWalker { + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + Sample sample = getToolkit().getSampleByRead(read); + return sample.isMale() ? 1 : 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java new file mode 100644 index 000000000..e66e6fce3 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java @@ -0,0 +1,154 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Sep 9, 2010 + * Time: 8:21:00 AM + */ +public class SampleDataSourceTest extends BaseTest { + + // this empty header used to instantiate sampledatasource objects + private static SAMFileHeader header = new SAMFileHeader(); + + // all the test sample files are located here + private String sampleFilesDir = validationDataLocation + "sample/"; + + // make sure samples are created from the SAM file correctly + @Test() + public void loadSAMSamplesTest() { + SampleDataSource s = new SampleDataSource(header, null); + } + + // tests that a basic sample with relationships loads correctly + // Note that this is the only test for family relationships - we may want to expand this + @Test() + public void basicLoadSampleFileTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + Assert.assertTrue(s.sampleCount() == 4); + Sample sampleA = s.getSampleById("sampleA"); + Sample sampleB = s.getSampleById("sampleB"); + Assert.assertTrue(sampleB.getMother() == sampleA); + Assert.assertTrue(s.getChildren(sampleA).contains(sampleB)); + Set family = s.getFamily("family1"); + Assert.assertTrue(family.size() == 2); + Assert.assertTrue(family.contains(sampleA)); + Assert.assertTrue(family.contains(sampleB)); + } + + // but that file should fail if it has an extra character in it... + @Test(expected = StingException.class) + public void loadInvalidSampleExtraCharText() { + File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraChar.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + } + + // ...or a typo... + @Test(expected = StingException.class) + public void loadInvalidSampleTypoText() { + File sampleFile = new File(sampleFilesDir + "invalidSyntaxTypo.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + + } + + // ...or an extra unrecognized array + @Test(expected = StingException.class) + public void loadInvalidSampleExtraArrayText() { + File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraArray.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + } + + // make sure aliases work + @Test(expected = StingException.class) + public void sampleAliasText() { + File sampleFile = new File(sampleFilesDir + "basicSampleFileWithAlias.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + // this file has two samples, but one has an alias. let's make sure that checks out... + Assert.assertTrue(s.sampleCount() == 2); + Assert.assertTrue(s.getSampleById("sampleA") == s.getSampleById("sampleC")); + } + + // error is thrown if property is included that's not in properties array + @Test(expected = StingException.class) + public void unallowedPropertySampleTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedProperty.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + } + + // same as above, with relationship + @Test(expected = StingException.class) + public void unallowedRelationshipSampleTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedRelationship.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + } + + // two sample files + @Test() + public void twoSampleFilesTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); + File secondFile = new File(sampleFilesDir + "basicSampleFileExt.yaml"); + ArrayList files = new ArrayList(); + files.add(sampleFile); + files.add(secondFile); + SampleDataSource s = new SampleDataSource(header, files); + Assert.assertTrue(s.getSampleById("sampleA").getProperty("propC").equals("valC")); + Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); + } + + // two sample files, with contradictory properties + @Test(expected = StingException.class) + public void twoContradictorySampleFilesTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); + File secondFile = new File(sampleFilesDir + "basicSampleFileInvalidExt.yaml"); + ArrayList files = new ArrayList(); + files.add(sampleFile); + files.add(secondFile); + SampleDataSource s = new SampleDataSource(header, files); + } + + // three sample files + @Test() + public void threeSamplesTest() { + File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); + ArrayList files = new ArrayList(); + files.add(sampleFile); + files.add(new File(sampleFilesDir + "basicSampleFileExt.yaml")); + files.add(new File(sampleFilesDir + "basicSampleFileExt2.yaml")); + SampleDataSource s = new SampleDataSource(header, files); + Assert.assertTrue(s.sampleCount() == 5); + Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC")); + Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); + } + + // make sure we can import data types other than Strings + @Test() + public void sampleTestPropertyType() { + File sampleFile = new File(sampleFilesDir + "sampleFileOtherTypes.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + Sample sample = s.getSampleById("sampleA"); + Assert.assertTrue(sample.getProperty("a").getClass() == Integer.class); + Assert.assertTrue(sample.getProperty("b").getClass() == String.class); + Assert.assertTrue(sample.getProperty("c").getClass() == Double.class); + Assert.assertTrue(sample.getProperty("b").getClass() == String.class); + } + + + // we create lots of single item lists... + private ArrayList makeFileList(File file) { + ArrayList a = new ArrayList(); + a.add(file); + return a; + } +} diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java new file mode 100644 index 000000000..ce73103e8 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java @@ -0,0 +1,63 @@ +package org.broadinstitute.sting.gatk.datasources.sample; + +import org.broadinstitute.sting.BaseTest; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Sep 9, 2010 + * Time: 8:21:00 AM + */ +public class SampleTest extends BaseTest { + + static Sample sampleA; + static Sample sampleA1; + static Sample sampleB; + static Sample sampleC; + + @BeforeClass + public static void init() { + sampleA = new Sample("sampleA"); + sampleA.setProperty("uniqueProperty", "uniqueValue"); + sampleA1 = new Sample("sampleA"); + sampleA1.setProperty("uniqueProperty", "uniqueValue"); + sampleB = new Sample("sampleB"); + sampleC = new Sample("sampleC"); + sampleC.setProperty("population", "pop1"); + sampleC.setProperty("gender", Sample.Gender.MALE); + } + + /** + * Testing equality + */ + @Test() + public void equalsTest() { + Assert.assertTrue(sampleA.equals(sampleA1)); + Assert.assertFalse(sampleA == sampleA1); + Assert.assertFalse(sampleA.equals(sampleB)); + } + + /** + * And hash + */ + @Test() + public void basicHashTest() { + Assert.assertFalse(sampleA.hashCode() == sampleB.hashCode()); + Assert.assertTrue(sampleA.hashCode() == sampleA1.hashCode()); + } + + /** + * Now test the special getter methods + */ + @Test() + public void specialGettersTest() { + Assert.assertTrue(sampleC.getId().equals("sampleC")); + Assert.assertTrue(sampleC.getPopulation().equals("pop1")); + Assert.assertTrue(sampleC.isMale()); + Assert.assertFalse(sampleA.isMale()); // sample A doesn't have a gender, so this should be false + } + +} \ No newline at end of file