diff --git a/ivy.xml b/ivy.xml
index 9bb0b1f03..c62acf8c8 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -17,6 +17,7 @@
+
diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index 9ed27d046..c88de62c9 100755
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -30,6 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
+import org.broadinstitute.sting.gatk.datasources.sample.Sample;
+import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
+import org.broadinstitute.sting.gatk.datasources.sample.SampleFileParser;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@@ -76,6 +79,11 @@ public class GenomeAnalysisEngine {
*/
private ReferenceDataSource referenceDataSource = null;
+ /**
+ * Accessor for sample metadata
+ */
+ private SampleDataSource sampleDataSource = null;
+
/**
* Accessor for sharded reference-ordered data.
*/
@@ -388,6 +396,10 @@ public class GenomeAnalysisEngine {
rodDataSources = getReferenceOrderedDataSources(my_walker, tracks);
}
+ private void initializeSampleDataSource() {
+ this.sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
+ }
+
/**
* setup a microscheduler
*
@@ -966,5 +978,64 @@ public class GenomeAnalysisEngine {
}
}
return unpackedReads;
- }
+ }
+
+ /**
+ * Get a sample by its ID
+ * If an alias is passed in, return the main sample object
+ * @param id
+ * @return sample Object with this ID
+ */
+ public Sample getSampleById(String id) {
+ return sampleDataSource.getSampleById(id);
+ }
+
+ /**
+ * Get the sample for a given read group
+ * Must first look up ID for read group
+ * @param readGroup of sample
+ * @return sample object with ID from the read group
+ */
+ public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
+ return sampleDataSource.getSampleByReadGroup(readGroup);
+ }
+
+ /**
+ * Get a sample for a given read
+ * Must first look up read group, and then sample ID for that read group
+ * @param read of sample
+ * @return sample object of this read
+ */
+ public Sample getSampleByRead(SAMRecord read) {
+ return getSampleByReadGroup(read.getReadGroup());
+ }
+
+ /**
+ * Get number of sample objects
+ * @return size of samples map
+ */
+ public int sampleCount() {
+ return sampleDataSource.sampleCount();
+ }
+
+ /**
+ * Return all samples with a given family ID
+ * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
+ * @param familyId
+ * @return
+ */
+ public Set getFamily(String familyId) {
+ return sampleDataSource.getFamily(familyId);
+ }
+
+ /**
+ * Returns all children of a given sample
+ * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
+ * @param sample
+ * @return
+ */
+ public Set getChildren(Sample sample) {
+ return sampleDataSource.getChildren(sample);
+ }
+
}
diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 6abd8c209..b73f6a7ee 100755
--- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -70,6 +70,11 @@ public class GATKArgumentCollection {
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false)
public List samFiles = new ArrayList();
+ // parameters and their defaults
+ @ElementList(required = false)
+ @Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false)
+ public List sampleFiles = new ArrayList();
+
@Element(required = false)
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java
new file mode 100644
index 000000000..433e0af40
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java
@@ -0,0 +1,30 @@
+package org.broadinstitute.sting.gatk.datasources.sample;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: brett
+ * Date: Aug 12, 2010
+ * Time: 2:09:16 PM
+ */
+public class PropertyDefinition {
+
+ String property;
+
+ String[] values;
+
+ public String getProperty() {
+ return property;
+ }
+
+ public void setProperty(String property) {
+ this.property = property;
+ }
+
+ public String[] getValues() {
+ return values;
+ }
+
+ public void setValues(String[] values) {
+ this.values = values;
+ }
+}
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java
new file mode 100644
index 000000000..c509df317
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java
@@ -0,0 +1,190 @@
+package org.broadinstitute.sting.gatk.datasources.sample;
+
+
+import org.broadinstitute.sting.utils.exceptions.StingException;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: brett
+ * Date: Jul 26, 2010
+ * Time: 3:31:38 PM
+ */
+public class Sample implements java.io.Serializable {
+
+ private final String id;
+
+ private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file
+
+ private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file
+
+ private HashMap properties = new HashMap();
+
+ private HashMap relationships = new HashMap();
+
+ public enum Gender {
+ MALE,
+ FEMALE,
+ UNKNOWN
+ }
+
+ public Sample(String id) {
+ if (id == null) {
+ throw new StingException("Error creating sample: sample ID cannot be null");
+ }
+ this.id = id;
+ }
+
+ public String getId() {
+ return this.id;
+ }
+
+ public Map getProperties() {
+ return properties;
+ }
+
+ public void setProperties(Map properties) {
+ this.properties = (HashMap) properties;
+ }
+
+
+ public void setSampleFileEntry(boolean value) {
+ this.hasSampleFileEntry = value;
+ }
+
+ public boolean hasSAMFileEntry() {
+ return this.hasSAMFileEntry;
+ }
+
+ public void setSAMFileEntry(boolean value) {
+ this.hasSAMFileEntry = value;
+ }
+
+ public boolean hasSampleFileEntry() {
+ return this.hasSampleFileEntry;
+ }
+
+ /**
+ * Get one property
+ * @param key key of property
+ * @return value of property as generic object
+ */
+ public Object getProperty(String key) {
+ return properties.get(key);
+ }
+
+ /**
+ * Set a property
+ * If property already exists, it is overwritten
+ * @param key key of property
+ * @param value object to be stored in properties array
+ */
+ public void setProperty(String key, Object value) {
+
+ if (relationships.containsKey(key)) {
+ throw new StingException("The same key cannot exist as a property and a relationship");
+ }
+
+ if (key.equals("gender") && value.getClass() != Gender.class) {
+ throw new StingException("'gender' property must be of type Sample.Gender");
+ }
+
+ if (key.equals("population") && value.getClass() != String.class) {
+ throw new StingException("'population' property must be of type String");
+ }
+
+ properties.put(key, value);
+ }
+
+ /**
+ * Get one relationship
+ * @param key of relationship
+ * @return Sample object that this relationship points to
+ */
+ public Sample getRelationship(String key) {
+ return relationships.get(key);
+ }
+
+ /**
+ * Set one relationship
+ * If already set, it is overwritten
+ * @param key key of the relationship
+ * @param value Sample object this relationship points to
+ */
+ public void setRelationship(String key, Sample value) {
+ if (properties.containsKey(key)) {
+ throw new StingException("The same key cannot exist as a property and a relationship");
+ }
+ relationships.put(key, value);
+ }
+
+ /**
+ * Get the sample's mother
+ * @return sample object with relationship mother, if exists, or null
+ */
+ public Sample getMother() {
+ return getRelationship("mother");
+ }
+
+ /**
+ * Get the sample's father
+ * @return sample object with relationship father, if exists, or null
+ */
+ public Sample getFather() {
+ return getRelationship("father");
+ }
+
+ /**
+ * Get gender of the sample
+ * @return property of key "gender" - must be of type Gender
+ */
+ public Gender getGender() {
+ return (Gender) properties.get("gender");
+ }
+
+ public String getPopulation() {
+ return (String) properties.get("population");
+ }
+
+ public String getFamilyId() {
+ return (String) properties.get("familyId");
+ }
+
+ /**
+ * @return True if sample is male, false if female, unknown, or null
+ */
+ public boolean isMale() {
+ return properties.get("gender") == Gender.MALE;
+ }
+
+ /**
+ * @return True if sample is female, false if male, unknown or null
+ */
+ public boolean isFemale() {
+ return properties.get("gender") == Gender.MALE;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ Sample sample = (Sample) o;
+
+ if (hasSAMFileEntry != sample.hasSAMFileEntry) return false;
+ if (hasSampleFileEntry != sample.hasSampleFileEntry) return false;
+ if (id != null ? !id.equals(sample.id) : sample.id != null) return false;
+ if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false;
+ if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null)
+ return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return id.hashCode();
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java
new file mode 100644
index 000000000..ce749cb83
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java
@@ -0,0 +1,31 @@
+package org.broadinstitute.sting.gatk.datasources.sample;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: brett
+ * Date: Aug 13, 2010
+ * Time: 5:13:46 PM
+ */
+public class SampleAlias {
+
+ String mainId;
+
+ String[] otherIds;
+
+ public String getMainId() {
+ return mainId;
+ }
+
+ public void setMainId(String mainId) {
+ this.mainId = mainId;
+ }
+
+ public String[] getOtherIds() {
+ return otherIds;
+ }
+
+ public void setOtherIds(String[] otherIds) {
+ this.otherIds = otherIds;
+ }
+
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java
new file mode 100644
index 000000000..960664832
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java
@@ -0,0 +1,459 @@
+package org.broadinstitute.sting.gatk.datasources.sample;
+
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMReadGroupRecord;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+import org.yaml.snakeyaml.Loader;
+import org.yaml.snakeyaml.TypeDescription;
+import org.yaml.snakeyaml.Yaml;
+import org.yaml.snakeyaml.constructor.Constructor;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: brett
+ * Date: Jul 26, 2010
+ * Time: 3:30:09 PM
+ *
+ * This class stores and manages sample metadata. This data is encoded in a sample file, which can be included
+ * in the GATK by the "--samples" argument. This class reads and parses those files.
+ *
+ * Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really
+ * only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker
+ * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource.
+ *
+ */
+public class SampleDataSource {
+
+ /**
+ * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so
+ * this is stored as a HashMap.
+ */
+ private HashMap samples = new HashMap();
+
+ /**
+ * Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different
+ * datasets. If this is the case, one ID is the "primary ID" and others are "aliases".
+ *
+ * This maps ID => primary ID for all samples ID strings - both primary IDs and aliases.
+ */
+ private HashMap sampleAliases = new HashMap();
+
+ /**
+ * While loading sample files, we must be aware of "special" properties and relationships that are always allowed
+ */
+ public static final String[] specialProperties = new String[] {"familyId", "population", "gender"};
+ public static final String[] specialRelationships = new String[] {"mother", "father"};
+
+ /**
+ * Constructor takes both a SAM header and sample files because the two must be integrated.
+ * @param header SAMFileHeader that has been created for this analysis
+ * @param sampleFiles Sample files that were included on the command line
+ */
+ public SampleDataSource(SAMFileHeader header, List sampleFiles) {
+
+ // create empty sample object for each sample referenced in the SAM header
+ for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
+ if (!hasSample(sampleName)) {
+ Sample newSample = new Sample(sampleName);
+ newSample.setSAMFileEntry(true);
+ samples.put(sampleName, newSample);
+ }
+ }
+
+ // add files consecutively
+ if (sampleFiles != null) {
+ for (File file : sampleFiles) {
+ addFile(file);
+ }
+ }
+ }
+
+ /**
+ * Hallucinates sample objects for all the samples in the SAM file and stores them
+ */
+ private void getSamplesFromSAMFile() {
+ for (String sampleName : SampleUtils.getSAMFileSamples(GenomeAnalysisEngine.instance.getSAMFileHeader())) {
+ if (!hasSample(sampleName)) {
+ Sample newSample = new Sample(sampleName);
+ newSample.setSAMFileEntry(true);
+ samples.put(sampleName, newSample);
+ }
+ }
+ }
+
+ /**
+ * Parse one sample file and integrate it with samples that are already there
+ * Fail quickly if we find any errors in the file
+ */
+ private void addFile(File sampleFile) {
+
+ BufferedReader reader;
+ try {
+ reader = new BufferedReader(new FileReader(sampleFile));
+ }
+ catch (IOException e) {
+ throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e);
+ }
+
+ // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file
+ Constructor con = new Constructor(SampleFileParser.class);
+ TypeDescription desc = new TypeDescription(SampleFileParser.class);
+ desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class);
+ desc.putListPropertyType("sampleAliases", SampleAlias.class);
+ con.addTypeDescription(desc);
+ Loader loader = new Loader(con);
+ Yaml yaml = new Yaml(loader);
+
+ // SampleFileParser stores an object representation of a sample file - this is what we'll parse
+ SampleFileParser parser;
+ try {
+ parser = (SampleFileParser) yaml.load(reader);
+ }
+ catch (Exception e) { // TODO: should we have more granular exception here?
+ throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e);
+ }
+
+ // check to see which validation options were built into the file
+ boolean restrictProperties = parser.getAllowedProperties() != null;
+ boolean restrictRelationships = parser.getAllowedRelationships() != null;
+ boolean restrictPropertyValues = parser.getPropertyDefinitions() != null;
+
+ // propertyValues stores the values that are allowed for a given property
+ HashMap propertyValues = null;
+ if (restrictPropertyValues) {
+ propertyValues = new HashMap();
+ for (PropertyDefinition def : parser.getPropertyDefinitions()) {
+ HashSet set = new HashSet();
+ for (String value : def.getValues()) {
+ set.add(value);
+ }
+ propertyValues.put(def.getProperty(), set);
+ }
+ }
+
+ // make sure the aliases are valid
+ validateAliases(parser);
+
+ // loop through each sample in the file - a SampleParser stores an object that will become a Sample
+ for (SampleParser sampleParser : parser.getSamples()) {
+
+ // step 1: add the sample if it doesn't already exist
+ Sample sample = getSampleById(sampleParser.getId());
+ if (sample == null) {
+ sample = new Sample(sampleParser.getId());
+ }
+ addSample(sample);
+ sample.setSampleFileEntry(true);
+
+ // step 2: add the properties
+ if (sampleParser.getProperties() != null) {
+ for (String property : sampleParser.getProperties().keySet()) {
+
+ // check that property is allowed
+ if (restrictProperties) {
+ if (!isPropertyValid(property, parser.getAllowedProperties())) {
+ throw new StingException(property + " is an invalid property. It is not included in the list " +
+ "of allowed properties.");
+ }
+ }
+
+ // next check that the value is allowed
+ if (restrictPropertyValues) {
+ if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) {
+ throw new StingException("The value of property '" + property + "' is invalid. " +
+ "It is not included in the list of allowed values for this property.");
+ }
+ }
+
+ // next check that there isn't already a conflicting property there
+ if (sample.getProperty(property) != null &&
+ sample.getProperty(property) != sampleParser.getProperties().get(property))
+ {
+ throw new StingException(property + " is a conflicting property!");
+ }
+
+ // checks are passed - now add the property!
+ saveProperty(sample, property, sampleParser.getProperties().get(property));
+ }
+ }
+
+ // step 3: add the relationships
+ if (sampleParser.getRelationships() != null) {
+ for (String relationship : sampleParser.getRelationships().keySet()) {
+ String relativeId = sampleParser.getRelationships().get(relationship);
+ if (relativeId == null) {
+ throw new StingException("The relationship cannot be null");
+ }
+
+ // first check that it's not invalid
+ if (restrictRelationships) {
+ if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) {
+ throw new StingException(relationship + " is an invalid relationship");
+ }
+ }
+
+ // next check that there isn't already a conflicting property there
+ if (sample.getRelationship(relationship) != null) {
+ if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) {
+ throw new StingException(relationship + " is a conflicting relationship!");
+ }
+ // if the relationship is already set - and consistent with what we're reading now - no need to continue
+ else {
+ continue;
+ }
+ }
+
+ // checks are passed - now save the relationship
+ saveRelationship(sample, relationship, relativeId);
+ }
+ }
+ }
+
+ }
+
+ private boolean isValueAllowed(String key, Object value, HashMap valuesList) {
+
+ // if the property values weren't specified for this property, then any value is okay
+ if (!valuesList.containsKey(key)) {
+ return true;
+ }
+
+ // if this property has enumerated values, it must be a string
+ else if (value.getClass() != String.class)
+ return false;
+
+ // is the value specified or not?
+ else if (!valuesList.get(key).contains(value))
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Makes sure that the aliases are valid
+ * Checks that 1) no string is used as both a main ID and an alias;
+ * 2) no alias is used more than once
+ * @param parser
+ */
+ private void validateAliases(SampleFileParser parser) {
+
+ // no aliases sure validate
+ if (parser.getSampleAliases() == null)
+ return;
+
+ HashSet mainIds = new HashSet();
+ HashSet otherIds = new HashSet();
+
+ for (SampleAlias sampleAlias : parser.getSampleAliases()) {
+ mainIds.add(sampleAlias.getMainId());
+ for (String otherId : sampleAlias.getOtherIds()) {
+ if (mainIds.contains(otherId))
+ throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " +
+ "be both a main ID and an other ID", otherId));
+
+ if (!otherIds.add(otherId))
+ throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " +
+ "alias more than once.", otherId));
+ }
+ }
+ }
+
+ private boolean isPropertyValid(String property, String[] allowedProperties) {
+
+ // is it a special property that is always allowed?
+ for (String allowedProperty : specialProperties) {
+ if (property.equals(allowedProperty))
+ return true;
+ }
+
+ // is it in the allowed properties list?
+ for (String allowedProperty : allowedProperties) {
+ if (property.equals(allowedProperty))
+ return true;
+ }
+
+ return false;
+ }
+
+ private boolean isRelationshipValid(String relationship, String[] allowedRelationships) {
+
+ // is it a special relationship that is always allowed?
+ for (String allowedRelationship : specialRelationships) {
+ if (relationship.equals(allowedRelationship))
+ return true;
+ }
+
+ // is it in the allowed properties list?
+ for (String allowedRelationship : allowedRelationships) {
+ if (relationship.equals(allowedRelationship))
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Saves a property as the correct type
+ * @param key property key
+ * @param value property value, as read from YAML parser
+ * @return property value to be stored
+ */
+ private void saveProperty(Sample sample, String key, Object value) {
+
+ // convert gender to the right type, if it was stored as a String
+ if (key.equals("gender")) {
+ if (((String) value).toLowerCase().equals("male")) {
+ value = Sample.Gender.MALE;
+ }
+ else if (((String) value).toLowerCase().equals("female")) {
+ value = Sample.Gender.FEMALE;
+ }
+ else if (((String) value).toLowerCase().equals("unknown")) {
+ value = Sample.Gender.UNKNOWN;
+ }
+ else if (value != null) {
+ throw new StingException("'gender' property must be male, female, or unknown.");
+ }
+ value = null;
+ }
+ sample.setProperty(key, value);
+ }
+
+ /**
+ * Saves a relationship as the correct type
+ * @param key relationship key
+ * @param relativeId sample ID string of the relative
+ * @return relationship value to be stored
+ */
+ private void saveRelationship(Sample sample, String key, String relativeId) {
+
+ // get the reference that we'll store as the value
+ Sample relative = getSampleById(relativeId);
+
+ // create sample object for the relative, if necessary
+ if (relative == null) {
+ relative = new Sample(relativeId);
+ addSample(relative);
+ }
+ sample.setRelationship(key, relative);
+ }
+
+
+
+ /**
+ * Filter a sample name in case it is an alias
+ * @param sampleId to be filtered
+ * @return ID of sample that stores data for this alias
+ */
+ private String aliasFilter(String sampleId) {
+ if (!sampleAliases.containsKey(sampleId))
+ return sampleId;
+ else
+ return sampleAliases.get(sampleId);
+ }
+
+ /**
+ * Add a sample to the collection
+ * @param sample to be added
+ */
+ private void addSample(Sample sample) {
+ samples.put(sample.getId(), sample);
+ }
+
+ /**
+ * Check if sample with this ID exists
+ * Note that this will return true if name passed in is an alias
+ * @param id ID of sample to be checked
+ * @return true if sample exists; false if not
+ */
+ public boolean hasSample(String id) {
+ return samples.get(aliasFilter(id)) != null;
+ }
+
+ /**
+ * Get a sample by its ID
+ * If an alias is passed in, return the main sample object
+ * @param id
+ * @return sample Object with this ID
+ */
+ public Sample getSampleById(String id) {
+ return samples.get(aliasFilter(id));
+ }
+
+ /**
+ * Get the sample for a given read group
+ * Must first look up ID for read group
+ * @param readGroup of sample
+ * @return sample object with ID from the read group
+ */
+ public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
+ String nameFromReadGroup = readGroup.getSample();
+ return getSampleById(nameFromReadGroup);
+ }
+
+ /**
+ * Get a sample for a given read
+ * Must first look up read group, and then sample ID for that read group
+ * @param read of sample
+ * @return sample object of this read
+ */
+ public Sample getSampleByRead(SAMRecord read) {
+ return getSampleByReadGroup(read.getReadGroup());
+ }
+
+ /**
+ * Get number of sample objects
+ * @return size of samples map
+ */
+ public int sampleCount() {
+ return samples.size();
+ }
+
+ /**
+ * Return all samples with a given family ID
+ * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
+ * @param familyId
+ * @return
+ */
+ public Set getFamily(String familyId) {
+ HashSet familyMembers = new HashSet();
+
+ for (Sample sample : samples.values()) {
+ if (sample.getFamilyId() != null) {
+ if (sample.getFamilyId().equals(familyId))
+ familyMembers.add(sample);
+ }
+ }
+ return familyMembers;
+ }
+
+ /**
+ * Returns all children of a given sample
+ * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
+ * @param sample
+ * @return
+ */
+ public Set getChildren(Sample sample) {
+ HashSet children = new HashSet();
+ for (Sample familyMember : getFamily(sample.getFamilyId())) {
+ if (familyMember.getMother() == sample || familyMember.getFather() == sample) {
+ children.add(familyMember);
+ }
+ }
+ return children;
+ }
+
+
+}
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java
new file mode 100644
index 000000000..a362af663
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java
@@ -0,0 +1,65 @@
+package org.broadinstitute.sting.gatk.datasources.sample;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: brett
+ * Date: Aug 12, 2010
+ * Time: 1:30:44 PM
+ */
+public class SampleFileParser {
+
+ private SampleAlias[] sampleAliases;
+
+ private String[] allowedProperties;
+
+ private String[] allowedRelationships;
+
+ private PropertyDefinition[] propertyDefinitions;
+
+ private SampleParser[] samples;
+
+ public PropertyDefinition[] getPropertyDefinitions() {
+ return propertyDefinitions;
+ }
+
+ public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) {
+ this.propertyDefinitions = propertyDefinitions;
+ }
+
+ public SampleFileParser() {
+
+ }
+
+ public String[] getAllowedProperties() {
+ return allowedProperties;
+ }
+
+ public void setAllowedProperties(String[] allowedProperties) {
+ this.allowedProperties = allowedProperties;
+ }
+
+ public SampleParser[] getSamples() {
+ return samples;
+ }
+
+ public void setSamples(SampleParser[] samples) {
+ this.samples = samples;
+ }
+
+ public String[] getAllowedRelationships() {
+ return allowedRelationships;
+ }
+
+ public void setAllowedRelationships(String[] allowedRelationships) {
+ this.allowedRelationships = allowedRelationships;
+ }
+
+ public SampleAlias[] getSampleAliases() {
+ return sampleAliases;
+ }
+
+ public void setSampleAliases(SampleAlias[] sampleAliases) {
+ this.sampleAliases = sampleAliases;
+ }
+
+}
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java
new file mode 100644
index 000000000..f5e07ca29
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java
@@ -0,0 +1,43 @@
+package org.broadinstitute.sting.gatk.datasources.sample;
+
+import java.util.HashMap;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: brett
+ * Date: Aug 13, 2010
+ * Time: 2:09:43 PM
+ */
+public class SampleParser {
+
+ private String id;
+
+ private HashMap properties;
+
+ private HashMap relationships;
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public HashMap getProperties() {
+ return properties;
+ }
+
+ public void setProperties(HashMap properties) {
+ this.properties = properties;
+ }
+
+ public HashMap getRelationships() {
+ return relationships;
+ }
+
+ public void setRelationships(HashMap relationships) {
+ this.relationships = relationships;
+ }
+
+}
diff --git a/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java b/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java
new file mode 100644
index 000000000..905bb9bf9
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/playground/sample/CountLociByPopulationWalker.java
@@ -0,0 +1,53 @@
+package org.broadinstitute.sting.playground.sample;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.walkers.TreeReducible;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+/**
+ * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes.
+ * Simplest example of a locus walker.
+ */
+public class CountLociByPopulationWalker extends LocusWalker implements TreeReducible {
+ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+
+ HashMap count = new HashMap();
+
+ ArrayList reads = (ArrayList) context.getBasePileup().getReads();
+
+ for (SAMRecord read : reads) {
+ String population = getToolkit().getSampleByRead(read).getPopulation();
+ if (!count.containsKey(population)) {
+ count.put(population, 1);
+ }
+ count.put(population, count.get(population) + 1);
+ }
+
+ System.out.println("\n\n\n***** LOCUS: " + ref.toString() + " *****");
+ for (String population : count.keySet()) {
+ System.out.println(String.format("%s | %d\n", population, count.get(population)));
+ }
+
+ return 1;
+ }
+
+ public Long reduceInit() { return 0l; }
+
+ public Long reduce(Integer value, Long sum) {
+ return value + sum;
+ }
+
+ /**
+ * Reduces two subtrees together. In this case, the implementation of the tree reduce
+ * is exactly the same as the implementation of the single reduce.
+ */
+ public Long treeReduce(Long lhs, Long rhs) {
+ return lhs + rhs;
+ }
+}
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java b/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java
new file mode 100644
index 000000000..20cb7fe3d
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/playground/sample/CountMalesWalker.java
@@ -0,0 +1,28 @@
+package org.broadinstitute.sting.playground.sample;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.datasources.sample.Sample;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.DataSource;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.gatk.walkers.Requires;
+
+/**
+ * Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
+ * Can also count the number of reads matching a given criterion using read filters (see the
+ * --read-filter command line argument). Simplest example of a read-backed analysis.
+ */
+@Requires({DataSource.READS, DataSource.REFERENCE})
+public class CountMalesWalker extends ReadWalker {
+ public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
+ Sample sample = getToolkit().getSampleByRead(read);
+ return sample.isMale() ? 1 : 0;
+ }
+
+ public Integer reduceInit() { return 0; }
+
+ public Integer reduce(Integer value, Integer sum) {
+ return value + sum;
+ }
+}
\ No newline at end of file
diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java
new file mode 100644
index 000000000..e66e6fce3
--- /dev/null
+++ b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceTest.java
@@ -0,0 +1,154 @@
+package org.broadinstitute.sting.gatk.datasources.sample;
+
+import net.sf.samtools.SAMFileHeader;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: brett
+ * Date: Sep 9, 2010
+ * Time: 8:21:00 AM
+ */
+public class SampleDataSourceTest extends BaseTest {
+
+ // this empty header used to instantiate sampledatasource objects
+ private static SAMFileHeader header = new SAMFileHeader();
+
+ // all the test sample files are located here
+ private String sampleFilesDir = validationDataLocation + "sample/";
+
+ // make sure samples are created from the SAM file correctly
+ @Test()
+ public void loadSAMSamplesTest() {
+ SampleDataSource s = new SampleDataSource(header, null);
+ }
+
+ // tests that a basic sample with relationships loads correctly
+ // Note that this is the only test for family relationships - we may want to expand this
+ @Test()
+ public void basicLoadSampleFileTest() {
+ File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml");
+ SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
+ Assert.assertTrue(s.sampleCount() == 4);
+ Sample sampleA = s.getSampleById("sampleA");
+ Sample sampleB = s.getSampleById("sampleB");
+ Assert.assertTrue(sampleB.getMother() == sampleA);
+ Assert.assertTrue(s.getChildren(sampleA).contains(sampleB));
+ Set family = s.getFamily("family1");
+ Assert.assertTrue(family.size() == 2);
+ Assert.assertTrue(family.contains(sampleA));
+ Assert.assertTrue(family.contains(sampleB));
+ }
+
+ // but that file should fail if it has an extra character in it...
+ @Test(expected = StingException.class)
+ public void loadInvalidSampleExtraCharText() {
+ File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraChar.yaml");
+ SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
+ }
+
+ // ...or a typo...
+ @Test(expected = StingException.class)
+ public void loadInvalidSampleTypoText() {
+ File sampleFile = new File(sampleFilesDir + "invalidSyntaxTypo.yaml");
+ SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
+
+ }
+
+ // ...or an extra unrecognized array
+ @Test(expected = StingException.class)
+ public void loadInvalidSampleExtraArrayText() {
+ File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraArray.yaml");
+ SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
+ }
+
+ // make sure aliases work
+ @Test(expected = StingException.class)
+ public void sampleAliasText() {
+ File sampleFile = new File(sampleFilesDir + "basicSampleFileWithAlias.yaml");
+ SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
+ // this file has two samples, but one has an alias. let's make sure that checks out...
+ Assert.assertTrue(s.sampleCount() == 2);
+ Assert.assertTrue(s.getSampleById("sampleA") == s.getSampleById("sampleC"));
+ }
+
+ // error is thrown if property is included that's not in properties array
+ @Test(expected = StingException.class)
+ public void unallowedPropertySampleTest() {
+ File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedProperty.yaml");
+ SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
+ }
+
+ // same as above, with relationship
+ @Test(expected = StingException.class)
+ public void unallowedRelationshipSampleTest() {
+ File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedRelationship.yaml");
+ SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
+ }
+
+ // two sample files
+ @Test()
+ public void twoSampleFilesTest() {
+ File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml");
+ File secondFile = new File(sampleFilesDir + "basicSampleFileExt.yaml");
+ ArrayList files = new ArrayList();
+ files.add(sampleFile);
+ files.add(secondFile);
+ SampleDataSource s = new SampleDataSource(header, files);
+ Assert.assertTrue(s.getSampleById("sampleA").getProperty("propC").equals("valC"));
+ Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA"));
+ }
+
+ // two sample files, with contradictory properties
+ @Test(expected = StingException.class)
+ public void twoContradictorySampleFilesTest() {
+ File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml");
+ File secondFile = new File(sampleFilesDir + "basicSampleFileInvalidExt.yaml");
+ ArrayList files = new ArrayList();
+ files.add(sampleFile);
+ files.add(secondFile);
+ SampleDataSource s = new SampleDataSource(header, files);
+ }
+
+ // three sample files
+ @Test()
+ public void threeSamplesTest() {
+ File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml");
+ ArrayList files = new ArrayList();
+ files.add(sampleFile);
+ files.add(new File(sampleFilesDir + "basicSampleFileExt.yaml"));
+ files.add(new File(sampleFilesDir + "basicSampleFileExt2.yaml"));
+ SampleDataSource s = new SampleDataSource(header, files);
+ Assert.assertTrue(s.sampleCount() == 5);
+ Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC"));
+ Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA"));
+ }
+
+ // make sure we can import data types other than Strings
+ @Test()
+ public void sampleTestPropertyType() {
+ File sampleFile = new File(sampleFilesDir + "sampleFileOtherTypes.yaml");
+ SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
+ Sample sample = s.getSampleById("sampleA");
+ Assert.assertTrue(sample.getProperty("a").getClass() == Integer.class);
+ Assert.assertTrue(sample.getProperty("b").getClass() == String.class);
+ Assert.assertTrue(sample.getProperty("c").getClass() == Double.class);
+ Assert.assertTrue(sample.getProperty("b").getClass() == String.class);
+ }
+
+
+ // we create lots of single item lists...
+ private ArrayList makeFileList(File file) {
+ ArrayList a = new ArrayList();
+ a.add(file);
+ return a;
+ }
+}
diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java
new file mode 100644
index 000000000..ce73103e8
--- /dev/null
+++ b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleTest.java
@@ -0,0 +1,63 @@
+package org.broadinstitute.sting.gatk.datasources.sample;
+
+import org.broadinstitute.sting.BaseTest;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: brett
+ * Date: Sep 9, 2010
+ * Time: 8:21:00 AM
+ */
+public class SampleTest extends BaseTest {
+
+ static Sample sampleA;
+ static Sample sampleA1;
+ static Sample sampleB;
+ static Sample sampleC;
+
+ @BeforeClass
+ public static void init() {
+ sampleA = new Sample("sampleA");
+ sampleA.setProperty("uniqueProperty", "uniqueValue");
+ sampleA1 = new Sample("sampleA");
+ sampleA1.setProperty("uniqueProperty", "uniqueValue");
+ sampleB = new Sample("sampleB");
+ sampleC = new Sample("sampleC");
+ sampleC.setProperty("population", "pop1");
+ sampleC.setProperty("gender", Sample.Gender.MALE);
+ }
+
+ /**
+ * Testing equality
+ */
+ @Test()
+ public void equalsTest() {
+ Assert.assertTrue(sampleA.equals(sampleA1));
+ Assert.assertFalse(sampleA == sampleA1);
+ Assert.assertFalse(sampleA.equals(sampleB));
+ }
+
+ /**
+ * And hash
+ */
+ @Test()
+ public void basicHashTest() {
+ Assert.assertFalse(sampleA.hashCode() == sampleB.hashCode());
+ Assert.assertTrue(sampleA.hashCode() == sampleA1.hashCode());
+ }
+
+ /**
+ * Now test the special getter methods
+ */
+ @Test()
+ public void specialGettersTest() {
+ Assert.assertTrue(sampleC.getId().equals("sampleC"));
+ Assert.assertTrue(sampleC.getPopulation().equals("pop1"));
+ Assert.assertTrue(sampleC.isMale());
+ Assert.assertFalse(sampleA.isMale()); // sample A doesn't have a gender, so this should be false
+ }
+
+}
\ No newline at end of file