();
-
@Element(required = false)
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
@@ -215,28 +208,91 @@ public class GATKArgumentCollection {
// --------------------------------------------------------------------------------------------------------------
//
- // distributed GATK arguments
+ // PED (pedigree) support
//
// --------------------------------------------------------------------------------------------------------------
- @Element(required=false)
- @Argument(fullName="processingTracker",shortName="C",doc="A lockable, shared file for coordinating distributed GATK runs",required=false)
- @Hidden
- public File processingTrackerFile = null;
- @Element(required=false)
- @Argument(fullName="restartProcessingTracker",shortName="RPT",doc="Should we delete the processing tracker file at startup?",required=false)
- @Hidden
- public boolean restartProcessingTracker = false;
+ /**
+ * Reads PED file-formatted tabular text files describing meta-data about the samples being
+ * processed in the GATK.
+ *
+ *
+ *
+ * The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:
+ *
+ *
+ * - Family ID
+ * - Individual ID
+ * - Paternal ID
+ * - Maternal ID
+ * - Sex (1=male; 2=female; other=unknown)
+ * - Phenotype
+ *
+ *
+ * The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person.
+ * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a
+ * quantitative trait or an affection status column: GATK will automatically detect which type
+ * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).
+ *
+ * If an individual's sex is unknown, then any character other than 1 or 2 can be used.
+ *
+ * You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that
+ * line will be ignored. Do not start any family IDs with this character therefore.
+ *
+ * Affection status should be coded:
+ *
+ *
+ * - -9 missing
+ * - 0 missing
+ * - 1 unaffected
+ * - 2 affected
+ *
+ *
+ * If any value outside of -9,0,1,2 is detected than the samples are assumed
+ * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely
+ * represents the missing value.
+ *
+ * Genotypes (column 7 onwards) cannot be specified to the GATK.
+ *
+ * For example, here are two individuals (one row = one person):
+ *
+ *
+ * FAM001 1 0 0 1 2
+ * FAM001 2 0 0 1 2
+ *
+ *
+ * Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to
+ * tell the GATK PED parser that the corresponding fields are missing from the ped file.
+ *
+ * Note that most GATK walkers do not use pedigree information. Walkers that require pedigree
+ * data should clearly indicate so in their arguments and will throw errors if required pedigree
+ * information is missing.
+ */
+ @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false)
+ public List pedigreeFiles = Collections.emptyList();
- @Element(required=false)
- @Argument(fullName="processingTrackerStatusFile",shortName="CSF",doc="If provided, a detailed accounting of the state of the process tracker is written to this file. For debugging, only",required=false)
- @Hidden
- public File processingTrackerStatusFile = null;
+ /**
+ * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more
+ * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString
+ * as -ped supports
+ */
+ @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false)
+ public List pedigreeStrings = Collections.emptyList();
- @Element(required=false)
- @Argument(fullName="processingTrackerID",shortName="CID",doc="If provided, an integer ID (starting at 1) indicating a unique id for this process within the distributed GATK group",required=false)
- @Hidden
- public int processTrackerID = -1;
+ /**
+ * How strict should we be in parsing the PED files?
+ */
+ @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false)
+ public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT;
+
+ // --------------------------------------------------------------------------------------------------------------
+ //
+ // BAM indexing and sharding arguments
+ //
+ // --------------------------------------------------------------------------------------------------------------
@Element(required = false)
@Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false)
@@ -387,7 +443,7 @@ public class GATKArgumentCollection {
return false;
}
if ((other.RODToInterval == null && RODToInterval != null) ||
- (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) {
+ (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) {
return false;
}
@@ -405,20 +461,6 @@ public class GATKArgumentCollection {
(other.performanceLog != null && !other.performanceLog.equals(this.performanceLog)))
return false;
- if ((other.processingTrackerFile == null && this.processingTrackerFile != null) ||
- (other.processingTrackerFile != null && !other.processingTrackerFile.equals(this.processingTrackerFile)))
- return false;
-
- if ((other.processingTrackerStatusFile == null && this.processingTrackerStatusFile != null) ||
- (other.processingTrackerStatusFile != null && !other.processingTrackerStatusFile.equals(this.processingTrackerStatusFile)))
- return false;
-
- if ( restartProcessingTracker != other.restartProcessingTracker )
- return false;
-
- if ( processTrackerID != other.processTrackerID )
- return false;
-
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
return false;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
index 1f9a7d705..c9506ec4c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
@@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.contexts;
import net.sf.samtools.SAMReadGroupRecord;
-import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@@ -76,14 +75,6 @@ public class AlignmentContextUtils {
return splitContextBySampleName(context, null);
}
- public static Map splitContextBySample(AlignmentContext context) {
- Map m = new HashMap();
- for ( Map.Entry entry : splitContextBySampleName(context, null).entrySet() ) {
- m.put(new Sample(entry.getKey()), entry.getValue());
- }
- return m;
- }
-
/**
* Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead
* of sample object.
@@ -97,8 +88,8 @@ public class AlignmentContextUtils {
GenomeLoc loc = context.getLocation();
HashMap contexts = new HashMap();
- for(String sample: context.getPileup().getSampleNames()) {
- ReadBackedPileup pileupBySample = context.getPileup().getPileupForSampleName(sample);
+ for(String sample: context.getPileup().getSamples()) {
+ ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample);
// Don't add empty pileups to the split context.
if(pileupBySample.size() == 0)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java
deleted file mode 100644
index 433e0af40..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java
+++ /dev/null
@@ -1,30 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Aug 12, 2010
- * Time: 2:09:16 PM
- */
-public class PropertyDefinition {
-
- String property;
-
- String[] values;
-
- public String getProperty() {
- return property;
- }
-
- public void setProperty(String property) {
- this.property = property;
- }
-
- public String[] getValues() {
- return values;
- }
-
- public void setValues(String[] values) {
- this.values = values;
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java
deleted file mode 100644
index ca8756684..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java
+++ /dev/null
@@ -1,203 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-
-import org.broadinstitute.sting.utils.exceptions.StingException;
-
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Jul 26, 2010
- * Time: 3:31:38 PM
- */
-public class Sample implements java.io.Serializable {
-
- private final String id;
-
- private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file
-
- private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file
-
- private HashMap properties = new HashMap();
-
- private HashMap relationships = new HashMap();
-
- public enum Gender {
- MALE,
- FEMALE,
- UNKNOWN
- }
-
- public Sample(String id) {
-/* if (id == null) {
- throw new StingException("Error creating sample: sample ID cannot be null");
- }*/
- this.id = id;
- }
-
- public String getId() {
- return this.id;
- }
-
- public Map getProperties() {
- return properties;
- }
-
- public void setProperties(Map properties) {
- this.properties = (HashMap) properties;
- }
-
- public Map getRelationships() {
- return Collections.unmodifiableMap(this.relationships);
- }
-
- public void setSampleFileEntry(boolean value) {
- this.hasSampleFileEntry = value;
- }
-
- public boolean hasSAMFileEntry() {
- return this.hasSAMFileEntry;
- }
-
- public void setSAMFileEntry(boolean value) {
- this.hasSAMFileEntry = value;
- }
-
- public boolean hasSampleFileEntry() {
- return this.hasSampleFileEntry;
- }
-
- /**
- * Get one property
- * @param key key of property
- * @return value of property as generic object
- */
- public Object getProperty(String key) {
- return properties.get(key);
- }
-
- /**
- * Set a property
- * If property already exists, it is overwritten
- * @param key key of property
- * @param value object to be stored in properties array
- */
- public void setProperty(String key, Object value) {
-
- if (relationships.containsKey(key)) {
- throw new StingException("The same key cannot exist as a property and a relationship");
- }
-
- if (key.equals("gender") && value.getClass() != Gender.class) {
- throw new StingException("'gender' property must be of type Sample.Gender");
- }
-
- if (key.equals("population") && value.getClass() != String.class) {
- throw new StingException("'population' property must be of type String");
- }
-
- properties.put(key, value);
- }
-
- /**
- * Get one relationship
- * @param key of relationship
- * @return Sample object that this relationship points to
- */
- public Sample getRelationship(String key) {
- return relationships.get(key);
- }
-
- /**
- * Set one relationship
- * If already set, it is overwritten
- * @param key key of the relationship
- * @param value Sample object this relationship points to
- */
- public void setRelationship(String key, Sample value) {
- if (properties.containsKey(key)) {
- throw new StingException("The same key cannot exist as a property and a relationship");
- }
- relationships.put(key, value);
- }
-
- /**
- * Get the sample's mother
- * @return sample object with relationship mother, if exists, or null
- */
- public Sample getMother() {
- return getRelationship("mother");
- }
-
- /**
- * Get the sample's father
- * @return sample object with relationship father, if exists, or null
- */
- public Sample getFather() {
- return getRelationship("father");
- }
-
- /**
- * Get gender of the sample
- * @return property of key "gender" - must be of type Gender
- */
- public Gender getGender() {
- return (Gender) properties.get("gender");
- }
-
- public String getPopulation() {
- return (String) properties.get("population");
- }
-
- public String getFamilyId() {
- return (String) properties.get("familyId");
- }
-
- /**
- * @return True if sample is male, false if female, unknown, or null
- */
- public boolean isMale() {
- return properties.get("gender") == Gender.MALE;
- }
-
- /**
- * @return True if sample is female, false if male, unknown or null
- */
- public boolean isFemale() {
- return properties.get("gender") == Gender.MALE;
- }
-
- /**
- *
- * @param key property key
- * @return true if sample has this property (even if its value is null)
- */
- public boolean hasProperty(String key) {
- return properties.containsKey(key);
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
-
- Sample sample = (Sample) o;
-
- if (hasSAMFileEntry != sample.hasSAMFileEntry) return false;
- if (hasSampleFileEntry != sample.hasSampleFileEntry) return false;
- if (id != null ? !id.equals(sample.id) : sample.id != null) return false;
- if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false;
- if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null)
- return false;
-
- return true;
- }
-
- @Override
- public int hashCode() {
- return id != null ? id.hashCode() : "".hashCode();
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java
deleted file mode 100644
index ce749cb83..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java
+++ /dev/null
@@ -1,31 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Aug 13, 2010
- * Time: 5:13:46 PM
- */
-public class SampleAlias {
-
- String mainId;
-
- String[] otherIds;
-
- public String getMainId() {
- return mainId;
- }
-
- public void setMainId(String mainId) {
- this.mainId = mainId;
- }
-
- public String[] getOtherIds() {
- return otherIds;
- }
-
- public void setOtherIds(String[] otherIds) {
- this.otherIds = otherIds;
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java
deleted file mode 100644
index 067bf3f72..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java
+++ /dev/null
@@ -1,590 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMReadGroupRecord;
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.SampleUtils;
-import org.broadinstitute.sting.utils.exceptions.StingException;
-import org.broadinstitute.sting.utils.variantcontext.Genotype;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import org.yaml.snakeyaml.TypeDescription;
-import org.yaml.snakeyaml.Yaml;
-import org.yaml.snakeyaml.constructor.Constructor;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.*;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Jul 26, 2010
- * Time: 3:30:09 PM
- *
- * This class stores and manages sample metadata. This data is encoded in a sample file, which can be included
- * in the GATK by the "--samples" argument. This class reads and parses those files.
- *
- * Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really
- * only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker
- * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource.
- *
- */
-public class SampleDataSource {
-
- /**
- * SAMFileHeader that has been created for this analysis.
- */
- private SAMFileHeader header;
-
- /**
- * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so
- * this is stored as a HashMap.
- */
- private final HashMap samples = new HashMap();
-
- /**
- * Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different
- * datasets. If this is the case, one ID is the "primary ID" and others are "aliases".
- *
- * This maps ID => primary ID for all samples ID strings - both primary IDs and aliases.
- */
- private HashMap sampleAliases = new HashMap();
-
- /**
- * While loading sample files, we must be aware of "special" properties and relationships that are always allowed
- */
- public static final String[] specialProperties = new String[] {"familyId", "population", "gender"};
- public static final String[] specialRelationships = new String[] {"mother", "father"};
-
- /**
- * Constructor takes both a SAM header and sample files because the two must be integrated.
- * @param header SAMFileHeader that has been created for this analysis
- * @param sampleFiles Sample files that were included on the command line
- */
- public SampleDataSource(SAMFileHeader header, List sampleFiles) {
- this();
- this.header = header;
- // create empty sample object for each sample referenced in the SAM header
- for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
- if (!hasSample(sampleName)) {
- Sample newSample = new Sample(sampleName);
- newSample.setSAMFileEntry(true);
- samples.put(sampleName, newSample);
- }
- }
-
- // add files consecutively
- if (sampleFiles != null) {
- for (File file : sampleFiles) {
- addFile(file);
- }
- }
- }
-
- public SampleDataSource() {
- samples.put(null, new Sample(null));
- }
-
- /**
- * Hallucinates sample objects for all the samples in the SAM file and stores them
- */
- public void addSamplesFromSAMHeader(SAMFileHeader header) {
- for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
- if (!hasSample(sampleName)) {
- Sample newSample = new Sample(sampleName);
- newSample.setSAMFileEntry(true);
- samples.put(sampleName, newSample);
- }
- }
- }
-
- /**
- * Parse one sample file and integrate it with samples that are already there
- * Fail quickly if we find any errors in the file
- */
- public void addFile(File sampleFile) {
-
- BufferedReader reader;
- try {
- reader = new BufferedReader(new FileReader(sampleFile));
- }
- catch (IOException e) {
- throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e);
- }
-
- // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file
- Constructor con = new Constructor(SampleFileParser.class);
- TypeDescription desc = new TypeDescription(SampleFileParser.class);
- desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class);
- desc.putListPropertyType("sampleAliases", SampleAlias.class);
- con.addTypeDescription(desc);
- Yaml yaml = new Yaml(con);
-
- // SampleFileParser stores an object representation of a sample file - this is what we'll parse
- SampleFileParser parser;
- try {
- parser = (SampleFileParser) yaml.load(reader);
- }
- catch (Exception e) {
- throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e);
- }
-
- // check to see which validation options were built into the file
- boolean restrictProperties = parser.getAllowedProperties() != null;
- boolean restrictRelationships = parser.getAllowedRelationships() != null;
- boolean restrictPropertyValues = parser.getPropertyDefinitions() != null;
-
- // propertyValues stores the values that are allowed for a given property
- HashMap propertyValues = null;
- if (restrictPropertyValues) {
- propertyValues = new HashMap();
- for (PropertyDefinition def : parser.getPropertyDefinitions()) {
- HashSet set = new HashSet();
- for (String value : def.getValues()) {
- set.add(value);
- }
- propertyValues.put(def.getProperty(), set);
- }
- }
-
- // make sure the aliases are valid
- validateAliases(parser);
-
- // loop through each sample in the file - a SampleParser stores an object that will become a Sample
- for (SampleParser sampleParser : parser.getSamples()) {
-
- try {
- // step 1: add the sample if it doesn't already exist
- Sample sample = getSampleById(sampleParser.getId());
- if (sample == null) {
- sample = new Sample(sampleParser.getId());
- }
- addSample(sample);
- sample.setSampleFileEntry(true);
-
- // step 2: add the properties
- if (sampleParser.getProperties() != null) {
- for (String property : sampleParser.getProperties().keySet()) {
-
- // check that property is allowed
- if (restrictProperties) {
- if (!isPropertyValid(property, parser.getAllowedProperties())) {
- throw new StingException(property + " is an invalid property. It is not included in the list " +
- "of allowed properties.");
- }
- }
-
- // next check that the value is allowed
- if (restrictPropertyValues) {
- if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) {
- throw new StingException("The value of property '" + property + "' is invalid. " +
- "It is not included in the list of allowed values for this property.");
- }
- }
-
- // next check that there isn't already a conflicting property there
- if (sample.getProperty(property) != null &&
- sample.getProperty(property) != sampleParser.getProperties().get(property))
- {
- throw new StingException(property + " is a conflicting property!");
- }
-
- // checks are passed - now add the property!
- saveProperty(sample, property, sampleParser.getProperties().get(property));
- }
- }
-
- // step 3: add the relationships
- if (sampleParser.getRelationships() != null) {
- for (String relationship : sampleParser.getRelationships().keySet()) {
- String relativeId = sampleParser.getRelationships().get(relationship);
- if (relativeId == null) {
- throw new StingException("The relationship cannot be null");
- }
-
- // first check that it's not invalid
- if (restrictRelationships) {
- if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) {
- throw new StingException(relationship + " is an invalid relationship");
- }
- }
-
- // next check that there isn't already a conflicting property there
- if (sample.getRelationship(relationship) != null) {
- if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) {
- throw new StingException(relationship + " is a conflicting relationship!");
- }
- // if the relationship is already set - and consistent with what we're reading now - no need to continue
- else {
- continue;
- }
- }
-
- // checks are passed - now save the relationship
- saveRelationship(sample, relationship, relativeId);
- }
- }
- } catch (Exception e) {
- throw new StingException("An error occurred while loading this sample from the sample file: " +
- sampleParser.getId(), e);
- }
- }
-
- }
-
- private boolean isValueAllowed(String key, Object value, HashMap valuesList) {
-
- // if the property values weren't specified for this property, then any value is okay
- if (!valuesList.containsKey(key)) {
- return true;
- }
-
- // if this property has enumerated values, it must be a string
- else if (value.getClass() != String.class)
- return false;
-
- // is the value specified or not?
- else if (!valuesList.get(key).contains(value))
- return false;
-
- return true;
- }
-
- /**
- * Makes sure that the aliases are valid
- * Checks that 1) no string is used as both a main ID and an alias;
- * 2) no alias is used more than once
- * @param parser
- */
- private void validateAliases(SampleFileParser parser) {
-
- // no aliases sure validate
- if (parser.getSampleAliases() == null)
- return;
-
- HashSet mainIds = new HashSet();
- HashSet otherIds = new HashSet();
-
- for (SampleAlias sampleAlias : parser.getSampleAliases()) {
- mainIds.add(sampleAlias.getMainId());
- for (String otherId : sampleAlias.getOtherIds()) {
- if (mainIds.contains(otherId))
- throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " +
- "be both a main ID and an other ID", otherId));
-
- if (!otherIds.add(otherId))
- throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " +
- "alias more than once.", otherId));
- }
- }
- }
-
- private boolean isPropertyValid(String property, String[] allowedProperties) {
-
- // is it a special property that is always allowed?
- for (String allowedProperty : specialProperties) {
- if (property.equals(allowedProperty))
- return true;
- }
-
- // is it in the allowed properties list?
- for (String allowedProperty : allowedProperties) {
- if (property.equals(allowedProperty))
- return true;
- }
-
- return false;
- }
-
- private boolean isRelationshipValid(String relationship, String[] allowedRelationships) {
-
- // is it a special relationship that is always allowed?
- for (String allowedRelationship : specialRelationships) {
- if (relationship.equals(allowedRelationship))
- return true;
- }
-
- // is it in the allowed properties list?
- for (String allowedRelationship : allowedRelationships) {
- if (relationship.equals(allowedRelationship))
- return true;
- }
-
- return false;
- }
-
- /**
- * Saves a property as the correct type
- * @param key property key
- * @param value property value, as read from YAML parser
- * @return property value to be stored
- */
- private void saveProperty(Sample sample, String key, Object value) {
-
- // convert gender to the right type, if it was stored as a String
- if (key.equals("gender")) {
- if (((String) value).toLowerCase().equals("male")) {
- value = Sample.Gender.MALE;
- }
- else if (((String) value).toLowerCase().equals("female")) {
- value = Sample.Gender.FEMALE;
- }
- else if (((String) value).toLowerCase().equals("unknown")) {
- value = Sample.Gender.UNKNOWN;
- }
- else if (value != null) {
- throw new StingException("'gender' property must be male, female, or unknown.");
- }
- }
- try {
- sample.setProperty(key, value);
- }
- catch (Exception e) {
- throw new StingException("Could not save property " + key, e);
- }
- }
-
- /**
- * Saves a relationship as the correct type
- * @param key relationship key
- * @param relativeId sample ID string of the relative
- * @return relationship value to be stored
- */
- private void saveRelationship(Sample sample, String key, String relativeId) {
-
- // get the reference that we'll store as the value
- Sample relative = getSampleById(relativeId);
-
- // create sample object for the relative, if necessary
- if (relative == null) {
- relative = new Sample(relativeId);
- addSample(relative);
- }
- sample.setRelationship(key, relative);
- }
-
-
-
- /**
- * Filter a sample name in case it is an alias
- * @param sampleId to be filtered
- * @return ID of sample that stores data for this alias
- */
- private String aliasFilter(String sampleId) {
- if (!sampleAliases.containsKey(sampleId))
- return sampleId;
- else
- return sampleAliases.get(sampleId);
- }
-
- /**
- * Add a sample to the collection
- * @param sample to be added
- */
- private void addSample(Sample sample) {
- samples.put(sample.getId(), sample);
- }
-
- /**
- * Check if sample with this ID exists
- * Note that this will return true if name passed in is an alias
- * @param id ID of sample to be checked
- * @return true if sample exists; false if not
- */
- public boolean hasSample(String id) {
- return samples.get(aliasFilter(id)) != null;
- }
-
- /**
- * Get a sample by its ID
- * If an alias is passed in, return the main sample object
- * @param id
- * @return sample Object with this ID
- */
- public Sample getSampleById(String id) {
- return samples.get(aliasFilter(id));
- }
-
- /**
- * Get the sample for a given read group
- * Must first look up ID for read group
- * @param readGroup of sample
- * @return sample object with ID from the read group
- */
- public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
- String nameFromReadGroup = readGroup.getSample();
- return getSampleById(nameFromReadGroup);
- }
-
- /**
- * Get a sample for a given read
- * Must first look up read group, and then sample ID for that read group
- * @param read of sample
- * @return sample object of this read
- */
- public Sample getSampleByRead(SAMRecord read) {
- return getSampleByReadGroup(read.getReadGroup());
- }
-
- /**
- * Get number of sample objects
- * @return size of samples map
- */
- public int sampleCount() {
- return samples.size();
- }
-
- /**
- * Return all samples with a given family ID
- * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
- * @param familyId
- * @return
- */
- public Set getFamily(String familyId) {
- HashSet familyMembers = new HashSet();
-
- for (Sample sample : samples.values()) {
- if (sample.getFamilyId() != null) {
- if (sample.getFamilyId().equals(familyId))
- familyMembers.add(sample);
- }
- }
- return familyMembers;
- }
-
- /**
- * Returns all children of a given sample
- * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
- * @param sample
- * @return
- */
- public Set getChildren(Sample sample) {
- HashSet children = new HashSet();
- for (Sample familyMember : getFamily(sample.getFamilyId())) {
- if (familyMember.getMother() == sample || familyMember.getFather() == sample) {
- children.add(familyMember);
- }
- }
- return children;
- }
-
- public Set getSamples() {
- HashSet set = new HashSet();
- set.addAll(samples.values());
- return set;
- }
-
- /**
- * Takes a collection of sample names and returns their corresponding sample objects
- * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set
- * @param sampleNameList Set of sample names
- * @return Corresponding set of samples
- */
- public Set getSamples(Collection sampleNameList) {
- HashSet samples = new HashSet();
- for (String name : sampleNameList) {
- try {
- samples.add(getSampleById(name));
- }
- catch (Exception e) {
- throw new StingException("Could not get sample with the following ID: " + name, e);
- }
- }
- return samples;
- }
-
- /**
- * Returns a set of samples that have any value (which could be null) for a given property
- * @param key Property key
- * @return Set of samples with the property
- */
- public Set getSamplesWithProperty(String key) {
- HashSet toReturn = new HashSet();
- for (Sample s : samples.values()) {
- if (s.hasProperty(key))
- toReturn.add(s);
- }
- return toReturn;
- }
-
- /**
- * Returns a set of samples that have a property with a certain value
- * Value must be a string for now - could add a similar method for matching any objects in the future
- *
- * @param key Property key
- * @param value String property value
- * @return Set of samples that match key and value
- */
- public Set getSamplesWithProperty(String key, String value) {
- Set toReturn = getSamplesWithProperty(key);
- for (Sample s : toReturn) {
- if (!s.getProperty(key).equals(value))
- toReturn.remove(s);
- }
- return toReturn;
- }
-
- public Sample getOrCreateSample(String id) {
- Sample sample = getSampleById(id);
- if (sample == null) {
- sample = new Sample(id);
- addSample(sample);
- }
- return sample;
- }
-
- /**
- * Returns all samples that were referenced in the SAM file
- */
- public Set getSAMFileSamples() {
- Set toReturn = new HashSet();
- for (Sample sample : samples.values()) {
- if (sample.hasSAMFileEntry())
- toReturn.add(sample);
- }
- return toReturn;
- }
-
- /**
- * Returns a set of sample objects for the sample names in a variant context
- *
- * @param context Any variant context
- * @return a set of the sample objects
- */
- public Set getSamplesByVariantContext(VariantContext context) {
- Set samples = new HashSet();
- for (String sampleName : context.getSampleNames()) {
- samples.add(getOrCreateSample(sampleName));
- }
- return samples;
- }
-
-
- /**
- * Return a subcontext restricted to samples with a given property key/value
- * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
- * @param context VariantContext to filter
- * @param key property key
- * @param value property value (must be string)
- * @return subcontext
- */
- public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
-
- Set samplesWithProperty = new HashSet();
- for (String sampleName : context.getSampleNames()) {
- Sample s = samples.get(sampleName);
- if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value))
- samplesWithProperty.add(sampleName);
- }
- Map genotypes = context.getGenotypes(samplesWithProperty);
- return context.subContextFromGenotypes(genotypes.values());
- }
-
- public static SampleDataSource createEmptyDataSource() {
- SAMFileHeader header = new SAMFileHeader();
- return new SampleDataSource(header, null);
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java
deleted file mode 100644
index a362af663..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java
+++ /dev/null
@@ -1,65 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Aug 12, 2010
- * Time: 1:30:44 PM
- */
-public class SampleFileParser {
-
- private SampleAlias[] sampleAliases;
-
- private String[] allowedProperties;
-
- private String[] allowedRelationships;
-
- private PropertyDefinition[] propertyDefinitions;
-
- private SampleParser[] samples;
-
- public PropertyDefinition[] getPropertyDefinitions() {
- return propertyDefinitions;
- }
-
- public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) {
- this.propertyDefinitions = propertyDefinitions;
- }
-
- public SampleFileParser() {
-
- }
-
- public String[] getAllowedProperties() {
- return allowedProperties;
- }
-
- public void setAllowedProperties(String[] allowedProperties) {
- this.allowedProperties = allowedProperties;
- }
-
- public SampleParser[] getSamples() {
- return samples;
- }
-
- public void setSamples(SampleParser[] samples) {
- this.samples = samples;
- }
-
- public String[] getAllowedRelationships() {
- return allowedRelationships;
- }
-
- public void setAllowedRelationships(String[] allowedRelationships) {
- this.allowedRelationships = allowedRelationships;
- }
-
- public SampleAlias[] getSampleAliases() {
- return sampleAliases;
- }
-
- public void setSampleAliases(SampleAlias[] sampleAliases) {
- this.sampleAliases = sampleAliases;
- }
-
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java
deleted file mode 100644
index f5e07ca29..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-import java.util.HashMap;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Aug 13, 2010
- * Time: 2:09:43 PM
- */
-public class SampleParser {
-
- private String id;
-
- private HashMap properties;
-
- private HashMap relationships;
-
- public String getId() {
- return id;
- }
-
- public void setId(String id) {
- this.id = id;
- }
-
- public HashMap getProperties() {
- return properties;
- }
-
- public void setProperties(HashMap properties) {
- this.properties = properties;
- }
-
- public HashMap getRelationships() {
- return relationships;
- }
-
- public void setRelationships(HashMap relationships) {
- this.relationships = relationships;
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
index 3b9e35311..a07f735fa 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
@@ -84,12 +84,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
*/
protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse ) {
super(engine, walker, reads, reference, rods);
-
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
-
- if (engine.getArguments().processingTrackerFile != null) {
- throw new UserException.BadArgumentValue("-C", "Distributed GATK calculations currently not supported in multi-threaded mode. Complain to Mark depristo@broadinstitute.org to implement and test this code path");
- }
}
public Object execute( Walker walker, ShardStrategy shardStrategy ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
index 09ab4bd44..deafcd0cc 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.utils.SampleUtils;
import java.util.Collection;
@@ -56,7 +57,8 @@ public class LinearMicroScheduler extends MicroScheduler {
traversalEngine.startTimersIfNecessary();
if(shard.getShardType() == Shard.ShardType.LOCUS) {
LocusWalker lWalker = (LocusWalker)walker;
- WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata());
+ WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
+ getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods);
Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit());
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
index 2b6488ada..11e51d99b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
@@ -62,7 +62,10 @@ public class ShardTraverser implements Callable {
Object accumulator = walker.reduceInit();
LocusWalker lWalker = (LocusWalker)walker;
- WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),microScheduler.getReadIterator(shard),shard.getGenomeLocs(), microScheduler.engine.getSampleMetadata()); // todo: microScheduler.engine is protected - is it okay to user it here?
+ WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),
+ microScheduler.getReadIterator(shard),
+ shard.getGenomeLocs(),
+ microScheduler.engine.getSampleDB().getSampleNames()); // todo: microScheduler.engine is protected - is it okay to user it here?
ShardDataProvider dataProvider = null;
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
index 43ea46002..d1f5d80da 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
@@ -4,7 +4,6 @@ import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
@@ -12,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
@@ -63,17 +63,20 @@ public class WindowMaker implements Iterable, I
* the given intervals.
* @param iterator The data source for this window.
* @param intervals The set of intervals over which to traverse.
- * @param sampleData SampleDataSource that we can reference reads with
+ * @param sampleNames The complete set of sample names in the reads in shard
*/
- public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, SampleDataSource sampleData ) {
+ public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) {
this.sourceInfo = shard.getReadProperties();
this.readIterator = iterator;
-
- this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData));
+ this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null;
}
+ public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) {
+ this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups());
+ }
+
public Iterator iterator() {
return this;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
index e13c5a764..eb5b51b33 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
@@ -35,12 +35,11 @@ import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.datasources.sample.Sample;
-import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.ReservoirDownsampler;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
@@ -52,9 +51,6 @@ import java.util.*;
/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */
public class LocusIteratorByState extends LocusIterator {
-// private static long discarded_bases = 0L;
-// private static long observed_bases = 0L;
-
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
@@ -69,7 +65,7 @@ public class LocusIteratorByState extends LocusIterator {
* Used to create new GenomeLocs.
*/
private final GenomeLocParser genomeLocParser;
- private final ArrayList samples;
+ private final ArrayList samples;
private final ReadStateManager readStates;
static private class SAMRecordState {
@@ -278,15 +274,30 @@ public class LocusIteratorByState extends LocusIterator {
//
// -----------------------------------------------------------------------------------------------------------------
- public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, SampleDataSource sampleData ) {
+ public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples ) {
this.readInfo = readInformation;
this.genomeLocParser = genomeLocParser;
+ this.samples = new ArrayList(samples);
+ this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod());
- // get the list of samples
- this.samples = new ArrayList(sampleData.getSamples());
-
- readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod());
-
+ // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
+ // there's no read data. So we need to throw this error only when samIterator.hasNext() is true
+ if ( this.samples.isEmpty() && samIterator.hasNext() ) {
+ // actually we cannot process BAMs without read groups unless we tolerate empty
+ // sample lists. In the empty case we need to add the null element to the samples
+ this.samples.add(null);
+ //throw new IllegalArgumentException("samples list must not be empty");
+ }
+ }
+
+ /**
+ * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
+ * for the system.
+ */
+ public final static Collection sampleListForSAMWithoutReadGroups() {
+ List samples = new ArrayList();
+ samples.add(null);
+ return samples;
}
public Iterator iterator() {
@@ -303,19 +314,6 @@ public class LocusIteratorByState extends LocusIterator {
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
}
- public void printState() {
- for(Sample sample: samples) {
- Iterator iterator = readStates.iterator(sample);
- while(iterator.hasNext()) {
- SAMRecordState state = iterator.next();
- logger.debug(String.format("printState():"));
- SAMRecord read = state.getRead();
- int offset = state.getReadOffset();
- logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, (char)read.getReadBases()[offset], read.getCigarString()));
- }
- }
- }
-
private GenomeLoc getLocation() {
return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
}
@@ -355,14 +353,14 @@ public class LocusIteratorByState extends LocusIterator {
// In this case, the subsequent call to next() will emit the normal pileup at the current base
// and shift the position.
if (readInfo.generateExtendedEvents() && hasExtendedEvents) {
- Map fullExtendedEventPileup = new HashMap();
+ Map fullExtendedEventPileup = new HashMap();
// get current location on the reference and decrement it by 1: the indels we just stepped over
// are associated with the *previous* reference base
GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1);
boolean hasBeenSampled = false;
- for(Sample sample: samples) {
+ for(final String sample: samples) {
Iterator iterator = readStates.iterator(sample);
List indelPile = new ArrayList(readStates.size(sample));
hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample);
@@ -426,10 +424,10 @@ public class LocusIteratorByState extends LocusIterator {
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
} else {
GenomeLoc location = getLocation();
- Map fullPileup = new HashMap();
+ Map fullPileup = new HashMap();
boolean hasBeenSampled = false;
- for(Sample sample: samples) {
+ for(final String sample: samples) {
Iterator iterator = readStates.iterator(sample);
List pile = new ArrayList(readStates.size(sample));
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
@@ -495,7 +493,7 @@ public class LocusIteratorByState extends LocusIterator {
}
private void updateReadStates() {
- for(Sample sample: samples) {
+ for(final String sample: samples) {
Iterator it = readStates.iterator(sample);
while ( it.hasNext() ) {
SAMRecordState state = it.next();
@@ -522,7 +520,7 @@ public class LocusIteratorByState extends LocusIterator {
private final PeekableIterator iterator;
private final DownsamplingMethod downsamplingMethod;
private final SamplePartitioner samplePartitioner;
- private final Map readStatesBySample = new HashMap();
+ private final Map readStatesBySample = new HashMap();
private final int targetCoverage;
private int totalReadStates = 0;
@@ -540,9 +538,9 @@ public class LocusIteratorByState extends LocusIterator {
}
Map readSelectors = new HashMap();
- for(Sample sample: samples) {
+ for(final String sample: samples) {
readStatesBySample.put(sample,new PerSampleReadStateManager());
- readSelectors.put(sample.getId(),downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector());
+ readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector());
}
samplePartitioner = new SamplePartitioner(readSelectors);
@@ -554,7 +552,7 @@ public class LocusIteratorByState extends LocusIterator {
* @param sample The sample.
* @return Iterator over the reads associated with that sample.
*/
- public Iterator iterator(final Sample sample) {
+ public Iterator iterator(final String sample) {
return new Iterator() {
private Iterator wrappedIterator = readStatesBySample.get(sample).iterator();
@@ -590,7 +588,7 @@ public class LocusIteratorByState extends LocusIterator {
* @param sample The sample.
* @return Total number of reads in the given sample.
*/
- public int size(final Sample sample) {
+ public int size(final String sample) {
return readStatesBySample.get(sample).size();
}
@@ -600,12 +598,12 @@ public class LocusIteratorByState extends LocusIterator {
* @param sample Sample, downsampled independently.
* @return Integer stop of the furthest undownsampled region.
*/
- public int getDownsamplingExtent(final Sample sample) {
+ public int getDownsamplingExtent(final String sample) {
return readStatesBySample.get(sample).getDownsamplingExtent();
}
public SAMRecordState getFirst() {
- for(Sample sample: samples) {
+ for(final String sample: samples) {
PerSampleReadStateManager reads = readStatesBySample.get(sample);
if(!reads.isEmpty())
return reads.peek();
@@ -639,8 +637,8 @@ public class LocusIteratorByState extends LocusIterator {
}
samplePartitioner.complete();
- for(Sample sample: samples) {
- ReadSelector aggregator = samplePartitioner.getSelectedReads(sample.getId());
+ for(final String sample: samples) {
+ ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
Collection newReads = new ArrayList(aggregator.getSelectedReads());
@@ -1072,6 +1070,3 @@ class SamplePartitioner implements ReadSelector {
}
}
-
-
-
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java
new file mode 100644
index 000000000..83e31f672
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.samples;
+
+/**
+ * Categorical sample trait for association and analysis
+ *
+ * Samples can have unknown status, be affected or unaffected by the
+ * categorical trait, or they can be marked as actually having an
+ * other trait value (stored in an associated value in the Sample class)
+ *
+ * @author Mark DePristo
+ * @since Sept. 2011
+ */
+public enum Affection {
+ /** Status is unknown */
+ UNKNOWN,
+ /** Suffers from the disease */
+ AFFECTED,
+ /** Unaffected by the disease */
+ UNAFFECTED,
+ /** An "other" trait: value of the trait is stored elsewhere and is an arbitrary string */
+ OTHER
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java
new file mode 100644
index 000000000..6fb44804a
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.samples;
+
+/**
+* ENUM of possible human genders: male, female, or unknown
+*/
+public enum Gender {
+ MALE,
+ FEMALE,
+ UNKNOWN
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java
new file mode 100644
index 000000000..c442409fb
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.samples;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.text.XReadLines;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * Reads PED file-formatted tabular text files
+ *
+ * See http://www.broadinstitute.org/mpg/tagger/faq.html
+ * See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped
+ *
+ * The "ped" file format refers to the widely-used format for linkage pedigree data.
+ * Each line describes a single (diploid) individual in the following format:
+ *
+ * family_ID individual_ID father_ID mother_ID gender phenotype genotype_1 genotype_2 ...
+ *
+ * If your data lacks pedigree information (for example, unrelated case/control individuals),
+ * set the father_ID and mother_ID to 0. sex denotes the individual's gender with 1=male and 2=female.
+ * phenotype refers to the affected status (for association studies) where 0=unknown, 1=unaffected, 2=affected.
+ * Finally, each genotype is written as two (=diploid) integer numbers (separated by whitespace),
+ * where 1=A, 2=C, 3=G, 4=T. No header lines are allowed and all columns must be separated by whitespace.
+ * Check out the information at the PLINK website on the "ped" file format.
+ *
+ * The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:
+ * Family ID
+ * Individual ID
+ * Paternal ID
+ * Maternal ID
+ * Sex (1=male; 2=female; other=unknown)
+ * Phenotype
+ *
+ * The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person.
+ * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a
+ * quantitative trait or an affection status column: PLINK will automatically detect which type
+ * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).
+ * Note that the GATK actually supports arbitrary values for quantitative trait -- not just doubles --
+ * and are actually representing these values as strings instead of doubles
+ *
+ * NOTE Quantitative traits with decimal points must be coded with a period/full-stop character and
+ * not a comma, i.e. 2.394 not 2,394
+ *
+ * If an individual's sex is unknown, then any character other than 1 or 2 can be used.
+ * When new files are created (PED, FAM, or other which contain sex) then the original coding will be
+ * preserved. However, these individuals will be dropped from any analyses (i.e. phenotype set to missing also)
+ * and an error message will arise if an analysis that uses family information is requested and an
+ * individual of 'unknown' sex is specified as a father or mother.
+ *
+ *
+ * HINT You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that
+ * line will be ignored. Do not start any family IDs with this character therefore.
+ *
+ * Affection status, by default, should be coded:
+ * -9 missing
+ * 0 missing
+ * 1 unaffected
+ * 2 affected
+ *
+ * If your file is coded 0/1 to represent unaffected/affected, then use the --1 flag:
+ * plink --file mydata --1 which will specify a disease phenotype coded:
+ *
+ * -9 missing
+ * 0 unaffected
+ * 1 affected
+ *
+ * The missing phenotype value for quantitative traits is, by default, -9 (this can also be used for
+ * disease traits as well as 0). It can be reset by including the --missing-phenotype option:
+ *
+ * Genotypes (column 7 onwards) should also be white-space delimited; they can be any character
+ * (e.g. 1,2,3,4 or A,C,G,T or anything else) except 0 which is, by default, the missing genotype
+ * character. All markers should be biallelic. All SNPs (whether haploid or not) must have two
+ * alleles specified. Either Both alleles should be missing (i.e. 0) or neither.
+ *
+ * No header row should be given. For example, here are two individuals typed for 3 SNPs (one row = one person):
+ *
+ * FAM001 1 0 0 1 2 A A G G A C
+ * FAM001 2 0 0 1 2 A A A G 0 0
+ * ...
+ *
+ * Note that the GATK does not support genotypes in a PED file.
+ *
+ * @author Mark DePristo
+ * @since 2011
+ */
+public class PedReader {
+ private static Logger logger = Logger.getLogger(PedReader.class);
+ final static private Set CATAGORICAL_TRAIT_VALUES = new HashSet(Arrays.asList("-9", "0", "1", "2"));
+ final static private String commentMarker = "#";
+
+ /**
+ * An enum that specifies which, if any, of the standard PED fields are
+ * missing from the input records. For example, suppose we have the full record:
+ *
+ * "fam1 kid dad mom 1 2"
+ *
+ * indicating a male affected child. This can be parsed with the -ped x.ped argument
+ * to the GATK. Suppose we only have:
+ *
+ * "fam1 kid 1"
+ *
+ * we can parse the reduced version of this record with -ped:NO_PARENTS,NO_PHENOTYPE x.ped
+ */
+ public enum MissingPedField {
+ /**
+ * The PED records do not have the first (FAMILY_ID) argument. The family id
+ * will be set to null / empty.
+ */
+ NO_FAMILY_ID,
+
+ /**
+ * The PED records do not have either the paternal or maternal IDs, so
+ * the corresponding IDs are set to null.
+ */
+ NO_PARENTS,
+
+ /**
+ * The PED records do not have the GENDER field, so the sex of each
+ * sample will be set to UNKNOWN.
+ */
+ NO_SEX,
+
+ /**
+ * The PED records do not have the PHENOTYPE field, so the phenotype
+ * of each sample will be set to UNKNOWN.
+ */
+ NO_PHENOTYPE
+ }
+
+ protected enum Field {
+ FAMILY_ID, INDIVIDUAL_ID, PATERNAL_ID, MATERNAL_ID, GENDER, PHENOTYPE
+ }
+
+ // phenotype
+ private final static String MISSING_VALUE1 = "-9";
+ private final static String MISSING_VALUE2 = "0";
+ private final static String PHENOTYPE_UNAFFECTED = "1";
+ private final static String PHENOTYPE_AFFECTED = "2";
+
+ // Sex
+ private final static String SEX_MALE = "1";
+ private final static String SEX_FEMALE = "2";
+ // other=unknown
+
+ public PedReader() { }
+
+ public final List parse(File source, EnumSet missingFields, SampleDB sampleDB) throws FileNotFoundException {
+ logger.info("Reading PED file " + source + " with missing fields: " + missingFields);
+ return parse(new FileReader(source), missingFields, sampleDB);
+ }
+
+ public final List parse(final String source, EnumSet missingFields, SampleDB sampleDB) {
+ logger.warn("Reading PED string: \"" + source + "\" with missing fields: " + missingFields);
+ return parse(new StringReader(source.replace(";", String.format("%n"))), missingFields, sampleDB);
+ }
+
+ public final List parse(Reader reader, EnumSet missingFields, SampleDB sampleDB) {
+ final List lines = new XReadLines(reader).readLines();
+
+ // What are the record offsets?
+ final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0;
+ final int samplePos = familyPos + 1;
+ final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1;
+ final int maternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1;
+ final int sexPos = missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1;
+ final int phenotypePos = missingFields.contains(MissingPedField.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1;
+ final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1;
+
+ // go through once and determine properties
+ int lineNo = 1;
+ boolean isQT = false;
+ final List splits = new ArrayList(lines.size());
+ for ( final String line : lines ) {
+ if ( line.startsWith(commentMarker)) continue;
+ if ( line.trim().equals("") ) continue;
+
+ final String[] parts = line.split("\\s+");
+
+ if ( parts.length != nExpectedFields )
+ throw new UserException.MalformedFile(reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields");
+
+ if ( phenotypePos != -1 ) {
+ isQT = isQT || ! CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]);
+ }
+
+ splits.add(parts);
+ lineNo++;
+ }
+ logger.info("Phenotype is other? " + isQT);
+
+ // now go through and parse each record
+ lineNo = 1;
+ final List samples = new ArrayList(splits.size());
+ for ( final String[] parts : splits ) {
+ String familyID = null, individualID, paternalID = null, maternalID = null;
+ Gender sex = Gender.UNKNOWN;
+ String quantitativePhenotype = Sample.UNSET_QT;
+ Affection affection = Affection.UNKNOWN;
+
+ if ( familyPos != -1 ) familyID = maybeMissing(parts[familyPos]);
+ individualID = parts[samplePos];
+ if ( paternalPos != -1 ) paternalID = maybeMissing(parts[paternalPos]);
+ if ( maternalPos != -1 ) maternalID = maybeMissing(parts[maternalPos]);
+
+ if ( sexPos != -1 ) {
+ if ( parts[sexPos].equals(SEX_MALE) ) sex = Gender.MALE;
+ else if ( parts[sexPos].equals(SEX_FEMALE) ) sex = Gender.FEMALE;
+ else sex = Gender.UNKNOWN;
+ }
+
+ if ( phenotypePos != -1 ) {
+ if ( isQT ) {
+ if ( parts[phenotypePos].equals(MISSING_VALUE1) )
+ affection = Affection.UNKNOWN;
+ else {
+ affection = Affection.OTHER;
+ quantitativePhenotype = parts[phenotypePos];
+ }
+ } else {
+ if ( parts[phenotypePos].equals(MISSING_VALUE1) ) affection = Affection.UNKNOWN;
+ else if ( parts[phenotypePos].equals(MISSING_VALUE2) ) affection = Affection.UNKNOWN;
+ else if ( parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED) ) affection = Affection.UNAFFECTED;
+ else if ( parts[phenotypePos].equals(PHENOTYPE_AFFECTED) ) affection = Affection.AFFECTED;
+ else throw new ReviewedStingException("Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo);
+ }
+ }
+
+ final Sample s = new Sample(individualID, sampleDB, familyID, paternalID, maternalID, sex, affection, quantitativePhenotype);
+ samples.add(s);
+ sampleDB.addSample(s);
+ lineNo++;
+ }
+
+ for ( final Sample sample : new ArrayList(samples) ) {
+ Sample dad = maybeAddImplicitSample(sampleDB, sample.getPaternalID(), sample.getFamilyID(), Gender.MALE);
+ if ( dad != null ) samples.add(dad);
+
+ Sample mom = maybeAddImplicitSample(sampleDB, sample.getMaternalID(), sample.getFamilyID(), Gender.FEMALE);
+ if ( mom != null ) samples.add(mom);
+ }
+
+ return samples;
+ }
+
+ private final static String maybeMissing(final String string) {
+ if ( string.equals(MISSING_VALUE1) || string.equals(MISSING_VALUE2) )
+ return null;
+ else
+ return string;
+ }
+
+ private final Sample maybeAddImplicitSample(SampleDB sampleDB, final String id, final String familyID, final Gender gender) {
+ if ( id != null && sampleDB.getSample(id) == null ) {
+ Sample s = new Sample(id, sampleDB, familyID, null, null, gender, Affection.UNKNOWN, Sample.UNSET_QT);
+ sampleDB.addSample(s);
+ return s;
+ } else
+ return null;
+ }
+
+ /**
+ * Parses a list of tags from the command line, assuming it comes from the GATK Engine
+ * tags, and returns the corresponding EnumSet.
+ *
+ * @param arg the actual engine arg, used for the UserException if there's an error
+ * @param tags a list of string tags that should be converted to the MissingPedField value
+ * @return
+ */
+ public static final EnumSet parseMissingFieldTags(final Object arg, final List tags) {
+ final EnumSet missingFields = EnumSet.noneOf(MissingPedField.class);
+
+ for ( final String tag : tags ) {
+ try {
+ missingFields.add(MissingPedField.valueOf(tag));
+ } catch ( IllegalArgumentException e ) {
+ throw new UserException.BadArgumentValue(arg.toString(), "Unknown tag " + tag + " allowed values are " + MissingPedField.values());
+ }
+ }
+
+ return missingFields;
+ }
+}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java
new file mode 100644
index 000000000..bbf857820
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.samples;
+
+/**
+*
+*/
+public enum PedigreeValidationType {
+ /**
+ * Require if a pedigree file is provided at all samples in the VCF or BAM files have a corresponding
+ * entry in the pedigree file(s).
+ */
+ STRICT,
+
+ /**
+ * Do not enforce any overlap between the VCF/BAM samples and the pedigree data
+ * */
+ SILENT
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
new file mode 100644
index 000000000..b39fdd79d
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
@@ -0,0 +1,222 @@
+package org.broadinstitute.sting.gatk.samples;
+
+
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ *
+ */
+public class Sample implements Comparable { // implements java.io.Serializable {
+ final private String familyID, paternalID, maternalID;
+ final private Gender gender;
+ final private String otherPhenotype;
+ final private Affection affection;
+ final private String ID;
+ final private SampleDB infoDB;
+ final private Map properties = new HashMap();
+
+ public final static String UNSET_QT = null;
+
+ public Sample(final String ID, final SampleDB infoDB,
+ final String familyID, final String paternalID, final String maternalID,
+ final Gender gender, final Affection affection, final String otherPhenotype) {
+ this.familyID = familyID;
+ this.paternalID = paternalID;
+ this.maternalID = maternalID;
+ this.gender = gender;
+ this.otherPhenotype = otherPhenotype;
+ this.affection = affection;
+ this.ID = ID;
+ this.infoDB = infoDB;
+ }
+
+ protected Sample(final String ID,
+ final String familyID, final String paternalID, final String maternalID,
+ final Gender gender, final Affection affection, final String otherPhenotype) {
+ this(ID, null, familyID, paternalID, maternalID, gender, affection, otherPhenotype);
+ }
+
+ protected Sample(final String ID,
+ final String familyID, final String paternalID, final String maternalID,
+ final Gender gender, final Affection affection) {
+ this(ID, null, familyID, paternalID, maternalID, gender, affection, UNSET_QT);
+ }
+
+
+ public Sample(final String ID, final SampleDB infoDB,
+ final String familyID, final String paternalID, final String maternalID, final Gender gender) {
+ this(ID, infoDB, familyID, paternalID, maternalID, gender, Affection.UNKNOWN, UNSET_QT);
+ }
+
+ public Sample(final String ID, final SampleDB infoDB, final Affection affection, final String otherPhenotype) {
+ this(ID, infoDB, null, null, null, Gender.UNKNOWN, affection, otherPhenotype);
+ }
+
+ public Sample(String id, SampleDB infoDB) {
+ this(id, infoDB, null, null, null,
+ Gender.UNKNOWN, Affection.UNKNOWN, UNSET_QT);
+ }
+
+ // -------------------------------------------------------------------------------------
+ //
+ // standard property getters
+ //
+ // -------------------------------------------------------------------------------------
+
+ public String getID() {
+ return ID;
+ }
+
+ public String getFamilyID() {
+ return familyID;
+ }
+
+ public String getPaternalID() {
+ return paternalID;
+ }
+
+ public String getMaternalID() {
+ return maternalID;
+ }
+
+ public Affection getAffection() {
+ return affection;
+ }
+
+ public boolean hasOtherPhenotype() {
+ return affection == Affection.OTHER;
+ }
+
+ public String getOtherPhenotype() {
+ return otherPhenotype;
+ }
+
+ /**
+ * Get the sample's mother
+ * @return sample object with relationship mother, if exists, or null
+ */
+ public Sample getMother() {
+ return infoDB.getSample(maternalID);
+ }
+
+ /**
+ * Get the sample's father
+ * @return sample object with relationship father, if exists, or null
+ */
+ public Sample getFather() {
+ return infoDB.getSample(paternalID);
+ }
+
+ /**
+ * Get gender of the sample
+ * @return property of key "gender" - must be of type Gender
+ */
+ public Gender getGender() {
+ return gender;
+ }
+
+ @Override
+ public int compareTo(final Sample sample) {
+ return ID.compareTo(sample.getID());
+ }
+
+ @Override
+ public String toString() {
+ return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s",
+ getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(),
+ getOtherPhenotype(), properties);
+ }
+
+// // -------------------------------------------------------------------------------------
+// //
+// // code for working with additional -- none standard -- properites
+// //
+// // -------------------------------------------------------------------------------------
+//
+// public Map getExtraProperties() {
+// return Collections.unmodifiableMap(properties);
+// }
+//
+// /**
+// * Get one property
+// * @param key key of property
+// * @return value of property as generic object
+// */
+// public Object getExtraPropertyValue(final String key) {
+// return properties.get(key);
+// }
+//
+// /**
+// *
+// * @param key property key
+// * @return true if sample has this property (even if its value is null)
+// */
+// public boolean hasExtraProperty(String key) {
+// return properties.containsKey(key);
+// }
+
+ @Override
+ public int hashCode() {
+ return ID.hashCode();
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ if(o == null)
+ return false;
+ if(o instanceof Sample) {
+ Sample otherSample = (Sample)o;
+ return ID.equals(otherSample.ID) &&
+ equalOrNull(familyID, otherSample.familyID) &&
+ equalOrNull(paternalID, otherSample.paternalID) &&
+ equalOrNull(maternalID, otherSample.maternalID) &&
+ equalOrNull(gender, otherSample.gender) &&
+ equalOrNull(otherPhenotype, otherSample.otherPhenotype) &&
+ equalOrNull(affection, otherSample.affection) &&
+ equalOrNull(properties, otherSample.properties);
+ }
+ return false;
+ }
+
+ private final static boolean equalOrNull(final Object o1, final Object o2) {
+ if ( o1 == null )
+ return o2 == null;
+ else
+ return o2 == null ? false : o1.equals(o2);
+ }
+
+ private final static T mergeValues(final String name, final String field, final T o1, final T o2, final T emptyValue) {
+ if ( o1 == null || o1.equals(emptyValue) ) {
+ // take o2 if both are null, otherwise keep o2
+ return o2 == null ? null : o2;
+ } else {
+ if ( o2 == null || o2.equals(emptyValue) )
+ return o1; // keep o1, since it's a real value
+ else {
+ // both o1 and o2 have a value
+ if ( o1 == o2 )
+ return o1;
+ else
+ throw new UserException("Inconsistent values detected for " + name + " for field " + field + " value1 " + o1 + " value2 " + o2);
+ }
+ }
+ }
+
+ public final static Sample mergeSamples(final Sample prev, final Sample next) {
+ if ( prev.equals(next) )
+ return next;
+ else {
+ return new Sample(prev.getID(), prev.infoDB,
+ mergeValues(prev.getID(), "Family_ID", prev.getFamilyID(), next.getFamilyID(), null),
+ mergeValues(prev.getID(), "Paternal_ID", prev.getPaternalID(), next.getPaternalID(), null),
+ mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null),
+ mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN),
+ mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN),
+ mergeValues(prev.getID(), "OtherPhenotype", prev.getOtherPhenotype(), next.getOtherPhenotype(), UNSET_QT));
+ //mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap()));
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java
new file mode 100644
index 000000000..ee0873c6e
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java
@@ -0,0 +1,183 @@
+package org.broadinstitute.sting.gatk.samples;
+
+import net.sf.samtools.SAMReadGroupRecord;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+import org.broadinstitute.sting.utils.variantcontext.Genotype;
+
+import java.util.*;
+
+/**
+ *
+ */
+public class SampleDB {
+ /**
+ * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so
+ * this is stored as a HashMap.
+ */
+ private final HashMap samples = new HashMap();
+
+ /**
+ * Constructor takes both a SAM header and sample files because the two must be integrated.
+ */
+ public SampleDB() {
+
+ }
+
+ /**
+ * Protected function to add a single sample to the database
+ *
+ * @param sample to be added
+ */
+ protected SampleDB addSample(Sample sample) {
+ Sample prev = samples.get(sample.getID());
+ if ( prev != null )
+ sample = Sample.mergeSamples(prev, sample);
+ samples.put(sample.getID(), sample);
+ return this;
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // Functions for getting a sample from the DB
+ //
+ // --------------------------------------------------------------------------------
+
+ /**
+ * Get a sample by its ID
+ * If an alias is passed in, return the main sample object
+ * @param id
+ * @return sample Object with this ID, or null if this does not exist
+ */
+ public Sample getSample(String id) {
+ return samples.get(id);
+ }
+
+ /**
+ *
+ * @param read
+ * @return sample Object with this ID, or null if this does not exist
+ */
+ public Sample getSample(final SAMRecord read) {
+ return getSample(read.getReadGroup());
+ }
+
+ /**
+ *
+ * @param rg
+ * @return sample Object with this ID, or null if this does not exist
+ */
+ public Sample getSample(final SAMReadGroupRecord rg) {
+ return getSample(rg.getSample());
+ }
+
+ /**
+ * @param g Genotype
+ * @return sample Object with this ID, or null if this does not exist
+ */
+ public Sample getSample(final Genotype g) {
+ return getSample(g.getSampleName());
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // Functions for accessing samples in the DB
+ //
+ // --------------------------------------------------------------------------------
+
+ /**
+ * Get number of sample objects
+ * @return size of samples map
+ */
+ public int sampleCount() {
+ return samples.size();
+ }
+
+ public Set getSamples() {
+ return new HashSet(samples.values());
+ }
+
+ public Collection getSampleNames() {
+ return Collections.unmodifiableCollection(samples.keySet());
+ }
+
+
+ /**
+ * Takes a collection of sample names and returns their corresponding sample objects
+ * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set
+ * @param sampleNameList Set of sample names
+ * @return Corresponding set of samples
+ */
+ public Set getSamples(Collection sampleNameList) {
+ HashSet samples = new HashSet();
+ for (String name : sampleNameList) {
+ try {
+ samples.add(getSample(name));
+ }
+ catch (Exception e) {
+ throw new StingException("Could not get sample with the following ID: " + name, e);
+ }
+ }
+ return samples;
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // Higher level pedigree functions
+ //
+ // --------------------------------------------------------------------------------
+
+ /**
+ * Returns a sorted set of the family IDs in all samples (excluding null ids)
+ * @return
+ */
+ public final Set getFamilyIDs() {
+ return getFamilies().keySet();
+ }
+
+ /**
+ * Returns a map from family ID -> set of family members for all samples with
+ * non-null family ids
+ *
+ * @return
+ */
+ public final Map> getFamilies() {
+ final Map> families = new TreeMap>();
+
+ for ( final Sample sample : samples.values() ) {
+ final String famID = sample.getFamilyID();
+ if ( famID != null ) {
+ if ( ! families.containsKey(famID) )
+ families.put(famID, new TreeSet());
+ families.get(famID).add(sample);
+ }
+ }
+
+ return families;
+ }
+
+ /**
+ * Return all samples with a given family ID
+ * @param familyId
+ * @return
+ */
+ public Set getFamily(String familyId) {
+ return getFamilies().get(familyId);
+ }
+
+ /**
+ * Returns all children of a given sample
+ * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
+ * @param sample
+ * @return
+ */
+ public Set getChildren(Sample sample) {
+ final HashSet children = new HashSet();
+ for ( final Sample familyMember : getFamily(sample.getFamilyID())) {
+ if ( familyMember.getMother() == sample || familyMember.getFather() == sample ) {
+ children.add(familyMember);
+ }
+ }
+ return children;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java
new file mode 100644
index 000000000..44a8600b0
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.samples;
+
+import net.sf.samtools.SAMFileHeader;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.*;
+
+/**
+ *
+ */
+public class SampleDBBuilder {
+ PedigreeValidationType validationStrictness;
+ final SampleDB sampleDB = new SampleDB();
+ final GenomeAnalysisEngine engine;
+
+ Set samplesFromDataSources = new HashSet();
+ Set samplesFromPedigrees = new HashSet();
+
+ /** for testing only */
+ protected SampleDBBuilder(PedigreeValidationType validationStrictness) {
+ engine = null;
+ this.validationStrictness = validationStrictness;
+ }
+
+ /**
+ * Constructor takes both a SAM header and sample files because the two must be integrated.
+ */
+ public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) {
+ this.engine = engine;
+ this.validationStrictness = validationStrictness;
+ }
+
+ /**
+ * Hallucinates sample objects for all the samples in the SAM file and stores them
+ */
+ public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) {
+ addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header));
+ return this;
+ }
+
+ public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) {
+ for (final String sampleName : sampleNames) {
+ if (sampleDB.getSample(sampleName) == null) {
+ final Sample newSample = new Sample(sampleName, sampleDB);
+ sampleDB.addSample(newSample);
+ samplesFromDataSources.add(newSample); // keep track of data source samples
+ }
+ }
+ return this;
+ }
+
+ public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) {
+ for (final File pedFile : pedigreeFiles) {
+ Collection samples = addSamplesFromPedigreeArgument(pedFile);
+ samplesFromPedigrees.addAll(samples);
+ }
+
+ return this;
+ }
+
+ public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) {
+ for (final String pedString : pedigreeStrings) {
+ Collection samples = addSamplesFromPedigreeArgument(pedString);
+ samplesFromPedigrees.addAll(samples);
+ }
+
+ return this;
+ }
+
+ /**
+ * Parse one sample file and integrate it with samples that are already there
+ * Fail quickly if we find any errors in the file
+ */
+ private Collection addSamplesFromPedigreeArgument(File sampleFile) {
+ final PedReader reader = new PedReader();
+
+ try {
+ return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB);
+ } catch ( FileNotFoundException e ) {
+ throw new UserException.CouldNotReadInputFile(sampleFile, e);
+ }
+ }
+
+ private Collection addSamplesFromPedigreeArgument(final String string) {
+ final PedReader reader = new PedReader();
+ return reader.parse(string, getMissingFields(string), sampleDB);
+ }
+
+ public SampleDB getFinalSampleDB() {
+ validate();
+ return sampleDB;
+ }
+
+ public EnumSet getMissingFields(final Object engineArg) {
+ if ( engine == null )
+ return EnumSet.noneOf(PedReader.MissingPedField.class);
+ else {
+ final List posTags = engine.getTags(engineArg).getPositionalTags();
+ return PedReader.parseMissingFieldTags(engineArg, posTags);
+ }
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // Validation
+ //
+ // --------------------------------------------------------------------------------
+
+ protected final void validate() {
+ if ( validationStrictness == PedigreeValidationType.SILENT )
+ return;
+ else {
+ // check that samples in data sources are all annotated, if anything is annotated
+ if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) {
+ final Set sampleNamesFromPedigrees = new HashSet();
+ for ( final Sample pSample : samplesFromPedigrees )
+ sampleNamesFromPedigrees.add(pSample.getID());
+
+ for ( final Sample dsSample : samplesFromDataSources )
+ if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) )
+ throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation");
+ }
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java
index 10261112c..792fef9c3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java
@@ -30,11 +30,12 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.filters.MalformedReadFilter;
+import org.broadinstitute.sting.gatk.samples.Sample;
+import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.help.GenericDocumentationHandler;
import java.util.List;
@@ -87,6 +88,14 @@ public abstract class Walker {
return getToolkit().getMasterSequenceDictionary();
}
+ protected SampleDB getSampleDB() {
+ return getToolkit().getSampleDB();
+ }
+
+ protected Sample getSample(final String id) {
+ return getToolkit().getSampleDB().getSample(id);
+ }
+
/**
* (conceptual static) method that states whether you want to see reads piling up at a locus
* that contain a deletion at the locus.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
index 87695077d..b722220f9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
@@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.samples.Gender;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VQSRCalibrationCurve;
import org.broadinstitute.sting.utils.GenomeLoc;
@@ -247,7 +248,7 @@ public class ProduceBeagleInputWalker extends RodWalker {
Map preferredGenotypes = preferredVC.getGenotypes();
Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
for ( String sample : samples ) {
- boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale();
+ boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;
Genotype genotype;
boolean isValidation;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
index 32875a098..1dfc6fea0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
@@ -227,9 +227,8 @@ public class CallableLociWalker extends LocusWalker getSamplesFromToolKit(DoCOutputType.Partition type) {
HashSet partition = new HashSet();
if ( type == DoCOutputType.Partition.sample ) {
- for ( Set sampleSet : getToolkit().getSamplesByReaders() ) {
- for ( String s : sampleSet ) {
- partition.add(s);
- }
- }
+ partition.addAll(SampleUtils.getSAMFileSamples(getToolkit()));
} else if ( type == DoCOutputType.Partition.readgroup ) {
for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId());
}
} else if ( type == DoCOutputType.Partition.library ) {
- for ( Set libraries : getToolkit().getLibrariesByReaders() ) {
- for ( String l : libraries ) {
- partition.add(l);
- }
+ for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
+ partition.add(rg.getLibrary());
}
} else if ( type == DoCOutputType.Partition.center ) {
for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
index 8680f3537..36e4db1c5 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@@ -248,6 +248,9 @@ public class IndelRealigner extends ReadWalker {
@Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file")
protected String N_WAY_OUT = null;
+ @Hidden
+ @Argument(fullName="generate_nWayOut_md5s",doc="Generate md5sums for BAMs")
+ protected boolean generateMD5s = false;
// DEBUGGING OPTIONS FOLLOW
@@ -401,9 +404,9 @@ public class IndelRealigner extends ReadWalker {
// if ( args.containsKey("disable_bam_indexing") ) { System.out.println("NO INDEXING!!"); System.exit(1); createIndex = false; }
if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) {
- writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT),SAMFileHeader.SortOrder.coordinate,true, createIndex);
+ writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT),SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s);
} else {
- writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, createIndex);
+ writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s);
}
} else {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
index 7c436ce44..9edf5b5d4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@@ -32,16 +32,11 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.PrintStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
index 5b10a79c6..74cbfa05f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
@@ -36,12 +36,8 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter;
-import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
-import org.broadinstitute.sting.utils.codecs.refseq.Transcript;
-import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec;
-import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
@@ -51,6 +47,9 @@ import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec;
+import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature;
+import org.broadinstitute.sting.utils.codecs.refseq.Transcript;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.collections.CircularArray;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
@@ -392,7 +391,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1);
- normalSamples = getToolkit().getSamplesByReaders().get(0);
+ normalSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeaders().get(0));
try {
// we already checked that bedOutput and output_file are not set simultaneously
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
index 17a6e20f1..998cfa654 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
@@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
@@ -1095,14 +1094,14 @@ public class ReadBackedPhasingWalker extends RodWalker {
+ public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
+ Sample sample = getSampleDB().getSample(read);
+ return sample.getGender() == Gender.MALE ? 1 : 0;
+ }
+
+ public Integer reduceInit() { return 0; }
+
+ public Integer reduce(Integer value, Integer sum) {
+ return value + sum;
+ }
+}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
index 72058ba7b..e83434037 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
@@ -130,6 +130,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
nVariantLoci++;
nMixed++;
break;
+ case SYMBOLIC:
+ // ignore symbolic alleles, but don't fail
+ // todo - consistent way of treating symbolic alleles thgoughout codebase?
+ break;
default:
throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType());
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
index c44d84136..81d0c36ac 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@@ -133,7 +133,7 @@ public class VariantsToTable extends RodWalker {
/**
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
- * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being
+ * is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHRMO not being
* found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java
index 8da118174..e62a7e512 100755
--- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java
@@ -1,13 +1,10 @@
package org.broadinstitute.sting.utils;
-import org.apache.commons.lang.ArrayUtils;
-import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
-import org.broadinstitute.sting.gatk.datasources.sample.Sample;
+import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
@@ -19,9 +16,6 @@ import java.util.regex.Pattern;
* Time: 12:38 PM
*/
public class MendelianViolation {
-
-
-
String sampleMom;
String sampleDad;
String sampleChild;
@@ -34,22 +28,15 @@ public class MendelianViolation {
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
- static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 };
- static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 };
-
-
public String getSampleMom() {
return sampleMom;
}
-
public String getSampleDad() {
return sampleDad;
}
-
public String getSampleChild() {
return sampleChild;
}
-
public double getMinGenotypeQuality() {
return minGenotypeQuality;
}
@@ -90,37 +77,12 @@ public class MendelianViolation {
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
*/
public MendelianViolation(Sample sample, double minGenotypeQualityP) {
- sampleMom = sample.getMother().getId();
- sampleDad = sample.getFather().getId();
- sampleChild = sample.getId();
+ sampleMom = sample.getMother().getID();
+ sampleDad = sample.getFather().getID();
+ sampleChild = sample.getID();
minGenotypeQuality = minGenotypeQualityP;
}
-
- /**
- * The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option.
- * @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored.
- * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
- */
- public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) {
- boolean gotSampleInformation = false;
- Collection samples = engine.getSamples();
- // Iterate through all samples in the sample_metadata file but we really can only take one.
- for (Sample sample : samples) {
- if (sample.getMother() != null && sample.getFather() != null) {
- sampleMom = sample.getMother().getId();
- sampleDad = sample.getFather().getId();
- sampleChild = sample.getId();
- minGenotypeQuality = minGenotypeQualityP;
- gotSampleInformation = true;
- break; // we can only deal with one trio information
- }
- }
- if (!gotSampleInformation)
- throw new UserException("YAML file has no sample with relationship information (mother/father)");
- }
-
-
/**
* This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to
* isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site
@@ -158,7 +120,7 @@ public class MendelianViolation {
* @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation.
*
*/
- public boolean isViolation (VariantContext vc)
+ public boolean isViolation(VariantContext vc)
{
return setAlleles(vc) && isViolation();
}
@@ -172,42 +134,4 @@ public class MendelianViolation {
return false;
return true;
}
-
- /**
- * @return the likelihood ratio for a mendelian violation
- */
- public double violationLikelihoodRatio(VariantContext vc) {
- double[] logLikAssignments = new double[27];
- // the matrix to set up is
- // MOM DAD CHILD
- // |- AA
- // AA AA | AB
- // |- BB
- // |- AA
- // AA AB | AB
- // |- BB
- // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs
- double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector();
- double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector();
- double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector();
- int offset = 0;
- for ( int oMom = 0; oMom < 3; oMom++ ) {
- for ( int oDad = 0; oDad < 3; oDad++ ) {
- for ( int oChild = 0; oChild < 3; oChild ++ ) {
- logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild];
- }
- }
- }
- double[] mvLiks = new double[12];
- double[] nonMVLiks = new double[15];
- for ( int i = 0; i < 12; i ++ ) {
- mvLiks[i] = logLikAssignments[mvOffsets[i]];
- }
-
- for ( int i = 0; i < 15; i++) {
- nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]];
- }
-
- return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks);
- }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
index 1b4703e4a..edc1413ba 100755
--- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
@@ -69,6 +69,18 @@ public class SampleUtils {
return samples;
}
+
+ /**
+ * Same as @link getSAMFileSamples but gets all of the samples
+ * in the SAM files loaded by the engine
+ *
+ * @param engine
+ * @return
+ */
+ public final static Set getSAMFileSamples(GenomeAnalysisEngine engine) {
+ return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader());
+ }
+
/**
* Gets all of the unique sample names from all VCF rods input by the user
*
diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
index 11a59de10..0106442e0 100644
--- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
+++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
@@ -4,7 +4,6 @@ import com.google.java.contract.Requires;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ReadUtils;
diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
index 70f7387f4..77f1ed6c0 100755
--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@@ -215,6 +215,10 @@ public class UserException extends ReviewedStingException {
super(String.format("File %s is malformed: %s caused by %s", f.getAbsolutePath(), message, e.getMessage()));
}
+ public MalformedFile(String name, String message) {
+ super(String.format("File associated with name %s is malformed: %s", name, message));
+ }
+
public MalformedFile(String name, String message, Exception e) {
super(String.format("File associated with name %s is malformed: %s caused by %s", name, message, e.getMessage()));
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
index 3821c9c8a..b3f2bc6b0 100644
--- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
@@ -26,11 +26,9 @@ package org.broadinstitute.sting.utils.pileup;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
-import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.exceptions.StingException;
import java.util.*;
@@ -114,10 +112,10 @@ public abstract class AbstractReadBackedPileup> pileupsBySample) {
+ protected AbstractReadBackedPileup(GenomeLoc loc, Map> pileupsBySample) {
this.loc = loc;
PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker();
- for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) {
+ for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) {
tracker.addElements(pileupEntry.getKey(),pileupEntry.getValue().pileupElementTracker);
addPileupToCumulativeStats(pileupEntry.getValue());
}
@@ -213,7 +211,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker;
PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker();
- for(Sample sample: tracker.getSamples()) {
+ for(final String sample: tracker.getSamples()) {
PileupElementTracker perSampleElements = tracker.getElements(sample);
AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutDeletions();
filteredTracker.addElements(sample,pileup.pileupElementTracker);
@@ -251,7 +249,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker;
PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker();
- for(Sample sample: tracker.getSamples()) {
+ for(final String sample: tracker.getSamples()) {
PileupElementTracker perSampleElements = tracker.getElements(sample);
AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getOverlappingFragmentFilteredPileup();
filteredTracker.addElements(sample,pileup.pileupElementTracker);
@@ -305,7 +303,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker;
PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker();
- for(Sample sample: tracker.getSamples()) {
+ for(final String sample: tracker.getSamples()) {
PileupElementTracker perSampleElements = tracker.getElements(sample);
AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutMappingQualityZeroReads();
filteredTracker.addElements(sample,pileup.pileupElementTracker);
@@ -334,7 +332,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker;
PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker();
- for(Sample sample: tracker.getSamples()) {
+ for(final String sample: tracker.getSamples()) {
PileupElementTracker perSampleElements = tracker.getElements(sample);
AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPositiveStrandPileup();
filteredTracker.addElements(sample,pileup.pileupElementTracker);
@@ -363,7 +361,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker;
PerSamplePileupElementTracker