From 4d31673cc5980f8ffdd78c78e5f312514dcc9d3b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 09:43:31 -0400 Subject: [PATCH 01/63] No longer supporting YAML file allows us to delete 75% of the sample's codebase --- .../sample/PropertyDefinition.java | 30 --------- .../gatk/datasources/sample/SampleAlias.java | 31 --------- .../datasources/sample/SampleFileParser.java | 65 ------------------- .../gatk/datasources/sample/SampleParser.java | 43 ------------ 4 files changed, 169 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java deleted file mode 100644 index 433e0af40..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java +++ /dev/null @@ -1,30 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 12, 2010 - * Time: 2:09:16 PM - */ -public class PropertyDefinition { - - String property; - - String[] values; - - public String getProperty() { - return property; - } - - public void setProperty(String property) { - this.property = property; - } - - public String[] getValues() { - return values; - } - - public void setValues(String[] values) { - this.values = values; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java deleted file mode 100644 index ce749cb83..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 13, 2010 - * Time: 5:13:46 PM - */ -public class SampleAlias { - - String mainId; - - String[] otherIds; - - public String getMainId() { - return mainId; - } - - public void setMainId(String mainId) { - this.mainId = mainId; - } - - public String[] getOtherIds() { - return otherIds; - } - - public void setOtherIds(String[] otherIds) { - this.otherIds = otherIds; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java deleted file mode 100644 index a362af663..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 12, 2010 - * Time: 1:30:44 PM - */ -public class SampleFileParser { - - private SampleAlias[] sampleAliases; - - private String[] allowedProperties; - - private String[] allowedRelationships; - - private PropertyDefinition[] propertyDefinitions; - - private SampleParser[] samples; - - public PropertyDefinition[] getPropertyDefinitions() { - return propertyDefinitions; - } - - public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) { - this.propertyDefinitions = propertyDefinitions; - } - - public SampleFileParser() { - - } - - public String[] getAllowedProperties() { - return allowedProperties; - } - - public void setAllowedProperties(String[] allowedProperties) { - this.allowedProperties = allowedProperties; - } - - public SampleParser[] getSamples() { - return samples; - } - - public void setSamples(SampleParser[] samples) { - this.samples = samples; - } - - public String[] getAllowedRelationships() { - return allowedRelationships; - } - - public void setAllowedRelationships(String[] allowedRelationships) { - this.allowedRelationships = allowedRelationships; - } - - public SampleAlias[] getSampleAliases() { - return sampleAliases; - } - - public void setSampleAliases(SampleAlias[] sampleAliases) { - this.sampleAliases = sampleAliases; - } - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java deleted file mode 100644 index f5e07ca29..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java +++ /dev/null @@ -1,43 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import java.util.HashMap; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 13, 2010 - * Time: 2:09:43 PM - */ -public class SampleParser { - - private String id; - - private HashMap properties; - - private HashMap relationships; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public HashMap getProperties() { - return properties; - } - - public void setProperties(HashMap properties) { - this.properties = properties; - } - - public HashMap getRelationships() { - return relationships; - } - - public void setRelationships(HashMap relationships) { - this.relationships = relationships; - } - -} From e197dcd1f3bf87d8d55831215fa19ff700c383b2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 09:44:18 -0400 Subject: [PATCH 02/63] Pre-cleanup commit of Sample and SampleDataSource -- SampleDataSource has all reader functionality disabled --- .../sting/gatk/datasources/sample/Sample.java | 52 +- .../datasources/sample/SampleDataSource.java | 519 +++++++++--------- 2 files changed, 290 insertions(+), 281 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java index ca8756684..db53d1236 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java @@ -14,6 +14,13 @@ import java.util.Map; * Time: 3:31:38 PM */ public class Sample implements java.io.Serializable { + private final static String MOTHER = "mother"; + private final static String FATHER = "father"; + private final static String GENDER = "gender"; + private final static String POPULATION = "population"; + private final static String FAMILY = "familyId"; + private final static String AFFECTION = "affection"; + private final static String QUANT_TRAIT = "quantTrait"; private final String id; @@ -31,6 +38,18 @@ public class Sample implements java.io.Serializable { UNKNOWN } + public enum Affection { + /** Status is unknown */ + UNKNOWN, + /** Suffers from the disease */ + AFFECTED, + /** Unaffected by the disease */ + UNAFFECTED, + /** A quantitative trait: value of the trait is stored elsewhere */ + QUANTITATIVE + } + public final static double UNSET_QUANTITIATIVE_TRAIT_VALUE = Double.NaN; + public Sample(String id) { /* if (id == null) { throw new StingException("Error creating sample: sample ID cannot be null"); @@ -46,30 +65,21 @@ public class Sample implements java.io.Serializable { return properties; } - public void setProperties(Map properties) { - this.properties = (HashMap) properties; - } - - public Map getRelationships() { - return Collections.unmodifiableMap(this.relationships); - } - + @Deprecated public void setSampleFileEntry(boolean value) { this.hasSampleFileEntry = value; } + @Deprecated public boolean hasSAMFileEntry() { return this.hasSAMFileEntry; } + @Deprecated public void setSAMFileEntry(boolean value) { this.hasSAMFileEntry = value; } - public boolean hasSampleFileEntry() { - return this.hasSampleFileEntry; - } - /** * Get one property * @param key key of property @@ -91,11 +101,11 @@ public class Sample implements java.io.Serializable { throw new StingException("The same key cannot exist as a property and a relationship"); } - if (key.equals("gender") && value.getClass() != Gender.class) { + if (key.equals(GENDER) && value.getClass() != Gender.class) { throw new StingException("'gender' property must be of type Sample.Gender"); } - if (key.equals("population") && value.getClass() != String.class) { + if (key.equals(POPULATION) && value.getClass() != String.class) { throw new StingException("'population' property must be of type String"); } @@ -129,7 +139,7 @@ public class Sample implements java.io.Serializable { * @return sample object with relationship mother, if exists, or null */ public Sample getMother() { - return getRelationship("mother"); + return getRelationship(MOTHER); } /** @@ -137,7 +147,7 @@ public class Sample implements java.io.Serializable { * @return sample object with relationship father, if exists, or null */ public Sample getFather() { - return getRelationship("father"); + return getRelationship(FATHER); } /** @@ -145,29 +155,29 @@ public class Sample implements java.io.Serializable { * @return property of key "gender" - must be of type Gender */ public Gender getGender() { - return (Gender) properties.get("gender"); + return (Gender) properties.get(GENDER); } public String getPopulation() { - return (String) properties.get("population"); + return (String) properties.get(POPULATION); } public String getFamilyId() { - return (String) properties.get("familyId"); + return (String) properties.get(FAMILY); } /** * @return True if sample is male, false if female, unknown, or null */ public boolean isMale() { - return properties.get("gender") == Gender.MALE; + return properties.get(GENDER) == Gender.MALE; } /** * @return True if sample is female, false if male, unknown or null */ public boolean isFemale() { - return properties.get("gender") == Gender.MALE; + return properties.get(GENDER) == Gender.MALE; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java index 067bf3f72..5b2c06061 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java @@ -104,266 +104,265 @@ public class SampleDataSource { * Parse one sample file and integrate it with samples that are already there * Fail quickly if we find any errors in the file */ - public void addFile(File sampleFile) { - - BufferedReader reader; - try { - reader = new BufferedReader(new FileReader(sampleFile)); - } - catch (IOException e) { - throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e); - } - - // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file - Constructor con = new Constructor(SampleFileParser.class); - TypeDescription desc = new TypeDescription(SampleFileParser.class); - desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class); - desc.putListPropertyType("sampleAliases", SampleAlias.class); - con.addTypeDescription(desc); - Yaml yaml = new Yaml(con); - - // SampleFileParser stores an object representation of a sample file - this is what we'll parse - SampleFileParser parser; - try { - parser = (SampleFileParser) yaml.load(reader); - } - catch (Exception e) { - throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e); - } - - // check to see which validation options were built into the file - boolean restrictProperties = parser.getAllowedProperties() != null; - boolean restrictRelationships = parser.getAllowedRelationships() != null; - boolean restrictPropertyValues = parser.getPropertyDefinitions() != null; - - // propertyValues stores the values that are allowed for a given property - HashMap propertyValues = null; - if (restrictPropertyValues) { - propertyValues = new HashMap(); - for (PropertyDefinition def : parser.getPropertyDefinitions()) { - HashSet set = new HashSet(); - for (String value : def.getValues()) { - set.add(value); - } - propertyValues.put(def.getProperty(), set); - } - } - - // make sure the aliases are valid - validateAliases(parser); - - // loop through each sample in the file - a SampleParser stores an object that will become a Sample - for (SampleParser sampleParser : parser.getSamples()) { - - try { - // step 1: add the sample if it doesn't already exist - Sample sample = getSampleById(sampleParser.getId()); - if (sample == null) { - sample = new Sample(sampleParser.getId()); - } - addSample(sample); - sample.setSampleFileEntry(true); - - // step 2: add the properties - if (sampleParser.getProperties() != null) { - for (String property : sampleParser.getProperties().keySet()) { - - // check that property is allowed - if (restrictProperties) { - if (!isPropertyValid(property, parser.getAllowedProperties())) { - throw new StingException(property + " is an invalid property. It is not included in the list " + - "of allowed properties."); - } - } - - // next check that the value is allowed - if (restrictPropertyValues) { - if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) { - throw new StingException("The value of property '" + property + "' is invalid. " + - "It is not included in the list of allowed values for this property."); - } - } - - // next check that there isn't already a conflicting property there - if (sample.getProperty(property) != null && - sample.getProperty(property) != sampleParser.getProperties().get(property)) - { - throw new StingException(property + " is a conflicting property!"); - } - - // checks are passed - now add the property! - saveProperty(sample, property, sampleParser.getProperties().get(property)); - } - } - - // step 3: add the relationships - if (sampleParser.getRelationships() != null) { - for (String relationship : sampleParser.getRelationships().keySet()) { - String relativeId = sampleParser.getRelationships().get(relationship); - if (relativeId == null) { - throw new StingException("The relationship cannot be null"); - } - - // first check that it's not invalid - if (restrictRelationships) { - if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) { - throw new StingException(relationship + " is an invalid relationship"); - } - } - - // next check that there isn't already a conflicting property there - if (sample.getRelationship(relationship) != null) { - if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) { - throw new StingException(relationship + " is a conflicting relationship!"); - } - // if the relationship is already set - and consistent with what we're reading now - no need to continue - else { - continue; - } - } - - // checks are passed - now save the relationship - saveRelationship(sample, relationship, relativeId); - } - } - } catch (Exception e) { - throw new StingException("An error occurred while loading this sample from the sample file: " + - sampleParser.getId(), e); - } - } - - } - - private boolean isValueAllowed(String key, Object value, HashMap valuesList) { - - // if the property values weren't specified for this property, then any value is okay - if (!valuesList.containsKey(key)) { - return true; - } - - // if this property has enumerated values, it must be a string - else if (value.getClass() != String.class) - return false; - - // is the value specified or not? - else if (!valuesList.get(key).contains(value)) - return false; - - return true; - } - - /** - * Makes sure that the aliases are valid - * Checks that 1) no string is used as both a main ID and an alias; - * 2) no alias is used more than once - * @param parser - */ - private void validateAliases(SampleFileParser parser) { - - // no aliases sure validate - if (parser.getSampleAliases() == null) - return; - - HashSet mainIds = new HashSet(); - HashSet otherIds = new HashSet(); - - for (SampleAlias sampleAlias : parser.getSampleAliases()) { - mainIds.add(sampleAlias.getMainId()); - for (String otherId : sampleAlias.getOtherIds()) { - if (mainIds.contains(otherId)) - throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " + - "be both a main ID and an other ID", otherId)); - - if (!otherIds.add(otherId)) - throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " + - "alias more than once.", otherId)); - } - } - } - - private boolean isPropertyValid(String property, String[] allowedProperties) { - - // is it a special property that is always allowed? - for (String allowedProperty : specialProperties) { - if (property.equals(allowedProperty)) - return true; - } - - // is it in the allowed properties list? - for (String allowedProperty : allowedProperties) { - if (property.equals(allowedProperty)) - return true; - } - - return false; - } - - private boolean isRelationshipValid(String relationship, String[] allowedRelationships) { - - // is it a special relationship that is always allowed? - for (String allowedRelationship : specialRelationships) { - if (relationship.equals(allowedRelationship)) - return true; - } - - // is it in the allowed properties list? - for (String allowedRelationship : allowedRelationships) { - if (relationship.equals(allowedRelationship)) - return true; - } - - return false; - } - - /** - * Saves a property as the correct type - * @param key property key - * @param value property value, as read from YAML parser - * @return property value to be stored - */ - private void saveProperty(Sample sample, String key, Object value) { - - // convert gender to the right type, if it was stored as a String - if (key.equals("gender")) { - if (((String) value).toLowerCase().equals("male")) { - value = Sample.Gender.MALE; - } - else if (((String) value).toLowerCase().equals("female")) { - value = Sample.Gender.FEMALE; - } - else if (((String) value).toLowerCase().equals("unknown")) { - value = Sample.Gender.UNKNOWN; - } - else if (value != null) { - throw new StingException("'gender' property must be male, female, or unknown."); - } - } - try { - sample.setProperty(key, value); - } - catch (Exception e) { - throw new StingException("Could not save property " + key, e); - } - } - - /** - * Saves a relationship as the correct type - * @param key relationship key - * @param relativeId sample ID string of the relative - * @return relationship value to be stored - */ - private void saveRelationship(Sample sample, String key, String relativeId) { - - // get the reference that we'll store as the value - Sample relative = getSampleById(relativeId); - - // create sample object for the relative, if necessary - if (relative == null) { - relative = new Sample(relativeId); - addSample(relative); - } - sample.setRelationship(key, relative); - } + public void addFile(File sampleFile) {} +// +// BufferedReader reader; +// try { +// reader = new BufferedReader(new FileReader(sampleFile)); +// } +// catch (IOException e) { +// throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e); +// } +// +// // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file +// Constructor con = new Constructor(SampleFileParser.class); +// TypeDescription desc = new TypeDescription(SampleFileParser.class); +// desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class); +// desc.putListPropertyType("sampleAliases", SampleAlias.class); +// con.addTypeDescription(desc); +// Yaml yaml = new Yaml(con); +// +// // SampleFileParser stores an object representation of a sample file - this is what we'll parse +// SampleFileParser parser; +// try { +// parser = (SampleFileParser) yaml.load(reader); +// } +// catch (Exception e) { +// throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e); +// } +// +// // check to see which validation options were built into the file +// boolean restrictProperties = parser.getAllowedProperties() != null; +// boolean restrictRelationships = parser.getAllowedRelationships() != null; +// boolean restrictPropertyValues = parser.getPropertyDefinitions() != null; +// +// // propertyValues stores the values that are allowed for a given property +// HashMap propertyValues = null; +// if (restrictPropertyValues) { +// propertyValues = new HashMap(); +// for (PropertyDefinition def : parser.getPropertyDefinitions()) { +// HashSet set = new HashSet(); +// for (String value : def.getValues()) { +// set.add(value); +// } +// propertyValues.put(def.getProperty(), set); +// } +// } +// +// // make sure the aliases are valid +// validateAliases(parser); +// +// // loop through each sample in the file - a SampleParser stores an object that will become a Sample +// for (SampleParser sampleParser : parser.getSamples()) { +// +// try { +// // step 1: add the sample if it doesn't already exist +// Sample sample = getSampleById(sampleParser.getId()); +// if (sample == null) { +// sample = new Sample(sampleParser.getId()); +// } +// addSample(sample); +// sample.setSampleFileEntry(true); +// +// // step 2: add the properties +// if (sampleParser.getProperties() != null) { +// for (String property : sampleParser.getProperties().keySet()) { +// +// // check that property is allowed +// if (restrictProperties) { +// if (!isPropertyValid(property, parser.getAllowedProperties())) { +// throw new StingException(property + " is an invalid property. It is not included in the list " + +// "of allowed properties."); +// } +// } +// +// // next check that the value is allowed +// if (restrictPropertyValues) { +// if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) { +// throw new StingException("The value of property '" + property + "' is invalid. " + +// "It is not included in the list of allowed values for this property."); +// } +// } +// +// // next check that there isn't already a conflicting property there +// if (sample.getProperty(property) != null && +// sample.getProperty(property) != sampleParser.getProperties().get(property)) +// { +// throw new StingException(property + " is a conflicting property!"); +// } +// +// // checks are passed - now add the property! +// saveProperty(sample, property, sampleParser.getProperties().get(property)); +// } +// } +// +// // step 3: add the relationships +// if (sampleParser.getRelationships() != null) { +// for (String relationship : sampleParser.getRelationships().keySet()) { +// String relativeId = sampleParser.getRelationships().get(relationship); +// if (relativeId == null) { +// throw new StingException("The relationship cannot be null"); +// } +// +// // first check that it's not invalid +// if (restrictRelationships) { +// if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) { +// throw new StingException(relationship + " is an invalid relationship"); +// } +// } +// +// // next check that there isn't already a conflicting property there +// if (sample.getRelationship(relationship) != null) { +// if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) { +// throw new StingException(relationship + " is a conflicting relationship!"); +// } +// // if the relationship is already set - and consistent with what we're reading now - no need to continue +// else { +// continue; +// } +// } +// +// // checks are passed - now save the relationship +// saveRelationship(sample, relationship, relativeId); +// } +// } +// } catch (Exception e) { +// throw new StingException("An error occurred while loading this sample from the sample file: " + +// sampleParser.getId(), e); +// } +// } +// } +// +// private boolean isValueAllowed(String key, Object value, HashMap valuesList) { +// +// // if the property values weren't specified for this property, then any value is okay +// if (!valuesList.containsKey(key)) { +// return true; +// } +// +// // if this property has enumerated values, it must be a string +// else if (value.getClass() != String.class) +// return false; +// +// // is the value specified or not? +// else if (!valuesList.get(key).contains(value)) +// return false; +// +// return true; +// } +// +// /** +// * Makes sure that the aliases are valid +// * Checks that 1) no string is used as both a main ID and an alias; +// * 2) no alias is used more than once +// * @param parser +// */ +// private void validateAliases(SampleFileParser parser) { +// +// // no aliases sure validate +// if (parser.getSampleAliases() == null) +// return; +// +// HashSet mainIds = new HashSet(); +// HashSet otherIds = new HashSet(); +// +// for (SampleAlias sampleAlias : parser.getSampleAliases()) { +// mainIds.add(sampleAlias.getMainId()); +// for (String otherId : sampleAlias.getOtherIds()) { +// if (mainIds.contains(otherId)) +// throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " + +// "be both a main ID and an other ID", otherId)); +// +// if (!otherIds.add(otherId)) +// throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " + +// "alias more than once.", otherId)); +// } +// } +// } +// +// private boolean isPropertyValid(String property, String[] allowedProperties) { +// +// // is it a special property that is always allowed? +// for (String allowedProperty : specialProperties) { +// if (property.equals(allowedProperty)) +// return true; +// } +// +// // is it in the allowed properties list? +// for (String allowedProperty : allowedProperties) { +// if (property.equals(allowedProperty)) +// return true; +// } +// +// return false; +// } +// +// private boolean isRelationshipValid(String relationship, String[] allowedRelationships) { +// +// // is it a special relationship that is always allowed? +// for (String allowedRelationship : specialRelationships) { +// if (relationship.equals(allowedRelationship)) +// return true; +// } +// +// // is it in the allowed properties list? +// for (String allowedRelationship : allowedRelationships) { +// if (relationship.equals(allowedRelationship)) +// return true; +// } +// +// return false; +// } +// +// /** +// * Saves a property as the correct type +// * @param key property key +// * @param value property value, as read from YAML parser +// * @return property value to be stored +// */ +// private void saveProperty(Sample sample, String key, Object value) { +// +// // convert gender to the right type, if it was stored as a String +// if (key.equals("gender")) { +// if (((String) value).toLowerCase().equals("male")) { +// value = Sample.Gender.MALE; +// } +// else if (((String) value).toLowerCase().equals("female")) { +// value = Sample.Gender.FEMALE; +// } +// else if (((String) value).toLowerCase().equals("unknown")) { +// value = Sample.Gender.UNKNOWN; +// } +// else if (value != null) { +// throw new StingException("'gender' property must be male, female, or unknown."); +// } +// } +// try { +// sample.setProperty(key, value); +// } +// catch (Exception e) { +// throw new StingException("Could not save property " + key, e); +// } +// } +// +// /** +// * Saves a relationship as the correct type +// * @param key relationship key +// * @param relativeId sample ID string of the relative +// * @return relationship value to be stored +// */ +// private void saveRelationship(Sample sample, String key, String relativeId) { +// +// // get the reference that we'll store as the value +// Sample relative = getSampleById(relativeId); +// +// // create sample object for the relative, if necessary +// if (relative == null) { +// relative = new Sample(relativeId); +// addSample(relative); +// } +// sample.setRelationship(key, relative); +// } From e76f3816289954bb2d5d2b31bb4ddb9dbce1ead5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 09:57:15 -0400 Subject: [PATCH 03/63] Moved sample package from DataSources to gatk, and renamed it samples -- All associated changes to the codebase are just header updates --- .../sting/gatk/GenomeAnalysisEngine.java | 5 +- .../gatk/contexts/AlignmentContextUtils.java | 2 +- .../sting/gatk/executive/WindowMaker.java | 2 +- .../gatk/iterators/LocusIteratorByState.java | 4 +- .../sample => samples}/Sample.java | 3 +- .../sample => samples}/SampleDataSource.java | 8 +- .../phasing/ReadBackedPhasingWalker.java | 2 +- .../qc/CountLociByPopulationWalker.java | 59 ++++ .../gatk/walkers/qc/CountMalesWalker.java | 28 ++ .../sting/utils/MendelianViolation.java | 4 +- .../sting/utils/ped/PedReader.java | 254 ++++++++++++++++++ .../pileup/AbstractReadBackedPileup.java | 2 +- .../pileup/MergingPileupElementIterator.java | 2 +- .../utils/pileup/PileupElementTracker.java | 2 +- .../pileup/ReadBackedExtendedEventPileup.java | 2 +- .../ReadBackedExtendedEventPileupImpl.java | 2 +- .../sting/utils/pileup/ReadBackedPileup.java | 2 +- .../utils/pileup/ReadBackedPileupImpl.java | 2 +- .../providers/LocusViewTemplate.java | 3 +- .../reads/DownsamplerBenchmark.java | 5 +- .../LocusIteratorByStateUnitTest.java | 3 +- .../SampleDataSourceUnitTest.java | 2 +- .../sample => samples}/SampleUnitTest.java | 2 +- .../pileup/ReadBackedPileupUnitTest.java | 2 +- 24 files changed, 364 insertions(+), 38 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/{datasources/sample => samples}/Sample.java (98%) rename public/java/src/org/broadinstitute/sting/gatk/{datasources/sample => samples}/SampleDataSource.java (98%) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociByPopulationWalker.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java rename public/java/test/org/broadinstitute/sting/gatk/{datasources/sample => samples}/SampleDataSourceUnitTest.java (99%) rename public/java/test/org/broadinstitute/sting/gatk/{datasources/sample => samples}/SampleUnitTest.java (96%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 972943e26..f5590b708 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -34,15 +34,14 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java index 1f9a7d705..707f4e97c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.contexts; import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index cfbce58ee..42fe89dc9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,7 +4,7 @@ import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index e13c5a764..2f25bf7b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -35,8 +35,8 @@ import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java rename to public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index db53d1236..c37796bb6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -1,9 +1,8 @@ -package org.broadinstitute.sting.gatk.datasources.sample; +package org.broadinstitute.sting.gatk.samples; import org.broadinstitute.sting.utils.exceptions.StingException; -import java.util.Collections; import java.util.HashMap; import java.util.Map; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java rename to public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java index 5b2c06061..d3c59d9f4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.datasources.sample; +package org.broadinstitute.sting.gatk.samples; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; @@ -7,14 +7,8 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.yaml.snakeyaml.TypeDescription; -import org.yaml.snakeyaml.Yaml; -import org.yaml.snakeyaml.constructor.Constructor; -import java.io.BufferedReader; import java.io.File; -import java.io.FileReader; -import java.io.IOException; import java.util.*; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 17a6e20f1..ccbcca4f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociByPopulationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociByPopulationWalker.java new file mode 100644 index 000000000..6802e9c8d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociByPopulationWalker.java @@ -0,0 +1,59 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; + +import java.util.ArrayList; +import java.util.HashMap; + +/** + * Extends locus walker to print how many reads there are at each locus, by population + */ +public class CountLociByPopulationWalker extends LocusWalker implements TreeReducible { + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + // in this HashMap, we'll keep count of how many + HashMap count = new HashMap(); + + ArrayList reads = (ArrayList) context.getBasePileup().getReads(); + + for (SAMRecord read : reads) { + + // get the sample + Sample sample = getToolkit().getSampleByRead(read); + if (sample == null) + return 1; + + if (!count.containsKey(sample.getPopulation())) { + count.put(sample.getPopulation(), 1); + } + count.put(sample.getPopulation(), count.get(sample.getPopulation()) + 1); + } + + System.out.println("\n\n\n***** LOCUS: " + ref.getLocus().toString() + " *****"); + for (String population : count.keySet()) { + System.out.println(String.format("%s | %d", population, count.get(population))); + } + + return 1; + } + + public Long reduceInit() { return 0l; } + + public Long reduce(Integer value, Long sum) { + return value + sum; + } + + /** + * Reduces two subtrees together. In this case, the implementation of the tree reduce + * is exactly the same as the implementation of the single reduce. + */ + public Long treeReduce(Long lhs, Long rhs) { + return lhs + rhs; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java new file mode 100644 index 000000000..3c93f0786 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; + +/** + * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * Can also count the number of reads matching a given criterion using read filters (see the + * --read-filter command line argument). Simplest example of a read-backed analysis. + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountMalesWalker extends ReadWalker { + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + Sample sample = getToolkit().getSampleByRead(read); + return sample.isMale() ? 1 : 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java index 8da118174..7a044e4d1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -1,13 +1,11 @@ package org.broadinstitute.sting.utils; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; diff --git a/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java b/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java new file mode 100644 index 000000000..524ba7495 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.ped; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.SampleDataSource; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * Reads PED file-formatted tabular text files + * + * See http://www.broadinstitute.org/mpg/tagger/faq.html + * See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped + * + * The "ped" file format refers to the widely-used format for linkage pedigree data. + * Each line describes a single (diploid) individual in the following format: + * + * family_ID individual_ID father_ID mother_ID gender phenotype genotype_1 genotype_2 ... + * + * If your data lacks pedigree information (for example, unrelated case/control individuals), + * set the father_ID and mother_ID to 0. sex denotes the individual's gender with 1=male and 2=female. + * phenotype refers to the affected status (for association studies) where 0=unknown, 1=unaffected, 2=affected. + * Finally, each genotype is written as two (=diploid) integer numbers (separated by whitespace), + * where 1=A, 2=C, 3=G, 4=T. No header lines are allowed and all columns must be separated by whitespace. + * Check out the information at the PLINK website on the "ped" file format. + * + * The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory: + * Family ID + * Individual ID + * Paternal ID + * Maternal ID + * Sex (1=male; 2=female; other=unknown) + * Phenotype + * + * The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: PLINK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed). + * + * NOTE Quantitative traits with decimal points must be coded with a period/full-stop character and + * not a comma, i.e. 2.394 not 2,394 + * + * If an individual's sex is unknown, then any character other than 1 or 2 can be used. + * When new files are created (PED, FAM, or other which contain sex) then the original coding will be + * preserved. However, these individuals will be dropped from any analyses (i.e. phenotype set to missing also) + * and an error message will arise if an analysis that uses family information is requested and an + * individual of 'unknown' sex is specified as a father or mother. + * + * + * HINT You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore. + * + * Affection status, by default, should be coded: + * -9 missing + * 0 missing + * 1 unaffected + * 2 affected + * + * If your file is coded 0/1 to represent unaffected/affected, then use the --1 flag: + * plink --file mydata --1 which will specify a disease phenotype coded: + * + * -9 missing + * 0 unaffected + * 1 affected + * + * The missing phenotype value for quantitative traits is, by default, -9 (this can also be used for + * disease traits as well as 0). It can be reset by including the --missing-phenotype option: + * + * Genotypes (column 7 onwards) should also be white-space delimited; they can be any character + * (e.g. 1,2,3,4 or A,C,G,T or anything else) except 0 which is, by default, the missing genotype + * character. All markers should be biallelic. All SNPs (whether haploid or not) must have two + * alleles specified. Either Both alleles should be missing (i.e. 0) or neither. + * + * No header row should be given. For example, here are two individuals typed for 3 SNPs (one row = one person): + * + * FAM001 1 0 0 1 2 A A G G A C + * FAM001 2 0 0 1 2 A A A G 0 0 + * ... + * + * Note that the GATK does not support genotypes in a PED file. + * + * @author Mark DePristo + * @since 2011 + */ +public class PedReader { + private static Logger logger = Logger.getLogger(PedReader.class); + final static private Set CATAGORICAL_TRAIT_VALUES = new HashSet(Arrays.asList("-9", "0", "1", "2")); + final static private String commentMarker = "#"; + + private final File source; + private final List records; + + + public enum MissingPedFields { + NO_FAMILY_ID, + NO_PARENTS, + NO_SEX, + NO_PHENOTYPE + } + + // phenotype + private final static String PHENOTYPE_MISSING_VALUE = "-9"; + private final static String PHENOTYPE_MISSING_VALUE_SECONDARY = "0"; + private final static String PHENOTYPE_UNAFFECTED = "1"; + private final static String PHENOTYPE_AFFECTED = "2"; + + // Sex + private final static String SEX_MALE = "1"; + private final static String SEX_FEMALE = "2"; + // other=unknown + + public PedReader(File source, EnumSet missingFields) throws FileNotFoundException { + this.source = source; + List lines = new XReadLines(source).readLines(); + this.records = parsePedLines(lines, missingFields); + } + + private final List parsePedLines(final List lines, EnumSet missingFields) { + logger.info("Reading PED file " + source + " with missing fields: " + missingFields); + + // What are the record offsets? + final int familyPos = missingFields.contains(MissingPedFields.NO_FAMILY_ID) ? -1 : 0; + final int samplePos = familyPos + 1; + final int paternalPos = missingFields.contains(MissingPedFields.NO_PARENTS) ? -1 : samplePos + 1; + final int maternalPos = missingFields.contains(MissingPedFields.NO_PARENTS) ? -1 : paternalPos + 1; + final int sexPos = missingFields.contains(MissingPedFields.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; + final int phenotypePos = missingFields.contains(MissingPedFields.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; + final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)); + + // go through once and determine properties + int lineNo = 1; + boolean isQT = false; + final List splits = new ArrayList(lines.size()); + for ( final String line : lines ) { + if ( line.startsWith(commentMarker)) continue; + String[] parts = line.split("\\W+"); + + if ( parts.length != nExpectedFields ) + throw new UserException.MalformedFile(source, "Bad PED line " + lineNo + ": wrong number of fields"); + + if ( phenotypePos != -1 ) { + isQT = isQT || CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]); + } + + splits.add(parts); + lineNo++; + } + logger.info("Trait is quantitative? " + isQT); + + // now go through and parse each record + lineNo = 1; + final List recs = new ArrayList(splits.size()); + for ( final String[] parts : splits ) { + String familyID = null, individualID, paternalID = null, maternalID = null; + Sample.Gender sex = Sample.Gender.UNKNOWN; + double quantitativePhenotype = Sample.UNSET_QUANTITIATIVE_TRAIT_VALUE; + Sample.Affection affection = Sample.Affection.UNKNOWN; + + if ( familyPos != -1 ) familyID = parts[familyPos]; + individualID = parts[samplePos]; + if ( paternalPos != -1 ) paternalID = parts[paternalPos]; + if ( maternalPos != -1 ) maternalID = parts[maternalPos]; + + if ( sexPos != -1 ) { + if ( parts[sexPos].equals(SEX_MALE) ) sex = Sample.Gender.MALE; + else if ( parts[sexPos].equals(SEX_FEMALE) ) sex = Sample.Gender.FEMALE; + else sex = Sample.Gender.UNKNOWN; + } + + if ( phenotypePos != -1 ) { + if ( isQT ) { + if ( parts[phenotypePos].equals(PHENOTYPE_MISSING_VALUE) ) + affection = Sample.Affection.UNKNOWN; + else { + affection = Sample.Affection.QUANTITATIVE; + quantitativePhenotype = Double.valueOf(parts[phenotypePos]); + } + } else { + if ( parts[phenotypePos].equals(PHENOTYPE_MISSING_VALUE) ) affection = Sample.Affection.UNKNOWN; + else if ( parts[phenotypePos].equals(PHENOTYPE_MISSING_VALUE_SECONDARY) ) affection = Sample.Affection.UNKNOWN; + else if ( parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED) ) affection = Sample.Affection.UNAFFECTED; + else if ( parts[phenotypePos].equals(PHENOTYPE_AFFECTED) ) affection = Sample.Affection.AFFECTED; + else throw new ReviewedStingException("Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo); + } + } + + recs.add(new PedRecord(familyID, individualID, paternalID, maternalID, sex, quantitativePhenotype, affection)); + + lineNo++; + } + + return Collections.unmodifiableList(recs); + } + + public List getRecords() { + return records; + } + + public void fillSampleDB(SampleDataSource db) { + for ( final PedRecord rec : getRecords() ) { + Sample s = db.getOrCreateSample(rec.individualID); + } + } +} + +class PedRecord { + final String familyID, individualID, paternalID, maternalID; + final Sample.Gender sex; + final double quantitativePhenotype; + final Sample.Affection affection; + + PedRecord(final String familyID, final String individualID, + final String paternalID, final String maternalID, + final Sample.Gender sex, + final double quantitativePhenotype, final Sample.Affection affection) { + this.familyID = familyID; + this.individualID = individualID; + this.paternalID = paternalID; + this.maternalID = maternalID; + this.sex = sex; + this.quantitativePhenotype = quantitativePhenotype; + this.affection = affection; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 3821c9c8a..92915f590 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java index 7005cf869..58afc35e9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import java.util.Comparator; import java.util.Iterator; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java index 29e431695..137167dfc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.utils.pileup; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import java.util.*; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java index 8d43a368a..4e29be934 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index 31d29430a..6a3de5570 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 36b8a8c65..449ead56e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.HasGenomeLocation; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index e5b054961..7ebf6281b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.List; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index acfefd627..61977c99a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -8,11 +8,10 @@ import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.testng.annotations.BeforeClass; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 2ecd75754..17700cf7c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import com.google.caliper.Param; import net.sf.picard.filter.FilteringIterator; -import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; @@ -34,15 +33,13 @@ import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.baq.BAQ; -import java.io.File; import java.util.Collections; import java.util.Iterator; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 32d3675b7..c8cfdac9a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.iterators; -import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; @@ -12,7 +11,7 @@ import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLocParser; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java index 59405c065..390ed95ae 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.datasources.sample; +package org.broadinstitute.sting.gatk.samples; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.utils.variantcontext.Allele; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java similarity index 96% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java index 67e84cdd8..7f9a57f08 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.datasources.sample; +package org.broadinstitute.sting.gatk.samples; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index fb479ab47..d982d54a2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -28,7 +28,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.testng.Assert; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.annotations.Test; From 2a0cd556d39fbe3a376f8c8e66a91f3004d79611 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 10:34:51 -0400 Subject: [PATCH 04/63] Further cleanup of Sample -- Cleaned up interface functions in GAE -- Added Walker.getSampleDB() function which is an easier option for tools to get the samples db --- .../sting/gatk/GenomeAnalysisEngine.java | 133 +------------- .../gatk/contexts/AlignmentContextUtils.java | 9 - .../gatk/executive/LinearMicroScheduler.java | 2 +- .../sting/gatk/executive/ShardTraverser.java | 2 +- .../gatk/iterators/LocusIteratorByState.java | 4 +- .../sting/gatk/samples/Sample.java | 171 +++++++++--------- .../sting/gatk/samples/SampleDataSource.java | 37 ++-- .../sting/gatk/walkers/Walker.java | 10 + .../beagle/ProduceBeagleInputWalker.java | 2 +- .../walkers/coverage/CallableLociWalker.java | 4 +- .../phasing/ReadBackedPhasingWalker.java | 2 +- .../qc/CountLociByPopulationWalker.java | 2 +- .../gatk/walkers/qc/CountMalesWalker.java | 2 +- .../sting/utils/MendelianViolation.java | 14 +- .../pileup/AbstractReadBackedPileup.java | 4 +- .../utils/pileup/PileupElementTracker.java | 4 +- .../samples/SampleDataSourceUnitTest.java | 8 +- .../sting/gatk/samples/SampleUnitTest.java | 2 +- .../pileup/ReadBackedPileupUnitTest.java | 6 +- 19 files changed, 140 insertions(+), 278 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f5590b708..38ef2879b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -1034,123 +1034,16 @@ public class GenomeAnalysisEngine { return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); } - public SampleDataSource getSampleMetadata() { + // ------------------------------------------------------------------------------------- + // + // code for working with Samples database + // + // ------------------------------------------------------------------------------------- + + public SampleDataSource getSampleDB() { return this.sampleDataSource; } - /** - * Get a sample by its ID - * If an alias is passed in, return the main sample object - * @param id sample id - * @return sample Object with this ID - */ - public Sample getSampleById(String id) { - return sampleDataSource.getSampleById(id); - } - - /** - * Get the sample for a given read group - * Must first look up ID for read group - * @param readGroup of sample - * @return sample object with ID from the read group - */ - public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { - return sampleDataSource.getSampleByReadGroup(readGroup); - } - - /** - * Get a sample for a given read - * Must first look up read group, and then sample ID for that read group - * @param read of sample - * @return sample object of this read - */ - public Sample getSampleByRead(SAMRecord read) { - return getSampleByReadGroup(read.getReadGroup()); - } - - /** - * Get number of sample objects - * @return size of samples map - */ - public int sampleCount() { - return sampleDataSource.sampleCount(); - } - - /** - * Return all samples with a given family ID - * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this - * @param familyId family ID - * @return Samples with the given family ID - */ - public Set getFamily(String familyId) { - return sampleDataSource.getFamily(familyId); - } - - /** - * Returns all children of a given sample - * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient - * @param sample parent sample - * @return children of the given sample - */ - public Set getChildren(Sample sample) { - return sampleDataSource.getChildren(sample); - } - - /** - * Gets all the samples - * @return - */ - public Collection getSamples() { - return sampleDataSource.getSamples(); - } - - /** - * Takes a list of sample names and returns their corresponding sample objects - * - * @param sampleNameList List of sample names - * @return Corresponding set of samples - */ - public Set getSamples(Collection sampleNameList) { - return sampleDataSource.getSamples(sampleNameList); - } - - - /** - * Returns a set of samples that have any value (which could be null) for a given property - * @param key Property key - * @return Set of samples with the property - */ - public Set getSamplesWithProperty(String key) { - return sampleDataSource.getSamplesWithProperty(key); - } - - /** - * Returns a set of samples that have a property with a certain value - * Value must be a string for now - could add a similar method for matching any objects in the future - * - * @param key Property key - * @param value String property value - * @return Set of samples that match key and value - */ - public Set getSamplesWithProperty(String key, String value) { - return sampleDataSource.getSamplesWithProperty(key, value); - - } - - /** - * Returns a set of sample objects for the sample names in a variant context - * - * @param context Any variant context - * @return a set of the sample objects - */ - public Set getSamplesByVariantContext(VariantContext context) { - Set samples = new HashSet(); - for (String sampleName : context.getSampleNames()) { - samples.add(sampleDataSource.getOrCreateSample(sampleName)); - } - return samples; - } - /** * Returns all samples that were referenced in the SAM file */ @@ -1158,18 +1051,6 @@ public class GenomeAnalysisEngine { return sampleDataSource.getSAMFileSamples(); } - /** - * Return a subcontext restricted to samples with a given property key/value - * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering - * @param context VariantContext to filter - * @param key property key - * @param value property value (must be string) - * @return subcontext - */ - public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { - return sampleDataSource.subContextFromSampleProperty(context, key, value); - } - public Map getApproximateCommandLineArguments(Object... argumentProviders) { return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java index 707f4e97c..f77fbe4e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.contexts; import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -76,14 +75,6 @@ public class AlignmentContextUtils { return splitContextBySampleName(context, null); } - public static Map splitContextBySample(AlignmentContext context) { - Map m = new HashMap(); - for ( Map.Entry entry : splitContextBySampleName(context, null).entrySet() ) { - m.put(new Sample(entry.getKey()), entry.getValue()); - } - return m; - } - /** * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead * of sample object. diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 09ab4bd44..b7846399f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -56,7 +56,7 @@ public class LinearMicroScheduler extends MicroScheduler { traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata()); + WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleDB()); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 2b6488ada..428813b71 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -62,7 +62,7 @@ public class ShardTraverser implements Callable { Object accumulator = walker.reduceInit(); LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),microScheduler.getReadIterator(shard),shard.getGenomeLocs(), microScheduler.engine.getSampleMetadata()); // todo: microScheduler.engine is protected - is it okay to user it here? + WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),microScheduler.getReadIterator(shard),shard.getGenomeLocs(), microScheduler.engine.getSampleDB()); // todo: microScheduler.engine is protected - is it okay to user it here? ShardDataProvider dataProvider = null; for(WindowMaker.WindowMakerIterator iterator: windowMaker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 2f25bf7b1..61b861fd6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -542,7 +542,7 @@ public class LocusIteratorByState extends LocusIterator { Map readSelectors = new HashMap(); for(Sample sample: samples) { readStatesBySample.put(sample,new PerSampleReadStateManager()); - readSelectors.put(sample.getId(),downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); + readSelectors.put(sample.getID(),downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); } samplePartitioner = new SamplePartitioner(readSelectors); @@ -640,7 +640,7 @@ public class LocusIteratorByState extends LocusIterator { samplePartitioner.complete(); for(Sample sample: samples) { - ReadSelector aggregator = samplePartitioner.getSelectedReads(sample.getId()); + ReadSelector aggregator = samplePartitioner.getSelectedReads(sample.getID()); Collection newReads = new ArrayList(aggregator.getSelectedReads()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index c37796bb6..f92533f08 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples; import org.broadinstitute.sting.utils.exceptions.StingException; +import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -13,23 +14,16 @@ import java.util.Map; * Time: 3:31:38 PM */ public class Sample implements java.io.Serializable { - private final static String MOTHER = "mother"; - private final static String FATHER = "father"; - private final static String GENDER = "gender"; - private final static String POPULATION = "population"; - private final static String FAMILY = "familyId"; - private final static String AFFECTION = "affection"; - private final static String QUANT_TRAIT = "quantTrait"; - - private final String id; - - private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file + final private String familyID, paternalID, maternalID; + final private Sample.Gender gender; + final private double quantitativePhenotype; + final private Sample.Affection affection; + final private String population; + final private String ID; + final private SampleDataSource dataSource; private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file - - private HashMap properties = new HashMap(); - - private HashMap relationships = new HashMap(); + private Map properties = new HashMap(); public enum Gender { MALE, @@ -47,26 +41,28 @@ public class Sample implements java.io.Serializable { /** A quantitative trait: value of the trait is stored elsewhere */ QUANTITATIVE } + public final static double UNSET_QUANTITIATIVE_TRAIT_VALUE = Double.NaN; - public Sample(String id) { -/* if (id == null) { - throw new StingException("Error creating sample: sample ID cannot be null"); - }*/ - this.id = id; + public Sample(final String ID, final SampleDataSource dataSource, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final double quantitativePhenotype, final Affection affection, + final String population) { + this.familyID = familyID; + this.paternalID = paternalID; + this.maternalID = maternalID; + this.gender = gender; + this.quantitativePhenotype = quantitativePhenotype; + this.affection = affection; + this.population = population; + this.ID = ID; + this.dataSource = dataSource; } - public String getId() { - return this.id; - } - - public Map getProperties() { - return properties; - } - - @Deprecated - public void setSampleFileEntry(boolean value) { - this.hasSampleFileEntry = value; + public Sample(String id, SampleDataSource dataSource) { + this(id, dataSource, + null, null, null, + Gender.UNKNOWN, UNSET_QUANTITIATIVE_TRAIT_VALUE, Affection.UNKNOWN, null); } @Deprecated @@ -79,58 +75,39 @@ public class Sample implements java.io.Serializable { this.hasSAMFileEntry = value; } - /** - * Get one property - * @param key key of property - * @return value of property as generic object - */ - public Object getProperty(String key) { - return properties.get(key); + // ------------------------------------------------------------------------------------- + // + // standard property getters + // + // ------------------------------------------------------------------------------------- + + public String getID() { + return ID; } - /** - * Set a property - * If property already exists, it is overwritten - * @param key key of property - * @param value object to be stored in properties array - */ - public void setProperty(String key, Object value) { - if (relationships.containsKey(key)) { - throw new StingException("The same key cannot exist as a property and a relationship"); - } - - if (key.equals(GENDER) && value.getClass() != Gender.class) { - throw new StingException("'gender' property must be of type Sample.Gender"); - } - - if (key.equals(POPULATION) && value.getClass() != String.class) { - throw new StingException("'population' property must be of type String"); - } - - properties.put(key, value); + public String getFamilyID() { + return familyID; } - /** - * Get one relationship - * @param key of relationship - * @return Sample object that this relationship points to - */ - public Sample getRelationship(String key) { - return relationships.get(key); + public String getPaternalID() { + return paternalID; } - /** - * Set one relationship - * If already set, it is overwritten - * @param key key of the relationship - * @param value Sample object this relationship points to - */ - public void setRelationship(String key, Sample value) { - if (properties.containsKey(key)) { - throw new StingException("The same key cannot exist as a property and a relationship"); - } - relationships.put(key, value); + public String getMaternalID() { + return maternalID; + } + + public Affection getAffection() { + return affection; + } + + public boolean hasQuantitativeTrait() { + return affection == Affection.QUANTITATIVE; + } + + public double getQuantitativePhenotype() { + return quantitativePhenotype; } /** @@ -138,7 +115,7 @@ public class Sample implements java.io.Serializable { * @return sample object with relationship mother, if exists, or null */ public Sample getMother() { - return getRelationship(MOTHER); + return dataSource.getSampleById(maternalID); } /** @@ -146,7 +123,7 @@ public class Sample implements java.io.Serializable { * @return sample object with relationship father, if exists, or null */ public Sample getFather() { - return getRelationship(FATHER); + return dataSource.getSampleById(paternalID); } /** @@ -154,29 +131,48 @@ public class Sample implements java.io.Serializable { * @return property of key "gender" - must be of type Gender */ public Gender getGender() { - return (Gender) properties.get(GENDER); + return gender; } public String getPopulation() { - return (String) properties.get(POPULATION); + return population; } public String getFamilyId() { - return (String) properties.get(FAMILY); + return familyID; } /** * @return True if sample is male, false if female, unknown, or null */ public boolean isMale() { - return properties.get(GENDER) == Gender.MALE; + return getGender() == Gender.MALE; } /** * @return True if sample is female, false if male, unknown or null */ public boolean isFemale() { - return properties.get(GENDER) == Gender.MALE; + return getGender() == Gender.MALE; + } + + // ------------------------------------------------------------------------------------- + // + // code for working with additional -- none standard -- properites + // + // ------------------------------------------------------------------------------------- + + public Map getExtraProperties() { + return Collections.unmodifiableMap(properties); + } + + /** + * Get one property + * @param key key of property + * @return value of property as generic object + */ + public Object getExtraPropertyValue(final String key) { + return properties.get(key); } /** @@ -184,7 +180,7 @@ public class Sample implements java.io.Serializable { * @param key property key * @return true if sample has this property (even if its value is null) */ - public boolean hasProperty(String key) { + public boolean hasExtraProperty(String key) { return properties.containsKey(key); } @@ -196,17 +192,14 @@ public class Sample implements java.io.Serializable { Sample sample = (Sample) o; if (hasSAMFileEntry != sample.hasSAMFileEntry) return false; - if (hasSampleFileEntry != sample.hasSampleFileEntry) return false; - if (id != null ? !id.equals(sample.id) : sample.id != null) return false; + if (ID != null ? !ID.equals(sample.ID) : sample.ID != null) return false; if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false; - if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null) - return false; return true; } @Override public int hashCode() { - return id != null ? id.hashCode() : "".hashCode(); + return ID != null ? ID.hashCode() : "".hashCode(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java index d3c59d9f4..f4855b27f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java @@ -46,12 +46,6 @@ public class SampleDataSource { */ private HashMap sampleAliases = new HashMap(); - /** - * While loading sample files, we must be aware of "special" properties and relationships that are always allowed - */ - public static final String[] specialProperties = new String[] {"familyId", "population", "gender"}; - public static final String[] specialRelationships = new String[] {"mother", "father"}; - /** * Constructor takes both a SAM header and sample files because the two must be integrated. * @param header SAMFileHeader that has been created for this analysis @@ -63,8 +57,7 @@ public class SampleDataSource { // create empty sample object for each sample referenced in the SAM header for (String sampleName : SampleUtils.getSAMFileSamples(header)) { if (!hasSample(sampleName)) { - Sample newSample = new Sample(sampleName); - newSample.setSAMFileEntry(true); + Sample newSample = new Sample(sampleName, this); samples.put(sampleName, newSample); } } @@ -78,7 +71,7 @@ public class SampleDataSource { } public SampleDataSource() { - samples.put(null, new Sample(null)); + samples.put(null, new Sample(null, this)); } /** @@ -87,7 +80,7 @@ public class SampleDataSource { public void addSamplesFromSAMHeader(SAMFileHeader header) { for (String sampleName : SampleUtils.getSAMFileSamples(header)) { if (!hasSample(sampleName)) { - Sample newSample = new Sample(sampleName); + Sample newSample = new Sample(sampleName, this); newSample.setSAMFileEntry(true); samples.put(sampleName, newSample); } @@ -151,9 +144,9 @@ public class SampleDataSource { // // try { // // step 1: add the sample if it doesn't already exist -// Sample sample = getSampleById(sampleParser.getId()); +// Sample sample = getSampleById(sampleParser.getID()); // if (sample == null) { -// sample = new Sample(sampleParser.getId()); +// sample = new Sample(sampleParser.getID()); // } // addSample(sample); // sample.setSampleFileEntry(true); @@ -207,7 +200,7 @@ public class SampleDataSource { // // // next check that there isn't already a conflicting property there // if (sample.getRelationship(relationship) != null) { -// if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) { +// if (sample.getRelationship(relationship).getID() != sampleParser.getProperties().get(relationship)) { // throw new StingException(relationship + " is a conflicting relationship!"); // } // // if the relationship is already set - and consistent with what we're reading now - no need to continue @@ -222,7 +215,7 @@ public class SampleDataSource { // } // } catch (Exception e) { // throw new StingException("An error occurred while loading this sample from the sample file: " + -// sampleParser.getId(), e); +// sampleParser.getID(), e); // } // } // } @@ -377,7 +370,7 @@ public class SampleDataSource { * @param sample to be added */ private void addSample(Sample sample) { - samples.put(sample.getId(), sample); + samples.put(sample.getID(), sample); } /** @@ -496,7 +489,7 @@ public class SampleDataSource { public Set getSamplesWithProperty(String key) { HashSet toReturn = new HashSet(); for (Sample s : samples.values()) { - if (s.hasProperty(key)) + if (s.hasExtraProperty(key)) toReturn.add(s); } return toReturn; @@ -513,7 +506,7 @@ public class SampleDataSource { public Set getSamplesWithProperty(String key, String value) { Set toReturn = getSamplesWithProperty(key); for (Sample s : toReturn) { - if (!s.getProperty(key).equals(value)) + if (!s.getExtraPropertyValue(key).equals(value)) toReturn.remove(s); } return toReturn; @@ -522,7 +515,7 @@ public class SampleDataSource { public Sample getOrCreateSample(String id) { Sample sample = getSampleById(id); if (sample == null) { - sample = new Sample(id); + sample = new Sample(id, this); addSample(sample); } return sample; @@ -568,16 +561,10 @@ public class SampleDataSource { Set samplesWithProperty = new HashSet(); for (String sampleName : context.getSampleNames()) { Sample s = samples.get(sampleName); - if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value)) + if (s != null && s.hasExtraProperty(key) && s.getExtraPropertyValue(key).equals(value)) samplesWithProperty.add(sampleName); } Map genotypes = context.getGenotypes(samplesWithProperty); return context.subContextFromGenotypes(genotypes.values()); } - - public static SampleDataSource createEmptyDataSource() { - SAMFileHeader header = new SAMFileHeader(); - return new SampleDataSource(header, null); - } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 10261112c..ef791f12f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -30,6 +30,8 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.MalformedReadFilter; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; @@ -87,6 +89,14 @@ public abstract class Walker { return getToolkit().getMasterSequenceDictionary(); } + protected SampleDataSource getSampleDB() { + return getToolkit().getSampleDB(); + } + + protected Sample getSampleByID(final String id) { + return getToolkit().getSampleDB().getSampleById(id); + } + /** * (conceptual static) method that states whether you want to see reads piling up at a locus * that contain a deletion at the locus. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 87695077d..ecc4ad793 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -247,7 +247,7 @@ public class ProduceBeagleInputWalker extends RodWalker { Map preferredGenotypes = preferredVC.getGenotypes(); Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for ( String sample : samples ) { - boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale(); + boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSampleByID(sample).isMale(); Genotype genotype; boolean isValidation; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 32875a098..1e2d40271 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -227,9 +227,9 @@ public class CallableLociWalker extends LocusWalker impl for (SAMRecord read : reads) { // get the sample - Sample sample = getToolkit().getSampleByRead(read); + Sample sample = getSampleDB().getSampleByRead(read); if (sample == null) return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java index 3c93f0786..2d89a7d44 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java @@ -16,7 +16,7 @@ import org.broadinstitute.sting.gatk.walkers.Requires; @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountMalesWalker extends ReadWalker { public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - Sample sample = getToolkit().getSampleByRead(read); + Sample sample = getSampleDB().getSampleByRead(read); return sample.isMale() ? 1 : 0; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java index 7a044e4d1..a87a73a2d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -88,9 +88,9 @@ public class MendelianViolation { * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation */ public MendelianViolation(Sample sample, double minGenotypeQualityP) { - sampleMom = sample.getMother().getId(); - sampleDad = sample.getFather().getId(); - sampleChild = sample.getId(); + sampleMom = sample.getMother().getID(); + sampleDad = sample.getFather().getID(); + sampleChild = sample.getID(); minGenotypeQuality = minGenotypeQualityP; } @@ -102,13 +102,13 @@ public class MendelianViolation { */ public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) { boolean gotSampleInformation = false; - Collection samples = engine.getSamples(); + Collection samples = engine.getSampleDB().getSamples(); // Iterate through all samples in the sample_metadata file but we really can only take one. for (Sample sample : samples) { if (sample.getMother() != null && sample.getFather() != null) { - sampleMom = sample.getMother().getId(); - sampleDad = sample.getFather().getId(); - sampleChild = sample.getId(); + sampleMom = sample.getMother().getID(); + sampleDad = sample.getFather().getID(); + sampleChild = sample.getID(); minGenotypeQuality = minGenotypeQualityP; gotSampleInformation = true; break; // we can only deal with one trio information diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 92915f590..3bc325cea 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -555,7 +555,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; Collection sampleNames = new HashSet(); for (Sample sample : tracker.getSamples()) { - sampleNames.add(sample.getId()); + sampleNames.add(sample.getID()); } return sampleNames; } @@ -700,7 +700,7 @@ public abstract class AbstractReadBackedPileup extends PileupElem Sample sample = entry.getKey(); AbstractReadBackedPileup pileupBySample = entry.getValue(); pileup.put(sample,pileupBySample.pileupElementTracker); - sampleNames.put(sample.getId(), sample); + sampleNames.put(sample.getID(), sample); } } @@ -105,7 +105,7 @@ class PerSamplePileupElementTracker extends PileupElem public void addElements(final Sample sample, PileupElementTracker elements) { pileup.put(sample,elements); - sampleNames.put(sample.getId(), sample); + sampleNames.put(sample.getID(), sample); size += elements.size(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java index 390ed95ae..61aed7b34 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java @@ -149,8 +149,8 @@ public class SampleDataSourceUnitTest extends BaseTest { Iterator i = ceuSamples.iterator(); ArrayList sampleNames = new ArrayList(); - sampleNames.add(i.next().getId()); - sampleNames.add(i.next().getId()); + sampleNames.add(i.next().getID()); + sampleNames.add(i.next().getID()); Assert.assertTrue(sampleNames.contains("sampleA")); Assert.assertTrue(sampleNames.contains("sampleB")); } @@ -191,8 +191,8 @@ public class SampleDataSourceUnitTest extends BaseTest { // make sure both samples are included Iterator i = set.iterator(); ArrayList sampleNames = new ArrayList(); - sampleNames.add(i.next().getId()); - sampleNames.add(i.next().getId()); + sampleNames.add(i.next().getID()); + sampleNames.add(i.next().getID()); Assert.assertTrue(sampleNames.contains("NA123")); Assert.assertTrue(sampleNames.contains("NA456")); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java index 7f9a57f08..ca777200b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -55,7 +55,7 @@ public class SampleUnitTest extends BaseTest { */ @Test() public void specialGettersTest() { - Assert.assertTrue(sampleC.getId().equals("sampleC")); + Assert.assertTrue(sampleC.getID().equals("sampleC")); Assert.assertTrue(sampleC.getPopulation().equals("pop1")); Assert.assertTrue(sampleC.isMale()); Assert.assertFalse(sampleA.isMale()); // sample A doesn't have a gender, so this should be false diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index d982d54a2..2b9ff7113 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -168,9 +168,9 @@ public class ReadBackedPileupUnitTest { Sample sample2 = new Sample("sample2"); SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); - readGroupOne.setSample(sample1.getId()); + readGroupOne.setSample(sample1.getID()); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); - readGroupTwo.setSample(sample2.getId()); + readGroupTwo.setSample(sample2.getID()); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); header.addReadGroup(readGroupOne); @@ -191,7 +191,7 @@ public class ReadBackedPileupUnitTest { Assert.assertEquals(sample1Pileup.size(),1,"Sample 1 pileup has wrong number of elements"); Assert.assertEquals(sample1Pileup.getReads().get(0),read1,"Sample 1 pileup has incorrect read"); - ReadBackedPileup sample2Pileup = pileup.getPileupForSampleName(sample2.getId()); + ReadBackedPileup sample2Pileup = pileup.getPileupForSampleName(sample2.getID()); Assert.assertEquals(sample2Pileup.size(),1,"Sample 2 pileup has wrong number of elements"); Assert.assertEquals(sample2Pileup.getReads().get(0),read2,"Sample 2 pileup has incorrect read"); From 5c9227cf5e9eefc24e3b42d1b8f8dd6f49c54f27 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 11:50:05 -0400 Subject: [PATCH 05/63] Further cleanup of Sample database -- Removing more and more unnecessary code -- Partial removal of type safe Sample usage. On the road to SampleDB only --- .../sting/gatk/GenomeAnalysisEngine.java | 7 +- .../sting/gatk/samples/Sample.java | 89 ++-- .../sting/gatk/samples/SampleDataSource.java | 491 +++--------------- .../sting/gatk/walkers/Walker.java | 5 +- .../beagle/ProduceBeagleInputWalker.java | 3 +- .../phasing/ReadBackedPhasingWalker.java | 6 +- .../qc/CountLociByPopulationWalker.java | 59 --- .../gatk/walkers/qc/CountMalesWalker.java | 4 +- .../sting/utils/ped/PedReader.java | 1 - .../pileup/AbstractReadBackedPileup.java | 34 -- .../pileup/ReadBackedExtendedEventPileup.java | 13 - .../sting/utils/pileup/ReadBackedPileup.java | 14 - .../reads/DownsamplerBenchmark.java | 3 +- .../samples/SampleDataSourceUnitTest.java | 207 -------- .../sting/gatk/samples/SampleUnitTest.java | 60 +-- .../pileup/ReadBackedPileupUnitTest.java | 14 +- 16 files changed, 139 insertions(+), 871 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociByPopulationWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 38ef2879b..050128740 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -46,10 +46,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -1048,7 +1045,7 @@ public class GenomeAnalysisEngine { * Returns all samples that were referenced in the SAM file */ public Set getSAMFileSamples() { - return sampleDataSource.getSAMFileSamples(); + return sampleDataSource.getSamples(SampleUtils.getSAMFileSamples(getSAMFileHeader())); } public Map getApproximateCommandLineArguments(Object... argumentProviders) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index f92533f08..c6fcbbc2a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.gatk.samples; -import org.broadinstitute.sting.utils.exceptions.StingException; - import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -18,12 +16,11 @@ public class Sample implements java.io.Serializable { final private Sample.Gender gender; final private double quantitativePhenotype; final private Sample.Affection affection; - final private String population; final private String ID; final private SampleDataSource dataSource; - private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file - private Map properties = new HashMap(); + // todo -- conditionally add the property map -- should be empty by default + private final Map properties = new HashMap(); public enum Gender { MALE, @@ -46,33 +43,31 @@ public class Sample implements java.io.Serializable { public Sample(final String ID, final SampleDataSource dataSource, final String familyID, final String paternalID, final String maternalID, - final Gender gender, final double quantitativePhenotype, final Affection affection, - final String population) { + final Gender gender, final double quantitativePhenotype, final Affection affection) { this.familyID = familyID; this.paternalID = paternalID; this.maternalID = maternalID; this.gender = gender; this.quantitativePhenotype = quantitativePhenotype; this.affection = affection; - this.population = population; this.ID = ID; this.dataSource = dataSource; } + public Sample(final String ID, final SampleDataSource dataSource, + final String familyID, final String paternalID, final String maternalID, final Gender gender) { + this(ID, dataSource, familyID, paternalID, maternalID, gender, + UNSET_QUANTITIATIVE_TRAIT_VALUE, Affection.UNKNOWN); + } + + public Sample(final String ID, final SampleDataSource dataSource, final double quantitativePhenotype, final Affection affection) { + this(ID, dataSource, null, null, null, Gender.UNKNOWN, quantitativePhenotype, affection); + } + public Sample(String id, SampleDataSource dataSource) { this(id, dataSource, null, null, null, - Gender.UNKNOWN, UNSET_QUANTITIATIVE_TRAIT_VALUE, Affection.UNKNOWN, null); - } - - @Deprecated - public boolean hasSAMFileEntry() { - return this.hasSAMFileEntry; - } - - @Deprecated - public void setSAMFileEntry(boolean value) { - this.hasSAMFileEntry = value; + Gender.UNKNOWN, UNSET_QUANTITIATIVE_TRAIT_VALUE, Affection.UNKNOWN); } // ------------------------------------------------------------------------------------- @@ -115,7 +110,7 @@ public class Sample implements java.io.Serializable { * @return sample object with relationship mother, if exists, or null */ public Sample getMother() { - return dataSource.getSampleById(maternalID); + return dataSource.getSample(maternalID); } /** @@ -123,7 +118,7 @@ public class Sample implements java.io.Serializable { * @return sample object with relationship father, if exists, or null */ public Sample getFather() { - return dataSource.getSampleById(paternalID); + return dataSource.getSample(paternalID); } /** @@ -134,28 +129,10 @@ public class Sample implements java.io.Serializable { return gender; } - public String getPopulation() { - return population; - } - public String getFamilyId() { return familyID; } - /** - * @return True if sample is male, false if female, unknown, or null - */ - public boolean isMale() { - return getGender() == Gender.MALE; - } - - /** - * @return True if sample is female, false if male, unknown or null - */ - public boolean isFemale() { - return getGender() == Gender.MALE; - } - // ------------------------------------------------------------------------------------- // // code for working with additional -- none standard -- properites @@ -184,22 +161,20 @@ public class Sample implements java.io.Serializable { return properties.containsKey(key); } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Sample sample = (Sample) o; - - if (hasSAMFileEntry != sample.hasSAMFileEntry) return false; - if (ID != null ? !ID.equals(sample.ID) : sample.ID != null) return false; - if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false; - - return true; - } - - @Override - public int hashCode() { - return ID != null ? ID.hashCode() : "".hashCode(); - } +// @Override +// public boolean equals(Object o) { +// if (this == o) return true; +// if (o == null || getClass() != o.getClass()) return false; +// +// Sample sample = (Sample) o; +// if (ID != null ? !ID.equals(sample.ID) : sample.ID != null) return false; +// if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false; +// +// return true; +// } +// +// @Override +// public int hashCode() { +// return ID != null ? ID.hashCode() : "".hashCode(); +// } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java index f4855b27f..fec82a71f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java @@ -6,7 +6,6 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; import java.util.*; @@ -26,394 +25,121 @@ import java.util.*; * */ public class SampleDataSource { - - /** - * SAMFileHeader that has been created for this analysis. - */ - private SAMFileHeader header; - /** * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so * this is stored as a HashMap. */ private final HashMap samples = new HashMap(); - /** - * Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different - * datasets. If this is the case, one ID is the "primary ID" and others are "aliases". - * - * This maps ID => primary ID for all samples ID strings - both primary IDs and aliases. - */ - private HashMap sampleAliases = new HashMap(); - /** * Constructor takes both a SAM header and sample files because the two must be integrated. - * @param header SAMFileHeader that has been created for this analysis - * @param sampleFiles Sample files that were included on the command line */ - public SampleDataSource(SAMFileHeader header, List sampleFiles) { - this(); - this.header = header; - // create empty sample object for each sample referenced in the SAM header - for (String sampleName : SampleUtils.getSAMFileSamples(header)) { - if (!hasSample(sampleName)) { - Sample newSample = new Sample(sampleName, this); - samples.put(sampleName, newSample); - } - } - - // add files consecutively - if (sampleFiles != null) { - for (File file : sampleFiles) { - addFile(file); - } - } - } - public SampleDataSource() { samples.put(null, new Sample(null, this)); } + public SampleDataSource(final SAMFileHeader header, final List sampleFiles) { + this(); + addSamples(header); + addSamples(sampleFiles); + } + + // -------------------------------------------------------------------------------- + // + // Functions for adding samples to the DB + // + // TODO: these should be protected, really + // + // -------------------------------------------------------------------------------- + /** * Hallucinates sample objects for all the samples in the SAM file and stores them */ - public void addSamplesFromSAMHeader(SAMFileHeader header) { + public SampleDataSource addSamples(SAMFileHeader header) { for (String sampleName : SampleUtils.getSAMFileSamples(header)) { - if (!hasSample(sampleName)) { + if (getSample(sampleName) == null) { Sample newSample = new Sample(sampleName, this); - newSample.setSAMFileEntry(true); samples.put(sampleName, newSample); } } + return this; + } + + public SampleDataSource addSamples(final List sampleFiles) { + // add files consecutively + for (File file : sampleFiles) { + addSamples(file); + } + return this; } /** * Parse one sample file and integrate it with samples that are already there * Fail quickly if we find any errors in the file */ - public void addFile(File sampleFile) {} -// -// BufferedReader reader; -// try { -// reader = new BufferedReader(new FileReader(sampleFile)); -// } -// catch (IOException e) { -// throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e); -// } -// -// // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file -// Constructor con = new Constructor(SampleFileParser.class); -// TypeDescription desc = new TypeDescription(SampleFileParser.class); -// desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class); -// desc.putListPropertyType("sampleAliases", SampleAlias.class); -// con.addTypeDescription(desc); -// Yaml yaml = new Yaml(con); -// -// // SampleFileParser stores an object representation of a sample file - this is what we'll parse -// SampleFileParser parser; -// try { -// parser = (SampleFileParser) yaml.load(reader); -// } -// catch (Exception e) { -// throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e); -// } -// -// // check to see which validation options were built into the file -// boolean restrictProperties = parser.getAllowedProperties() != null; -// boolean restrictRelationships = parser.getAllowedRelationships() != null; -// boolean restrictPropertyValues = parser.getPropertyDefinitions() != null; -// -// // propertyValues stores the values that are allowed for a given property -// HashMap propertyValues = null; -// if (restrictPropertyValues) { -// propertyValues = new HashMap(); -// for (PropertyDefinition def : parser.getPropertyDefinitions()) { -// HashSet set = new HashSet(); -// for (String value : def.getValues()) { -// set.add(value); -// } -// propertyValues.put(def.getProperty(), set); -// } -// } -// -// // make sure the aliases are valid -// validateAliases(parser); -// -// // loop through each sample in the file - a SampleParser stores an object that will become a Sample -// for (SampleParser sampleParser : parser.getSamples()) { -// -// try { -// // step 1: add the sample if it doesn't already exist -// Sample sample = getSampleById(sampleParser.getID()); -// if (sample == null) { -// sample = new Sample(sampleParser.getID()); -// } -// addSample(sample); -// sample.setSampleFileEntry(true); -// -// // step 2: add the properties -// if (sampleParser.getProperties() != null) { -// for (String property : sampleParser.getProperties().keySet()) { -// -// // check that property is allowed -// if (restrictProperties) { -// if (!isPropertyValid(property, parser.getAllowedProperties())) { -// throw new StingException(property + " is an invalid property. It is not included in the list " + -// "of allowed properties."); -// } -// } -// -// // next check that the value is allowed -// if (restrictPropertyValues) { -// if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) { -// throw new StingException("The value of property '" + property + "' is invalid. " + -// "It is not included in the list of allowed values for this property."); -// } -// } -// -// // next check that there isn't already a conflicting property there -// if (sample.getProperty(property) != null && -// sample.getProperty(property) != sampleParser.getProperties().get(property)) -// { -// throw new StingException(property + " is a conflicting property!"); -// } -// -// // checks are passed - now add the property! -// saveProperty(sample, property, sampleParser.getProperties().get(property)); -// } -// } -// -// // step 3: add the relationships -// if (sampleParser.getRelationships() != null) { -// for (String relationship : sampleParser.getRelationships().keySet()) { -// String relativeId = sampleParser.getRelationships().get(relationship); -// if (relativeId == null) { -// throw new StingException("The relationship cannot be null"); -// } -// -// // first check that it's not invalid -// if (restrictRelationships) { -// if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) { -// throw new StingException(relationship + " is an invalid relationship"); -// } -// } -// -// // next check that there isn't already a conflicting property there -// if (sample.getRelationship(relationship) != null) { -// if (sample.getRelationship(relationship).getID() != sampleParser.getProperties().get(relationship)) { -// throw new StingException(relationship + " is a conflicting relationship!"); -// } -// // if the relationship is already set - and consistent with what we're reading now - no need to continue -// else { -// continue; -// } -// } -// -// // checks are passed - now save the relationship -// saveRelationship(sample, relationship, relativeId); -// } -// } -// } catch (Exception e) { -// throw new StingException("An error occurred while loading this sample from the sample file: " + -// sampleParser.getID(), e); -// } -// } -// } -// -// private boolean isValueAllowed(String key, Object value, HashMap valuesList) { -// -// // if the property values weren't specified for this property, then any value is okay -// if (!valuesList.containsKey(key)) { -// return true; -// } -// -// // if this property has enumerated values, it must be a string -// else if (value.getClass() != String.class) -// return false; -// -// // is the value specified or not? -// else if (!valuesList.get(key).contains(value)) -// return false; -// -// return true; -// } -// -// /** -// * Makes sure that the aliases are valid -// * Checks that 1) no string is used as both a main ID and an alias; -// * 2) no alias is used more than once -// * @param parser -// */ -// private void validateAliases(SampleFileParser parser) { -// -// // no aliases sure validate -// if (parser.getSampleAliases() == null) -// return; -// -// HashSet mainIds = new HashSet(); -// HashSet otherIds = new HashSet(); -// -// for (SampleAlias sampleAlias : parser.getSampleAliases()) { -// mainIds.add(sampleAlias.getMainId()); -// for (String otherId : sampleAlias.getOtherIds()) { -// if (mainIds.contains(otherId)) -// throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " + -// "be both a main ID and an other ID", otherId)); -// -// if (!otherIds.add(otherId)) -// throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " + -// "alias more than once.", otherId)); -// } -// } -// } -// -// private boolean isPropertyValid(String property, String[] allowedProperties) { -// -// // is it a special property that is always allowed? -// for (String allowedProperty : specialProperties) { -// if (property.equals(allowedProperty)) -// return true; -// } -// -// // is it in the allowed properties list? -// for (String allowedProperty : allowedProperties) { -// if (property.equals(allowedProperty)) -// return true; -// } -// -// return false; -// } -// -// private boolean isRelationshipValid(String relationship, String[] allowedRelationships) { -// -// // is it a special relationship that is always allowed? -// for (String allowedRelationship : specialRelationships) { -// if (relationship.equals(allowedRelationship)) -// return true; -// } -// -// // is it in the allowed properties list? -// for (String allowedRelationship : allowedRelationships) { -// if (relationship.equals(allowedRelationship)) -// return true; -// } -// -// return false; -// } -// -// /** -// * Saves a property as the correct type -// * @param key property key -// * @param value property value, as read from YAML parser -// * @return property value to be stored -// */ -// private void saveProperty(Sample sample, String key, Object value) { -// -// // convert gender to the right type, if it was stored as a String -// if (key.equals("gender")) { -// if (((String) value).toLowerCase().equals("male")) { -// value = Sample.Gender.MALE; -// } -// else if (((String) value).toLowerCase().equals("female")) { -// value = Sample.Gender.FEMALE; -// } -// else if (((String) value).toLowerCase().equals("unknown")) { -// value = Sample.Gender.UNKNOWN; -// } -// else if (value != null) { -// throw new StingException("'gender' property must be male, female, or unknown."); -// } -// } -// try { -// sample.setProperty(key, value); -// } -// catch (Exception e) { -// throw new StingException("Could not save property " + key, e); -// } -// } -// -// /** -// * Saves a relationship as the correct type -// * @param key relationship key -// * @param relativeId sample ID string of the relative -// * @return relationship value to be stored -// */ -// private void saveRelationship(Sample sample, String key, String relativeId) { -// -// // get the reference that we'll store as the value -// Sample relative = getSampleById(relativeId); -// -// // create sample object for the relative, if necessary -// if (relative == null) { -// relative = new Sample(relativeId); -// addSample(relative); -// } -// sample.setRelationship(key, relative); -// } - - - - /** - * Filter a sample name in case it is an alias - * @param sampleId to be filtered - * @return ID of sample that stores data for this alias - */ - private String aliasFilter(String sampleId) { - if (!sampleAliases.containsKey(sampleId)) - return sampleId; - else - return sampleAliases.get(sampleId); + public SampleDataSource addSamples(File sampleFile) { + return this; } /** * Add a sample to the collection * @param sample to be added */ - private void addSample(Sample sample) { + private SampleDataSource addSample(Sample sample) { samples.put(sample.getID(), sample); + return this; } - /** - * Check if sample with this ID exists - * Note that this will return true if name passed in is an alias - * @param id ID of sample to be checked - * @return true if sample exists; false if not - */ - public boolean hasSample(String id) { - return samples.get(aliasFilter(id)) != null; - } + // -------------------------------------------------------------------------------- + // + // Functions for getting a sample from the DB + // + // -------------------------------------------------------------------------------- /** * Get a sample by its ID * If an alias is passed in, return the main sample object * @param id - * @return sample Object with this ID + * @return sample Object with this ID, or null if this does not exist */ - public Sample getSampleById(String id) { - return samples.get(aliasFilter(id)); + public Sample getSample(String id) { + return samples.get(id); } /** - * Get the sample for a given read group - * Must first look up ID for read group - * @param readGroup of sample - * @return sample object with ID from the read group + * + * @param read + * @return sample Object with this ID, or null if this does not exist */ - public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { - String nameFromReadGroup = readGroup.getSample(); - return getSampleById(nameFromReadGroup); + public Sample getSample(final SAMRecord read) { + return getSample(read.getReadGroup()); } /** - * Get a sample for a given read - * Must first look up read group, and then sample ID for that read group - * @param read of sample - * @return sample object of this read + * + * @param rg + * @return sample Object with this ID, or null if this does not exist */ - public Sample getSampleByRead(SAMRecord read) { - return getSampleByReadGroup(read.getReadGroup()); + public Sample getSample(final SAMReadGroupRecord rg) { + return getSample(rg.getSample()); } + /** + * @param g Genotype + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(final Genotype g) { + return getSample(g.getSampleName()); + } + + // -------------------------------------------------------------------------------- + // + // Functions for accessing samples in the DB + // + // -------------------------------------------------------------------------------- + + + /** * Get number of sample objects * @return size of samples map @@ -469,10 +195,10 @@ public class SampleDataSource { * @return Corresponding set of samples */ public Set getSamples(Collection sampleNameList) { - HashSet samples = new HashSet(); + HashSet samples = new HashSet(); for (String name : sampleNameList) { try { - samples.add(getSampleById(name)); + samples.add(getSample(name)); } catch (Exception e) { throw new StingException("Could not get sample with the following ID: " + name, e); @@ -480,91 +206,4 @@ public class SampleDataSource { } return samples; } - - /** - * Returns a set of samples that have any value (which could be null) for a given property - * @param key Property key - * @return Set of samples with the property - */ - public Set getSamplesWithProperty(String key) { - HashSet toReturn = new HashSet(); - for (Sample s : samples.values()) { - if (s.hasExtraProperty(key)) - toReturn.add(s); - } - return toReturn; - } - - /** - * Returns a set of samples that have a property with a certain value - * Value must be a string for now - could add a similar method for matching any objects in the future - * - * @param key Property key - * @param value String property value - * @return Set of samples that match key and value - */ - public Set getSamplesWithProperty(String key, String value) { - Set toReturn = getSamplesWithProperty(key); - for (Sample s : toReturn) { - if (!s.getExtraPropertyValue(key).equals(value)) - toReturn.remove(s); - } - return toReturn; - } - - public Sample getOrCreateSample(String id) { - Sample sample = getSampleById(id); - if (sample == null) { - sample = new Sample(id, this); - addSample(sample); - } - return sample; - } - - /** - * Returns all samples that were referenced in the SAM file - */ - public Set getSAMFileSamples() { - Set toReturn = new HashSet(); - for (Sample sample : samples.values()) { - if (sample.hasSAMFileEntry()) - toReturn.add(sample); - } - return toReturn; - } - - /** - * Returns a set of sample objects for the sample names in a variant context - * - * @param context Any variant context - * @return a set of the sample objects - */ - public Set getSamplesByVariantContext(VariantContext context) { - Set samples = new HashSet(); - for (String sampleName : context.getSampleNames()) { - samples.add(getOrCreateSample(sampleName)); - } - return samples; - } - - - /** - * Return a subcontext restricted to samples with a given property key/value - * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering - * @param context VariantContext to filter - * @param key property key - * @param value property value (must be string) - * @return subcontext - */ - public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { - - Set samplesWithProperty = new HashSet(); - for (String sampleName : context.getSampleNames()) { - Sample s = samples.get(sampleName); - if (s != null && s.hasExtraProperty(key) && s.getExtraPropertyValue(key).equals(value)) - samplesWithProperty.add(sampleName); - } - Map genotypes = context.getGenotypes(samplesWithProperty); - return context.subContextFromGenotypes(genotypes.values()); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index ef791f12f..f67dace2c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -36,7 +36,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.GenericDocumentationHandler; import java.util.List; @@ -93,8 +92,8 @@ public abstract class Walker { return getToolkit().getSampleDB(); } - protected Sample getSampleByID(final String id) { - return getToolkit().getSampleDB().getSampleById(id); + protected Sample getSample(final String id) { + return getToolkit().getSampleDB().getSample(id); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index ecc4ad793..cdf1913f7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VQSRCalibrationCurve; import org.broadinstitute.sting.utils.GenomeLoc; @@ -247,7 +248,7 @@ public class ProduceBeagleInputWalker extends RodWalker { Map preferredGenotypes = preferredVC.getGenotypes(); Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for ( String sample : samples ) { - boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSampleByID(sample).isMale(); + boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Sample.Gender.MALE; Genotype genotype; boolean isValidation; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index a4729371a..bbbdf5f1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -1095,14 +1095,14 @@ public class ReadBackedPhasingWalker extends RodWalker implements TreeReducible { - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - // in this HashMap, we'll keep count of how many - HashMap count = new HashMap(); - - ArrayList reads = (ArrayList) context.getBasePileup().getReads(); - - for (SAMRecord read : reads) { - - // get the sample - Sample sample = getSampleDB().getSampleByRead(read); - if (sample == null) - return 1; - - if (!count.containsKey(sample.getPopulation())) { - count.put(sample.getPopulation(), 1); - } - count.put(sample.getPopulation(), count.get(sample.getPopulation()) + 1); - } - - System.out.println("\n\n\n***** LOCUS: " + ref.getLocus().toString() + " *****"); - for (String population : count.keySet()) { - System.out.println(String.format("%s | %d", population, count.get(population))); - } - - return 1; - } - - public Long reduceInit() { return 0l; } - - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - /** - * Reduces two subtrees together. In this case, the implementation of the tree reduce - * is exactly the same as the implementation of the single reduce. - */ - public Long treeReduce(Long lhs, Long rhs) { - return lhs + rhs; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java index 2d89a7d44..f2c035c3c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java @@ -16,8 +16,8 @@ import org.broadinstitute.sting.gatk.walkers.Requires; @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountMalesWalker extends ReadWalker { public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - Sample sample = getSampleDB().getSampleByRead(read); - return sample.isMale() ? 1 : 0; + Sample sample = getSampleDB().getSample(read); + return sample.getGender() == Sample.Gender.MALE ? 1 : 0; } public Integer reduceInit() { return 0; } diff --git a/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java b/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java index 524ba7495..4d282d821 100644 --- a/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java @@ -228,7 +228,6 @@ public class PedReader { public void fillSampleDB(SampleDataSource db) { for ( final PedRecord rec : getRecords() ) { - Sample s = db.getOrCreateSample(rec.individualID); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 3bc325cea..38ffcae8f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -570,16 +570,6 @@ public abstract class AbstractReadBackedPileup getSamples() { - if(!(pileupElementTracker instanceof PerSamplePileupElementTracker)) { - throw new StingException("Must be an instance of PerSampleElementTracker"); - } - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; - return tracker.getSamples(); - } - - /** * Returns a pileup randomly downsampled to the desiredCoverage. * @@ -688,30 +678,6 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; - PileupElementTracker filteredElements = tracker.getElements(sample); - return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null; - } - else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { - SAMRecord read = p.getRead(); - if(sample != null) { - if(read.getReadGroup() != null && sample.getID().equals(read.getReadGroup().getSample())) - filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; - } - } - // -------------------------------------------------------- // // iterators diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java index 4e29be934..8dd2394cf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -123,19 +123,6 @@ public interface ReadBackedExtendedEventPileup extends ReadBackedPileup { */ public Collection getSampleNames(); - /** - * Gets a list of all the samples stored in this pileup. - * @return List of samples in this pileup. - */ - public Collection getSamples(); - - /** - * Gets the particular subset of this pileup with the given sample name. - * @param sample Name of the sample to use. - * @return A subset of this pileup containing only reads with the given sample. - */ - public ReadBackedExtendedEventPileup getPileupForSample(Sample sample); - public Iterable toExtendedIterable(); /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 449ead56e..b76619cd7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -137,13 +137,6 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForLane(String laneID); - - /** - * Gets a collection of all the samples stored in this pileup. - * @return Collection of samples in this pileup. - */ - public Collection getSamples(); - /** * Gets a collection of *names* of all the samples stored in this pileup. * @return Collection of names @@ -165,13 +158,6 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * @return A subset of this pileup containing only reads with the given sample. */ public ReadBackedPileup getPileupForSampleName(String sampleName); - - /** - * Gets the particular subset of this pileup with the given sample. - * @param sample Sample to use. - * @return A subset of this pileup containing only reads with the given sample. - */ - public ReadBackedPileup getPileupForSample(Sample sample); /** * Simple useful routine to count the number of deletion bases in this pileup diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 17700cf7c..9f3c2bb29 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -85,8 +85,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { (byte)0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); - SampleDataSource sampleDataSource = new SampleDataSource(); - sampleDataSource.addSamplesFromSAMHeader(reader.getFileHeader()); + SampleDataSource sampleDataSource = new SampleDataSource().addSamples(reader.getFileHeader()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java index 61aed7b34..ccbfd5c99 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java @@ -20,7 +20,6 @@ import java.util.*; * Time: 8:21:00 AM */ public class SampleDataSourceUnitTest extends BaseTest { - // this empty header used to instantiate sampledatasource objects private static SAMFileHeader header = new SAMFileHeader(); @@ -32,210 +31,4 @@ public class SampleDataSourceUnitTest extends BaseTest { public void loadSAMSamplesTest() { SampleDataSource s = new SampleDataSource(header, null); } - - // tests that a basic sample with relationships loads correctly - // Note that this is the only test for family relationships - we may want to expand this - @Test() - public void basicLoadSampleFileTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - Sample sampleA = s.getSampleById("sampleA"); - Sample sampleB = s.getSampleById("sampleB"); - Assert.assertTrue(sampleB.getMother() == sampleA); - Assert.assertTrue(s.getChildren(sampleA).contains(sampleB)); - Set family = s.getFamily("family1"); - Assert.assertTrue(family.size() == 2); - Assert.assertTrue(family.contains(sampleA)); - Assert.assertTrue(family.contains(sampleB)); - } - - // but that file should fail if it has an extra character in it... - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleExtraCharText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraChar.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // ...or a typo... - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleTypoText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxTypo.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - - } - - // ...or an extra unrecognized array - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleExtraArrayText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraArray.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // make sure aliases work - @Test(expectedExceptions=StingException.class) - public void sampleAliasText() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileWithAlias.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - // this file has two samples, but one has an alias. let's make sure that checks out... - Assert.assertTrue(s.sampleCount() == 3); - Assert.assertTrue(s.getSampleById("sampleA") == s.getSampleById("sampleC")); - } - - // error is thrown if property is included that's not in properties array - @Test(expectedExceptions=StingException.class) - public void unallowedPropertySampleTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedProperty.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // same as above, with relationship - @Test(expectedExceptions=StingException.class) - public void unallowedRelationshipSampleTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedRelationship.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // two sample files - @Test() - public void twoSampleFilesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - File secondFile = new File(sampleFilesDir + "basicSampleFileExt.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(secondFile); - SampleDataSource s = new SampleDataSource(header, files); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propC").equals("valC")); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); - } - - // two sample files, with contradictory properties - @Test(expectedExceptions=StingException.class) - public void twoContradictorySampleFilesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - File secondFile = new File(sampleFilesDir + "basicSampleFileInvalidExt.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(secondFile); - SampleDataSource s = new SampleDataSource(header, files); - } - - // three sample files - @Test() - public void threeSamplesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(new File(sampleFilesDir + "basicSampleFileExt.yaml")); - files.add(new File(sampleFilesDir + "basicSampleFileExt2.yaml")); - SampleDataSource s = new SampleDataSource(header, files); - Assert.assertTrue(s.sampleCount() == 6); - Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC")); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); - } - - /** - * testing getSamplesWithProperty - * in this file there are 5 samples - 2 with population "CEU", 1 with population "ABC", 1 with no population, - * and then the default null sample - */ - @Test() - public void getSamplesWithPropertyTest() { - File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - Set ceuSamples = s.getSamplesWithProperty("population", "CEU"); - Assert.assertTrue(ceuSamples.size() == 2); - - Iterator i = ceuSamples.iterator(); - ArrayList sampleNames = new ArrayList(); - sampleNames.add(i.next().getID()); - sampleNames.add(i.next().getID()); - Assert.assertTrue(sampleNames.contains("sampleA")); - Assert.assertTrue(sampleNames.contains("sampleB")); - } - - // make sure we can import data types other than Strings - @Test() - public void sampleTestPropertyType() { - File sampleFile = new File(sampleFilesDir + "sampleFileOtherTypes.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Sample sample = s.getSampleById("sampleA"); - Assert.assertTrue(sample.getProperty("a").getClass() == Integer.class); - Assert.assertTrue(sample.getProperty("b").getClass() == String.class); - Assert.assertTrue(sample.getProperty("c").getClass() == Double.class); - Assert.assertTrue(sample.getProperty("b").getClass() == String.class); - } - - /** - * check that getSamplesFromVariantContext works - * create a variant context with two sample names, and make sure the right samples are there - */ - @Test() - public void variantContextTest() { - SampleDataSource s = new SampleDataSource(header, null); - List alleleCollection = new ArrayList(); - Allele a1 = Allele.create("A", true); - alleleCollection.add(a1); - - Set genotypeCollection = new HashSet(); - genotypeCollection.add(new Genotype("NA123", alleleCollection)); - genotypeCollection.add(new Genotype("NA456", alleleCollection)); - - VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection); - - // make sure the set that's returned is the right size - HashSet set = (HashSet) s.getSamplesByVariantContext(v); - Assert.assertTrue(set.size() == 2); - - // make sure both samples are included - Iterator i = set.iterator(); - ArrayList sampleNames = new ArrayList(); - sampleNames.add(i.next().getID()); - sampleNames.add(i.next().getID()); - Assert.assertTrue(sampleNames.contains("NA123")); - Assert.assertTrue(sampleNames.contains("NA456")); - } - - /** - * checking subContextFromSampleProperty - */ - - /** - * check that subContextFromSampleProperty works - * create a variant context with four sample names, make sure that it filters correctly to 2 - */ - @Test() - public void subContextFromSamplePropertyTest() { - - File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - - List alleleCollection = new ArrayList(); - Allele a1 = Allele.create("A", true); - alleleCollection.add(a1); - - Set genotypeCollection = new HashSet(); - genotypeCollection.add(new Genotype("NA123", alleleCollection)); - genotypeCollection.add(new Genotype("sampleA", alleleCollection)); - genotypeCollection.add(new Genotype("sampleB", alleleCollection)); - genotypeCollection.add(new Genotype("sampleC", alleleCollection)); - - VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection); - VariantContext subContext = s.subContextFromSampleProperty(v, "population", "CEU"); - - Assert.assertTrue(subContext.getSampleNames().contains("sampleA")); - Assert.assertTrue(subContext.getSampleNames().contains("sampleA")); - Assert.assertTrue(subContext.getSampleNames().size() == 2); - - } - - - // we create lots of single item lists... - private ArrayList makeFileList(File file) { - ArrayList a = new ArrayList(); - a.add(file); - return a; - } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java index ca777200b..e8d1772b8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -13,41 +13,26 @@ import org.testng.annotations.Test; * Time: 8:21:00 AM */ public class SampleUnitTest extends BaseTest { - - static Sample sampleA; - static Sample sampleA1; - static Sample sampleB; - static Sample sampleC; + SampleDataSource db; + static Sample fam1A, fam1B, fam1C; + static Sample s1, s2; + static Sample trait1, trait2, trait3, trait4; @BeforeClass public void init() { - sampleA = new Sample("sampleA"); - sampleA.setProperty("uniqueProperty", "uniqueValue"); - sampleA1 = new Sample("sampleA"); - sampleA1.setProperty("uniqueProperty", "uniqueValue"); - sampleB = new Sample("sampleB"); - sampleC = new Sample("sampleC"); - sampleC.setProperty("population", "pop1"); - sampleC.setProperty("gender", Sample.Gender.MALE); - } + db = new SampleDataSource(); - /** - * Testing equality - */ - @Test() - public void equalsTest() { - Assert.assertTrue(sampleA.equals(sampleA1)); - Assert.assertFalse(sampleA == sampleA1); - Assert.assertFalse(sampleA.equals(sampleB)); - } + fam1A = new Sample("1A", db, "fam1", "1B", "1C", Sample.Gender.UNKNOWN); + fam1B = new Sample("1B", db, "fam1", null, null, Sample.Gender.MALE); + fam1C = new Sample("1C", db, "fam1", null, null, Sample.Gender.FEMALE); - /** - * And hash - */ - @Test() - public void basicHashTest() { - Assert.assertFalse(sampleA.hashCode() == sampleB.hashCode()); - Assert.assertTrue(sampleA.hashCode() == sampleA1.hashCode()); + s1 = new Sample("s1", db); + s2 = new Sample("s2", db); + + trait1 = new Sample("t1", db, Sample.UNSET_QUANTITIATIVE_TRAIT_VALUE, Sample.Affection.AFFECTED); + trait2 = new Sample("t2", db, Sample.UNSET_QUANTITIATIVE_TRAIT_VALUE, Sample.Affection.UNAFFECTED); + trait3 = new Sample("t3", db, Sample.UNSET_QUANTITIATIVE_TRAIT_VALUE, Sample.Affection.UNKNOWN); + trait4 = new Sample("t4", db, 1.0, Sample.Affection.QUANTITATIVE); } /** @@ -55,10 +40,15 @@ public class SampleUnitTest extends BaseTest { */ @Test() public void specialGettersTest() { - Assert.assertTrue(sampleC.getID().equals("sampleC")); - Assert.assertTrue(sampleC.getPopulation().equals("pop1")); - Assert.assertTrue(sampleC.isMale()); - Assert.assertFalse(sampleA.isMale()); // sample A doesn't have a gender, so this should be false + // todo -- test for sample with extra properties, like population +// Assert.assertTrue(sampleC.getID().equals("sampleC")); +// Assert.assertTrue(sampleC.getPopulation().equals("pop1")); } -} + @Test() + public void testGenders() { + Assert.assertTrue(fam1A.getGender() == Sample.Gender.UNKNOWN); + Assert.assertTrue(fam1B.getGender() == Sample.Gender.MALE); + Assert.assertTrue(fam1C.getGender() == Sample.Gender.FEMALE); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index 2b9ff7113..7b1ee9768 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -142,8 +142,8 @@ public class ReadBackedPileupUnitTest { Arrays.asList(read2,read4), Arrays.asList(1,1)); Map sampleToPileupMap = new HashMap(); - sampleToPileupMap.put(new Sample(readGroupOne.getSample()),sample1Pileup); - sampleToPileupMap.put(new Sample(readGroupTwo.getSample()),sample2Pileup); + sampleToPileupMap.put(new Sample(readGroupOne.getSample(), null),sample1Pileup); + sampleToPileupMap.put(new Sample(readGroupTwo.getSample(), null),sample2Pileup); ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null,sampleToPileupMap); @@ -164,8 +164,8 @@ public class ReadBackedPileupUnitTest { @Test public void testGetPileupForSample() { - Sample sample1 = new Sample("sample1"); - Sample sample2 = new Sample("sample2"); + Sample sample1 = new Sample("sample1", null); + Sample sample2 = new Sample("sample2", null); SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); readGroupOne.setSample(sample1.getID()); @@ -187,15 +187,11 @@ public class ReadBackedPileupUnitTest { ReadBackedPileup pileup = new ReadBackedPileupImpl(null,sampleToPileupMap); - ReadBackedPileup sample1Pileup = pileup.getPileupForSample(sample1); - Assert.assertEquals(sample1Pileup.size(),1,"Sample 1 pileup has wrong number of elements"); - Assert.assertEquals(sample1Pileup.getReads().get(0),read1,"Sample 1 pileup has incorrect read"); - ReadBackedPileup sample2Pileup = pileup.getPileupForSampleName(sample2.getID()); Assert.assertEquals(sample2Pileup.size(),1,"Sample 2 pileup has wrong number of elements"); Assert.assertEquals(sample2Pileup.getReads().get(0),read2,"Sample 2 pileup has incorrect read"); - ReadBackedPileup missingSamplePileup = pileup.getPileupForSample(new Sample("missing")); + ReadBackedPileup missingSamplePileup = pileup.getPileupForSampleName("missing"); Assert.assertNull(missingSamplePileup,"Pileup for sample 'missing' should be null but isn't"); missingSamplePileup = pileup.getPileupForSampleName("not here"); From 5043d76c3d308d45db816cfb685fe14085d41acb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 12:16:34 -0400 Subject: [PATCH 06/63] Removing more bad uses of SampleDataSource creation --- .../sting/utils/MendelianViolation.java | 26 ------------------- .../samples/SampleDataSourceUnitTest.java | 2 +- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java index a87a73a2d..a89bad2fd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.utils; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -94,31 +93,6 @@ public class MendelianViolation { minGenotypeQuality = minGenotypeQualityP; } - - /** - * The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option. - * @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored. - * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation - */ - public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) { - boolean gotSampleInformation = false; - Collection samples = engine.getSampleDB().getSamples(); - // Iterate through all samples in the sample_metadata file but we really can only take one. - for (Sample sample : samples) { - if (sample.getMother() != null && sample.getFather() != null) { - sampleMom = sample.getMother().getID(); - sampleDad = sample.getFather().getID(); - sampleChild = sample.getID(); - minGenotypeQuality = minGenotypeQualityP; - gotSampleInformation = true; - break; // we can only deal with one trio information - } - } - if (!gotSampleInformation) - throw new UserException("YAML file has no sample with relationship information (mother/father)"); - } - - /** * This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to * isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java index ccbfd5c99..3d40d4de8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java @@ -29,6 +29,6 @@ public class SampleDataSourceUnitTest extends BaseTest { // make sure samples are created from the SAM file correctly @Test() public void loadSAMSamplesTest() { - SampleDataSource s = new SampleDataSource(header, null); + SampleDataSource s = new SampleDataSource(header, Collections.emptyList()); } } From 9536845e3544fb5323c740216a0080009b8e61d9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 12:20:07 -0400 Subject: [PATCH 07/63] Cleaning up unused code in MV --- .../sting/utils/MendelianViolation.java | 50 +------------------ 1 file changed, 1 insertion(+), 49 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java index a89bad2fd..e62a7e512 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -16,9 +16,6 @@ import java.util.regex.Pattern; * Time: 12:38 PM */ public class MendelianViolation { - - - String sampleMom; String sampleDad; String sampleChild; @@ -31,22 +28,15 @@ public class MendelianViolation { private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); - static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; - static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; - - public String getSampleMom() { return sampleMom; } - public String getSampleDad() { return sampleDad; } - public String getSampleChild() { return sampleChild; } - public double getMinGenotypeQuality() { return minGenotypeQuality; } @@ -130,7 +120,7 @@ public class MendelianViolation { * @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation. * */ - public boolean isViolation (VariantContext vc) + public boolean isViolation(VariantContext vc) { return setAlleles(vc) && isViolation(); } @@ -144,42 +134,4 @@ public class MendelianViolation { return false; return true; } - - /** - * @return the likelihood ratio for a mendelian violation - */ - public double violationLikelihoodRatio(VariantContext vc) { - double[] logLikAssignments = new double[27]; - // the matrix to set up is - // MOM DAD CHILD - // |- AA - // AA AA | AB - // |- BB - // |- AA - // AA AB | AB - // |- BB - // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs - double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector(); - double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector(); - double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector(); - int offset = 0; - for ( int oMom = 0; oMom < 3; oMom++ ) { - for ( int oDad = 0; oDad < 3; oDad++ ) { - for ( int oChild = 0; oChild < 3; oChild ++ ) { - logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild]; - } - } - } - double[] mvLiks = new double[12]; - double[] nonMVLiks = new double[15]; - for ( int i = 0; i < 12; i ++ ) { - mvLiks[i] = logLikAssignments[mvOffsets[i]]; - } - - for ( int i = 0; i < 15; i++) { - nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]]; - } - - return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks); - } } From 68761a6e2882ce23744e13cf3c70073c90fca267 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 14:13:05 -0400 Subject: [PATCH 08/63] Removed sample from header --- .../org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index b76619cd7..32ab50695 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.HasGenomeLocation; From 625ffb6a075f5233cd0730bb71899463bfec4ad3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 14:52:11 -0400 Subject: [PATCH 09/63] LocusIteratorByState and ReadBackedPileups no long use Sample --- .../sting/gatk/executive/WindowMaker.java | 2 +- .../gatk/iterators/LocusIteratorByState.java | 60 +++++++------------ .../sting/gatk/samples/SampleDataSource.java | 11 ++-- .../pileup/AbstractReadBackedPileup.java | 34 +++++------ .../pileup/MergingPileupElementIterator.java | 3 +- .../utils/pileup/PileupElementTracker.java | 33 +++------- .../ReadBackedExtendedEventPileupImpl.java | 3 +- .../utils/pileup/ReadBackedPileupImpl.java | 3 +- .../LocusIteratorByStateUnitTest.java | 13 ++-- 9 files changed, 59 insertions(+), 103 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index da0395ba8..fb207087f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -70,7 +70,7 @@ public class WindowMaker implements Iterable, I this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData)); + this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData.getSampleNames())); this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 61b861fd6..e466aa325 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -35,8 +35,6 @@ import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; @@ -69,7 +67,7 @@ public class LocusIteratorByState extends LocusIterator { * Used to create new GenomeLocs. */ private final GenomeLocParser genomeLocParser; - private final ArrayList samples; + private final ArrayList samples; private final ReadStateManager readStates; static private class SAMRecordState { @@ -278,15 +276,15 @@ public class LocusIteratorByState extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, SampleDataSource sampleData ) { + public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples ) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; + this.samples = new ArrayList(samples); + this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); + } - // get the list of samples - this.samples = new ArrayList(sampleData.getSamples()); - - readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); - + public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser ) { + this(samIterator, readInformation, genomeLocParser, Collections.emptySet()); } public Iterator iterator() { @@ -303,19 +301,6 @@ public class LocusIteratorByState extends LocusIterator { //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); } - public void printState() { - for(Sample sample: samples) { - Iterator iterator = readStates.iterator(sample); - while(iterator.hasNext()) { - SAMRecordState state = iterator.next(); - logger.debug(String.format("printState():")); - SAMRecord read = state.getRead(); - int offset = state.getReadOffset(); - logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, (char)read.getReadBases()[offset], read.getCigarString())); - } - } - } - private GenomeLoc getLocation() { return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); } @@ -355,14 +340,14 @@ public class LocusIteratorByState extends LocusIterator { // In this case, the subsequent call to next() will emit the normal pileup at the current base // and shift the position. if (readInfo.generateExtendedEvents() && hasExtendedEvents) { - Map fullExtendedEventPileup = new HashMap(); + Map fullExtendedEventPileup = new HashMap(); // get current location on the reference and decrement it by 1: the indels we just stepped over // are associated with the *previous* reference base GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1); boolean hasBeenSampled = false; - for(Sample sample: samples) { + for(final String sample: samples) { Iterator iterator = readStates.iterator(sample); List indelPile = new ArrayList(readStates.size(sample)); hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample); @@ -426,10 +411,10 @@ public class LocusIteratorByState extends LocusIterator { nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled); } else { GenomeLoc location = getLocation(); - Map fullPileup = new HashMap(); + Map fullPileup = new HashMap(); boolean hasBeenSampled = false; - for(Sample sample: samples) { + for(final String sample: samples) { Iterator iterator = readStates.iterator(sample); List pile = new ArrayList(readStates.size(sample)); hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); @@ -495,7 +480,7 @@ public class LocusIteratorByState extends LocusIterator { } private void updateReadStates() { - for(Sample sample: samples) { + for(final String sample: samples) { Iterator it = readStates.iterator(sample); while ( it.hasNext() ) { SAMRecordState state = it.next(); @@ -522,7 +507,7 @@ public class LocusIteratorByState extends LocusIterator { private final PeekableIterator iterator; private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); + private final Map readStatesBySample = new HashMap(); private final int targetCoverage; private int totalReadStates = 0; @@ -540,9 +525,9 @@ public class LocusIteratorByState extends LocusIterator { } Map readSelectors = new HashMap(); - for(Sample sample: samples) { + for(final String sample: samples) { readStatesBySample.put(sample,new PerSampleReadStateManager()); - readSelectors.put(sample.getID(),downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); + readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); } samplePartitioner = new SamplePartitioner(readSelectors); @@ -554,7 +539,7 @@ public class LocusIteratorByState extends LocusIterator { * @param sample The sample. * @return Iterator over the reads associated with that sample. */ - public Iterator iterator(final Sample sample) { + public Iterator iterator(final String sample) { return new Iterator() { private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); @@ -590,7 +575,7 @@ public class LocusIteratorByState extends LocusIterator { * @param sample The sample. * @return Total number of reads in the given sample. */ - public int size(final Sample sample) { + public int size(final String sample) { return readStatesBySample.get(sample).size(); } @@ -600,12 +585,12 @@ public class LocusIteratorByState extends LocusIterator { * @param sample Sample, downsampled independently. * @return Integer stop of the furthest undownsampled region. */ - public int getDownsamplingExtent(final Sample sample) { + public int getDownsamplingExtent(final String sample) { return readStatesBySample.get(sample).getDownsamplingExtent(); } public SAMRecordState getFirst() { - for(Sample sample: samples) { + for(final String sample: samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); if(!reads.isEmpty()) return reads.peek(); @@ -639,8 +624,8 @@ public class LocusIteratorByState extends LocusIterator { } samplePartitioner.complete(); - for(Sample sample: samples) { - ReadSelector aggregator = samplePartitioner.getSelectedReads(sample.getID()); + for(final String sample: samples) { + ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); Collection newReads = new ArrayList(aggregator.getSelectedReads()); @@ -1072,6 +1057,3 @@ class SamplePartitioner implements ReadSelector { } } - - - diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java index fec82a71f..9543c834a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java @@ -182,12 +182,15 @@ public class SampleDataSource { return children; } - public Set getSamples() { - HashSet set = new HashSet(); - set.addAll(samples.values()); - return set; + public Collection getSamples() { + return Collections.unmodifiableCollection(samples.values()); } + public Collection getSampleNames() { + return Collections.unmodifiableCollection(samples.keySet()); + } + + /** * Takes a collection of sample names and returns their corresponding sample objects * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 38ffcae8f..fd1a041c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -26,11 +26,9 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; import java.util.*; @@ -114,10 +112,10 @@ public abstract class AbstractReadBackedPileup> pileupsBySample) { + protected AbstractReadBackedPileup(GenomeLoc loc, Map> pileupsBySample) { this.loc = loc; PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); - for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) { + for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) { tracker.addElements(pileupEntry.getKey(),pileupEntry.getValue().pileupElementTracker); addPileupToCumulativeStats(pileupEntry.getValue()); } @@ -213,7 +211,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutDeletions(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -251,7 +249,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getOverlappingFragmentFilteredPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -305,7 +303,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutMappingQualityZeroReads(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -334,7 +332,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPositiveStrandPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -363,7 +361,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getNegativeStrandPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -393,7 +391,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getFilteredPileup(filter); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -425,7 +423,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ,minMapQ); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -492,7 +490,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroup(targetReadGroupId); if(pileup != null) @@ -523,7 +521,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForLane(laneID); if(pileup != null) @@ -553,11 +551,7 @@ public abstract class AbstractReadBackedPileup getSampleNames() { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; - Collection sampleNames = new HashSet(); - for (Sample sample : tracker.getSamples()) { - sampleNames.add(sample.getID()); - } - return sampleNames; + return new HashSet(tracker.getSamples()); } else { Collection sampleNames = new HashSet(); @@ -594,7 +588,7 @@ public abstract class AbstractReadBackedPileup perSampleElements = tracker.getElements(sample); List filteredPileup = new ArrayList(); @@ -767,7 +761,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { int[] countsBySample = createNewPileup(loc,tracker.getElements(sample)).getBaseCounts(); for(int i = 0; i < counts.length; i++) counts[i] += countsBySample[i]; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java index 58afc35e9..c00ed24f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.gatk.samples.Sample; import java.util.Comparator; import java.util.Iterator; @@ -42,7 +41,7 @@ class MergingPileupElementIterator implements Iterator public MergingPileupElementIterator(PerSamplePileupElementTracker tracker) { perSampleIterators = new PriorityQueue>(Math.max(1,tracker.getSamples().size()),new PileupElementIteratorComparator()); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker trackerPerSample = tracker.getElements(sample); if(trackerPerSample.size() != 0) perSampleIterators.add(new PeekableIterator(trackerPerSample.iterator())); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java index 56e06c3d7..09b907e00 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java @@ -24,8 +24,6 @@ package org.broadinstitute.sting.utils.pileup; -import org.broadinstitute.sting.gatk.samples.Sample; - import java.util.*; /** @@ -60,52 +58,35 @@ class UnifiedPileupElementTracker extends PileupElemen } class PerSamplePileupElementTracker extends PileupElementTracker { - private final Map> pileup; - private final Map sampleNames = new HashMap(); + private final Map> pileup; private int size = 0; public PerSamplePileupElementTracker() { - pileup = new HashMap>(); - } - - public PerSamplePileupElementTracker(Map> pileupsBySample) { - pileup = new HashMap>(); - for(Map.Entry> entry: pileupsBySample.entrySet()) { - Sample sample = entry.getKey(); - AbstractReadBackedPileup pileupBySample = entry.getValue(); - pileup.put(sample,pileupBySample.pileupElementTracker); - sampleNames.put(sample.getID(), sample); - } + pileup = new HashMap>(); } /** * Gets a list of all the samples stored in this pileup. * @return List of samples in this pileup. */ - public Collection getSamples() { + public Collection getSamples() { return pileup.keySet(); } - public PileupElementTracker getElements(final Sample sample) { + public PileupElementTracker getElements(final String sample) { return pileup.get(sample); } - public PileupElementTracker getElements(final String sampleName) { - return pileup.get(sampleNames.get(sampleName)); - } - public PileupElementTracker getElements(final Collection selectSampleNames) { PerSamplePileupElementTracker result = new PerSamplePileupElementTracker(); - for (String sample : selectSampleNames) { - Sample sampleObject = sampleNames.get(sample); - result.addElements(sampleObject, pileup.get(sampleObject)); + for (final String sample : selectSampleNames) { + result.addElements(sample, pileup.get(sample)); } return result; } - public void addElements(final Sample sample, PileupElementTracker elements) { + public void addElements(final String sample, PileupElementTracker elements) { pileup.put(sample,elements); - sampleNames.put(sample.getID(), sample); size += elements.size(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index 6a3de5570..21dfee8b8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -56,7 +55,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< } // this is the good new one - public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { + public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { super(loc,pileupElementsBySample); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index 7ebf6281b..18e6d9134 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.List; @@ -48,7 +47,7 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup pileupElementsBySample) { + public ReadBackedPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { super(loc,pileupElementsBySample); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index c8cfdac9a..0db2e466c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -28,7 +28,6 @@ import java.util.*; * testing of the LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends BaseTest { - private final int MAX_READS = 10; private static SAMFileHeader header; private LocusIteratorByState li; @@ -67,7 +66,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(before,during,after); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser, new SampleDataSource()); + li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); boolean foundExtendedEventPileup = false; while (li.hasNext()) { @@ -119,7 +118,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(before,during,after); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser, new SampleDataSource()); + li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); boolean foundExtendedEventPileup = false; while (li.hasNext()) { @@ -153,7 +152,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(indelOnlyRead); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read // and considers it to be an indel-containing read. @@ -166,7 +165,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // Turn on extended events, and make sure the event is found. JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); Assert.assertTrue(li.hasNext(),"LocusIteratorByState with extended events should contain exactly one pileup"); alignmentContext = li.next(); @@ -202,7 +201,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),createTestReadProperties(),genomeLocParser,new SampleDataSource()); + li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),createTestReadProperties(),genomeLocParser); int currentLocus = firstLocus; int numAlignmentContextsFound = 0; @@ -259,7 +258,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); Assert.assertTrue(li.hasNext(),"Missing first locus at " + firstLocus); AlignmentContext alignmentContext = li.next(); From 9458f01409b5307981992bdeb7f26d460465f01f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 15:13:05 -0400 Subject: [PATCH 10/63] Test cleanup of Sample object --- .../reads/DownsamplerBenchmark.java | 2 +- .../utils/pileup/ReadBackedPileupUnitTest.java | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 9f3c2bb29..0e1f4253a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -89,7 +89,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); - LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser,sampleDataSource); + LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser,sampleDataSource.getSampleNames()); while(locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index 7b1ee9768..4e0541522 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -141,9 +141,9 @@ public class ReadBackedPileupUnitTest { ReadBackedPileupImpl sample2Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read2,read4), Arrays.asList(1,1)); - Map sampleToPileupMap = new HashMap(); - sampleToPileupMap.put(new Sample(readGroupOne.getSample(), null),sample1Pileup); - sampleToPileupMap.put(new Sample(readGroupTwo.getSample(), null),sample2Pileup); + Map sampleToPileupMap = new HashMap(); + sampleToPileupMap.put(readGroupOne.getSample(),sample1Pileup); + sampleToPileupMap.put(readGroupTwo.getSample(),sample2Pileup); ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null,sampleToPileupMap); @@ -164,13 +164,13 @@ public class ReadBackedPileupUnitTest { @Test public void testGetPileupForSample() { - Sample sample1 = new Sample("sample1", null); - Sample sample2 = new Sample("sample2", null); + String sample1 = "sample1"; + String sample2 = "sample2"; SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); - readGroupOne.setSample(sample1.getID()); + readGroupOne.setSample(sample1); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); - readGroupTwo.setSample(sample2.getID()); + readGroupTwo.setSample(sample2); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); header.addReadGroup(readGroupOne); @@ -181,13 +181,13 @@ public class ReadBackedPileupUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); read2.setAttribute("RG",readGroupTwo.getId()); - Map sampleToPileupMap = new HashMap(); + Map sampleToPileupMap = new HashMap(); sampleToPileupMap.put(sample1,new ReadBackedPileupImpl(null,Collections.singletonList(read1),0)); sampleToPileupMap.put(sample2,new ReadBackedPileupImpl(null,Collections.singletonList(read2),0)); ReadBackedPileup pileup = new ReadBackedPileupImpl(null,sampleToPileupMap); - ReadBackedPileup sample2Pileup = pileup.getPileupForSampleName(sample2.getID()); + ReadBackedPileup sample2Pileup = pileup.getPileupForSampleName(sample2); Assert.assertEquals(sample2Pileup.size(),1,"Sample 2 pileup has wrong number of elements"); Assert.assertEquals(sample2Pileup.getReads().get(0),read2,"Sample 2 pileup has incorrect read"); From b71b51751ecc7ea38e4bf88c11c3420e102b7b14 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 29 Sep 2011 17:30:01 -0400 Subject: [PATCH 11/63] Bug fix for UnitTest -- Provide the null sample to the LIBS, as this seems to be required for correctly passing this unit test -- Will be fixed in a future update --- .../iterators/LocusIteratorByStateUnitTest.java | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 0db2e466c..6e9fa0372 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -40,6 +40,11 @@ public class LocusIteratorByStateUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } + private final LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), + readAttributes,genomeLocParser, new SampleDataSource().getSampleNames()); + } + @Test public void testIndelBaseQualityFiltering() { final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; @@ -66,7 +71,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(before,during,after); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); + li = makeLTBS(reads,readAttributes); boolean foundExtendedEventPileup = false; while (li.hasNext()) { @@ -118,7 +123,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(before,during,after); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); + li = makeLTBS(reads,readAttributes); boolean foundExtendedEventPileup = false; while (li.hasNext()) { @@ -152,7 +157,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(indelOnlyRead); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); + li = makeLTBS(reads, readAttributes); // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read // and considers it to be an indel-containing read. @@ -165,7 +170,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // Turn on extended events, and make sure the event is found. JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); + li = makeLTBS(reads, readAttributes); Assert.assertTrue(li.hasNext(),"LocusIteratorByState with extended events should contain exactly one pileup"); alignmentContext = li.next(); @@ -201,7 +206,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),createTestReadProperties(),genomeLocParser); + li = makeLTBS(reads, createTestReadProperties()); int currentLocus = firstLocus; int numAlignmentContextsFound = 0; @@ -258,7 +263,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser); + li = makeLTBS(reads,readAttributes); Assert.assertTrue(li.hasNext(),"Missing first locus at " + firstLocus); AlignmentContext alignmentContext = li.next(); From e055a78f6e479eabcb172a5aa2ca8164fd2ea974 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 09:49:35 -0400 Subject: [PATCH 12/63] LIBS now requires at least one sample be present -- UnitTest provides a "null" sample for matching the reads without read groups --- .../sting/gatk/iterators/LocusIteratorByState.java | 9 +++------ .../gatk/iterators/LocusIteratorByStateUnitTest.java | 7 ++++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index e466aa325..9a468f482 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.ReservoirDownsampler; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -50,9 +51,6 @@ import java.util.*; /** Iterator that traverses a SAM File, accumulating information on a per-locus basis */ public class LocusIteratorByState extends LocusIterator { -// private static long discarded_bases = 0L; -// private static long observed_bases = 0L; - /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(LocusIteratorByState.class); @@ -281,10 +279,9 @@ public class LocusIteratorByState extends LocusIterator { this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); - } - public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser ) { - this(samIterator, readInformation, genomeLocParser, Collections.emptySet()); + if ( this.samples.isEmpty() ) + throw new IllegalArgumentException("samples list must not be empty"); } public Iterator iterator() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 6e9fa0372..dc43b4968 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -28,10 +28,8 @@ import java.util.*; * testing of the LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends BaseTest { - private final int MAX_READS = 10; private static SAMFileHeader header; private LocusIteratorByState li; - private GenomeLocParser genomeLocParser; @BeforeClass @@ -41,8 +39,11 @@ public class LocusIteratorByStateUnitTest extends BaseTest { } private final LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + List samples = new ArrayList(); + samples.add(null); + return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), - readAttributes,genomeLocParser, new SampleDataSource().getSampleNames()); + readAttributes,genomeLocParser, samples); } @Test From a69a4dda2fcf14ef3f7e8c6e97b92645fdf62e6d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 09:56:23 -0400 Subject: [PATCH 13/63] SamplesDB no longer has null sample -- Updated getSamples().size() == 2 test in CallableLociWalker that really ensured there was one sample in the system --- .../sting/gatk/samples/SampleDataSource.java | 2 +- .../walkers/coverage/CallableLociWalker.java | 3 +-- .../gatk/walkers/qc/CountMalesWalker.java | 24 +++++++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java index 9543c834a..d5285271b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java @@ -35,7 +35,7 @@ public class SampleDataSource { * Constructor takes both a SAM header and sample files because the two must be integrated. */ public SampleDataSource() { - samples.put(null, new Sample(null, this)); + } public SampleDataSource(final SAMFileHeader header, final List sampleFiles) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 1e2d40271..1dfc6fea0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -227,8 +227,7 @@ public class CallableLociWalker extends LocusWalker Date: Fri, 30 Sep 2011 09:57:39 -0400 Subject: [PATCH 14/63] Removed final use of Sample in RBP --- .../sting/utils/pileup/ReadBackedExtendedEventPileup.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java index 8dd2394cf..afed68177 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; From 30d23942b1f481568ca95fd6ff82a0b1099ad237 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 10:02:57 -0400 Subject: [PATCH 15/63] Renamed ReadBackedPileup getXSampleName() functions to getXSample -- now that we don't have Sample objects floating around we don't have to have all of the Name extensions on our functions --- .../sting/gatk/contexts/AlignmentContextUtils.java | 4 ++-- .../gatk/walkers/phasing/ReadBackedPhasingWalker.java | 5 ++--- .../sting/utils/pileup/AbstractReadBackedPileup.java | 6 +++--- .../sting/utils/pileup/ReadBackedExtendedEventPileup.java | 2 +- .../sting/utils/pileup/ReadBackedPileup.java | 6 +++--- .../sting/utils/variantcontext/VariantContext.java | 2 +- .../sting/utils/pileup/ReadBackedPileupUnitTest.java | 7 +++---- 7 files changed, 15 insertions(+), 17 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java index f77fbe4e9..c9506ec4c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java @@ -88,8 +88,8 @@ public class AlignmentContextUtils { GenomeLoc loc = context.getLocation(); HashMap contexts = new HashMap(); - for(String sample: context.getPileup().getSampleNames()) { - ReadBackedPileup pileupBySample = context.getPileup().getPileupForSampleName(sample); + for(String sample: context.getPileup().getSamples()) { + ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample); // Don't add empty pileups to the split context. if(pileupBySample.size() == 0) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index bbbdf5f1a..998cfa654 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -1095,8 +1094,8 @@ public class ReadBackedPhasingWalker extends RodWalker getSampleNames() { + public Collection getSamples() { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; return new HashSet(tracker.getSamples()); @@ -623,7 +623,7 @@ public abstract class AbstractReadBackedPileup sampleNames) { + public RBP getPileupForSamples(Collection sampleNames) { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleNames); @@ -649,7 +649,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleName); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java index afed68177..e7c0bc18f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -120,7 +120,7 @@ public interface ReadBackedExtendedEventPileup extends ReadBackedPileup { * Gets a list of all the samples stored in this pileup. * @return List of samples in this pileup. */ - public Collection getSampleNames(); + public Collection getSamples(); public Iterable toExtendedIterable(); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 32ab50695..3d30aa11b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -140,7 +140,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * Gets a collection of *names* of all the samples stored in this pileup. * @return Collection of names */ - public Collection getSampleNames(); + public Collection getSamples(); /** @@ -148,7 +148,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * @param sampleNames Name of the sample to use. * @return A subset of this pileup containing only reads with the given sample. */ - public ReadBackedPileup getPileupForSampleNames(Collection sampleNames); + public ReadBackedPileup getPileupForSamples(Collection sampleNames); /** @@ -156,7 +156,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * @param sampleName Name of the sample to use. * @return A subset of this pileup containing only reads with the given sample. */ - public ReadBackedPileup getPileupForSampleName(String sampleName); + public ReadBackedPileup getPileupForSample(String sampleName); /** * Simple useful routine to count the number of deletion bases in this pileup diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 412cbd90b..05e21c8b8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -132,7 +132,7 @@ import java.util.*; * vc.hasGenotypes() * vc.isMonomorphic() * vc.isPolymorphic() - * vc.getSampleNames().size() + * vc.getSamples().size() * * vc.getGenotypes() * vc.getGenotypes().get("g1") diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index 4e0541522..b07da7cc8 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -28,7 +28,6 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.testng.Assert; -import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.annotations.Test; @@ -187,14 +186,14 @@ public class ReadBackedPileupUnitTest { ReadBackedPileup pileup = new ReadBackedPileupImpl(null,sampleToPileupMap); - ReadBackedPileup sample2Pileup = pileup.getPileupForSampleName(sample2); + ReadBackedPileup sample2Pileup = pileup.getPileupForSample(sample2); Assert.assertEquals(sample2Pileup.size(),1,"Sample 2 pileup has wrong number of elements"); Assert.assertEquals(sample2Pileup.getReads().get(0),read2,"Sample 2 pileup has incorrect read"); - ReadBackedPileup missingSamplePileup = pileup.getPileupForSampleName("missing"); + ReadBackedPileup missingSamplePileup = pileup.getPileupForSample("missing"); Assert.assertNull(missingSamplePileup,"Pileup for sample 'missing' should be null but isn't"); - missingSamplePileup = pileup.getPileupForSampleName("not here"); + missingSamplePileup = pileup.getPileupForSample("not here"); Assert.assertNull(missingSamplePileup,"Pileup for sample 'not here' should be null but isn't"); } } From 178ba24c27af683602b34cc6e6b3d685b896db54 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 10:28:18 -0400 Subject: [PATCH 16/63] Move getSamplesForSamFile to SampleUtils -- A nearly identical piece of code already lived in SampleUtils. Now there are two functions, one taking a regular header and another grabbing the merged header from the GATK engine itself. Much cleaner --- .../sting/gatk/GenomeAnalysisEngine.java | 7 ------- .../org/broadinstitute/sting/utils/SampleUtils.java | 12 ++++++++++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 050128740..4884452d2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -1041,13 +1041,6 @@ public class GenomeAnalysisEngine { return this.sampleDataSource; } - /** - * Returns all samples that were referenced in the SAM file - */ - public Set getSAMFileSamples() { - return sampleDataSource.getSamples(SampleUtils.getSAMFileSamples(getSAMFileHeader())); - } - public Map getApproximateCommandLineArguments(Object... argumentProviders) { return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); } diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 1b4703e4a..edc1413ba 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -69,6 +69,18 @@ public class SampleUtils { return samples; } + + /** + * Same as @link getSAMFileSamples but gets all of the samples + * in the SAM files loaded by the engine + * + * @param engine + * @return + */ + public final static Set getSAMFileSamples(GenomeAnalysisEngine engine) { + return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader()); + } + /** * Gets all of the unique sample names from all VCF rods input by the user * From 810e8ad0118bdc486a81b23378f12290aaa072fc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 10:43:51 -0400 Subject: [PATCH 18/63] Removed getXByReaders() function from the engine -- These could be simplied in their downstream uses -- Or they could be replaced with a generic getSAMFileHeaders() function and then apply the getSamples(header) as desired downstream --- .../sting/gatk/GenomeAnalysisEngine.java | 108 ++---------------- .../coverage/DepthOfCoverageWalker.java | 13 +-- .../indels/SomaticIndelDetectorWalker.java | 2 +- 3 files changed, 17 insertions(+), 106 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 4884452d2..52544fbd2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -712,100 +712,6 @@ public class GenomeAnalysisEngine { return getReadsDataSource().getSAMFile(id); } - /** - * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names - * found in the corresponding bam file. - * - * @return Sets of samples in the merged input SAM stream, grouped by readers - */ - public List> getSamplesByReaders() { - Collection readers = getReadsDataSource().getReaderIDs(); - - List> sample_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set samples = new HashSet(1); - sample_sets.add(samples); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - samples.add(g.getSample()); - } - } - - return sample_sets; - - } - - /** - * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names - * found in the corresponding bam file. - * - * @return Sets of libraries present in the (merged) input SAM stream, grouped by readers - */ - public List> getLibrariesByReaders() { - - - Collection readers = getReadsDataSource().getReaderIDs(); - - List> lib_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set libs = new HashSet(2); - lib_sets.add(libs); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - libs.add(g.getLibrary()); - } - } - - return lib_sets; - - } - - /** - * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD **** - * - * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups - * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. - * - * @return sets of (merged) read group ids in order of input bams - */ - public List> getMergedReadGroupsByReaders() { - - - Collection readers = getReadsDataSource().getReaderIDs(); - - List> rg_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set groups = new HashSet(5); - rg_sets.add(groups); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - if (getReadsDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: - // use HeaderMerger to translate original read group id from the reader into the read group id in the - // merged stream, and save that remapped read group id to associate it with specific reader - groups.add(getReadsDataSource().getReadGroupId(r, g.getReadGroupId())); - } else { - // otherwise, pass through the unmapped read groups since this is what Picard does as well - groups.add(g.getReadGroupId()); - } - } - } - - return rg_sets; - - } - /** * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). * @@ -925,6 +831,18 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(reader); } + /** + * Returns an ordered list of the unmerged SAM file headers known to this engine. + * @return list of header for each input SAM file, in command line order + */ + public List getSAMFileHeaders() { + final List headers = new ArrayList(); + for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { + headers.add(getReadsDataSource().getHeader(id)); + } + return headers; + } + /** * Gets the master sequence dictionary for this GATK engine instance * @return a never-null dictionary listing all of the contigs known to this engine instance @@ -943,8 +861,6 @@ public class GenomeAnalysisEngine { return this.readsDataSource; } - - /** * Sets the collection of GATK main application arguments. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 664c319ab..3168d9a6b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; @@ -281,20 +282,14 @@ public class DepthOfCoverageWalker extends LocusWalker getSamplesFromToolKit(DoCOutputType.Partition type) { HashSet partition = new HashSet(); if ( type == DoCOutputType.Partition.sample ) { - for ( Set sampleSet : getToolkit().getSamplesByReaders() ) { - for ( String s : sampleSet ) { - partition.add(s); - } - } + partition.addAll(SampleUtils.getSAMFileSamples(getToolkit())); } else if ( type == DoCOutputType.Partition.readgroup ) { for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); } } else if ( type == DoCOutputType.Partition.library ) { - for ( Set libraries : getToolkit().getLibrariesByReaders() ) { - for ( String l : libraries ) { - partition.add(l); - } + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getLibrary()); } } else if ( type == DoCOutputType.Partition.center ) { for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 8bba8eac2..84cb69b07 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -392,7 +392,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); - normalSamples = getToolkit().getSamplesByReaders().get(0); + normalSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeaders().get(0)); try { // we already checked that bedOutput and output_file are not set simultaneously From 56f10b40a8dcae431ff9fc423d6a8bcf98076879 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 14:18:27 -0400 Subject: [PATCH 19/63] Fixing test bugs for WindowMaker that required empty sample list --- .../gatk/executive/LinearMicroScheduler.java | 3 ++- .../sting/gatk/executive/ShardTraverser.java | 5 +++- .../sting/gatk/executive/WindowMaker.java | 12 ++++++--- .../gatk/iterators/LocusIteratorByState.java | 10 +++++++ .../providers/LocusViewTemplate.java | 26 +++++++++---------- .../reads/DownsamplerBenchmark.java | 2 +- .../LocusIteratorByStateUnitTest.java | 6 +---- 7 files changed, 39 insertions(+), 25 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index b7846399f..a5d1370ba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -56,7 +56,8 @@ public class LinearMicroScheduler extends MicroScheduler { traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleDB()); + WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), + getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleDB().getSampleNames()); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 428813b71..11e51d99b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -62,7 +62,10 @@ public class ShardTraverser implements Callable { Object accumulator = walker.reduceInit(); LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),microScheduler.getReadIterator(shard),shard.getGenomeLocs(), microScheduler.engine.getSampleDB()); // todo: microScheduler.engine is protected - is it okay to user it here? + WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(), + microScheduler.getReadIterator(shard), + shard.getGenomeLocs(), + microScheduler.engine.getSampleDB().getSampleNames()); // todo: microScheduler.engine is protected - is it okay to user it here? ShardDataProvider dataProvider = null; for(WindowMaker.WindowMakerIterator iterator: windowMaker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index fb207087f..825a81e64 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; @@ -63,17 +64,20 @@ public class WindowMaker implements Iterable, I * the given intervals. * @param iterator The data source for this window. * @param intervals The set of intervals over which to traverse. - * @param sampleData SampleDataSource that we can reference reads with + * @param sampleNames The complete set of sample names in the reads in shard */ - public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, SampleDataSource sampleData ) { + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) { this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - - this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData.getSampleNames())); + this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) { + this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + } + public Iterator iterator() { return this; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 9a468f482..d16502b1d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -284,6 +284,16 @@ public class LocusIteratorByState extends LocusIterator { throw new IllegalArgumentException("samples list must not be empty"); } + /** + * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list + * for the system. + */ + public final static Collection sampleListForSAMWithoutReadGroups() { + List samples = new ArrayList(); + samples.add(null); + return samples; + } + public Iterator iterator() { return this; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index d962f7dc8..8b226101a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -50,7 +50,7 @@ public abstract class LocusViewTemplate extends BaseTest { GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); @@ -66,7 +66,7 @@ public abstract class LocusViewTemplate extends BaseTest { GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(shardBounds)); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); @@ -81,7 +81,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -95,7 +95,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -109,7 +109,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -123,7 +123,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 5, 5))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -137,7 +137,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -151,7 +151,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -166,7 +166,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read1, read2); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -185,7 +185,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -204,7 +204,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -225,7 +225,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4, read5, read6); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -253,7 +253,7 @@ public abstract class LocusViewTemplate extends BaseTest { read07, read08, read09, read10, read11, read12); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 0e1f4253a..41d7a23c6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -89,7 +89,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); - LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser,sampleDataSource.getSampleNames()); + LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); while(locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index dc43b4968..297a8501a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -11,7 +11,6 @@ import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -39,11 +38,8 @@ public class LocusIteratorByStateUnitTest extends BaseTest { } private final LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { - List samples = new ArrayList(); - samples.add(null); - return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), - readAttributes,genomeLocParser, samples); + readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); } @Test From c1cf6bc45ac8dfed24c7ec13bbf0e843f6d7cf2e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 14:22:19 -0400 Subject: [PATCH 20/63] PEDReader should be in samples --- .../sting/{utils/ped => gatk/samples}/PedReader.java | 4 +--- .../src/org/broadinstitute/sting/gatk/samples/Sample.java | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) rename public/java/src/org/broadinstitute/sting/{utils/ped => gatk/samples}/PedReader.java (98%) diff --git a/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java rename to public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index 4d282d821..6514cffe4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/ped/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -22,11 +22,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.ped; +package org.broadinstitute.sting.gatk.samples; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index c6fcbbc2a..db905a16e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -6,10 +6,7 @@ import java.util.HashMap; import java.util.Map; /** - * Created by IntelliJ IDEA. - * User: brett - * Date: Jul 26, 2010 - * Time: 3:31:38 PM + * */ public class Sample implements java.io.Serializable { final private String familyID, paternalID, maternalID; From 84160bd83fd92cc6f89f715f41976d2d1512cfb0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 15:50:54 -0400 Subject: [PATCH 21/63] Reorganization of Sample -- Moved Gender and Afflication to separate public enums -- PedReader 90% implemented -- Improve interface cleanup to XReadLines and UserException --- .../sting/gatk/samples/Affection.java | 46 ++++ .../sting/gatk/samples/Gender.java | 34 +++ .../sting/gatk/samples/PedReader.java | 101 ++++----- .../sting/gatk/samples/Sample.java | 76 +++---- .../sting/gatk/samples/SampleDataSource.java | 24 ++- .../beagle/ProduceBeagleInputWalker.java | 4 +- .../gatk/walkers/qc/CountMalesWalker.java | 3 +- .../sting/utils/exceptions/UserException.java | 4 + .../sting/utils/text/XReadLines.java | 6 +- .../reads/DownsamplerBenchmark.java | 3 +- .../sting/gatk/samples/PedReaderUnitTest.java | 201 ++++++++++++++++++ .../sting/gatk/samples/SampleUnitTest.java | 20 +- 12 files changed, 386 insertions(+), 136 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java new file mode 100644 index 000000000..de0dba884 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** + * Categorical sample trait for association and analysis + * + * Samples can have unknown status, be affected or unaffected by the + * categorical trait, or they can be marked as actually having a + * quantitative trait value (stored in an associated value in the Sample class) + * + * @author Mark DePristo + * @since Sept. 2011 + */ +public enum Affection { + /** Status is unknown */ + UNKNOWN, + /** Suffers from the disease */ + AFFECTED, + /** Unaffected by the disease */ + UNAFFECTED, + /** A quantitative trait: value of the trait is stored elsewhere */ + QUANTITATIVE +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java new file mode 100644 index 000000000..6fb44804a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** +* ENUM of possible human genders: male, female, or unknown +*/ +public enum Gender { + MALE, + FEMALE, + UNKNOWN +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index 6514cffe4..added09b6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -32,6 +32,8 @@ import org.broadinstitute.sting.utils.text.XReadLines; import java.io.File; import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.Reader; import java.util.*; /** @@ -115,10 +117,6 @@ public class PedReader { final static private Set CATAGORICAL_TRAIT_VALUES = new HashSet(Arrays.asList("-9", "0", "1", "2")); final static private String commentMarker = "#"; - private final File source; - private final List records; - - public enum MissingPedFields { NO_FAMILY_ID, NO_PARENTS, @@ -127,8 +125,8 @@ public class PedReader { } // phenotype - private final static String PHENOTYPE_MISSING_VALUE = "-9"; - private final static String PHENOTYPE_MISSING_VALUE_SECONDARY = "0"; + private final static String MISSING_VALUE1 = "-9"; + private final static String MISSING_VALUE2 = "0"; private final static String PHENOTYPE_UNAFFECTED = "1"; private final static String PHENOTYPE_AFFECTED = "2"; @@ -137,14 +135,15 @@ public class PedReader { private final static String SEX_FEMALE = "2"; // other=unknown - public PedReader(File source, EnumSet missingFields) throws FileNotFoundException { - this.source = source; - List lines = new XReadLines(source).readLines(); - this.records = parsePedLines(lines, missingFields); + public PedReader() { } + + public final List parse(File source, EnumSet missingFields, SampleDataSource sampleDB) throws FileNotFoundException { + logger.info("Reading PED file " + source + " with missing fields: " + missingFields); + return parse(new FileReader(source), missingFields, sampleDB); } - private final List parsePedLines(final List lines, EnumSet missingFields) { - logger.info("Reading PED file " + source + " with missing fields: " + missingFields); + public final List parse(Reader reader, EnumSet missingFields, SampleDataSource sampleDB) { + final List lines = new XReadLines(reader).readLines(); // What are the record offsets? final int familyPos = missingFields.contains(MissingPedFields.NO_FAMILY_ID) ? -1 : 0; @@ -153,7 +152,7 @@ public class PedReader { final int maternalPos = missingFields.contains(MissingPedFields.NO_PARENTS) ? -1 : paternalPos + 1; final int sexPos = missingFields.contains(MissingPedFields.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; final int phenotypePos = missingFields.contains(MissingPedFields.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; - final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)); + final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1; // go through once and determine properties int lineNo = 1; @@ -164,7 +163,7 @@ public class PedReader { String[] parts = line.split("\\W+"); if ( parts.length != nExpectedFields ) - throw new UserException.MalformedFile(source, "Bad PED line " + lineNo + ": wrong number of fields"); + throw new UserException.MalformedFile(reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields"); if ( phenotypePos != -1 ) { isQT = isQT || CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]); @@ -177,75 +176,55 @@ public class PedReader { // now go through and parse each record lineNo = 1; - final List recs = new ArrayList(splits.size()); + final List samples = new ArrayList(splits.size()); for ( final String[] parts : splits ) { String familyID = null, individualID, paternalID = null, maternalID = null; - Sample.Gender sex = Sample.Gender.UNKNOWN; - double quantitativePhenotype = Sample.UNSET_QUANTITIATIVE_TRAIT_VALUE; - Sample.Affection affection = Sample.Affection.UNKNOWN; + Gender sex = Gender.UNKNOWN; + double quantitativePhenotype = Sample.UNSET_QT; + Affection affection = Affection.UNKNOWN; - if ( familyPos != -1 ) familyID = parts[familyPos]; + if ( familyPos != -1 ) familyID = maybeMissing(parts[familyPos]); individualID = parts[samplePos]; - if ( paternalPos != -1 ) paternalID = parts[paternalPos]; - if ( maternalPos != -1 ) maternalID = parts[maternalPos]; + if ( paternalPos != -1 ) paternalID = maybeMissing(parts[paternalPos]); + if ( maternalPos != -1 ) maternalID = maybeMissing(parts[maternalPos]); if ( sexPos != -1 ) { - if ( parts[sexPos].equals(SEX_MALE) ) sex = Sample.Gender.MALE; - else if ( parts[sexPos].equals(SEX_FEMALE) ) sex = Sample.Gender.FEMALE; - else sex = Sample.Gender.UNKNOWN; + if ( parts[sexPos].equals(SEX_MALE) ) sex = Gender.MALE; + else if ( parts[sexPos].equals(SEX_FEMALE) ) sex = Gender.FEMALE; + else sex = Gender.UNKNOWN; } if ( phenotypePos != -1 ) { if ( isQT ) { - if ( parts[phenotypePos].equals(PHENOTYPE_MISSING_VALUE) ) - affection = Sample.Affection.UNKNOWN; + if ( parts[phenotypePos].equals(MISSING_VALUE1) ) + affection = Affection.UNKNOWN; else { - affection = Sample.Affection.QUANTITATIVE; + affection = Affection.QUANTITATIVE; quantitativePhenotype = Double.valueOf(parts[phenotypePos]); } } else { - if ( parts[phenotypePos].equals(PHENOTYPE_MISSING_VALUE) ) affection = Sample.Affection.UNKNOWN; - else if ( parts[phenotypePos].equals(PHENOTYPE_MISSING_VALUE_SECONDARY) ) affection = Sample.Affection.UNKNOWN; - else if ( parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED) ) affection = Sample.Affection.UNAFFECTED; - else if ( parts[phenotypePos].equals(PHENOTYPE_AFFECTED) ) affection = Sample.Affection.AFFECTED; + if ( parts[phenotypePos].equals(MISSING_VALUE1) ) affection = Affection.UNKNOWN; + else if ( parts[phenotypePos].equals(MISSING_VALUE2) ) affection = Affection.UNKNOWN; + else if ( parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED) ) affection = Affection.UNAFFECTED; + else if ( parts[phenotypePos].equals(PHENOTYPE_AFFECTED) ) affection = Affection.AFFECTED; else throw new ReviewedStingException("Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo); } } - recs.add(new PedRecord(familyID, individualID, paternalID, maternalID, sex, quantitativePhenotype, affection)); - + final Sample s = new Sample(familyID, sampleDB, individualID, paternalID, maternalID, sex, affection, quantitativePhenotype); + samples.add(s); + sampleDB.addSample(s); lineNo++; } - return Collections.unmodifiableList(recs); + sampleDB.validate(samples); + return samples; } - public List getRecords() { - return records; - } - - public void fillSampleDB(SampleDataSource db) { - for ( final PedRecord rec : getRecords() ) { - } - } -} - -class PedRecord { - final String familyID, individualID, paternalID, maternalID; - final Sample.Gender sex; - final double quantitativePhenotype; - final Sample.Affection affection; - - PedRecord(final String familyID, final String individualID, - final String paternalID, final String maternalID, - final Sample.Gender sex, - final double quantitativePhenotype, final Sample.Affection affection) { - this.familyID = familyID; - this.individualID = individualID; - this.paternalID = paternalID; - this.maternalID = maternalID; - this.sex = sex; - this.quantitativePhenotype = quantitativePhenotype; - this.affection = affection; + private final static String maybeMissing(final String string) { + if ( string.equals(MISSING_VALUE1) || string.equals(MISSING_VALUE2) ) + return null; + else + return string; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index db905a16e..3426cf678 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -10,37 +10,18 @@ import java.util.Map; */ public class Sample implements java.io.Serializable { final private String familyID, paternalID, maternalID; - final private Sample.Gender gender; + final private Gender gender; final private double quantitativePhenotype; - final private Sample.Affection affection; + final private Affection affection; final private String ID; final private SampleDataSource dataSource; + final private Map properties = new HashMap(); - // todo -- conditionally add the property map -- should be empty by default - private final Map properties = new HashMap(); - - public enum Gender { - MALE, - FEMALE, - UNKNOWN - } - - public enum Affection { - /** Status is unknown */ - UNKNOWN, - /** Suffers from the disease */ - AFFECTED, - /** Unaffected by the disease */ - UNAFFECTED, - /** A quantitative trait: value of the trait is stored elsewhere */ - QUANTITATIVE - } - - public final static double UNSET_QUANTITIATIVE_TRAIT_VALUE = Double.NaN; + public final static double UNSET_QT = Double.NaN; public Sample(final String ID, final SampleDataSource dataSource, final String familyID, final String paternalID, final String maternalID, - final Gender gender, final double quantitativePhenotype, final Affection affection) { + final Gender gender, final Affection affection, final double quantitativePhenotype) { this.familyID = familyID; this.paternalID = paternalID; this.maternalID = maternalID; @@ -51,20 +32,31 @@ public class Sample implements java.io.Serializable { this.dataSource = dataSource; } - public Sample(final String ID, final SampleDataSource dataSource, - final String familyID, final String paternalID, final String maternalID, final Gender gender) { - this(ID, dataSource, familyID, paternalID, maternalID, gender, - UNSET_QUANTITIATIVE_TRAIT_VALUE, Affection.UNKNOWN); + protected Sample(final String ID, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final Affection affection, final double quantitativePhenotype) { + this(ID, null, familyID, paternalID, maternalID, gender, affection, quantitativePhenotype); } - public Sample(final String ID, final SampleDataSource dataSource, final double quantitativePhenotype, final Affection affection) { - this(ID, dataSource, null, null, null, Gender.UNKNOWN, quantitativePhenotype, affection); + protected Sample(final String ID, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final Affection affection) { + this(ID, null, familyID, paternalID, maternalID, gender, affection, UNSET_QT); + } + + + public Sample(final String ID, final SampleDataSource dataSource, + final String familyID, final String paternalID, final String maternalID, final Gender gender) { + this(ID, dataSource, familyID, paternalID, maternalID, gender, Affection.UNKNOWN, UNSET_QT); + } + + public Sample(final String ID, final SampleDataSource dataSource, final Affection affection, final double quantitativePhenotype) { + this(ID, dataSource, null, null, null, Gender.UNKNOWN, affection, quantitativePhenotype); } public Sample(String id, SampleDataSource dataSource) { - this(id, dataSource, - null, null, null, - Gender.UNKNOWN, UNSET_QUANTITIATIVE_TRAIT_VALUE, Affection.UNKNOWN); + this(id, dataSource, null, null, null, + Gender.UNKNOWN, Affection.UNKNOWN, UNSET_QT); } // ------------------------------------------------------------------------------------- @@ -77,7 +69,6 @@ public class Sample implements java.io.Serializable { return ID; } - public String getFamilyID() { return familyID; } @@ -157,21 +148,4 @@ public class Sample implements java.io.Serializable { public boolean hasExtraProperty(String key) { return properties.containsKey(key); } - -// @Override -// public boolean equals(Object o) { -// if (this == o) return true; -// if (o == null || getClass() != o.getClass()) return false; -// -// Sample sample = (Sample) o; -// if (ID != null ? !ID.equals(sample.ID) : sample.ID != null) return false; -// if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false; -// -// return true; -// } -// -// @Override -// public int hashCode() { -// return ID != null ? ID.hashCode() : "".hashCode(); -// } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java index d5285271b..e0d159947 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java @@ -55,7 +55,7 @@ public class SampleDataSource { /** * Hallucinates sample objects for all the samples in the SAM file and stores them */ - public SampleDataSource addSamples(SAMFileHeader header) { + protected SampleDataSource addSamples(SAMFileHeader header) { for (String sampleName : SampleUtils.getSAMFileSamples(header)) { if (getSample(sampleName) == null) { Sample newSample = new Sample(sampleName, this); @@ -65,7 +65,7 @@ public class SampleDataSource { return this; } - public SampleDataSource addSamples(final List sampleFiles) { + protected SampleDataSource addSamples(final List sampleFiles) { // add files consecutively for (File file : sampleFiles) { addSamples(file); @@ -77,7 +77,7 @@ public class SampleDataSource { * Parse one sample file and integrate it with samples that are already there * Fail quickly if we find any errors in the file */ - public SampleDataSource addSamples(File sampleFile) { + protected SampleDataSource addSamples(File sampleFile) { return this; } @@ -85,7 +85,7 @@ public class SampleDataSource { * Add a sample to the collection * @param sample to be added */ - private SampleDataSource addSample(Sample sample) { + protected SampleDataSource addSample(Sample sample) { samples.put(sample.getID(), sample); return this; } @@ -138,8 +138,6 @@ public class SampleDataSource { // // -------------------------------------------------------------------------------- - - /** * Get number of sample objects * @return size of samples map @@ -209,4 +207,18 @@ public class SampleDataSource { } return samples; } + + // -------------------------------------------------------------------------------- + // + // Validation + // + // -------------------------------------------------------------------------------- + + public final void validate() { + validate(getSamples()); + } + + public final void validate(Collection samplesToCheck) { + + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index cdf1913f7..b722220f9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.Gender; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VQSRCalibrationCurve; import org.broadinstitute.sting.utils.GenomeLoc; @@ -248,7 +248,7 @@ public class ProduceBeagleInputWalker extends RodWalker { Map preferredGenotypes = preferredVC.getGenotypes(); Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for ( String sample : samples ) { - boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Sample.Gender.MALE; + boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java index d776fe415..24c06d101 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Gender; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -41,7 +42,7 @@ import org.broadinstitute.sting.gatk.walkers.Requires; public class CountMalesWalker extends ReadWalker { public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { Sample sample = getSampleDB().getSample(read); - return sample.getGender() == Sample.Gender.MALE ? 1 : 0; + return sample.getGender() == Gender.MALE ? 1 : 0; } public Integer reduceInit() { return 0; } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 70f7387f4..77f1ed6c0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -215,6 +215,10 @@ public class UserException extends ReviewedStingException { super(String.format("File %s is malformed: %s caused by %s", f.getAbsolutePath(), message, e.getMessage())); } + public MalformedFile(String name, String message) { + super(String.format("File associated with name %s is malformed: %s", name, message)); + } + public MalformedFile(String name, String message, Exception e) { super(String.format("File associated with name %s is malformed: %s caused by %s", name, message, e.getMessage())); } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java index 52b6f3b01..49e9ddf52 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java @@ -99,9 +99,9 @@ public class XReadLines implements Iterator, Iterable { * * @param reader */ - public XReadLines(final BufferedReader reader, final boolean trimWhitespace) { + public XReadLines(final Reader reader, final boolean trimWhitespace) { try { - this.in = reader; + this.in = new BufferedReader(reader); nextline = readNextLine(); this.trimWhitespace = trimWhitespace; } catch(IOException e) { @@ -109,7 +109,7 @@ public class XReadLines implements Iterator, Iterable { } } - public XReadLines(final BufferedReader reader) throws FileNotFoundException { + public XReadLines(final Reader reader) { this(reader, true); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 41d7a23c6..0d5734d43 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.baq.BAQ; +import java.io.File; import java.util.Collections; import java.util.Iterator; @@ -85,8 +86,6 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { (byte)0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); - SampleDataSource sampleDataSource = new SampleDataSource().addSamples(reader.getFileHeader()); - // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java new file mode 100644 index 000000000..1cad634dd --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.StringReader; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.List; + +/** + * UnitTest for PedReader + * + * @author Mark DePristo + * @since 2011 + */ +public class PedReaderUnitTest extends BaseTest { + private static Logger logger = Logger.getLogger(PedReaderUnitTest.class); + + private class PedReaderTest extends TestDataProvider { + public String fileContents; + public List expectedSamples; + + private PedReaderTest(final String name, final List expectedSamples, final String fileContents) { + super(PedReaderTest.class, name); + this.fileContents = fileContents; + this.expectedSamples = expectedSamples; + } + } + +// Family ID +// Individual ID +// Paternal ID +// Maternal ID +// Sex (1=male; 2=female; other=unknown) +// Phenotype +// +// -9 missing +// 0 missing +// 1 unaffected +// 2 affected + + @DataProvider(name = "readerTest") + public Object[][] createPEDFiles() { + new PedReaderTest("singleRecordMale", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 1 1"); + + new PedReaderTest("singleRecordFemale", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.FEMALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 2 1"); + + new PedReaderTest("singleRecordMissingGender", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.UNKNOWN, Affection.UNKNOWN)), + "fam1 kid 0 0 0 0"); + + // Affection + new PedReaderTest("singleRecordAffected", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.AFFECTED)), + "fam1 kid 0 0 1 2"); + + new PedReaderTest("singleRecordUnaffected", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 1 1"); + + new PedReaderTest("singleRecordMissingAffection-9", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNKNOWN)), + "fam1 kid 0 0 1 -9"); + + new PedReaderTest("singleRecordMissingAffection0", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNKNOWN)), + "fam1 kid 0 0 1 0"); + + new PedReaderTest("multipleUnrelated", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.AFFECTED), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.UNAFFECTED)), + String.format("%s\n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 2")); + + new PedReaderTest("explicitTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s\n%s\n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2")); + + new PedReaderTest("implicitTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + "fam1 kid dad mom 1 1"); + + new PedReaderTest("partialTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + String.format("%s\n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1")); + + new PedReaderTest("bigPedigree", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", "granddad1", "grandma1", Gender.MALE, Affection.UNAFFECTED), + new Sample("granddad1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("grandma1", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN), + new Sample("mom", "fam1", "granddad2", "grandma2", Gender.FEMALE, Affection.AFFECTED), + new Sample("granddad2", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("grandma2", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + String.format("%s\n%s\n%s", + "fam1 kid dad mom 1 2", + "fam1 dad granddad1 grandma1 1 1", + "fam1 mom granddad2 grandma2 2 2")); + + // Quantitative trait + new PedReaderTest("QuantitativeTrait", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + String.format("%s\n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 10.0")); + + new PedReaderTest("QuantitativeTraitWithMissing", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + String.format("%s\n%s", + "fam1 s1 0 0 1 -9", + "fam2 s2 0 0 2 10.0")); + + new PedReaderTest("QuantitativeTraitOnlyInts", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + String.format("%s\n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 10")); + + return PedReaderTest.getTests(PedReaderTest.class); + } + + private static final void runTest(PedReaderTest test, String myFileContents, EnumSet missing) { + logger.warn("Test " + test); + PedReader reader = new PedReader(); + SampleDataSource sampleDB = new SampleDataSource(); + List readSamples = reader.parse(new StringReader(myFileContents), missing, sampleDB); + Assert.assertEquals(test.expectedSamples, readSamples, "Parsed incorrect number of samples"); + } + + @Test(enabled = true, dataProvider = "readerTest") + public void testPedReader(PedReaderTest test) { + runTest(test, test.fileContents, EnumSet.noneOf(PedReader.MissingPedFields.class)); + } + + @Test(enabled = true, dataProvider = "readerTest", dependsOnMethods = "testPedReader") + public void testPedReaderWithComments(PedReaderTest test) { + runTest(test, "#comment\n" + test.fileContents, EnumSet.noneOf(PedReader.MissingPedFields.class)); + } + + @Test(enabled = true, dataProvider = "readerTest", dependsOnMethods = "testPedReader") + public void testPedReaderWithMissing(PedReaderTest test) { + // todo -- test MISSING by splicing strings + //runTest(test, "#comment\n" + test.fileContents, EnumSet.noneOf(PedReader.MissingPedFields.class)); + } + +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java index e8d1772b8..279319edb 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -22,17 +22,17 @@ public class SampleUnitTest extends BaseTest { public void init() { db = new SampleDataSource(); - fam1A = new Sample("1A", db, "fam1", "1B", "1C", Sample.Gender.UNKNOWN); - fam1B = new Sample("1B", db, "fam1", null, null, Sample.Gender.MALE); - fam1C = new Sample("1C", db, "fam1", null, null, Sample.Gender.FEMALE); + fam1A = new Sample("1A", db, "fam1", "1B", "1C", Gender.UNKNOWN); + fam1B = new Sample("1B", db, "fam1", null, null, Gender.MALE); + fam1C = new Sample("1C", db, "fam1", null, null, Gender.FEMALE); s1 = new Sample("s1", db); s2 = new Sample("s2", db); - trait1 = new Sample("t1", db, Sample.UNSET_QUANTITIATIVE_TRAIT_VALUE, Sample.Affection.AFFECTED); - trait2 = new Sample("t2", db, Sample.UNSET_QUANTITIATIVE_TRAIT_VALUE, Sample.Affection.UNAFFECTED); - trait3 = new Sample("t3", db, Sample.UNSET_QUANTITIATIVE_TRAIT_VALUE, Sample.Affection.UNKNOWN); - trait4 = new Sample("t4", db, 1.0, Sample.Affection.QUANTITATIVE); + trait1 = new Sample("t1", db, Affection.AFFECTED, Sample.UNSET_QT); + trait2 = new Sample("t2", db, Affection.UNAFFECTED, Sample.UNSET_QT); + trait3 = new Sample("t3", db, Affection.UNKNOWN, Sample.UNSET_QT); + trait4 = new Sample("t4", db, Affection.QUANTITATIVE, 1.0); } /** @@ -47,8 +47,8 @@ public class SampleUnitTest extends BaseTest { @Test() public void testGenders() { - Assert.assertTrue(fam1A.getGender() == Sample.Gender.UNKNOWN); - Assert.assertTrue(fam1B.getGender() == Sample.Gender.MALE); - Assert.assertTrue(fam1C.getGender() == Sample.Gender.FEMALE); + Assert.assertTrue(fam1A.getGender() == Gender.UNKNOWN); + Assert.assertTrue(fam1B.getGender() == Gender.MALE); + Assert.assertTrue(fam1C.getGender() == Gender.FEMALE); } } From dd75ad9f49ed3a07dded0cc4eab7318cd60fda1e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 30 Sep 2011 18:03:34 -0400 Subject: [PATCH 22/63] 95% PedReader -- Passes significiant unit tests -- Implicit sample creation for mom / dad when you create single samples -- Continuing cleanup of Sample and SampleDataSource --- .../sting/gatk/samples/PedReader.java | 24 ++++++- .../sting/gatk/samples/Sample.java | 37 ++++++++++- .../sting/gatk/samples/SampleDataSource.java | 6 +- .../sting/gatk/samples/PedReaderUnitTest.java | 66 ++++++++++++++----- 4 files changed, 109 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index added09b6..e581c3718 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -160,13 +160,13 @@ public class PedReader { final List splits = new ArrayList(lines.size()); for ( final String line : lines ) { if ( line.startsWith(commentMarker)) continue; - String[] parts = line.split("\\W+"); + String[] parts = line.split("\\s+"); if ( parts.length != nExpectedFields ) throw new UserException.MalformedFile(reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields"); if ( phenotypePos != -1 ) { - isQT = isQT || CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]); + isQT = isQT || ! CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]); } splits.add(parts); @@ -211,12 +211,21 @@ public class PedReader { } } - final Sample s = new Sample(familyID, sampleDB, individualID, paternalID, maternalID, sex, affection, quantitativePhenotype); + final Sample s = new Sample(individualID, sampleDB, familyID, paternalID, maternalID, sex, affection, quantitativePhenotype); samples.add(s); sampleDB.addSample(s); lineNo++; } + for ( final Sample sample : new ArrayList(samples) ) { + Sample dad = maybeAddImplicitSample(sampleDB, sample.getPaternalID(), sample.getFamilyID(), Gender.MALE); + if ( dad != null ) samples.add(dad); + + Sample mom = maybeAddImplicitSample(sampleDB, sample.getMaternalID(), sample.getFamilyID(), Gender.FEMALE); + if ( mom != null ) samples.add(mom); + } + + sampleDB.validate(samples); return samples; } @@ -227,4 +236,13 @@ public class PedReader { else return string; } + + private final Sample maybeAddImplicitSample(SampleDataSource sampleDB, final String id, final String familyID, final Gender gender) { + if ( id != null && sampleDB.getSample(id) == null ) { + Sample s = new Sample(id, sampleDB, familyID, null, null, gender, Affection.UNKNOWN, Sample.UNSET_QT); + sampleDB.addSample(s); + return s; + } else + return null; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index 3426cf678..0a5043013 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -117,8 +117,11 @@ public class Sample implements java.io.Serializable { return gender; } - public String getFamilyId() { - return familyID; + @Override + public String toString() { + return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s", + getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(), + getQuantitativePhenotype(), getExtraProperties()); } // ------------------------------------------------------------------------------------- @@ -148,4 +151,34 @@ public class Sample implements java.io.Serializable { public boolean hasExtraProperty(String key) { return properties.containsKey(key); } + + @Override + public int hashCode() { + return ID.hashCode(); + } + + @Override + public boolean equals(final Object o) { + if(o == null) + return false; + if(o instanceof Sample) { + Sample otherSample = (Sample)o; + return ID.equals(otherSample.ID) && + equalOrNull(familyID, otherSample.familyID) && + equalOrNull(paternalID, otherSample.paternalID) && + equalOrNull(maternalID, otherSample.maternalID) && + equalOrNull(gender, otherSample.gender) && + equalOrNull(quantitativePhenotype, otherSample.quantitativePhenotype) && + equalOrNull(affection, otherSample.affection) && + equalOrNull(properties, otherSample.properties); + } + return false; + } + + private final static boolean equalOrNull(final Object o1, final Object o2) { + if ( o1 == null ) + return o2 == null; + else + return o2 == null ? false : o1.equals(o2); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java index e0d159947..b85759de2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java @@ -156,8 +156,8 @@ public class SampleDataSource { HashSet familyMembers = new HashSet(); for (Sample sample : samples.values()) { - if (sample.getFamilyId() != null) { - if (sample.getFamilyId().equals(familyId)) + if (sample.getFamilyID() != null) { + if (sample.getFamilyID().equals(familyId)) familyMembers.add(sample); } } @@ -172,7 +172,7 @@ public class SampleDataSource { */ public Set getChildren(Sample sample) { HashSet children = new HashSet(); - for (Sample familyMember : getFamily(sample.getFamilyId())) { + for (Sample familyMember : getFamily(sample.getFamilyID())) { if (familyMember.getMother() == sample || familyMember.getFather() == sample) { children.add(familyMember); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index 1cad634dd..5eec0e8c8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -33,6 +33,7 @@ import org.testng.annotations.Test; import java.io.StringReader; import java.util.Arrays; import java.util.EnumSet; +import java.util.HashSet; import java.util.List; /** @@ -47,6 +48,7 @@ public class PedReaderUnitTest extends BaseTest { private class PedReaderTest extends TestDataProvider { public String fileContents; public List expectedSamples; + EnumSet missing; private PedReaderTest(final String name, final List expectedSamples, final String fileContents) { super(PedReaderTest.class, name); @@ -55,6 +57,19 @@ public class PedReaderUnitTest extends BaseTest { } } + private class PedReaderTestMissing extends TestDataProvider { + public String fileContents; + public List expectedSamples; + EnumSet missing; + + private PedReaderTestMissing(final String name, EnumSet missing, final List expectedSamples, final String fileContents) { + super(PedReaderTest.class, name); + this.fileContents = fileContents; + this.expectedSamples = expectedSamples; + this.missing = missing; + } + } + // Family ID // Individual ID // Paternal ID @@ -100,9 +115,9 @@ public class PedReaderUnitTest extends BaseTest { new PedReaderTest("multipleUnrelated", Arrays.asList( - new Sample("s1", "fam1", null, null, Gender.MALE, Affection.AFFECTED), - new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.UNAFFECTED)), - String.format("%s\n%s", + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s", "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 2")); @@ -111,7 +126,7 @@ public class PedReaderUnitTest extends BaseTest { new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)), - String.format("%s\n%s\n%s", + String.format("%s%n%s%n%s", "fam1 kid dad mom 1 2", "fam1 dad 0 0 1 1", "fam1 mom 0 0 2 2")); @@ -121,14 +136,14 @@ public class PedReaderUnitTest extends BaseTest { new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), - "fam1 kid dad mom 1 1"); + "fam1 kid dad mom 1 2"); new PedReaderTest("partialTrio", Arrays.asList( new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), - String.format("%s\n%s", + String.format("%s%n%s", "fam1 kid dad mom 1 2", "fam1 dad 0 0 1 1")); @@ -141,7 +156,7 @@ public class PedReaderUnitTest extends BaseTest { new Sample("mom", "fam1", "granddad2", "grandma2", Gender.FEMALE, Affection.AFFECTED), new Sample("granddad2", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), new Sample("grandma2", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), - String.format("%s\n%s\n%s", + String.format("%s%n%s%n%s", "fam1 kid dad mom 1 2", "fam1 dad granddad1 grandma1 1 1", "fam1 mom granddad2 grandma2 2 2")); @@ -151,7 +166,7 @@ public class PedReaderUnitTest extends BaseTest { Arrays.asList( new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), - String.format("%s\n%s", + String.format("%s%n%s", "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 10.0")); @@ -159,7 +174,7 @@ public class PedReaderUnitTest extends BaseTest { Arrays.asList( new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT), new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), - String.format("%s\n%s", + String.format("%s%n%s", "fam1 s1 0 0 1 -9", "fam2 s2 0 0 2 10.0")); @@ -167,7 +182,7 @@ public class PedReaderUnitTest extends BaseTest { Arrays.asList( new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), - String.format("%s\n%s", + String.format("%s%n%s", "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 10")); @@ -179,7 +194,7 @@ public class PedReaderUnitTest extends BaseTest { PedReader reader = new PedReader(); SampleDataSource sampleDB = new SampleDataSource(); List readSamples = reader.parse(new StringReader(myFileContents), missing, sampleDB); - Assert.assertEquals(test.expectedSamples, readSamples, "Parsed incorrect number of samples"); + Assert.assertEquals(new HashSet(test.expectedSamples), new HashSet(readSamples), "Parsed incorrect number of samples"); } @Test(enabled = true, dataProvider = "readerTest") @@ -189,13 +204,32 @@ public class PedReaderUnitTest extends BaseTest { @Test(enabled = true, dataProvider = "readerTest", dependsOnMethods = "testPedReader") public void testPedReaderWithComments(PedReaderTest test) { - runTest(test, "#comment\n" + test.fileContents, EnumSet.noneOf(PedReader.MissingPedFields.class)); + runTest(test, String.format("#comment%n%s", test.fileContents), EnumSet.noneOf(PedReader.MissingPedFields.class)); } - @Test(enabled = true, dataProvider = "readerTest", dependsOnMethods = "testPedReader") + @DataProvider(name = "readerTestMissing") + public Object[][] createPEDFilesWithMissing() { + new PedReaderTestMissing("trioMissingFam", EnumSet.of(PedReader.MissingPedFields.NO_FAMILY_ID), + Arrays.asList( + new Sample("kid", null, "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", null, null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", null, null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s%n%s", + "kid dad mom 1 2", + "dad 0 0 1 1", + "mom 0 0 2 2")); + + return PedReaderTestMissing.getTests(PedReaderTestMissing.class); + } + + @Test(enabled = true, dataProvider = "readerTestMissing", dependsOnMethods = "testPedReader") public void testPedReaderWithMissing(PedReaderTest test) { - // todo -- test MISSING by splicing strings - //runTest(test, "#comment\n" + test.fileContents, EnumSet.noneOf(PedReader.MissingPedFields.class)); +// public enum MissingPedFields { +// NO_FAMILY_ID, +// NO_PARENTS, +// NO_SEX, +// NO_PHENOTYPE +// } +// runTest(test, sliceContents(0, test.fileContents), EnumSet.of(PedReader.MissingPedFields.NO_FAMILY_ID)); } - } \ No newline at end of file From bf6a3a65320cf5d1fa9aed9f3db5154579921443 Mon Sep 17 00:00:00 2001 From: Roger Zurawicki Date: Sun, 2 Oct 2011 22:33:46 -0400 Subject: [PATCH 23/63] Added framework to do batch CigarClip Testing *NOTE: This commit has not been compiled! --- .../sting/utils/clipreads/ReadClipperUnitTest.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index 1415379db..38eee762a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -62,6 +62,20 @@ public class ReadClipperUnitTest extends BaseTest { readClipper = new ReadClipper(read); } + private void testHardClipCigarByReadCoordinate( SAMRecord read, String inputCigar, String expectedCigar, int expectedStart, int expectedStop) { + read.setCigar(TextCigarCodec.getSingleton().decode(inputCigar) ); + SAMRecord clipped = readClipper.hardClipByReadCoordinates(expectedStart,expectedStop); + Assert.assertEquals(clipped.getCigarString(), expectedCigar, "Clipped Cigar string is different than expected"); + } +/* + private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) { + SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null); + String expectedBases = BASES.substring(expectedStart, expectedStop); + String expectedQuals = QUALS.substring(expectedStart, expectedStop); + Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected"); + Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected"); + } +*/ @Test public void testHardClipBothEndsByReferenceCoordinates() { logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); From 52f670c8b86787d9e72c5e179fe82d37c5e4729f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 06:12:58 -0700 Subject: [PATCH 24/63] 100% version of PedReader -- Passes all unit tests -- Added unit tests for missing fields --- .../sting/gatk/samples/PedReader.java | 20 ++- .../sting/gatk/samples/PedReaderUnitTest.java | 164 +++++++++++------- 2 files changed, 114 insertions(+), 70 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index e581c3718..27b9181de 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -117,13 +117,17 @@ public class PedReader { final static private Set CATAGORICAL_TRAIT_VALUES = new HashSet(Arrays.asList("-9", "0", "1", "2")); final static private String commentMarker = "#"; - public enum MissingPedFields { + public enum MissingPedField { NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE } + protected enum Field { + FAMILY_ID, INDIVIDUAL_ID, PATERNAL_ID, MATERNAL_ID, GENDER, PHENOTYPE + } + // phenotype private final static String MISSING_VALUE1 = "-9"; private final static String MISSING_VALUE2 = "0"; @@ -137,21 +141,21 @@ public class PedReader { public PedReader() { } - public final List parse(File source, EnumSet missingFields, SampleDataSource sampleDB) throws FileNotFoundException { + public final List parse(File source, EnumSet missingFields, SampleDataSource sampleDB) throws FileNotFoundException { logger.info("Reading PED file " + source + " with missing fields: " + missingFields); return parse(new FileReader(source), missingFields, sampleDB); } - public final List parse(Reader reader, EnumSet missingFields, SampleDataSource sampleDB) { + public final List parse(Reader reader, EnumSet missingFields, SampleDataSource sampleDB) { final List lines = new XReadLines(reader).readLines(); // What are the record offsets? - final int familyPos = missingFields.contains(MissingPedFields.NO_FAMILY_ID) ? -1 : 0; + final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0; final int samplePos = familyPos + 1; - final int paternalPos = missingFields.contains(MissingPedFields.NO_PARENTS) ? -1 : samplePos + 1; - final int maternalPos = missingFields.contains(MissingPedFields.NO_PARENTS) ? -1 : paternalPos + 1; - final int sexPos = missingFields.contains(MissingPedFields.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; - final int phenotypePos = missingFields.contains(MissingPedFields.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; + final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1; + final int maternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1; + final int sexPos = missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; + final int phenotypePos = missingFields.contains(MissingPedField.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1; // go through once and determine properties diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index 5eec0e8c8..35be45bc7 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -26,15 +26,14 @@ package org.broadinstitute.sting.gatk.samples; import org.apache.log4j.Logger; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.StringReader; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.HashSet; -import java.util.List; +import java.lang.reflect.Array; +import java.util.*; /** * UnitTest for PedReader @@ -48,7 +47,7 @@ public class PedReaderUnitTest extends BaseTest { private class PedReaderTest extends TestDataProvider { public String fileContents; public List expectedSamples; - EnumSet missing; + EnumSet missing; private PedReaderTest(final String name, final List expectedSamples, final String fileContents) { super(PedReaderTest.class, name); @@ -57,19 +56,6 @@ public class PedReaderUnitTest extends BaseTest { } } - private class PedReaderTestMissing extends TestDataProvider { - public String fileContents; - public List expectedSamples; - EnumSet missing; - - private PedReaderTestMissing(final String name, EnumSet missing, final List expectedSamples, final String fileContents) { - super(PedReaderTest.class, name); - this.fileContents = fileContents; - this.expectedSamples = expectedSamples; - this.missing = missing; - } - } - // Family ID // Individual ID // Paternal ID @@ -115,17 +101,17 @@ public class PedReaderUnitTest extends BaseTest { new PedReaderTest("multipleUnrelated", Arrays.asList( - new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), - new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), String.format("%s%n%s", "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 2")); new PedReaderTest("explicitTrio", Arrays.asList( - new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), - new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), - new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)), + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)), String.format("%s%n%s%n%s", "fam1 kid dad mom 1 2", "fam1 dad 0 0 1 1", @@ -133,29 +119,29 @@ public class PedReaderUnitTest extends BaseTest { new PedReaderTest("implicitTrio", Arrays.asList( - new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), - new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), - new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), "fam1 kid dad mom 1 2"); new PedReaderTest("partialTrio", Arrays.asList( - new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), - new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), - new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), String.format("%s%n%s", "fam1 kid dad mom 1 2", "fam1 dad 0 0 1 1")); new PedReaderTest("bigPedigree", Arrays.asList( - new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), - new Sample("dad", "fam1", "granddad1", "grandma1", Gender.MALE, Affection.UNAFFECTED), - new Sample("granddad1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), - new Sample("grandma1", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN), - new Sample("mom", "fam1", "granddad2", "grandma2", Gender.FEMALE, Affection.AFFECTED), - new Sample("granddad2", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), - new Sample("grandma2", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", "granddad1", "grandma1", Gender.MALE, Affection.UNAFFECTED), + new Sample("granddad1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("grandma1", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN), + new Sample("mom", "fam1", "granddad2", "grandma2", Gender.FEMALE, Affection.AFFECTED), + new Sample("granddad2", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("grandma2", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), String.format("%s%n%s%n%s", "fam1 kid dad mom 1 2", "fam1 dad granddad1 grandma1 1 1", @@ -164,24 +150,24 @@ public class PedReaderUnitTest extends BaseTest { // Quantitative trait new PedReaderTest("QuantitativeTrait", Arrays.asList( - new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), - new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), String.format("%s%n%s", "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 10.0")); new PedReaderTest("QuantitativeTraitWithMissing", Arrays.asList( - new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT), - new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), String.format("%s%n%s", "fam1 s1 0 0 1 -9", "fam2 s2 0 0 2 10.0")); new PedReaderTest("QuantitativeTraitOnlyInts", Arrays.asList( - new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), - new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), String.format("%s%n%s", "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 10")); @@ -189,7 +175,7 @@ public class PedReaderUnitTest extends BaseTest { return PedReaderTest.getTests(PedReaderTest.class); } - private static final void runTest(PedReaderTest test, String myFileContents, EnumSet missing) { + private static final void runTest(PedReaderTest test, String myFileContents, EnumSet missing) { logger.warn("Test " + test); PedReader reader = new PedReader(); SampleDataSource sampleDB = new SampleDataSource(); @@ -199,37 +185,91 @@ public class PedReaderUnitTest extends BaseTest { @Test(enabled = true, dataProvider = "readerTest") public void testPedReader(PedReaderTest test) { - runTest(test, test.fileContents, EnumSet.noneOf(PedReader.MissingPedFields.class)); + runTest(test, test.fileContents, EnumSet.noneOf(PedReader.MissingPedField.class)); } @Test(enabled = true, dataProvider = "readerTest", dependsOnMethods = "testPedReader") public void testPedReaderWithComments(PedReaderTest test) { - runTest(test, String.format("#comment%n%s", test.fileContents), EnumSet.noneOf(PedReader.MissingPedFields.class)); + runTest(test, String.format("#comment%n%s", test.fileContents), EnumSet.noneOf(PedReader.MissingPedField.class)); + } + + // ----------------------------------------------------------------- + // missing format field tests + // ----------------------------------------------------------------- + + private class PedReaderTestMissing extends TestDataProvider { + public EnumSet missingDesc; + public EnumSet missingFields; + public final String fileContents; + public Sample expected; + + + private PedReaderTestMissing(final String name, final String fileContents, + EnumSet missingDesc, + EnumSet missingFields, + final Sample expected) { + super(PedReaderTestMissing.class, name); + this.fileContents = fileContents; + this.missingDesc = missingDesc; + this.missingFields = missingFields; + this.expected = expected; + } } @DataProvider(name = "readerTestMissing") public Object[][] createPEDFilesWithMissing() { - new PedReaderTestMissing("trioMissingFam", EnumSet.of(PedReader.MissingPedFields.NO_FAMILY_ID), - Arrays.asList( - new Sample("kid", null, "dad", "mom", Gender.MALE, Affection.AFFECTED), - new Sample("dad", null, null, null, Gender.MALE, Affection.UNAFFECTED), - new Sample("mom", null, null, null, Gender.FEMALE, Affection.AFFECTED)), - String.format("%s%n%s%n%s", - "kid dad mom 1 2", - "dad 0 0 1 1", - "mom 0 0 2 2")); + + new PedReaderTestMissing("missingFam", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID), + EnumSet.of(PedReader.Field.FAMILY_ID), + new Sample("kid", null, "dad", "mom", Gender.MALE, Affection.AFFECTED)); + + new PedReaderTestMissing("missingParents", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PARENTS), + EnumSet.of(PedReader.Field.PATERNAL_ID, PedReader.Field.MATERNAL_ID), + new Sample("kid", "fam1", null, null, Gender.MALE, Affection.AFFECTED)); + + new PedReaderTestMissing("missingSex", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_SEX), + EnumSet.of(PedReader.Field.GENDER), + new Sample("kid", "fam1", "dad", "mom", Gender.UNKNOWN, Affection.AFFECTED)); + + new PedReaderTestMissing("missingPhenotype", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE), + EnumSet.of(PedReader.Field.PHENOTYPE), + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.UNKNOWN)); + + new PedReaderTestMissing("missingEverythingButGender", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS, PedReader.MissingPedField.NO_FAMILY_ID), + EnumSet.of(PedReader.Field.FAMILY_ID, PedReader.Field.PATERNAL_ID, PedReader.Field.MATERNAL_ID, PedReader.Field.PHENOTYPE), + new Sample("kid", null, null, null, Gender.MALE, Affection.UNKNOWN)); + return PedReaderTestMissing.getTests(PedReaderTestMissing.class); } @Test(enabled = true, dataProvider = "readerTestMissing", dependsOnMethods = "testPedReader") - public void testPedReaderWithMissing(PedReaderTest test) { -// public enum MissingPedFields { -// NO_FAMILY_ID, -// NO_PARENTS, -// NO_SEX, -// NO_PHENOTYPE -// } -// runTest(test, sliceContents(0, test.fileContents), EnumSet.of(PedReader.MissingPedFields.NO_FAMILY_ID)); + public void testPedReaderWithMissing(PedReaderTestMissing test) { + final String contents = sliceContents(test.missingFields, test.fileContents); + logger.warn("Test " + test); + PedReader reader = new PedReader(); + SampleDataSource sampleDB = new SampleDataSource(); + reader.parse(new StringReader(contents), test.missingDesc, sampleDB); + final Sample missingSample = sampleDB.getSample("kid"); + Assert.assertEquals(test.expected, missingSample, "Missing field value not expected value for " + test); + } + + private final static String sliceContents(EnumSet missingFieldsSet, String full) { + List parts = new ArrayList(Arrays.asList(full.split("\\s+"))); + final List missingFields = new ArrayList(missingFieldsSet); + Collections.reverse(missingFields); + for ( PedReader.Field field : missingFields ) + parts.remove(field.ordinal()); + return Utils.join("\t", parts); } } \ No newline at end of file From 0604ce55d1b4d418b30d3ece80b740fbb6a9e57d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 09:19:58 -0700 Subject: [PATCH 25/63] PedReader support for ; separated lines, not only newline --- .../sting/gatk/samples/PedReader.java | 10 ++++++---- .../sting/gatk/samples/PedReaderUnitTest.java | 14 ++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index 27b9181de..72c5ec12c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -30,10 +30,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.Reader; +import java.io.*; import java.util.*; /** @@ -146,6 +143,11 @@ public class PedReader { return parse(new FileReader(source), missingFields, sampleDB); } + public final List parse(final String source, EnumSet missingFields, SampleDataSource sampleDB) { + logger.warn("Reading PED string: \"" + source + "\" with missing fields: " + missingFields); + return parse(new StringReader(source.replace(";", String.format("%n"))), missingFields, sampleDB); + } + public final List parse(Reader reader, EnumSet missingFields, SampleDataSource sampleDB) { final List lines = new XReadLines(reader).readLines(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index 35be45bc7..e68d169ea 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -179,7 +179,7 @@ public class PedReaderUnitTest extends BaseTest { logger.warn("Test " + test); PedReader reader = new PedReader(); SampleDataSource sampleDB = new SampleDataSource(); - List readSamples = reader.parse(new StringReader(myFileContents), missing, sampleDB); + List readSamples = reader.parse(myFileContents, missing, sampleDB); Assert.assertEquals(new HashSet(test.expectedSamples), new HashSet(readSamples), "Parsed incorrect number of samples"); } @@ -188,11 +188,18 @@ public class PedReaderUnitTest extends BaseTest { runTest(test, test.fileContents, EnumSet.noneOf(PedReader.MissingPedField.class)); } - @Test(enabled = true, dataProvider = "readerTest", dependsOnMethods = "testPedReader") + @Test(enabled = true, dataProvider = "readerTest") public void testPedReaderWithComments(PedReaderTest test) { runTest(test, String.format("#comment%n%s", test.fileContents), EnumSet.noneOf(PedReader.MissingPedField.class)); } + @Test(enabled = true, dataProvider = "readerTest") + public void testPedReaderWithSemicolons(PedReaderTest test) { + runTest(test, + test.fileContents.replace(String.format("%n"), ";"), + EnumSet.noneOf(PedReader.MissingPedField.class)); + } + // ----------------------------------------------------------------- // missing format field tests // ----------------------------------------------------------------- @@ -218,7 +225,6 @@ public class PedReaderUnitTest extends BaseTest { @DataProvider(name = "readerTestMissing") public Object[][] createPEDFilesWithMissing() { - new PedReaderTestMissing("missingFam", "fam1 kid dad mom 1 2", EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID), @@ -253,7 +259,7 @@ public class PedReaderUnitTest extends BaseTest { return PedReaderTestMissing.getTests(PedReaderTestMissing.class); } - @Test(enabled = true, dataProvider = "readerTestMissing", dependsOnMethods = "testPedReader") + @Test(enabled = true, dataProvider = "readerTestMissing") public void testPedReaderWithMissing(PedReaderTestMissing test) { final String contents = sliceContents(test.missingFields, test.fileContents); logger.warn("Test " + test); From 93fba06cb545d331df4d2ebb67579987e1581cf7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 09:30:10 -0700 Subject: [PATCH 26/63] Support for whitespace only lines --- .../org/broadinstitute/sting/gatk/samples/PedReader.java | 4 +++- .../sting/gatk/samples/PedReaderUnitTest.java | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index 72c5ec12c..648637b09 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -166,7 +166,9 @@ public class PedReader { final List splits = new ArrayList(lines.size()); for ( final String line : lines ) { if ( line.startsWith(commentMarker)) continue; - String[] parts = line.split("\\s+"); + if ( line.trim().equals("") ) continue; + + final String[] parts = line.split("\\s+"); if ( parts.length != nExpectedFields ) throw new UserException.MalformedFile(reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index e68d169ea..16c1d178b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -107,6 +107,14 @@ public class PedReaderUnitTest extends BaseTest { "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 2")); + new PedReaderTest("multipleUnrelatedExtraLine", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s%n %n", // note extra newlines and whitespace + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 2")); + new PedReaderTest("explicitTrio", Arrays.asList( new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), From 89ac50e86e52bd014f11efd568e73ae52ade6ab2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 09:33:30 -0700 Subject: [PATCH 27/63] SampleDataSource -> SampleDB --- .../sting/gatk/GenomeAnalysisEngine.java | 12 +++++----- .../sting/gatk/executive/WindowMaker.java | 1 - .../sting/gatk/samples/PedReader.java | 8 +++---- .../sting/gatk/samples/Sample.java | 22 +++++++++---------- .../{SampleDataSource.java => SampleDB.java} | 14 ++++++------ .../sting/gatk/walkers/Walker.java | 4 ++-- .../providers/LocusViewTemplate.java | 1 - .../reads/DownsamplerBenchmark.java | 2 -- .../sting/gatk/samples/PedReaderUnitTest.java | 5 ++--- .../samples/SampleDataSourceUnitTest.java | 7 +----- .../sting/gatk/samples/SampleUnitTest.java | 4 ++-- 11 files changed, 34 insertions(+), 46 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/samples/{SampleDataSource.java => SampleDB.java} (94%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 52544fbd2..a9a7de75f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -34,8 +34,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.samples.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; @@ -51,7 +50,6 @@ import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; import java.util.*; @@ -88,7 +86,7 @@ public class GenomeAnalysisEngine { /** * Accessor for sample metadata */ - private SampleDataSource sampleDataSource = null; + private SampleDB sampleDB = null; /** * Accessor for sharded reference-ordered data. @@ -688,7 +686,7 @@ public class GenomeAnalysisEngine { for (ReadFilter filter : filters) filter.initialize(this); - sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); + sampleDB = new SampleDB(getSAMFileHeader(), argCollection.sampleFiles); // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); @@ -953,8 +951,8 @@ public class GenomeAnalysisEngine { // // ------------------------------------------------------------------------------------- - public SampleDataSource getSampleDB() { - return this.sampleDataSource; + public SampleDB getSampleDB() { + return this.sampleDB; } public Map getApproximateCommandLineArguments(Object... argumentProviders) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index 825a81e64..d1f5d80da 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,7 +4,6 @@ import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index 648637b09..d697498be 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -138,17 +138,17 @@ public class PedReader { public PedReader() { } - public final List parse(File source, EnumSet missingFields, SampleDataSource sampleDB) throws FileNotFoundException { + public final List parse(File source, EnumSet missingFields, SampleDB sampleDB) throws FileNotFoundException { logger.info("Reading PED file " + source + " with missing fields: " + missingFields); return parse(new FileReader(source), missingFields, sampleDB); } - public final List parse(final String source, EnumSet missingFields, SampleDataSource sampleDB) { + public final List parse(final String source, EnumSet missingFields, SampleDB sampleDB) { logger.warn("Reading PED string: \"" + source + "\" with missing fields: " + missingFields); return parse(new StringReader(source.replace(";", String.format("%n"))), missingFields, sampleDB); } - public final List parse(Reader reader, EnumSet missingFields, SampleDataSource sampleDB) { + public final List parse(Reader reader, EnumSet missingFields, SampleDB sampleDB) { final List lines = new XReadLines(reader).readLines(); // What are the record offsets? @@ -245,7 +245,7 @@ public class PedReader { return string; } - private final Sample maybeAddImplicitSample(SampleDataSource sampleDB, final String id, final String familyID, final Gender gender) { + private final Sample maybeAddImplicitSample(SampleDB sampleDB, final String id, final String familyID, final Gender gender) { if ( id != null && sampleDB.getSample(id) == null ) { Sample s = new Sample(id, sampleDB, familyID, null, null, gender, Affection.UNKNOWN, Sample.UNSET_QT); sampleDB.addSample(s); diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index 0a5043013..e68d92a9f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -14,12 +14,12 @@ public class Sample implements java.io.Serializable { final private double quantitativePhenotype; final private Affection affection; final private String ID; - final private SampleDataSource dataSource; + final private SampleDB infoDB; final private Map properties = new HashMap(); public final static double UNSET_QT = Double.NaN; - public Sample(final String ID, final SampleDataSource dataSource, + public Sample(final String ID, final SampleDB infoDB, final String familyID, final String paternalID, final String maternalID, final Gender gender, final Affection affection, final double quantitativePhenotype) { this.familyID = familyID; @@ -29,7 +29,7 @@ public class Sample implements java.io.Serializable { this.quantitativePhenotype = quantitativePhenotype; this.affection = affection; this.ID = ID; - this.dataSource = dataSource; + this.infoDB = infoDB; } protected Sample(final String ID, @@ -45,17 +45,17 @@ public class Sample implements java.io.Serializable { } - public Sample(final String ID, final SampleDataSource dataSource, + public Sample(final String ID, final SampleDB infoDB, final String familyID, final String paternalID, final String maternalID, final Gender gender) { - this(ID, dataSource, familyID, paternalID, maternalID, gender, Affection.UNKNOWN, UNSET_QT); + this(ID, infoDB, familyID, paternalID, maternalID, gender, Affection.UNKNOWN, UNSET_QT); } - public Sample(final String ID, final SampleDataSource dataSource, final Affection affection, final double quantitativePhenotype) { - this(ID, dataSource, null, null, null, Gender.UNKNOWN, affection, quantitativePhenotype); + public Sample(final String ID, final SampleDB infoDB, final Affection affection, final double quantitativePhenotype) { + this(ID, infoDB, null, null, null, Gender.UNKNOWN, affection, quantitativePhenotype); } - public Sample(String id, SampleDataSource dataSource) { - this(id, dataSource, null, null, null, + public Sample(String id, SampleDB infoDB) { + this(id, infoDB, null, null, null, Gender.UNKNOWN, Affection.UNKNOWN, UNSET_QT); } @@ -98,7 +98,7 @@ public class Sample implements java.io.Serializable { * @return sample object with relationship mother, if exists, or null */ public Sample getMother() { - return dataSource.getSample(maternalID); + return infoDB.getSample(maternalID); } /** @@ -106,7 +106,7 @@ public class Sample implements java.io.Serializable { * @return sample object with relationship father, if exists, or null */ public Sample getFather() { - return dataSource.getSample(paternalID); + return infoDB.getSample(paternalID); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java similarity index 94% rename from public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java rename to public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index b85759de2..6a2ec2ac4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -24,7 +24,7 @@ import java.util.*; * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource. * */ -public class SampleDataSource { +public class SampleDB { /** * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so * this is stored as a HashMap. @@ -34,11 +34,11 @@ public class SampleDataSource { /** * Constructor takes both a SAM header and sample files because the two must be integrated. */ - public SampleDataSource() { + public SampleDB() { } - public SampleDataSource(final SAMFileHeader header, final List sampleFiles) { + public SampleDB(final SAMFileHeader header, final List sampleFiles) { this(); addSamples(header); addSamples(sampleFiles); @@ -55,7 +55,7 @@ public class SampleDataSource { /** * Hallucinates sample objects for all the samples in the SAM file and stores them */ - protected SampleDataSource addSamples(SAMFileHeader header) { + protected SampleDB addSamples(SAMFileHeader header) { for (String sampleName : SampleUtils.getSAMFileSamples(header)) { if (getSample(sampleName) == null) { Sample newSample = new Sample(sampleName, this); @@ -65,7 +65,7 @@ public class SampleDataSource { return this; } - protected SampleDataSource addSamples(final List sampleFiles) { + protected SampleDB addSamples(final List sampleFiles) { // add files consecutively for (File file : sampleFiles) { addSamples(file); @@ -77,7 +77,7 @@ public class SampleDataSource { * Parse one sample file and integrate it with samples that are already there * Fail quickly if we find any errors in the file */ - protected SampleDataSource addSamples(File sampleFile) { + protected SampleDB addSamples(File sampleFile) { return this; } @@ -85,7 +85,7 @@ public class SampleDataSource { * Add a sample to the collection * @param sample to be added */ - protected SampleDataSource addSample(Sample sample) { + protected SampleDB addSample(Sample sample) { samples.put(sample.getID(), sample); return this; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index f67dace2c..792fef9c3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.MalformedReadFilter; import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.samples.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; @@ -88,7 +88,7 @@ public abstract class Walker { return getToolkit().getMasterSequenceDictionary(); } - protected SampleDataSource getSampleDB() { + protected SampleDB getSampleDB() { return getToolkit().getSampleDB(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index 8b226101a..2adb4864c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -8,7 +8,6 @@ import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 0d5734d43..5ee373e4f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -33,14 +33,12 @@ import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.samples.SampleDataSource; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.baq.BAQ; -import java.io.File; import java.util.Collections; import java.util.Iterator; diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index 16c1d178b..c14995dca 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -32,7 +32,6 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.StringReader; -import java.lang.reflect.Array; import java.util.*; /** @@ -186,7 +185,7 @@ public class PedReaderUnitTest extends BaseTest { private static final void runTest(PedReaderTest test, String myFileContents, EnumSet missing) { logger.warn("Test " + test); PedReader reader = new PedReader(); - SampleDataSource sampleDB = new SampleDataSource(); + SampleDB sampleDB = new SampleDB(); List readSamples = reader.parse(myFileContents, missing, sampleDB); Assert.assertEquals(new HashSet(test.expectedSamples), new HashSet(readSamples), "Parsed incorrect number of samples"); } @@ -272,7 +271,7 @@ public class PedReaderUnitTest extends BaseTest { final String contents = sliceContents(test.missingFields, test.fileContents); logger.warn("Test " + test); PedReader reader = new PedReader(); - SampleDataSource sampleDB = new SampleDataSource(); + SampleDB sampleDB = new SampleDB(); reader.parse(new StringReader(contents), test.missingDesc, sampleDB); final Sample missingSample = sampleDB.getSample("kid"); Assert.assertEquals(test.expected, missingSample, "Missing field value not expected value for " + test); diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java index 3d40d4de8..90dd8e36e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java @@ -1,12 +1,7 @@ package org.broadinstitute.sting.gatk.samples; import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.testng.Assert; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.StingException; import org.testng.annotations.Test; @@ -29,6 +24,6 @@ public class SampleDataSourceUnitTest extends BaseTest { // make sure samples are created from the SAM file correctly @Test() public void loadSAMSamplesTest() { - SampleDataSource s = new SampleDataSource(header, Collections.emptyList()); + SampleDB s = new SampleDB(header, Collections.emptyList()); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java index 279319edb..372b59353 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -13,14 +13,14 @@ import org.testng.annotations.Test; * Time: 8:21:00 AM */ public class SampleUnitTest extends BaseTest { - SampleDataSource db; + SampleDB db; static Sample fam1A, fam1B, fam1C; static Sample s1, s2; static Sample trait1, trait2, trait3, trait4; @BeforeClass public void init() { - db = new SampleDataSource(); + db = new SampleDB(); fam1A = new Sample("1A", db, "fam1", "1B", "1C", Gender.UNKNOWN); fam1B = new Sample("1B", db, "fam1", null, null, Gender.MALE); From 8ee0f91904433e89cef8a3a3599504e5a71390da Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 09:50:01 -0700 Subject: [PATCH 28/63] Remove residual processing tracker arguments --- .../arguments/GATKArgumentCollection.java | 34 ------------------- .../executive/HierarchicalMicroScheduler.java | 5 --- 2 files changed, 39 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index fd39d46b0..9ce402cf3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -218,26 +218,6 @@ public class GATKArgumentCollection { // distributed GATK arguments // // -------------------------------------------------------------------------------------------------------------- - @Element(required=false) - @Argument(fullName="processingTracker",shortName="C",doc="A lockable, shared file for coordinating distributed GATK runs",required=false) - @Hidden - public File processingTrackerFile = null; - - @Element(required=false) - @Argument(fullName="restartProcessingTracker",shortName="RPT",doc="Should we delete the processing tracker file at startup?",required=false) - @Hidden - public boolean restartProcessingTracker = false; - - @Element(required=false) - @Argument(fullName="processingTrackerStatusFile",shortName="CSF",doc="If provided, a detailed accounting of the state of the process tracker is written to this file. For debugging, only",required=false) - @Hidden - public File processingTrackerStatusFile = null; - - @Element(required=false) - @Argument(fullName="processingTrackerID",shortName="CID",doc="If provided, an integer ID (starting at 1) indicating a unique id for this process within the distributed GATK group",required=false) - @Hidden - public int processTrackerID = -1; - @Element(required = false) @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false) @Hidden @@ -405,20 +385,6 @@ public class GATKArgumentCollection { (other.performanceLog != null && !other.performanceLog.equals(this.performanceLog))) return false; - if ((other.processingTrackerFile == null && this.processingTrackerFile != null) || - (other.processingTrackerFile != null && !other.processingTrackerFile.equals(this.processingTrackerFile))) - return false; - - if ((other.processingTrackerStatusFile == null && this.processingTrackerStatusFile != null) || - (other.processingTrackerStatusFile != null && !other.processingTrackerStatusFile.equals(this.processingTrackerStatusFile))) - return false; - - if ( restartProcessingTracker != other.restartProcessingTracker ) - return false; - - if ( processTrackerID != other.processTrackerID ) - return false; - if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM) return false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 3b9e35311..a07f735fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -84,12 +84,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar */ protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse ) { super(engine, walker, reads, reference, rods); - this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); - - if (engine.getArguments().processingTrackerFile != null) { - throw new UserException.BadArgumentValue("-C", "Distributed GATK calculations currently not supported in multi-threaded mode. Complain to Mark depristo@broadinstitute.org to implement and test this code path"); - } } public Object execute( Walker walker, ShardStrategy shardStrategy ) { From dd71884b0c095fc57ff3dbb1a06961ac771c97ee Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 12:08:07 -0700 Subject: [PATCH 29/63] On path to SampleDB engine integration -- PedReader tag parser -- Separation of SampleDBBuilder from SampleDB (now immutable) -- Removed old sample engine arguments --- .../arguments/GATKArgumentCollection.java | 29 +++-- .../sting/gatk/samples/PedReader.java | 56 +++++++- .../gatk/samples/PedigreeValidationType.java | 34 +++++ .../sting/gatk/samples/SampleDB.java | 54 ++------ .../sting/gatk/samples/SampleDBBuilder.java | 121 ++++++++++++++++++ .../sting/gatk/samples/PedReaderUnitTest.java | 66 ++++++++++ ...rceUnitTest.java => SampleDBUnitTest.java} | 4 +- 7 files changed, 304 insertions(+), 60 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java rename public/java/test/org/broadinstitute/sting/gatk/samples/{SampleDataSourceUnitTest.java => SampleDBUnitTest.java} (83%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 9ce402cf3..c27bb26d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; @@ -44,10 +45,7 @@ import org.simpleframework.xml.stream.HyphenStyle; import java.io.File; import java.io.InputStream; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @author aaron @@ -72,11 +70,6 @@ public class GATKArgumentCollection { @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) public List samFiles = new ArrayList(); - // parameters and their defaults - @ElementList(required = false) - @Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false) - public List sampleFiles = new ArrayList(); - @Element(required = false) @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; @@ -215,9 +208,25 @@ public class GATKArgumentCollection { // -------------------------------------------------------------------------------------------------------------- // - // distributed GATK arguments + // PED (pedigree) support // // -------------------------------------------------------------------------------------------------------------- + + /** + * MARK: add documentation details + */ + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree file / string for samples",required=false) + public List pedigreeData = Collections.emptyList(); + + @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) + public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; + + // -------------------------------------------------------------------------------------------------------------- + // + // BAM indexing and sharding arguments + // + // -------------------------------------------------------------------------------------------------------------- + @Element(required = false) @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false) @Hidden diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index d697498be..ec49b0f60 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -114,10 +114,42 @@ public class PedReader { final static private Set CATAGORICAL_TRAIT_VALUES = new HashSet(Arrays.asList("-9", "0", "1", "2")); final static private String commentMarker = "#"; + /** + * An enum that specifies which, if any, of the standard PED fields are + * missing from the input records. For example, suppose we have the full record: + * + * "fam1 kid dad mom 1 2" + * + * indicating a male affected child. This can be parsed with the -ped x.ped argument + * to the GATK. Suppose we only have: + * + * "fam1 kid 1" + * + * we can parse the reduced version of this record with -ped:NO_PARENTS,NO_PHENOTYPE x.ped + */ public enum MissingPedField { + /** + * The PED records do not have the first (FAMILY_ID) argument. The family id + * will be set to null / empty. + */ NO_FAMILY_ID, + + /** + * The PED records do not have either the paternal or maternal IDs, so + * the corresponding IDs are set to null. + */ NO_PARENTS, + + /** + * The PED records do not have the GENDER field, so the sex of each + * sample will be set to UNKNOWN. + */ NO_SEX, + + /** + * The PED records do not have the PHENOTYPE field, so the phenotype + * of each sample will be set to UNKNOWN. + */ NO_PHENOTYPE } @@ -233,8 +265,6 @@ public class PedReader { if ( mom != null ) samples.add(mom); } - - sampleDB.validate(samples); return samples; } @@ -253,4 +283,26 @@ public class PedReader { } else return null; } + + /** + * Parses a list of tags from the command line, assuming it comes from the GATK Engine + * tags, and returns the corresponding EnumSet. + * + * @param arg the actual engine arg, used for the UserException if there's an error + * @param tags a list of string tags that should be converted to the MissingPedField value + * @return + */ + public static final EnumSet parseMissingFieldTags(final Object arg, final List tags) { + final EnumSet missingFields = EnumSet.noneOf(MissingPedField.class); + + for ( final String tag : tags ) { + try { + missingFields.add(MissingPedField.valueOf(tag)); + } catch ( IllegalArgumentException e ) { + throw new UserException.BadArgumentValue(arg.toString(), "Unknown tag " + tag + " allowed values are " + MissingPedField.values()); + } + } + + return missingFields; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java new file mode 100644 index 000000000..8a1a4f225 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** +* +*/ +public enum PedigreeValidationType { + STRICT, + LINIENT, + SILENT, +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 6a2ec2ac4..75b37d758 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -38,51 +38,9 @@ public class SampleDB { } - public SampleDB(final SAMFileHeader header, final List sampleFiles) { - this(); - addSamples(header); - addSamples(sampleFiles); - } - - // -------------------------------------------------------------------------------- - // - // Functions for adding samples to the DB - // - // TODO: these should be protected, really - // - // -------------------------------------------------------------------------------- - /** - * Hallucinates sample objects for all the samples in the SAM file and stores them - */ - protected SampleDB addSamples(SAMFileHeader header) { - for (String sampleName : SampleUtils.getSAMFileSamples(header)) { - if (getSample(sampleName) == null) { - Sample newSample = new Sample(sampleName, this); - samples.put(sampleName, newSample); - } - } - return this; - } - - protected SampleDB addSamples(final List sampleFiles) { - // add files consecutively - for (File file : sampleFiles) { - addSamples(file); - } - return this; - } - - /** - * Parse one sample file and integrate it with samples that are already there - * Fail quickly if we find any errors in the file - */ - protected SampleDB addSamples(File sampleFile) { - return this; - } - - /** - * Add a sample to the collection + * Protected function to add a single sample to the database + * * @param sample to be added */ protected SampleDB addSample(Sample sample) { @@ -215,10 +173,14 @@ public class SampleDB { // -------------------------------------------------------------------------------- public final void validate() { - validate(getSamples()); + validate(getSamples(), PedigreeValidationType.STRICT); } - public final void validate(Collection samplesToCheck) { + public final void validate(PedigreeValidationType validationType) { + validate(getSamples(), validationType); + } + public final void validate(Collection samplesToCheck, PedigreeValidationType validationType) { + // todo -- actually do an implementation } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java new file mode 100644 index 000000000..33bed89d2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Genotype; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * + */ +public class SampleDBBuilder { + PedigreeValidationType validationStrictness; + final SampleDB sampleDB = new SampleDB(); + final GenomeAnalysisEngine engine; + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + */ + public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) { + this.engine = engine; + this.validationStrictness = validationStrictness; + } + + /** + * Hallucinates sample objects for all the samples in the SAM file and stores them + */ + public SampleDBBuilder addSamples(SAMFileHeader header) { + for (String sampleName : SampleUtils.getSAMFileSamples(header)) { + if (sampleDB.getSample(sampleName) == null) { + final Sample newSample = new Sample(sampleName, sampleDB); + addSample(newSample); + } + } + return this; + } + + public SampleDBBuilder addSamples(final List pedigreeArguments) { + for (final String ped : pedigreeArguments) { + final File pedFile = new File(ped); + if ( pedFile.exists() ) + addSamples(pedFile); + else + addSamples(ped); + } + + return this; + } + + /** + * Parse one sample file and integrate it with samples that are already there + * Fail quickly if we find any errors in the file + */ + protected SampleDBBuilder addSamples(File sampleFile) { + final PedReader reader = new PedReader(); + + try { + reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleFile, e); + } + + return this; + } + + protected SampleDBBuilder addSamples(final String string) { + final PedReader reader = new PedReader(); + reader.parse(string, getMissingFields(string), sampleDB); + return this; + } + + /** + * Add a sample to the collection + * @param sample to be added + */ + protected SampleDBBuilder addSample(Sample sample) { + sampleDB.addSample(sample); + return this; + } + + public SampleDB getFinalSampleDB() { + sampleDB.validate(validationStrictness); + return sampleDB; + } + + public EnumSet getMissingFields(final Object engineArg) { + final List posTags = engine.getTags(engineArg).getPositionalTags(); + return PedReader.parseMissingFieldTags(engineArg, posTags); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index c14995dca..57bc6cf3b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.samples; import org.apache.log4j.Logger; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -285,4 +286,69 @@ public class PedReaderUnitTest extends BaseTest { parts.remove(field.ordinal()); return Utils.join("\t", parts); } + + // ----------------------------------------------------------------- + // parsing tags + // ----------------------------------------------------------------- + + private class PedReaderTestTagParsing extends TestDataProvider { + public EnumSet expected; + public final List tags; + + private PedReaderTestTagParsing(final List tags, EnumSet missingDesc) { + super(PedReaderTestTagParsing.class); + this.tags = tags; + this.expected = missingDesc; + + } + } + + @DataProvider(name = "readerTestTagParsing") + public Object[][] createReaderTestTagParsing() { + new PedReaderTestTagParsing( + Collections.emptyList(), + EnumSet.noneOf(PedReader.MissingPedField.class)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_FAMILY_ID"), + EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_PARENTS"), + EnumSet.of(PedReader.MissingPedField.NO_PARENTS)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_PHENOTYPE"), + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX"), + EnumSet.of(PedReader.MissingPedField.NO_SEX)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX", "NO_PHENOTYPE"), + EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX", "NO_PHENOTYPE", "NO_PARENTS"), + EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS)); + + return PedReaderTestTagParsing.getTests(PedReaderTestTagParsing.class); + } + + @Test(enabled = true, dataProvider = "readerTestTagParsing") + public void testPedReaderTagParsing(PedReaderTestTagParsing test) { + EnumSet parsed = PedReader.parseMissingFieldTags("test", test.tags); + Assert.assertEquals(test.expected, parsed, "Failed to properly parse tags " + test.tags); + } + + @Test(enabled = true, expectedExceptions = UserException.class) + public void testPedReaderTagParsing1() { + EnumSet parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("XXX")); + } + + @Test(enabled = true, expectedExceptions = UserException.class) + public void testPedReaderTagParsing2() { + EnumSet parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("NO_SEX", "XXX")); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java similarity index 83% rename from public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index 90dd8e36e..500d322db 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -14,7 +14,7 @@ import java.util.*; * Date: Sep 9, 2010 * Time: 8:21:00 AM */ -public class SampleDataSourceUnitTest extends BaseTest { +public class SampleDBUnitTest extends BaseTest { // this empty header used to instantiate sampledatasource objects private static SAMFileHeader header = new SAMFileHeader(); @@ -24,6 +24,6 @@ public class SampleDataSourceUnitTest extends BaseTest { // make sure samples are created from the SAM file correctly @Test() public void loadSAMSamplesTest() { - SampleDB s = new SampleDB(header, Collections.emptyList()); + //SampleDB s = new SampleDB(header); } } From 2e3dc520882ebe9037f2ff001193c39f3313be4e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 14:41:13 -0700 Subject: [PATCH 30/63] Minor function renaming --- .../sting/gatk/GenomeAnalysisEngine.java | 17 +++++++++++++++-- .../sting/gatk/samples/SampleDBBuilder.java | 11 ++++++++--- .../sting/gatk/samples/PedReaderUnitTest.java | 1 - 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index a9a7de75f..71e65f2fb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -44,6 +44,7 @@ import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; @@ -686,10 +687,22 @@ public class GenomeAnalysisEngine { for (ReadFilter filter : filters) filter.initialize(this); - sampleDB = new SampleDB(getSAMFileHeader(), argCollection.sampleFiles); - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); + + // set up sample db + initializeSampleDB(); + } + + /** + * Entry-point function to initialize the samples database from input data and pedigree arguments + */ + private void initializeSampleDB() { + SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); + sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); + sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); + sampleDBBuilder.addSamplesFromPedigreeArgument(argCollection.pedigreeData); + sampleDB = sampleDBBuilder.getFinalSampleDB(); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java index 33bed89d2..fd42a24f4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -56,8 +56,12 @@ public class SampleDBBuilder { /** * Hallucinates sample objects for all the samples in the SAM file and stores them */ - public SampleDBBuilder addSamples(SAMFileHeader header) { - for (String sampleName : SampleUtils.getSAMFileSamples(header)) { + public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { + return addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); + } + + public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { + for (final String sampleName : sampleNames) { if (sampleDB.getSample(sampleName) == null) { final Sample newSample = new Sample(sampleName, sampleDB); addSample(newSample); @@ -66,7 +70,7 @@ public class SampleDBBuilder { return this; } - public SampleDBBuilder addSamples(final List pedigreeArguments) { + public SampleDBBuilder addSamplesFromPedigreeArgument(final List pedigreeArguments) { for (final String ped : pedigreeArguments) { final File pedFile = new File(ped); if ( pedFile.exists() ) @@ -105,6 +109,7 @@ public class SampleDBBuilder { * @param sample to be added */ protected SampleDBBuilder addSample(Sample sample) { + // todo -- merge with existing record if we have one sampleDB.addSample(sample); return this; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index 57bc6cf3b..e63fc7feb 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -299,7 +299,6 @@ public class PedReaderUnitTest extends BaseTest { super(PedReaderTestTagParsing.class); this.tags = tags; this.expected = missingDesc; - } } From 867a7476c1cd22cd2d65f76fff0b36be539e1912 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 19:09:02 -0700 Subject: [PATCH 31/63] Systematic unit tests for the sample object --- .../sting/gatk/samples/SampleUnitTest.java | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java index 372b59353..c2c9d77c6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -36,13 +36,25 @@ public class SampleUnitTest extends BaseTest { } /** - * Now test the special getter methods + * Now basic getters */ @Test() - public void specialGettersTest() { - // todo -- test for sample with extra properties, like population -// Assert.assertTrue(sampleC.getID().equals("sampleC")); -// Assert.assertTrue(sampleC.getPopulation().equals("pop1")); + public void normalGettersTest() { + Assert.assertEquals("1A", fam1A.getID()); + Assert.assertEquals("fam1", fam1A.getFamilyID()); + Assert.assertEquals("1B", fam1A.getPaternalID()); + Assert.assertEquals("1C", fam1A.getMaternalID()); + Assert.assertEquals(null, fam1B.getPaternalID()); + Assert.assertEquals(null, fam1B.getMaternalID()); + + Assert.assertEquals(Affection.AFFECTED, trait1.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait1.getQuantitativePhenotype()); + Assert.assertEquals(Affection.UNAFFECTED, trait2.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait2.getQuantitativePhenotype()); + Assert.assertEquals(Affection.UNKNOWN, trait3.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait3.getQuantitativePhenotype()); + Assert.assertEquals(Affection.QUANTITATIVE, trait4.getAffection()); + Assert.assertEquals(1.0, trait4.getQuantitativePhenotype()); } @Test() From b20689ff5500f1ba37044b2289e1e867efdd1dd0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Oct 2011 19:20:33 -0700 Subject: [PATCH 32/63] No longer supports extraProperties -- the underlying data structure is still present, but until I decide what to do for the extensible system I've completely disabled the subsystem -- Added code to merge Samples, so that a mostly full record can be merged with a consistent empty record. If the two records are inconsistent, an error is thrown -- addSample() in Sample.class now invokes mergeSample() when appropriate -- Validation types are now only STRICT or SILENT -- Validation code implemented in SampleDBBuilder -- Extensive unit tests for SampleDBBuilder --- build.xml | 14 ++- .../sting/gatk/GenomeAnalysisEngine.java | 3 +- .../arguments/GATKArgumentCollection.java | 7 +- .../gatk/samples/PedigreeValidationType.java | 3 +- .../sting/gatk/samples/Sample.java | 90 ++++++++++----- .../sting/gatk/samples/SampleDB.java | 25 +---- .../sting/gatk/samples/SampleDBBuilder.java | 89 ++++++++++----- .../sting/gatk/samples/SampleDBUnitTest.java | 105 ++++++++++++++++-- 8 files changed, 241 insertions(+), 95 deletions(-) diff --git a/build.xml b/build.xml index 1f26e7b7a..ef662a160 100644 --- a/build.xml +++ b/build.xml @@ -146,12 +146,14 @@ - - + + + + + + + + diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 71e65f2fb..9cfe7d48b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -701,7 +701,8 @@ public class GenomeAnalysisEngine { SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); - sampleDBBuilder.addSamplesFromPedigreeArgument(argCollection.pedigreeData); + sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); + sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); sampleDB = sampleDBBuilder.getFinalSampleDB(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index c27bb26d9..c71b3ce2c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -215,8 +215,11 @@ public class GATKArgumentCollection { /** * MARK: add documentation details */ - @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree file / string for samples",required=false) - public List pedigreeData = Collections.emptyList(); + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) + public List pedigreeFiles = Collections.emptyList(); + + @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) + public List pedigreeStrings = Collections.emptyList(); @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java index 8a1a4f225..209636b54 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java @@ -29,6 +29,5 @@ package org.broadinstitute.sting.gatk.samples; */ public enum PedigreeValidationType { STRICT, - LINIENT, - SILENT, + SILENT } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index e68d92a9f..3e61e03d9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.samples; +import org.broadinstitute.sting.utils.exceptions.UserException; + import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -121,36 +123,36 @@ public class Sample implements java.io.Serializable { public String toString() { return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s", getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(), - getQuantitativePhenotype(), getExtraProperties()); + getQuantitativePhenotype(), properties); } - // ------------------------------------------------------------------------------------- - // - // code for working with additional -- none standard -- properites - // - // ------------------------------------------------------------------------------------- - - public Map getExtraProperties() { - return Collections.unmodifiableMap(properties); - } - - /** - * Get one property - * @param key key of property - * @return value of property as generic object - */ - public Object getExtraPropertyValue(final String key) { - return properties.get(key); - } - - /** - * - * @param key property key - * @return true if sample has this property (even if its value is null) - */ - public boolean hasExtraProperty(String key) { - return properties.containsKey(key); - } +// // ------------------------------------------------------------------------------------- +// // +// // code for working with additional -- none standard -- properites +// // +// // ------------------------------------------------------------------------------------- +// +// public Map getExtraProperties() { +// return Collections.unmodifiableMap(properties); +// } +// +// /** +// * Get one property +// * @param key key of property +// * @return value of property as generic object +// */ +// public Object getExtraPropertyValue(final String key) { +// return properties.get(key); +// } +// +// /** +// * +// * @param key property key +// * @return true if sample has this property (even if its value is null) +// */ +// public boolean hasExtraProperty(String key) { +// return properties.containsKey(key); +// } @Override public int hashCode() { @@ -181,4 +183,36 @@ public class Sample implements java.io.Serializable { else return o2 == null ? false : o1.equals(o2); } + + private final static T mergeValues(final String name, final String field, final T o1, final T o2, final T emptyValue) { + if ( o1 == null || o1.equals(emptyValue) ) { + // take o2 if both are null, otherwise keep o2 + return o2 == null ? null : o2; + } else { + if ( o2 == null || o2.equals(emptyValue) ) + return o1; // keep o1, since it's a real value + else { + // both o1 and o2 have a value + if ( o1 == o2 ) + return o1; + else + throw new UserException("Inconsistent values detected for " + name + " for field " + field + " value1 " + o1 + " value2 " + o2); + } + } + } + + public final static Sample mergeSamples(final Sample prev, final Sample next) { + if ( prev.equals(next) ) + return next; + else { + return new Sample(prev.getID(), prev.infoDB, + mergeValues(prev.getID(), "Family_ID", prev.getFamilyID(), next.getFamilyID(), null), + mergeValues(prev.getID(), "Paternal_ID", prev.getPaternalID(), next.getPaternalID(), null), + mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null), + mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN), + mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN), + mergeValues(prev.getID(), "QuantitativeTrait", prev.getQuantitativePhenotype(), next.getQuantitativePhenotype(), UNSET_QT)); + //mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap())); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 75b37d758..9abc28517 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -44,6 +44,9 @@ public class SampleDB { * @param sample to be added */ protected SampleDB addSample(Sample sample) { + Sample prev = samples.get(sample.getID()); + if ( prev != null ) + sample = Sample.mergeSamples(prev, sample); samples.put(sample.getID(), sample); return this; } @@ -138,8 +141,8 @@ public class SampleDB { return children; } - public Collection getSamples() { - return Collections.unmodifiableCollection(samples.values()); + public Set getSamples() { + return new HashSet(samples.values()); } public Collection getSampleNames() { @@ -165,22 +168,4 @@ public class SampleDB { } return samples; } - - // -------------------------------------------------------------------------------- - // - // Validation - // - // -------------------------------------------------------------------------------- - - public final void validate() { - validate(getSamples(), PedigreeValidationType.STRICT); - } - - public final void validate(PedigreeValidationType validationType) { - validate(getSamples(), validationType); - } - - public final void validate(Collection samplesToCheck, PedigreeValidationType validationType) { - // todo -- actually do an implementation - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java index fd42a24f4..87733d1f6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -45,6 +45,15 @@ public class SampleDBBuilder { final SampleDB sampleDB = new SampleDB(); final GenomeAnalysisEngine engine; + Set samplesFromDataSources = new HashSet(); + Set samplesFromPedigrees = new HashSet(); + + /** for testing only */ + protected SampleDBBuilder(PedigreeValidationType validationStrictness) { + engine = null; + this.validationStrictness = validationStrictness; + } + /** * Constructor takes both a SAM header and sample files because the two must be integrated. */ @@ -57,26 +66,34 @@ public class SampleDBBuilder { * Hallucinates sample objects for all the samples in the SAM file and stores them */ public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { - return addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); + addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); + return this; } public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { for (final String sampleName : sampleNames) { if (sampleDB.getSample(sampleName) == null) { final Sample newSample = new Sample(sampleName, sampleDB); - addSample(newSample); + sampleDB.addSample(newSample); + samplesFromDataSources.add(newSample); // keep track of data source samples } } return this; } - public SampleDBBuilder addSamplesFromPedigreeArgument(final List pedigreeArguments) { - for (final String ped : pedigreeArguments) { - final File pedFile = new File(ped); - if ( pedFile.exists() ) - addSamples(pedFile); - else - addSamples(ped); + public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) { + for (final File pedFile : pedigreeFiles) { + Collection samples = addSamplesFromPedigreeArgument(pedFile); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) { + for (final String pedString : pedigreeStrings) { + Collection samples = addSamplesFromPedigreeArgument(pedString); + samplesFromPedigrees.addAll(samples); } return this; @@ -86,41 +103,55 @@ public class SampleDBBuilder { * Parse one sample file and integrate it with samples that are already there * Fail quickly if we find any errors in the file */ - protected SampleDBBuilder addSamples(File sampleFile) { + private Collection addSamplesFromPedigreeArgument(File sampleFile) { final PedReader reader = new PedReader(); try { - reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); + return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); } catch ( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(sampleFile, e); } - - return this; } - protected SampleDBBuilder addSamples(final String string) { + private Collection addSamplesFromPedigreeArgument(final String string) { final PedReader reader = new PedReader(); - reader.parse(string, getMissingFields(string), sampleDB); - return this; - } - - /** - * Add a sample to the collection - * @param sample to be added - */ - protected SampleDBBuilder addSample(Sample sample) { - // todo -- merge with existing record if we have one - sampleDB.addSample(sample); - return this; + return reader.parse(string, getMissingFields(string), sampleDB); } public SampleDB getFinalSampleDB() { - sampleDB.validate(validationStrictness); + validate(); return sampleDB; } public EnumSet getMissingFields(final Object engineArg) { - final List posTags = engine.getTags(engineArg).getPositionalTags(); - return PedReader.parseMissingFieldTags(engineArg, posTags); + if ( engine == null ) + return EnumSet.noneOf(PedReader.MissingPedField.class); + else { + final List posTags = engine.getTags(engineArg).getPositionalTags(); + return PedReader.parseMissingFieldTags(engineArg, posTags); + } + } + + // -------------------------------------------------------------------------------- + // + // Validation + // + // -------------------------------------------------------------------------------- + + protected final void validate() { + if ( validationStrictness == PedigreeValidationType.SILENT ) + return; + else { + // check that samples in data sources are all annotated, if anything is annotated + if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { + final Set sampleNamesFromPedigrees = new HashSet(); + for ( final Sample pSample : samplesFromPedigrees ) + sampleNamesFromPedigrees.add(pSample.getID()); + + for ( final Sample dsSample : samplesFromDataSources ) + if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) + throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files"); + } + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index 500d322db..f6d3b42b8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -3,6 +3,11 @@ package org.broadinstitute.sting.gatk.samples; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import java.io.File; @@ -15,15 +20,101 @@ import java.util.*; * Time: 8:21:00 AM */ public class SampleDBUnitTest extends BaseTest { - // this empty header used to instantiate sampledatasource objects - private static SAMFileHeader header = new SAMFileHeader(); - + private static SampleDBBuilder builder; // all the test sample files are located here - private String sampleFilesDir = validationDataLocation + "samples/"; + private File testPED = new File(testDir + "ceutrio.ped"); + + private static final Set testPEDSamples = new HashSet(Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED))); + + private static final Set testSAMSamples = new HashSet(Arrays.asList( + new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), + new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), + new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN))); + + private static final String testPEDString = + String.format("%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2"); + + private static final String testPEDStringInconsistentGender = + "fam1 kid 0 0 2 2"; + + private static final Set testPEDSamplesAsSet = + new HashSet(testPEDSamples); + + + @BeforeMethod + public void before() { + builder = new SampleDBBuilder(PedigreeValidationType.STRICT); + } - // make sure samples are created from the SAM file correctly @Test() - public void loadSAMSamplesTest() { - //SampleDB s = new SampleDB(header); + public void loadPEDFile() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamplesAsSet, db.getSamples()); + } + + @Test() + public void loadPEDString() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDString)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamplesAsSet, db.getSamples()); + } + + private static final void addSAMHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); + ArtificialSAMUtils.createEnumeratedReadGroups(header, Arrays.asList("1", "2", "3"), + Arrays.asList("kid", "mom", "dad")); + builder.addSamplesFromSAMHeader(header); + } + + @Test() + public void loadSAMHeader() { + addSAMHeader(); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testSAMSamples, db.getSamples()); + } + + @Test() + public void loadSAMHeaderPlusPED() { + addSAMHeader(); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamples, db.getSamples()); + } + + @Test() + public void loadDuplicateData() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamples, db.getSamples()); + } + + @Test(expectedExceptions = UserException.class) + public void loadNonExistentFile() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(new File("non-existence-file.txt"))); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testSAMSamples, db.getSamples()); + } + + @Test(expectedExceptions = UserException.class) + public void loadInconsistentData() { + builder = new SampleDBBuilder(PedigreeValidationType.STRICT); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); + builder.getFinalSampleDB(); + } + + @Test(expectedExceptions = UserException.class) + public void sampleInSAMHeaderNotInSamplesDB() { + addSAMHeader(); + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); + builder.getFinalSampleDB(); } } From a27641e1fc549f631091cd34f2a9343d22eb2f5a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 06:28:36 -0700 Subject: [PATCH 34/63] Cleaned up imports --- .../src/org/broadinstitute/sting/gatk/samples/Sample.java | 1 - .../src/org/broadinstitute/sting/gatk/samples/SampleDB.java | 3 --- .../broadinstitute/sting/gatk/samples/SampleDBBuilder.java | 4 ---- .../broadinstitute/sting/gatk/samples/SampleDBUnitTest.java | 6 +++--- .../broadinstitute/sting/gatk/samples/SampleUnitTest.java | 3 +-- 5 files changed, 4 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index 3e61e03d9..d57668715 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -3,7 +3,6 @@ package org.broadinstitute.sting.gatk.samples; import org.broadinstitute.sting.utils.exceptions.UserException; -import java.util.Collections; import java.util.HashMap; import java.util.Map; diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 9abc28517..4bcf3c938 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -1,13 +1,10 @@ package org.broadinstitute.sting.gatk.samples; -import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.Genotype; -import java.io.File; import java.util.*; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java index 87733d1f6..807b150b2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -25,13 +25,9 @@ package org.broadinstitute.sting.gatk.samples; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; import java.io.File; import java.io.FileNotFoundException; diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index f6d3b42b8..b6b4fab54 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.samples; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.Assert; @@ -11,7 +9,9 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import java.io.File; -import java.util.*; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; /** * Created by IntelliJ IDEA. diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java index c2c9d77c6..bc8a98c22 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -1,8 +1,7 @@ package org.broadinstitute.sting.gatk.samples; -import org.testng.Assert; import org.broadinstitute.sting.BaseTest; - +import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; From f552aede420a62173784d19a5cfd4a13b77d17cc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 06:50:12 -0700 Subject: [PATCH 35/63] Only provide the sample names in the BAM file for efficiency --- .../sting/gatk/executive/LinearMicroScheduler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index a5d1370ba..deafcd0cc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.SampleUtils; import java.util.Collection; @@ -57,7 +58,7 @@ public class LinearMicroScheduler extends MicroScheduler { if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), - getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleDB().getSampleNames()); + getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); From fee89e47ffccd57a28019766a3e3a0ddff77b6c6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 06:50:54 -0700 Subject: [PATCH 36/63] Only throws an error when there are no samples but there are reads -- Handles the case when you are running a ROD traversal and yet the LIBS is still used to return null everywhere. --- .../sting/gatk/iterators/LocusIteratorByState.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index d16502b1d..896d6e3a2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -280,7 +280,9 @@ public class LocusIteratorByState extends LocusIterator { this.samples = new ArrayList(samples); this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); - if ( this.samples.isEmpty() ) + // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when + // there's no read data. So we need to throw this error only when samIterator.hasNext() is true + if ( this.samples.isEmpty() && samIterator.hasNext() ) throw new IllegalArgumentException("samples list must not be empty"); } From 343a7b6b2f93a1cc867df19a0dfb939d365af677 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 08:14:00 -0700 Subject: [PATCH 37/63] Updating UG integration tests for arbitrary impact of sample order changes on downsampling --- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 488b3ccd9..07b2f0566 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -29,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("e6639ea2dc81635c706e6c35921406d7")); + Arrays.asList("b27939251539439a382538e507e03507")); executeTest("test MultiSample Pilot1", spec); } @@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("4be308fd9e8167ebee677f62a7a753b7")); + Arrays.asList("37e891bf1ac40caec9ea228f39c27e44")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); } From e1d6c7a50ade2d6a80e539a4fc451d03885b0779 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 09:33:23 -0700 Subject: [PATCH 38/63] Updating MD5 that have changed due to sample ordering differences --- .../gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 59ac1a41e..411a5d0d2 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -73,7 +73,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.addAuxFile("df0ba76e0e6082c0d29fcfd68efc6b77", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); spec.addAuxFile("7dcac2e8962c778081486332a4576dc3", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); spec.addAuxFile("a50011571334f17e950ad3ed1149e350", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("6f3260504295695d765af639539585c9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + spec.addAuxFile("c95a7a6840334cadd0e520939615c77b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); execute("testBaseOutputNoFiltering",spec); } @@ -90,7 +90,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.setOutputFileLocation(baseOutputFile); spec.addAuxFile("6ccd7d8970ba98cb95fe41636a070c1c",baseOutputFile); - spec.addAuxFile("0ee40f3e5091536c14e077b77557083a",createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("7d87783b3d98b928cac16d383ceca807",createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); execute("testNoCoverageDueToFiltering",spec); } From 88c2fad64f193a42c8e7a66715b27b036a439389 Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Tue, 4 Oct 2011 13:14:39 -0400 Subject: [PATCH 39/63] Change vcf jar to use a classfileset to pull all dependencies. Should save Jim Robinson some detective work in the long run. --- build.xml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/build.xml b/build.xml index 1f26e7b7a..34de6cee6 100644 --- a/build.xml +++ b/build.xml @@ -545,12 +545,11 @@ - - - - - - + + + + + From 941317167eb1d091603aa03b323de0ab36fa608b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 14:08:00 -0700 Subject: [PATCH 40/63] Updating MD5 for BAMs that I added a read group to --- .../sting/utils/interval/IntervalIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 2fab1f287..178c09fa4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -76,7 +76,7 @@ public class IntervalIntegrationTest extends WalkerTest { // our base file File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("99c266d777e2e167b8153c858c305fda",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("748a38ed5eb0a043dfc7b82f0d1e8063",createTempFileFromBase(baseOutputFile.getAbsolutePath())); spec.addAuxFile("fadcdf88597b9609c5f2a17f4c6eb455", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadInclusion",spec); @@ -97,7 +97,7 @@ public class IntervalIntegrationTest extends WalkerTest { File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); spec.addAuxFile("8236f0b2df5a692e54751b08bc3836fa",createTempFileFromBase(baseOutputFile.getAbsolutePath())); - spec.addAuxFile("651b42456d31ba24e913297b71b32143", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadExclusion",spec); } From 463eab760447771e53b8831f4136cafbc5def736 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 15:53:52 -0700 Subject: [PATCH 41/63] All MD5 mismatches for test are shown -- Now for tests like DoC, with 20 output md5s, you see all of the differences before failing. --- .../test/org/broadinstitute/sting/MD5DB.java | 31 ++++++++++++++----- .../org/broadinstitute/sting/WalkerTest.java | 21 ++++++++++--- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index 0194e114a..374a9f8da 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -129,7 +129,7 @@ public class MD5DB { System.out.printf("##### Skipping update, cannot write file %s%n", dbFile); } } else { - System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); + //System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); } } @@ -170,6 +170,18 @@ public class MD5DB { return bytes; } + public static class MD5Match { + final String md5; + final String failMessage; + boolean failed; + + public MD5Match(final String md5, final String failMessage, final boolean failed) { + this.md5 = md5; + this.failMessage = failMessage; + this.failed = failed; + } + } + /** * Tests a file MD5 against an expected value, returning the MD5. NOTE: This function WILL throw an exception if the MD5s are different. * @param name Name of the test. @@ -178,18 +190,21 @@ public class MD5DB { * @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text. * @return The calculated MD5. */ - public static String assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) { - String filemd5sum = testFileMD5(name, resultsFile, expectedMD5, parameterize); + public static MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) { + final String filemd5sum = testFileMD5(name, resultsFile, expectedMD5, parameterize); + String failMessage = null; + boolean failed = false; if (parameterize || expectedMD5.equals("")) { // Don't assert } else if ( filemd5sum.equals(expectedMD5) ) { - System.out.println(String.format(" => %s PASSED", name)); + System.out.println(String.format(" => %s PASSED (expected=%s)", name, expectedMD5)); } else { - Assert.fail(String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, filemd5sum)); + failed = true; + failMessage = String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, filemd5sum); } - return filemd5sum; + return new MD5Match(filemd5sum, failMessage, failed); } @@ -218,8 +233,8 @@ public class MD5DB { System.out.println(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b", name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5))); } else { - System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5)); - System.out.flush(); + //System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5)); + //System.out.flush(); if ( ! expectedMD5.equals(filemd5sum) ) { // we are going to fail for real in assertEquals (so we are counted by the testing framework). diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index a1817e3c7..ca7653b58 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -52,7 +52,7 @@ public class WalkerTest extends BaseTest { GenomeAnalysisEngine.resetRandomGenerator(); } - public String assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5) { + public MD5DB.MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5) { return MD5DB.assertMatchingMD5(name, resultsFile, expectedMD5, parameterize()); } @@ -84,10 +84,23 @@ public class WalkerTest extends BaseTest { public List assertMatchingMD5s(final String name, List resultFiles, List expectedMD5s) { List md5s = new ArrayList(); + List fails = new ArrayList(); + for (int i = 0; i < resultFiles.size(); i++) { - String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); - maybeValidateSupplementaryFile(name, resultFiles.get(i)); - md5s.add(i, md5); + MD5DB.MD5Match result = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); + if ( ! result.failed ) { + maybeValidateSupplementaryFile(name, resultFiles.get(i)); + md5s.add(result.md5); + } else { + fails.add(result); + } + } + + if ( ! fails.isEmpty() ) { + for ( final MD5DB.MD5Match fail : fails ) { + logger.warn("Fail: " + fail.failMessage); + } + Assert.fail("Test failed: " + name); } return md5s; From a45d985818d9cec38d2956847cb4f64375c71fcc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 15:54:09 -0700 Subject: [PATCH 42/63] TODO method stubs --- .../sting/gatk/samples/SampleDB.java | 71 +++++++++++-------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 4bcf3c938..5ba2252e4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -4,6 +4,7 @@ import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import sun.reflect.generics.reflectiveObjects.NotImplementedException; import java.util.*; @@ -104,6 +105,48 @@ public class SampleDB { return samples.size(); } + public Set getSamples() { + return new HashSet(samples.values()); + } + + public Collection getSampleNames() { + return Collections.unmodifiableCollection(samples.keySet()); + } + + + /** + * Takes a collection of sample names and returns their corresponding sample objects + * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set + * @param sampleNameList Set of sample names + * @return Corresponding set of samples + */ + public Set getSamples(Collection sampleNameList) { + HashSet samples = new HashSet(); + for (String name : sampleNameList) { + try { + samples.add(getSample(name)); + } + catch (Exception e) { + throw new StingException("Could not get sample with the following ID: " + name, e); + } + } + return samples; + } + + // -------------------------------------------------------------------------------- + // + // Higher level pedigree functions + // + // -------------------------------------------------------------------------------- + + public Set getFamilyIDs() { + throw new NotImplementedException(); + } + + public Map> getFamilies() { + throw new NotImplementedException(); + } + /** * Return all samples with a given family ID * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this @@ -137,32 +180,4 @@ public class SampleDB { } return children; } - - public Set getSamples() { - return new HashSet(samples.values()); - } - - public Collection getSampleNames() { - return Collections.unmodifiableCollection(samples.keySet()); - } - - - /** - * Takes a collection of sample names and returns their corresponding sample objects - * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set - * @param sampleNameList Set of sample names - * @return Corresponding set of samples - */ - public Set getSamples(Collection sampleNameList) { - HashSet samples = new HashSet(); - for (String name : sampleNameList) { - try { - samples.add(getSample(name)); - } - catch (Exception e) { - throw new StingException("Could not get sample with the following ID: " + name, e); - } - } - return samples; - } } From ffdfdcde3ff3340d822693cb27efc8f7b6aaeeb4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 15:54:45 -0700 Subject: [PATCH 43/63] Updating MD5s -- Interval test now uses RG containing BAM -- DoC sample name ordering has changed. --- .../coverage/DepthOfCoverageIntegrationTest.java | 14 +++++++------- .../utils/interval/IntervalIntegrationTest.java | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 411a5d0d2..646fb5e77 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -57,11 +57,11 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { // now add the expected files that get generated spec.addAuxFile("423571e4c05e7934322172654ac6dbb7", baseOutputFile); spec.addAuxFile("9df5e7e07efeb34926c94a724714c219", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); - spec.addAuxFile("b9a7748e5aec4dc06daed893c901c00d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); + spec.addAuxFile("229b9b5bc2141c86dbc69c8acc9eba6a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); spec.addAuxFile("9cd395f47b329b9dd00ad024fcac9929", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); - spec.addAuxFile("aec669d64d9dd652dd088a5341835ea5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); - spec.addAuxFile("f6dbd74d32a48abe71ce08d300bce983", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); - spec.addAuxFile("e3a3467ed259ee3680f8d01980f525b7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("471c34ad2e4f7228efd20702d5941ba9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("9667c77284c2c08e647b162d0e9652d4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); + spec.addAuxFile("5a96c75f96d6fa6ee617451d731dae37", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); spec.addAuxFile("b82846df660f0aac8429aec57c2a62d6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); spec.addAuxFile("d32a8c425fadcc4c048bd8b48d0f61e5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); @@ -69,10 +69,10 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.addAuxFile("e70952f241eebb9b5448f2e7cb288131", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); spec.addAuxFile("054ed1e184f46d6a170dc9bf6524270c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); spec.addAuxFile("d53431022f7387fe9ac47814ab1fcd88", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); - spec.addAuxFile("650ee3714da7fbad7832c9d4ad49eb51", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("a395dafde101971d2b9e5ddb6cd4b7d0", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("df0ba76e0e6082c0d29fcfd68efc6b77", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("7dcac2e8962c778081486332a4576dc3", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("a50011571334f17e950ad3ed1149e350", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("e013cb5b11b0321a81c8dbd7c1863787", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("661160f571def8c323345b5859cfb9da", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); spec.addAuxFile("c95a7a6840334cadd0e520939615c77b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); execute("testBaseOutputNoFiltering",spec); diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 2fab1f287..178c09fa4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -76,7 +76,7 @@ public class IntervalIntegrationTest extends WalkerTest { // our base file File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("99c266d777e2e167b8153c858c305fda",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("748a38ed5eb0a043dfc7b82f0d1e8063",createTempFileFromBase(baseOutputFile.getAbsolutePath())); spec.addAuxFile("fadcdf88597b9609c5f2a17f4c6eb455", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadInclusion",spec); @@ -97,7 +97,7 @@ public class IntervalIntegrationTest extends WalkerTest { File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); spec.addAuxFile("8236f0b2df5a692e54751b08bc3836fa",createTempFileFromBase(baseOutputFile.getAbsolutePath())); - spec.addAuxFile("651b42456d31ba24e913297b71b32143", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadExclusion",spec); } From 9bd3ba4c7ed3a634a3abff5cae73a933e36ab61f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 16:04:52 -0700 Subject: [PATCH 44/63] Missed one MD5 --- .../sting/utils/interval/IntervalIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 178c09fa4..379d79c84 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -96,7 +96,7 @@ public class IntervalIntegrationTest extends WalkerTest { // our base file File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("8236f0b2df5a692e54751b08bc3836fa",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("80887ba488e53dabd9596ff93070ae75",createTempFileFromBase(baseOutputFile.getAbsolutePath())); spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadExclusion",spec); From d1d39943d0475af51e438bdbb945c79471142bdd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 21:00:15 -0700 Subject: [PATCH 45/63] Updating MD5 for BAMs that I added a read group to, part 2 --- .../sting/utils/interval/IntervalIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 178c09fa4..379d79c84 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -96,7 +96,7 @@ public class IntervalIntegrationTest extends WalkerTest { // our base file File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("8236f0b2df5a692e54751b08bc3836fa",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("80887ba488e53dabd9596ff93070ae75",createTempFileFromBase(baseOutputFile.getAbsolutePath())); spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadExclusion",spec); From 51ecc20867004411b2e683d5f1281b205f5c029d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Oct 2011 09:55:05 -0700 Subject: [PATCH 46/63] getFamily() and associated methods implemented and tested -- Sample no longer serializable -- Sample now implements Comparable --- .../sting/gatk/samples/Sample.java | 7 ++- .../sting/gatk/samples/SampleDB.java | 47 +++++++++++------- .../sting/gatk/samples/SampleDBUnitTest.java | 49 ++++++++++++++++--- 3 files changed, 78 insertions(+), 25 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index d57668715..8d19eb246 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -9,7 +9,7 @@ import java.util.Map; /** * */ -public class Sample implements java.io.Serializable { +public class Sample implements Comparable { // implements java.io.Serializable { final private String familyID, paternalID, maternalID; final private Gender gender; final private double quantitativePhenotype; @@ -118,6 +118,11 @@ public class Sample implements java.io.Serializable { return gender; } + @Override + public int compareTo(final Sample sample) { + return ID.compareTo(sample.getID()); + } + @Override public String toString() { return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s", diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 5ba2252e4..2c63f93ff 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -4,7 +4,6 @@ import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.Genotype; -import sun.reflect.generics.reflectiveObjects.NotImplementedException; import java.util.*; @@ -139,30 +138,42 @@ public class SampleDB { // // -------------------------------------------------------------------------------- - public Set getFamilyIDs() { - throw new NotImplementedException(); + /** + * Returns a sorted set of the family IDs in all samples (excluding null ids) + * @return + */ + public final Set getFamilyIDs() { + return getFamilies().keySet(); } - public Map> getFamilies() { - throw new NotImplementedException(); + /** + * Returns a map from family ID -> set of family members for all samples with + * non-null family ids + * + * @return + */ + public final Map> getFamilies() { + final Map> families = new TreeMap>(); + + for ( final Sample sample : samples.values() ) { + final String famID = sample.getFamilyID(); + if ( famID != null ) { + if ( ! families.containsKey(famID) ) + families.put(famID, new TreeSet()); + families.get(famID).add(sample); + } + } + + return families; } /** * Return all samples with a given family ID - * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this * @param familyId * @return */ public Set getFamily(String familyId) { - HashSet familyMembers = new HashSet(); - - for (Sample sample : samples.values()) { - if (sample.getFamilyID() != null) { - if (sample.getFamilyID().equals(familyId)) - familyMembers.add(sample); - } - } - return familyMembers; + return getFamilies().get(familyId); } /** @@ -172,9 +183,9 @@ public class SampleDB { * @return */ public Set getChildren(Sample sample) { - HashSet children = new HashSet(); - for (Sample familyMember : getFamily(sample.getFamilyID())) { - if (familyMember.getMother() == sample || familyMember.getFather() == sample) { + final HashSet children = new HashSet(); + for ( final Sample familyMember : getFamily(sample.getFamilyID())) { + if ( familyMember.getMother() == sample || familyMember.getFather() == sample ) { children.add(familyMember); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index b6b4fab54..d498ee61a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -9,9 +9,7 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import java.io.File; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -36,9 +34,17 @@ public class SampleDBUnitTest extends BaseTest { private static final String testPEDString = String.format("%s%n%s%n%s", - "fam1 kid dad mom 1 2", - "fam1 dad 0 0 1 1", - "fam1 mom 0 0 2 2"); + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2"); + + private static final String testPEDMultipleFamilies = + String.format("%s%n%s%n%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2", + "fam3 s1 d1 m1 2 2", + "fam2 s2 d2 m2 2 2"); private static final String testPEDStringInconsistentGender = "fam1 kid 0 0 2 2"; @@ -117,4 +123,35 @@ public class SampleDBUnitTest extends BaseTest { builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); builder.getFinalSampleDB(); } + + @Test() + public void getFamilyIDs() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFamilyIDs(), new TreeSet(Arrays.asList("fam1", "fam2", "fam3"))); + } + + @Test() + public void getFamily() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFamily("fam1"), testPEDSamplesAsSet); + } + + @Test() + public void loadFamilyIDs() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Map> families = db.getFamilies(); + Assert.assertEquals(families.size(), 3); + Assert.assertEquals(families.keySet(), new TreeSet(Arrays.asList("fam1", "fam2", "fam3"))); + + for ( final String famID : families.keySet() ) { + final Set fam = families.get(famID); + Assert.assertEquals(fam.size(), 3); + for ( final Sample sample : fam ) { + Assert.assertEquals(sample.getFamilyID(), famID); + } + } + } } From e7c80f7c451255c21c05116ca3ca8cc651fee109 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Oct 2011 12:26:33 -0700 Subject: [PATCH 47/63] Renaming quantitative trait to OtherPhenotype which is now a String not a double -- we can now use PED file to represent population data or other arbitrary phenotype data, not just doubles --- .../sting/gatk/samples/Affection.java | 8 ++--- .../sting/gatk/samples/PedReader.java | 10 ++++--- .../sting/gatk/samples/Sample.java | 30 +++++++++---------- .../sting/gatk/samples/SampleDB.java | 11 ------- .../sting/gatk/samples/PedReaderUnitTest.java | 16 +++++----- .../sting/gatk/samples/SampleUnitTest.java | 21 +++++++------ 6 files changed, 43 insertions(+), 53 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java index de0dba884..83e31f672 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java @@ -28,8 +28,8 @@ package org.broadinstitute.sting.gatk.samples; * Categorical sample trait for association and analysis * * Samples can have unknown status, be affected or unaffected by the - * categorical trait, or they can be marked as actually having a - * quantitative trait value (stored in an associated value in the Sample class) + * categorical trait, or they can be marked as actually having an + * other trait value (stored in an associated value in the Sample class) * * @author Mark DePristo * @since Sept. 2011 @@ -41,6 +41,6 @@ public enum Affection { AFFECTED, /** Unaffected by the disease */ UNAFFECTED, - /** A quantitative trait: value of the trait is stored elsewhere */ - QUANTITATIVE + /** An "other" trait: value of the trait is stored elsewhere and is an arbitrary string */ + OTHER } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java index ec49b0f60..c442409fb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -63,6 +63,8 @@ import java.util.*; * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a * quantitative trait or an affection status column: PLINK will automatically detect which type * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed). + * Note that the GATK actually supports arbitrary values for quantitative trait -- not just doubles -- + * and are actually representing these values as strings instead of doubles * * NOTE Quantitative traits with decimal points must be coded with a period/full-stop character and * not a comma, i.e. 2.394 not 2,394 @@ -212,7 +214,7 @@ public class PedReader { splits.add(parts); lineNo++; } - logger.info("Trait is quantitative? " + isQT); + logger.info("Phenotype is other? " + isQT); // now go through and parse each record lineNo = 1; @@ -220,7 +222,7 @@ public class PedReader { for ( final String[] parts : splits ) { String familyID = null, individualID, paternalID = null, maternalID = null; Gender sex = Gender.UNKNOWN; - double quantitativePhenotype = Sample.UNSET_QT; + String quantitativePhenotype = Sample.UNSET_QT; Affection affection = Affection.UNKNOWN; if ( familyPos != -1 ) familyID = maybeMissing(parts[familyPos]); @@ -239,8 +241,8 @@ public class PedReader { if ( parts[phenotypePos].equals(MISSING_VALUE1) ) affection = Affection.UNKNOWN; else { - affection = Affection.QUANTITATIVE; - quantitativePhenotype = Double.valueOf(parts[phenotypePos]); + affection = Affection.OTHER; + quantitativePhenotype = parts[phenotypePos]; } } else { if ( parts[phenotypePos].equals(MISSING_VALUE1) ) affection = Affection.UNKNOWN; diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index 8d19eb246..b39fdd79d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -12,22 +12,22 @@ import java.util.Map; public class Sample implements Comparable { // implements java.io.Serializable { final private String familyID, paternalID, maternalID; final private Gender gender; - final private double quantitativePhenotype; + final private String otherPhenotype; final private Affection affection; final private String ID; final private SampleDB infoDB; final private Map properties = new HashMap(); - public final static double UNSET_QT = Double.NaN; + public final static String UNSET_QT = null; public Sample(final String ID, final SampleDB infoDB, final String familyID, final String paternalID, final String maternalID, - final Gender gender, final Affection affection, final double quantitativePhenotype) { + final Gender gender, final Affection affection, final String otherPhenotype) { this.familyID = familyID; this.paternalID = paternalID; this.maternalID = maternalID; this.gender = gender; - this.quantitativePhenotype = quantitativePhenotype; + this.otherPhenotype = otherPhenotype; this.affection = affection; this.ID = ID; this.infoDB = infoDB; @@ -35,8 +35,8 @@ public class Sample implements Comparable { // implements java.io.Serial protected Sample(final String ID, final String familyID, final String paternalID, final String maternalID, - final Gender gender, final Affection affection, final double quantitativePhenotype) { - this(ID, null, familyID, paternalID, maternalID, gender, affection, quantitativePhenotype); + final Gender gender, final Affection affection, final String otherPhenotype) { + this(ID, null, familyID, paternalID, maternalID, gender, affection, otherPhenotype); } protected Sample(final String ID, @@ -51,8 +51,8 @@ public class Sample implements Comparable { // implements java.io.Serial this(ID, infoDB, familyID, paternalID, maternalID, gender, Affection.UNKNOWN, UNSET_QT); } - public Sample(final String ID, final SampleDB infoDB, final Affection affection, final double quantitativePhenotype) { - this(ID, infoDB, null, null, null, Gender.UNKNOWN, affection, quantitativePhenotype); + public Sample(final String ID, final SampleDB infoDB, final Affection affection, final String otherPhenotype) { + this(ID, infoDB, null, null, null, Gender.UNKNOWN, affection, otherPhenotype); } public Sample(String id, SampleDB infoDB) { @@ -86,12 +86,12 @@ public class Sample implements Comparable { // implements java.io.Serial return affection; } - public boolean hasQuantitativeTrait() { - return affection == Affection.QUANTITATIVE; + public boolean hasOtherPhenotype() { + return affection == Affection.OTHER; } - public double getQuantitativePhenotype() { - return quantitativePhenotype; + public String getOtherPhenotype() { + return otherPhenotype; } /** @@ -127,7 +127,7 @@ public class Sample implements Comparable { // implements java.io.Serial public String toString() { return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s", getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(), - getQuantitativePhenotype(), properties); + getOtherPhenotype(), properties); } // // ------------------------------------------------------------------------------------- @@ -174,7 +174,7 @@ public class Sample implements Comparable { // implements java.io.Serial equalOrNull(paternalID, otherSample.paternalID) && equalOrNull(maternalID, otherSample.maternalID) && equalOrNull(gender, otherSample.gender) && - equalOrNull(quantitativePhenotype, otherSample.quantitativePhenotype) && + equalOrNull(otherPhenotype, otherSample.otherPhenotype) && equalOrNull(affection, otherSample.affection) && equalOrNull(properties, otherSample.properties); } @@ -215,7 +215,7 @@ public class Sample implements Comparable { // implements java.io.Serial mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null), mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN), mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN), - mergeValues(prev.getID(), "QuantitativeTrait", prev.getQuantitativePhenotype(), next.getQuantitativePhenotype(), UNSET_QT)); + mergeValues(prev.getID(), "OtherPhenotype", prev.getOtherPhenotype(), next.getOtherPhenotype(), UNSET_QT)); //mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap())); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 2c63f93ff..ee0873c6e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -8,17 +8,6 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import java.util.*; /** - * Created by IntelliJ IDEA. - * User: brett - * Date: Jul 26, 2010 - * Time: 3:30:09 PM - * - * This class stores and manages sample metadata. This data is encoded in a sample file, which can be included - * in the GATK by the "--samples" argument. This class reads and parses those files. - * - * Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really - * only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker - * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource. * */ public class SampleDB { diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index e63fc7feb..c2a94acc1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -156,26 +156,26 @@ public class PedReaderUnitTest extends BaseTest { "fam1 mom granddad2 grandma2 2 2")); // Quantitative trait - new PedReaderTest("QuantitativeTrait", + new PedReaderTest("OtherPhenotype", Arrays.asList( - new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), - new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.OTHER, "1.0"), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10.0")), String.format("%s%n%s", "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 10.0")); - new PedReaderTest("QuantitativeTraitWithMissing", + new PedReaderTest("OtherPhenotypeWithMissing", Arrays.asList( new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT), - new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10.0")), String.format("%s%n%s", "fam1 s1 0 0 1 -9", "fam2 s2 0 0 2 10.0")); - new PedReaderTest("QuantitativeTraitOnlyInts", + new PedReaderTest("OtherPhenotypeOnlyInts", Arrays.asList( - new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), - new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.OTHER, "1"), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10")), String.format("%s%n%s", "fam1 s1 0 0 1 1", "fam2 s2 0 0 2 10")); diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java index bc8a98c22..3af40adbe 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -6,16 +6,13 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; /** - * Created by IntelliJ IDEA. - * User: brett - * Date: Sep 9, 2010 - * Time: 8:21:00 AM + * */ public class SampleUnitTest extends BaseTest { SampleDB db; static Sample fam1A, fam1B, fam1C; static Sample s1, s2; - static Sample trait1, trait2, trait3, trait4; + static Sample trait1, trait2, trait3, trait4, trait5; @BeforeClass public void init() { @@ -31,7 +28,8 @@ public class SampleUnitTest extends BaseTest { trait1 = new Sample("t1", db, Affection.AFFECTED, Sample.UNSET_QT); trait2 = new Sample("t2", db, Affection.UNAFFECTED, Sample.UNSET_QT); trait3 = new Sample("t3", db, Affection.UNKNOWN, Sample.UNSET_QT); - trait4 = new Sample("t4", db, Affection.QUANTITATIVE, 1.0); + trait4 = new Sample("t4", db, Affection.OTHER, "1.0"); + trait5 = new Sample("t4", db, Affection.OTHER, "CEU"); } /** @@ -47,13 +45,14 @@ public class SampleUnitTest extends BaseTest { Assert.assertEquals(null, fam1B.getMaternalID()); Assert.assertEquals(Affection.AFFECTED, trait1.getAffection()); - Assert.assertEquals(Sample.UNSET_QT, trait1.getQuantitativePhenotype()); + Assert.assertEquals(Sample.UNSET_QT, trait1.getOtherPhenotype()); Assert.assertEquals(Affection.UNAFFECTED, trait2.getAffection()); - Assert.assertEquals(Sample.UNSET_QT, trait2.getQuantitativePhenotype()); + Assert.assertEquals(Sample.UNSET_QT, trait2.getOtherPhenotype()); Assert.assertEquals(Affection.UNKNOWN, trait3.getAffection()); - Assert.assertEquals(Sample.UNSET_QT, trait3.getQuantitativePhenotype()); - Assert.assertEquals(Affection.QUANTITATIVE, trait4.getAffection()); - Assert.assertEquals(1.0, trait4.getQuantitativePhenotype()); + Assert.assertEquals(Sample.UNSET_QT, trait3.getOtherPhenotype()); + Assert.assertEquals(Affection.OTHER, trait4.getAffection()); + Assert.assertEquals("1.0", trait4.getOtherPhenotype()); + Assert.assertEquals("CEU", trait5.getOtherPhenotype()); } @Test() From b732f740d2e0d2bb7a0bf8c2457f630cb0e61782 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 5 Oct 2011 16:51:30 -0400 Subject: [PATCH 48/63] Revert "Change vcf jar to use a classfileset to pull all dependencies. Should save" This reverts commit 441022c4c600624928da46419a6a289200700f3e. --- build.xml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/build.xml b/build.xml index 34de6cee6..1f26e7b7a 100644 --- a/build.xml +++ b/build.xml @@ -545,11 +545,12 @@ - - - - - + + + + + + From 6a573437af6ad996864df9295bff2f0b9ed0a19e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Oct 2011 15:00:58 -0700 Subject: [PATCH 49/63] Details documentation arguments for -ped --- .../sting/gatk/GenomeAnalysisEngine.java | 6 +- .../arguments/GATKArgumentCollection.java | 60 ++++++++++++++++++- .../sting/gatk/samples/SampleDBBuilder.java | 2 +- 3 files changed, 62 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 9cfe7d48b..a35cd3690 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -201,6 +201,9 @@ public class GenomeAnalysisEngine { // Prepare the data for traversal. initializeDataSources(); + // initialize sampleDB + initializeSampleDB(); + // initialize and validate the interval list initializeIntervals(); validateSuppliedIntervals(); @@ -689,9 +692,6 @@ public class GenomeAnalysisEngine { // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); - - // set up sample db - initializeSampleDB(); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index c71b3ce2c..cd9068a64 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -213,14 +213,70 @@ public class GATKArgumentCollection { // -------------------------------------------------------------------------------------------------------------- /** - * MARK: add documentation details + * Reads PED file-formatted tabular text files describing meta-data about the samples being + * processed in the GATK. + * + * See http://www.broadinstitute.org/mpg/tagger/faq.html + * See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped + * + * The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory: + * + * Family ID + * Individual ID + * Paternal ID + * Maternal ID + * Sex (1=male; 2=female; other=unknown) + * Phenotype + * + * The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: GATK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed). + * + * If an individual's sex is unknown, then any character other than 1 or 2 can be used. + * + * You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore. + * + * Affection status should be coded: + * + * -9 missing + * 0 missing + * 1 unaffected + * 2 affected + * + * If any value outside of -9,0,1,2 is detected than the samples are assumed + * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely + * represents the missing value. + * + * Genotypes (column 7 onwards) cannot be specified to the GATK. + * + * For example, here are two individuals (one row = one person): + * + * FAM001 1 0 0 1 2 + * FAM001 2 0 0 1 2 + * + * Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to + * tell the GATK PED parser that the corresponding fields are missing from the ped file. + * + * Note that most GATK walkers do not use pedigree information. Walkers that require pedigree + * data should clearly indicate so in their arguments and will throw errors if required pedigree + * information is missing. */ @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) public List pedigreeFiles = Collections.emptyList(); + /** + * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more + * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString + * as -ped supports + */ @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) public List pedigreeStrings = Collections.emptyList(); + /** + * How strict should we be in parsing the PED files? + */ @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; @@ -379,7 +435,7 @@ public class GATKArgumentCollection { return false; } if ((other.RODToInterval == null && RODToInterval != null) || - (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) { + (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) { return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java index 807b150b2..44a8600b0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -146,7 +146,7 @@ public class SampleDBBuilder { for ( final Sample dsSample : samplesFromDataSources ) if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) - throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files"); + throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation"); } } } From be2d29ce69a2fd1b58938a9e9e47faff870c657a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Oct 2011 15:17:41 -0700 Subject: [PATCH 50/63] Final PED documentation --- .../arguments/GATKArgumentCollection.java | 66 +++++++++++-------- .../gatk/samples/PedigreeValidationType.java | 8 +++ 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index cd9068a64..486868dc2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -213,55 +213,63 @@ public class GATKArgumentCollection { // -------------------------------------------------------------------------------------------------------------- /** - * Reads PED file-formatted tabular text files describing meta-data about the samples being - * processed in the GATK. + *

Reads PED file-formatted tabular text files describing meta-data about the samples being + * processed in the GATK.

* - * See http://www.broadinstitute.org/mpg/tagger/faq.html - * See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped + * * - * The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory: + *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

* - * Family ID - * Individual ID - * Paternal ID - * Maternal ID - * Sex (1=male; 2=female; other=unknown) - * Phenotype + *
    + *
  • Family ID
  • + *
  • Individual ID
  • + *
  • Paternal ID
  • + *
  • Maternal ID
  • + *
  • Sex (1=male; 2=female; other=unknown)
  • + *
  • Phenotype
  • + *
* - * The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a * quantitative trait or an affection status column: GATK will automatically detect which type - * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed). + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

* - * If an individual's sex is unknown, then any character other than 1 or 2 can be used. + *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

* - * You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that - * line will be ignored. Do not start any family IDs with this character therefore. + *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore.

* - * Affection status should be coded: + *

Affection status should be coded:

* - * -9 missing - * 0 missing - * 1 unaffected - * 2 affected + *
    + *
  • -9 missing
  • + *
  • 0 missing
  • + *
  • 1 unaffected
  • + *
  • 2 affected
  • + *
* - * If any value outside of -9,0,1,2 is detected than the samples are assumed + *

If any value outside of -9,0,1,2 is detected than the samples are assumed * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely - * represents the missing value. + * represents the missing value.

* - * Genotypes (column 7 onwards) cannot be specified to the GATK. + *

Genotypes (column 7 onwards) cannot be specified to the GATK.

* - * For example, here are two individuals (one row = one person): + *

For example, here are two individuals (one row = one person):

* + *
      *   FAM001  1  0 0  1  2
      *   FAM001  2  0 0  1  2
+     * 
* - * Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to - * tell the GATK PED parser that the corresponding fields are missing from the ped file. + *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to + * tell the GATK PED parser that the corresponding fields are missing from the ped file.

* - * Note that most GATK walkers do not use pedigree information. Walkers that require pedigree + *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree * data should clearly indicate so in their arguments and will throw errors if required pedigree - * information is missing. + * information is missing.

*/ @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) public List pedigreeFiles = Collections.emptyList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java index 209636b54..bbf857820 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java @@ -28,6 +28,14 @@ package org.broadinstitute.sting.gatk.samples; * */ public enum PedigreeValidationType { + /** + * Require if a pedigree file is provided at all samples in the VCF or BAM files have a corresponding + * entry in the pedigree file(s). + */ STRICT, + + /** + * Do not enforce any overlap between the VCF/BAM samples and the pedigree data + * */ SILENT } From a91509e7dd7f098315c91475936e9eb29dc7f18b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Oct 2011 15:22:57 -0700 Subject: [PATCH 51/63] Shouldn't be public --- .../qscripts/StandardVariantEvaluation.scala | 202 ------------------ 1 file changed, 202 deletions(-) delete mode 100755 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala deleted file mode 100755 index d333e1dc0..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala +++ /dev/null @@ -1,202 +0,0 @@ -package org.broadinstitute.sting.queue.qscripts - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk.RodBind -import org.broadinstitute.sting.queue.extensions.gatk._ - -class StandardVariantEvaluation extends QScript { - // todo -- update to released version when things stabilize - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="B37 reference sequence: defaults to broad standard location", required=false) - var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - @Argument(shortName = "intervals", doc="intervals to evaluate. Only supports evaluation on chromosome 20 now, as most evaluation data is there", required=false) - val TARGET_INTERVAL: String = "20" - - @Argument(shortName = "includeUnion", doc="If provided, we'll create a union of the evaluation data sets for evaluation", required=false) - val CREATE_UNION: Boolean = false - - @Argument(shortName = "dataDir", doc="Path to the standard evaluation data files", required=false) - val DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Comparisons/StandardForEvaluation/b37/" - - @Argument(shortName = "evalStandard1000GCalls", doc="If provided, we'll include some standard 1000G data for evaluation", required=false) - val EVAL_STANDARD_1000G_CALLS: Boolean = false - - val COMPS_DIR = DATA_DIR + "/comps/" - val EVALS_DIR = DATA_DIR + "/evals/" - - @Argument(shortName = "moreSNPsToEval", doc="Path to additional SNP call sets for evaluation", required=false) - val moreSNPsToEval: List[File] = Nil - - @Argument(shortName = "moreIndelsToEval", doc="Path to additional Indel call sets for evaluation", required=false) - val moreIndelsToEval: List[File] = Nil - - val VARIANT_TYPES: List[String] = List("indels", "snps") - val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map( - "indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION), - "snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION) - ) - - val SITES_DIR: String = "sitesFiles" - - // path to b37 DBSNP - @Argument(shortName = "dbsnp", doc="Path to DBSNP **VCF** for evaluation", required=false) - val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf") - //val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf"); - - class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) { - val originalFile = new File(COMPS_DIR + filename) - val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile - val sitesFile = new File(SITES_DIR + "/" + swapExt(file, ".vcf", ".sites.vcf").getName) - } - - class Eval(val name: String, val evalType: String, val filename: String, val overrideFile: File = null ) { - val file: File = if ( overrideFile != null ) overrideFile else new File(EVALS_DIR + "/" + filename) - } - - var COMPS: List[Comp] = Nil - def addComp(comp: Comp) { COMPS = comp :: COMPS } - - var EVALS: List[Eval] = Nil - def addEval(eval: Eval) { EVALS = eval :: EVALS } - def addEvalFromCMD(file: File, t: String) { addEval(new Eval(file.getName, t, null, file)) } - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.intervalsString = List(TARGET_INTERVAL); - this.reference_sequence = referenceFile; - this.memoryLimit = 2 - } - - def initializeStandardDataFiles() = { - // - // Standard evaluation files for indels - // - addComp(new Comp("NA12878.homvar.GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true)) - addComp(new Comp("CG.38samples", "indels", "CG.Indels.leftAligned.b37.vcf")) - addComp(new Comp("NA12878.homvar.CG", "indels", "NA12878.CG.b37.indels.vcf", true)) - addComp(new Comp("g1k.pilot1.validation", "indels", "pilot1_indel_validation_2009.b37.vcf")) - addComp(new Comp("NA12878.hand_curated", "indels", "NA12878.validated.curated.polymorphic.indels.vcf")) - addComp(new Comp("NA12878.Mullikin", "indels", "NA12878.DIPline.NQScm.expanded.chr20.b37.minReads_2_or_gt2bp.vcf")) - - - // - // INDEL call sets - // - if ( EVAL_STANDARD_1000G_CALLS ) { - addEval(new Eval("dindel", "indels", "20110208.chr20.dindel2.EUR.sites.vcf")) - addEval(new Eval("si", "indels", "20101123.chr20.si.v2.EUR.sites.vcf")) - addEval(new Eval("gatk", "indels", "EUR.phase1.chr20.broad.filtered.indels.sites.vcf")) - } - - // - // Standard evaluation files for SNPs - // - addComp(new Comp("NA12878.homvar.GATK", "snps", "NA12878.HiSeq19.cut.vcf", true)) - addComp(new Comp("CG.38samples", "snps", "CG.38samples.b37.vcf")) - addComp(new Comp("NA12878.homvar.CG", "snps", "NA12878.CG.b37.snps.vcf", true)) - addComp(new Comp("HapMap3.3", "snps", "hapmap3.3.sites_r27_nr.b37_fwd.vcf")) - addComp(new Comp("OMNI.2.5M", "snps", "omni2.5.1212samples.b37.sites.chr20.monoAreAC0.vcf")) - addComp(new Comp("g1k.pilot1.validation", "snps", "1000G.snp.validation.b37.vcf")) - - // - // SNP call sets - // - if ( EVAL_STANDARD_1000G_CALLS ) { - addEval(new Eval("1000G.gatk.eurPlus.phase1", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf")) - addEval(new Eval("1000G.high_specificity.phase1", "snps", "ALL.phase1.chr20.projectConsensus.highSpecificity.snps.genotypes.sites.vcf")) - } - } - - def script = { - val sitesDir = new File(SITES_DIR) - if ( ! sitesDir.exists ) sitesDir.mkdirs() - - initializeStandardDataFiles(); - - // add additional files for evaluation, if necessary - moreSNPsToEval.foreach(addEvalFromCMD(_, "snps")) - moreIndelsToEval.foreach(addEvalFromCMD(_, "indels")) - - // - // create hom-var versions of key files - // - for ( comp <- COMPS ) - if ( comp.MakeHomVar ) - add(new SelectHomVars(comp.originalFile, comp.file)) - - for ( comp <- COMPS ) - add(new JustSites(comp.file, comp.sitesFile)) - - // - // Loop over evaluation types - // - for ( evalType <- VARIANT_TYPES ) { - var evalsOfType = EVALS.filter(_.evalType == evalType) - val compsOfType = COMPS.filter(_.evalType == evalType) - - if ( evalsOfType.size > 0 ) { - - // if desired and possible, create a union.X.vcf file - if ( CREATE_UNION && evalsOfType.size > 1 ) { - val union: File = new File("union.%s.vcf".format(evalType)) - add(new MyCombine(evalsOfType.map(_.file), union)); - evalsOfType = new Eval("union", evalType, null, union) :: evalsOfType - } - - // our root VE - val VE = new MyEval() - VE.VT = VARIANT_TYPE_VT(evalType) - VE.o = new File(evalType + ".eval") - - // add evals - for ( calls <- evalsOfType ) - VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file) - - // add comps - //VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP) - for ( comp <- compsOfType ) - VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile) - - add(VE) - } - } - } - - /** - * Select homozygous non-reference sites from a single deep data set - */ - class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("variant", "VCF", vcf) - this.o = out - this.select ++= List("\"AC == 2\"") - } - - /** - * A simple union - */ - class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS { - for ( vcf <- vcfs ) - this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) - this.o = out - } - - /** - * A command line (cut) that removes all genotyping information from a file - */ - class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { - def commandLine = "cut -f 1-8 %s > %s".format(in, out) - } - - /** - * Base class for VariantEval used here - */ - class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS { - this.noST = true - this.evalModule :+= "ValidationReport" - } -} - From dd780dba5fa5b07591bda94b58ac505e9131ca89 Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 5 Oct 2011 18:25:26 -0400 Subject: [PATCH 52/63] Rather than just picking a few classes to be the roots of the vcf jar and including all dependencies, use the entire codecs.vcf and variantcontext packages as roots. Fix for my fix for Jim Robinson. --- build.xml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/build.xml b/build.xml index 1f26e7b7a..1e5aaf644 100644 --- a/build.xml +++ b/build.xml @@ -545,12 +545,10 @@ - - - - - - + + + + From 6d6149b9a2ecdb6c401460ac10db6ee90de27ede Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Wed, 5 Oct 2011 18:30:40 -0400 Subject: [PATCH 53/63] Updated gsalib gsa.read.gatkreport to return all reports, even those beginning with '.'. In PreQC using geom_blank() so MEDIAN_INSERT_SIZE plot doesn't crash on facet_grid(scales='free') when data doesn't contain points for 'RF' or 'TANDEM'. --- public/R/src/gsalib/R/gsa.read.gatkreport.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/R/src/gsalib/R/gsa.read.gatkreport.R b/public/R/src/gsalib/R/gsa.read.gatkreport.R index 011b5240d..46bbf7eda 100644 --- a/public/R/src/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/gsalib/R/gsa.read.gatkreport.R @@ -99,5 +99,5 @@ gsa.read.gatkreport <- function(filename) { .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); } - gatkreport = as.list(tableEnv); + gatkreport = as.list(tableEnv, all.names=TRUE); } From b945e97de19f6f4b29aae9c14b5343cf1b1de140 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Oct 2011 17:12:48 -0700 Subject: [PATCH 54/63] Shouldn't have committed the non-fetching version by default --- build.xml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/build.xml b/build.xml index f3f88fdf7..ccd7a8aa0 100644 --- a/build.xml +++ b/build.xml @@ -147,12 +147,12 @@ - - - - - - + + From 55b9f065279f934443039f163af9f842529c0a7e Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 5 Oct 2011 20:36:28 -0400 Subject: [PATCH 55/63] Ensure that IndelRealigner n-way out option supports MD5 generation. --- .../gatk/walkers/indels/IndelRealigner.java | 7 ++++-- .../sting/utils/sam/NWaySAMFileWriter.java | 24 ++++++++++--------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 8680f3537..36e4db1c5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -248,6 +248,9 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") protected String N_WAY_OUT = null; + @Hidden + @Argument(fullName="generate_nWayOut_md5s",doc="Generate md5sums for BAMs") + protected boolean generateMD5s = false; // DEBUGGING OPTIONS FOLLOW @@ -401,9 +404,9 @@ public class IndelRealigner extends ReadWalker { // if ( args.containsKey("disable_bam_indexing") ) { System.out.println("NO INDEXING!!"); System.exit(1); createIndex = false; } if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) { - writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT),SAMFileHeader.SortOrder.coordinate,true, createIndex); + writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT),SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s); } else { - writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, createIndex); + writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s); } } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index 70417889b..07bfc52c7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -51,18 +51,18 @@ public class NWaySAMFileWriter implements SAMFileWriter { private boolean presorted ; GenomeAnalysisEngine toolkit; - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly) { + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { this.presorted = presorted; this.toolkit = toolkit; writerMap = new HashMap(); - setupByReader(toolkit,in2out,order, presorted, indexOnTheFly); + setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5); } - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly ) { + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly , boolean generateMD5) { this.presorted = presorted; this.toolkit = toolkit; writerMap = new HashMap(); - setupByReader(toolkit,ext,order, presorted, indexOnTheFly); + setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5); } @@ -73,8 +73,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { * @param toolkit * @param in2out */ - public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly) { - + public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { @@ -88,7 +87,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { if ( writerMap.containsKey( rid ) ) throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - addWriter(rid,outName, order, presorted, indexOnTheFly); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5); } } @@ -101,7 +100,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { * @param toolkit * @param ext */ - public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly) { + public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); @@ -119,16 +118,19 @@ public class NWaySAMFileWriter implements SAMFileWriter { if ( writerMap.containsKey( rid ) ) throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - addWriter(rid,outName, order, presorted, indexOnTheFly); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5); } } - private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly) { + private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { File f = new File(outName); SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); header.setSortOrder(order); - SAMFileWriter sw = new SAMFileWriterFactory().setCreateIndex(indexOnTheFly).makeSAMOrBAMWriter(header, presorted, f); + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(indexOnTheFly); + factory.setCreateMd5File(generateMD5); + SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); writerMap.put(id,sw); } From a3c5a316860d9ed4d98dd4a92c8e7b0031755613 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Oct 2011 21:09:08 -0700 Subject: [PATCH 56/63] Oops, forgot the PED test file --- public/testdata/ceutrio.ped | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 public/testdata/ceutrio.ped diff --git a/public/testdata/ceutrio.ped b/public/testdata/ceutrio.ped new file mode 100644 index 000000000..1302e1a2d --- /dev/null +++ b/public/testdata/ceutrio.ped @@ -0,0 +1,3 @@ +fam1 kid dad mom 1 2 +fam1 dad 0 0 1 1 +fam1 mom 0 0 2 2 From 8e6845806ad31bb1f7b42714828d27eee6c20377 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Oct 2011 21:26:21 -0700 Subject: [PATCH 57/63] Allowing empty samples list in LIBS -- Right now we cannot process BAM files without read groups because we enforce the samples list to not be empty when there's a SAM record. Now if there are reads and there are no samples we add the "null" sample so that LIBS walks the reads properly --- .../sting/gatk/iterators/LocusIteratorByState.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 896d6e3a2..eb5b51b33 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -282,8 +282,12 @@ public class LocusIteratorByState extends LocusIterator { // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true - if ( this.samples.isEmpty() && samIterator.hasNext() ) - throw new IllegalArgumentException("samples list must not be empty"); + if ( this.samples.isEmpty() && samIterator.hasNext() ) { + // actually we cannot process BAMs without read groups unless we tolerate empty + // sample lists. In the empty case we need to add the null element to the samples + this.samples.add(null); + //throw new IllegalArgumentException("samples list must not be empty"); + } } /** From 93f7e632bd2febd5f8af2e846bf054893997dee0 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 6 Oct 2011 10:07:46 -0400 Subject: [PATCH 58/63] Minor fix/enhancement for VariantEval: if a vcf has symbolic alleles, program would crash ungracefully - now we'll just skip record without processing. This is a big issue since we can't process 1000G integration files with code as is. --- .../gatk/walkers/varianteval/evaluators/CountVariants.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 72058ba7b..e83434037 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -130,6 +130,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval { nVariantLoci++; nMixed++; break; + case SYMBOLIC: + // ignore symbolic alleles, but don't fail + // todo - consistent way of treating symbolic alleles thgoughout codebase? + break; default: throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); } From efca1fdfd8a5e6cf25062911f142bb1bfcabea9b Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Thu, 6 Oct 2011 10:08:48 -0400 Subject: [PATCH 59/63] Revert change until I figure out how to use classfileset/rootfileset with a vanilla ant install. This reverts commit 4c9022872beec8dc0700a1320a267e3603c8212d. --- build.xml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/build.xml b/build.xml index 1e5aaf644..1f26e7b7a 100644 --- a/build.xml +++ b/build.xml @@ -545,10 +545,12 @@ - - - - + + + + + + From daa5999489fea84502cafab7c6d5bea2a0e56bac Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 6 Oct 2011 08:16:25 -0700 Subject: [PATCH 60/63] Fixed typo in argument description --- .../sting/gatk/walkers/variantutils/VariantsToTable.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index c44d84136..81d0c36ac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -133,7 +133,7 @@ public class VariantsToTable extends RodWalker { /** * By default, this tool throws a UserException when it encounters a field without a value in some record. This - * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being + * is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHRMO not being * found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such * fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error. From 4b5b9155a9be837f0cb799cb6605a5328f4b25ec Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 6 Oct 2011 08:16:47 -0700 Subject: [PATCH 61/63] Fixed bad expected value in PedReaderUnitTest --- .../broadinstitute/sting/gatk/samples/PedReaderUnitTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java index c2a94acc1..1601845cd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -158,7 +158,7 @@ public class PedReaderUnitTest extends BaseTest { // Quantitative trait new PedReaderTest("OtherPhenotype", Arrays.asList( - new Sample("s1", "fam1", null, null, Gender.MALE, Affection.OTHER, "1.0"), + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.OTHER, "1"), new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10.0")), String.format("%s%n%s", "fam1 s1 0 0 1 1", @@ -188,7 +188,7 @@ public class PedReaderUnitTest extends BaseTest { PedReader reader = new PedReader(); SampleDB sampleDB = new SampleDB(); List readSamples = reader.parse(myFileContents, missing, sampleDB); - Assert.assertEquals(new HashSet(test.expectedSamples), new HashSet(readSamples), "Parsed incorrect number of samples"); + Assert.assertEquals(new HashSet(test.expectedSamples), new HashSet(readSamples)); } @Test(enabled = true, dataProvider = "readerTest") From 4fa5045e84b1cf7e1a0c57041e3e01bb9f587baf Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Thu, 6 Oct 2011 12:49:51 -0400 Subject: [PATCH 63/63] Abandoning classfileset/rootfileset approach due to difficulting managing classloading of bcel*.jar/ant-apache-bcel*.jar. Switching instead to manually specifying a minimal set of packages/classes to include in the vcf.jar via build.xml, and adding a unit test which creates a limited classloader only aware of vcf.jar and tribble.jar and tries to use it to load the core classes in the vcf jar. Hopefully third time's the charm. --- build.xml | 7 +-- .../VCFJarClassLoadingUnitTest.java | 51 +++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java diff --git a/build.xml b/build.xml index 1f26e7b7a..565d18c58 100644 --- a/build.xml +++ b/build.xml @@ -546,8 +546,9 @@ - - + + + @@ -914,7 +915,7 @@ - + diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java new file mode 100644 index 000000000..50eebe179 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import org.testng.annotations.Test; + +import java.io.File; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.net.URLClassLoader; + +/** + * Test to ensure that, given only the VCF jar and its expected dependencies, core VCF classes will load. + */ +public class VCFJarClassLoadingUnitTest { + @Test + public void testVCFJarClassLoading() throws ClassNotFoundException, MalformedURLException { + URI vcfURI = new File("dist/vcf.jar").toURI(); + URI tribbleURI = new File("lib/tribble-24.jar").toURI(); + + ClassLoader classLoader = new URLClassLoader(new URL[] {vcfURI.toURL(),tribbleURI.toURL()}, null); + classLoader.loadClass("org.broadinstitute.sting.utils.variantcontext.VariantContext"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.VCFCodec"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.VCFWriter"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter"); + } +}