diff --git a/build.xml b/build.xml
index 1f26e7b7a..ef662a160 100644
--- a/build.xml
+++ b/build.xml
@@ -146,12 +146,14 @@
-
-
+
+
+
+
+
+
+
+
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index 71e65f2fb..9cfe7d48b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -701,7 +701,8 @@ public class GenomeAnalysisEngine {
SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType);
sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader());
sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this));
- sampleDBBuilder.addSamplesFromPedigreeArgument(argCollection.pedigreeData);
+ sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles);
+ sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings);
sampleDB = sampleDBBuilder.getFinalSampleDB();
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index c27bb26d9..c71b3ce2c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -215,8 +215,11 @@ public class GATKArgumentCollection {
/**
* MARK: add documentation details
*/
- @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree file / string for samples",required=false)
- public List pedigreeData = Collections.emptyList();
+ @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false)
+ public List pedigreeFiles = Collections.emptyList();
+
+ @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false)
+ public List pedigreeStrings = Collections.emptyList();
@Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false)
public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java
index 8a1a4f225..209636b54 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java
@@ -29,6 +29,5 @@ package org.broadinstitute.sting.gatk.samples;
*/
public enum PedigreeValidationType {
STRICT,
- LINIENT,
- SILENT,
+ SILENT
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
index e68d92a9f..3e61e03d9 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
@@ -1,6 +1,8 @@
package org.broadinstitute.sting.gatk.samples;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@@ -121,36 +123,36 @@ public class Sample implements java.io.Serializable {
public String toString() {
return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s",
getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(),
- getQuantitativePhenotype(), getExtraProperties());
+ getQuantitativePhenotype(), properties);
}
- // -------------------------------------------------------------------------------------
- //
- // code for working with additional -- none standard -- properites
- //
- // -------------------------------------------------------------------------------------
-
- public Map getExtraProperties() {
- return Collections.unmodifiableMap(properties);
- }
-
- /**
- * Get one property
- * @param key key of property
- * @return value of property as generic object
- */
- public Object getExtraPropertyValue(final String key) {
- return properties.get(key);
- }
-
- /**
- *
- * @param key property key
- * @return true if sample has this property (even if its value is null)
- */
- public boolean hasExtraProperty(String key) {
- return properties.containsKey(key);
- }
+// // -------------------------------------------------------------------------------------
+// //
+// // code for working with additional -- none standard -- properites
+// //
+// // -------------------------------------------------------------------------------------
+//
+// public Map getExtraProperties() {
+// return Collections.unmodifiableMap(properties);
+// }
+//
+// /**
+// * Get one property
+// * @param key key of property
+// * @return value of property as generic object
+// */
+// public Object getExtraPropertyValue(final String key) {
+// return properties.get(key);
+// }
+//
+// /**
+// *
+// * @param key property key
+// * @return true if sample has this property (even if its value is null)
+// */
+// public boolean hasExtraProperty(String key) {
+// return properties.containsKey(key);
+// }
@Override
public int hashCode() {
@@ -181,4 +183,36 @@ public class Sample implements java.io.Serializable {
else
return o2 == null ? false : o1.equals(o2);
}
+
+ private final static T mergeValues(final String name, final String field, final T o1, final T o2, final T emptyValue) {
+ if ( o1 == null || o1.equals(emptyValue) ) {
+ // take o2 if both are null, otherwise keep o2
+ return o2 == null ? null : o2;
+ } else {
+ if ( o2 == null || o2.equals(emptyValue) )
+ return o1; // keep o1, since it's a real value
+ else {
+ // both o1 and o2 have a value
+ if ( o1 == o2 )
+ return o1;
+ else
+ throw new UserException("Inconsistent values detected for " + name + " for field " + field + " value1 " + o1 + " value2 " + o2);
+ }
+ }
+ }
+
+ public final static Sample mergeSamples(final Sample prev, final Sample next) {
+ if ( prev.equals(next) )
+ return next;
+ else {
+ return new Sample(prev.getID(), prev.infoDB,
+ mergeValues(prev.getID(), "Family_ID", prev.getFamilyID(), next.getFamilyID(), null),
+ mergeValues(prev.getID(), "Paternal_ID", prev.getPaternalID(), next.getPaternalID(), null),
+ mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null),
+ mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN),
+ mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN),
+ mergeValues(prev.getID(), "QuantitativeTrait", prev.getQuantitativePhenotype(), next.getQuantitativePhenotype(), UNSET_QT));
+ //mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap()));
+ }
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java
index 75b37d758..9abc28517 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java
@@ -44,6 +44,9 @@ public class SampleDB {
* @param sample to be added
*/
protected SampleDB addSample(Sample sample) {
+ Sample prev = samples.get(sample.getID());
+ if ( prev != null )
+ sample = Sample.mergeSamples(prev, sample);
samples.put(sample.getID(), sample);
return this;
}
@@ -138,8 +141,8 @@ public class SampleDB {
return children;
}
- public Collection getSamples() {
- return Collections.unmodifiableCollection(samples.values());
+ public Set getSamples() {
+ return new HashSet(samples.values());
}
public Collection getSampleNames() {
@@ -165,22 +168,4 @@ public class SampleDB {
}
return samples;
}
-
- // --------------------------------------------------------------------------------
- //
- // Validation
- //
- // --------------------------------------------------------------------------------
-
- public final void validate() {
- validate(getSamples(), PedigreeValidationType.STRICT);
- }
-
- public final void validate(PedigreeValidationType validationType) {
- validate(getSamples(), validationType);
- }
-
- public final void validate(Collection samplesToCheck, PedigreeValidationType validationType) {
- // todo -- actually do an implementation
- }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java
index fd42a24f4..87733d1f6 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java
@@ -45,6 +45,15 @@ public class SampleDBBuilder {
final SampleDB sampleDB = new SampleDB();
final GenomeAnalysisEngine engine;
+ Set samplesFromDataSources = new HashSet();
+ Set samplesFromPedigrees = new HashSet();
+
+ /** for testing only */
+ protected SampleDBBuilder(PedigreeValidationType validationStrictness) {
+ engine = null;
+ this.validationStrictness = validationStrictness;
+ }
+
/**
* Constructor takes both a SAM header and sample files because the two must be integrated.
*/
@@ -57,26 +66,34 @@ public class SampleDBBuilder {
* Hallucinates sample objects for all the samples in the SAM file and stores them
*/
public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) {
- return addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header));
+ addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header));
+ return this;
}
public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) {
for (final String sampleName : sampleNames) {
if (sampleDB.getSample(sampleName) == null) {
final Sample newSample = new Sample(sampleName, sampleDB);
- addSample(newSample);
+ sampleDB.addSample(newSample);
+ samplesFromDataSources.add(newSample); // keep track of data source samples
}
}
return this;
}
- public SampleDBBuilder addSamplesFromPedigreeArgument(final List pedigreeArguments) {
- for (final String ped : pedigreeArguments) {
- final File pedFile = new File(ped);
- if ( pedFile.exists() )
- addSamples(pedFile);
- else
- addSamples(ped);
+ public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) {
+ for (final File pedFile : pedigreeFiles) {
+ Collection samples = addSamplesFromPedigreeArgument(pedFile);
+ samplesFromPedigrees.addAll(samples);
+ }
+
+ return this;
+ }
+
+ public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) {
+ for (final String pedString : pedigreeStrings) {
+ Collection samples = addSamplesFromPedigreeArgument(pedString);
+ samplesFromPedigrees.addAll(samples);
}
return this;
@@ -86,41 +103,55 @@ public class SampleDBBuilder {
* Parse one sample file and integrate it with samples that are already there
* Fail quickly if we find any errors in the file
*/
- protected SampleDBBuilder addSamples(File sampleFile) {
+ private Collection addSamplesFromPedigreeArgument(File sampleFile) {
final PedReader reader = new PedReader();
try {
- reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB);
+ return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(sampleFile, e);
}
-
- return this;
}
- protected SampleDBBuilder addSamples(final String string) {
+ private Collection addSamplesFromPedigreeArgument(final String string) {
final PedReader reader = new PedReader();
- reader.parse(string, getMissingFields(string), sampleDB);
- return this;
- }
-
- /**
- * Add a sample to the collection
- * @param sample to be added
- */
- protected SampleDBBuilder addSample(Sample sample) {
- // todo -- merge with existing record if we have one
- sampleDB.addSample(sample);
- return this;
+ return reader.parse(string, getMissingFields(string), sampleDB);
}
public SampleDB getFinalSampleDB() {
- sampleDB.validate(validationStrictness);
+ validate();
return sampleDB;
}
public EnumSet getMissingFields(final Object engineArg) {
- final List posTags = engine.getTags(engineArg).getPositionalTags();
- return PedReader.parseMissingFieldTags(engineArg, posTags);
+ if ( engine == null )
+ return EnumSet.noneOf(PedReader.MissingPedField.class);
+ else {
+ final List posTags = engine.getTags(engineArg).getPositionalTags();
+ return PedReader.parseMissingFieldTags(engineArg, posTags);
+ }
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // Validation
+ //
+ // --------------------------------------------------------------------------------
+
+ protected final void validate() {
+ if ( validationStrictness == PedigreeValidationType.SILENT )
+ return;
+ else {
+ // check that samples in data sources are all annotated, if anything is annotated
+ if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) {
+ final Set sampleNamesFromPedigrees = new HashSet();
+ for ( final Sample pSample : samplesFromPedigrees )
+ sampleNamesFromPedigrees.add(pSample.getID());
+
+ for ( final Sample dsSample : samplesFromDataSources )
+ if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) )
+ throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files");
+ }
+ }
}
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java
index 500d322db..f6d3b42b8 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java
@@ -3,6 +3,11 @@ package org.broadinstitute.sting.gatk.samples;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.testng.Assert;
+import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import java.io.File;
@@ -15,15 +20,101 @@ import java.util.*;
* Time: 8:21:00 AM
*/
public class SampleDBUnitTest extends BaseTest {
- // this empty header used to instantiate sampledatasource objects
- private static SAMFileHeader header = new SAMFileHeader();
-
+ private static SampleDBBuilder builder;
// all the test sample files are located here
- private String sampleFilesDir = validationDataLocation + "samples/";
+ private File testPED = new File(testDir + "ceutrio.ped");
+
+ private static final Set testPEDSamples = new HashSet(Arrays.asList(
+ new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED),
+ new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED),
+ new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)));
+
+ private static final Set testSAMSamples = new HashSet(Arrays.asList(
+ new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN),
+ new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN),
+ new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN)));
+
+ private static final String testPEDString =
+ String.format("%s%n%s%n%s",
+ "fam1 kid dad mom 1 2",
+ "fam1 dad 0 0 1 1",
+ "fam1 mom 0 0 2 2");
+
+ private static final String testPEDStringInconsistentGender =
+ "fam1 kid 0 0 2 2";
+
+ private static final Set testPEDSamplesAsSet =
+ new HashSet(testPEDSamples);
+
+
+ @BeforeMethod
+ public void before() {
+ builder = new SampleDBBuilder(PedigreeValidationType.STRICT);
+ }
- // make sure samples are created from the SAM file correctly
@Test()
- public void loadSAMSamplesTest() {
- //SampleDB s = new SampleDB(header);
+ public void loadPEDFile() {
+ builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
+ SampleDB db = builder.getFinalSampleDB();
+ Assert.assertEquals(testPEDSamplesAsSet, db.getSamples());
+ }
+
+ @Test()
+ public void loadPEDString() {
+ builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDString));
+ SampleDB db = builder.getFinalSampleDB();
+ Assert.assertEquals(testPEDSamplesAsSet, db.getSamples());
+ }
+
+ private static final void addSAMHeader() {
+ SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
+ ArtificialSAMUtils.createEnumeratedReadGroups(header, Arrays.asList("1", "2", "3"),
+ Arrays.asList("kid", "mom", "dad"));
+ builder.addSamplesFromSAMHeader(header);
+ }
+
+ @Test()
+ public void loadSAMHeader() {
+ addSAMHeader();
+ SampleDB db = builder.getFinalSampleDB();
+ Assert.assertEquals(testSAMSamples, db.getSamples());
+ }
+
+ @Test()
+ public void loadSAMHeaderPlusPED() {
+ addSAMHeader();
+ builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
+ SampleDB db = builder.getFinalSampleDB();
+ Assert.assertEquals(testPEDSamples, db.getSamples());
+ }
+
+ @Test()
+ public void loadDuplicateData() {
+ builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
+ builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
+ SampleDB db = builder.getFinalSampleDB();
+ Assert.assertEquals(testPEDSamples, db.getSamples());
+ }
+
+ @Test(expectedExceptions = UserException.class)
+ public void loadNonExistentFile() {
+ builder.addSamplesFromPedigreeFiles(Arrays.asList(new File("non-existence-file.txt")));
+ SampleDB db = builder.getFinalSampleDB();
+ Assert.assertEquals(testSAMSamples, db.getSamples());
+ }
+
+ @Test(expectedExceptions = UserException.class)
+ public void loadInconsistentData() {
+ builder = new SampleDBBuilder(PedigreeValidationType.STRICT);
+ builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
+ builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender));
+ builder.getFinalSampleDB();
+ }
+
+ @Test(expectedExceptions = UserException.class)
+ public void sampleInSAMHeaderNotInSamplesDB() {
+ addSAMHeader();
+ builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender));
+ builder.getFinalSampleDB();
}
}