diff --git a/build.xml b/build.xml index 1f26e7b7a..ef662a160 100644 --- a/build.xml +++ b/build.xml @@ -146,12 +146,14 @@ - - + + + + + + + + diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 71e65f2fb..9cfe7d48b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -701,7 +701,8 @@ public class GenomeAnalysisEngine { SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); - sampleDBBuilder.addSamplesFromPedigreeArgument(argCollection.pedigreeData); + sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); + sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); sampleDB = sampleDBBuilder.getFinalSampleDB(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index c27bb26d9..c71b3ce2c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -215,8 +215,11 @@ public class GATKArgumentCollection { /** * MARK: add documentation details */ - @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree file / string for samples",required=false) - public List pedigreeData = Collections.emptyList(); + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) + public List pedigreeFiles = Collections.emptyList(); + + @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) + public List pedigreeStrings = Collections.emptyList(); @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java index 8a1a4f225..209636b54 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java @@ -29,6 +29,5 @@ package org.broadinstitute.sting.gatk.samples; */ public enum PedigreeValidationType { STRICT, - LINIENT, - SILENT, + SILENT } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index e68d92a9f..3e61e03d9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.samples; +import org.broadinstitute.sting.utils.exceptions.UserException; + import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -121,36 +123,36 @@ public class Sample implements java.io.Serializable { public String toString() { return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s", getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(), - getQuantitativePhenotype(), getExtraProperties()); + getQuantitativePhenotype(), properties); } - // ------------------------------------------------------------------------------------- - // - // code for working with additional -- none standard -- properites - // - // ------------------------------------------------------------------------------------- - - public Map getExtraProperties() { - return Collections.unmodifiableMap(properties); - } - - /** - * Get one property - * @param key key of property - * @return value of property as generic object - */ - public Object getExtraPropertyValue(final String key) { - return properties.get(key); - } - - /** - * - * @param key property key - * @return true if sample has this property (even if its value is null) - */ - public boolean hasExtraProperty(String key) { - return properties.containsKey(key); - } +// // ------------------------------------------------------------------------------------- +// // +// // code for working with additional -- none standard -- properites +// // +// // ------------------------------------------------------------------------------------- +// +// public Map getExtraProperties() { +// return Collections.unmodifiableMap(properties); +// } +// +// /** +// * Get one property +// * @param key key of property +// * @return value of property as generic object +// */ +// public Object getExtraPropertyValue(final String key) { +// return properties.get(key); +// } +// +// /** +// * +// * @param key property key +// * @return true if sample has this property (even if its value is null) +// */ +// public boolean hasExtraProperty(String key) { +// return properties.containsKey(key); +// } @Override public int hashCode() { @@ -181,4 +183,36 @@ public class Sample implements java.io.Serializable { else return o2 == null ? false : o1.equals(o2); } + + private final static T mergeValues(final String name, final String field, final T o1, final T o2, final T emptyValue) { + if ( o1 == null || o1.equals(emptyValue) ) { + // take o2 if both are null, otherwise keep o2 + return o2 == null ? null : o2; + } else { + if ( o2 == null || o2.equals(emptyValue) ) + return o1; // keep o1, since it's a real value + else { + // both o1 and o2 have a value + if ( o1 == o2 ) + return o1; + else + throw new UserException("Inconsistent values detected for " + name + " for field " + field + " value1 " + o1 + " value2 " + o2); + } + } + } + + public final static Sample mergeSamples(final Sample prev, final Sample next) { + if ( prev.equals(next) ) + return next; + else { + return new Sample(prev.getID(), prev.infoDB, + mergeValues(prev.getID(), "Family_ID", prev.getFamilyID(), next.getFamilyID(), null), + mergeValues(prev.getID(), "Paternal_ID", prev.getPaternalID(), next.getPaternalID(), null), + mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null), + mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN), + mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN), + mergeValues(prev.getID(), "QuantitativeTrait", prev.getQuantitativePhenotype(), next.getQuantitativePhenotype(), UNSET_QT)); + //mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap())); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 75b37d758..9abc28517 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -44,6 +44,9 @@ public class SampleDB { * @param sample to be added */ protected SampleDB addSample(Sample sample) { + Sample prev = samples.get(sample.getID()); + if ( prev != null ) + sample = Sample.mergeSamples(prev, sample); samples.put(sample.getID(), sample); return this; } @@ -138,8 +141,8 @@ public class SampleDB { return children; } - public Collection getSamples() { - return Collections.unmodifiableCollection(samples.values()); + public Set getSamples() { + return new HashSet(samples.values()); } public Collection getSampleNames() { @@ -165,22 +168,4 @@ public class SampleDB { } return samples; } - - // -------------------------------------------------------------------------------- - // - // Validation - // - // -------------------------------------------------------------------------------- - - public final void validate() { - validate(getSamples(), PedigreeValidationType.STRICT); - } - - public final void validate(PedigreeValidationType validationType) { - validate(getSamples(), validationType); - } - - public final void validate(Collection samplesToCheck, PedigreeValidationType validationType) { - // todo -- actually do an implementation - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java index fd42a24f4..87733d1f6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -45,6 +45,15 @@ public class SampleDBBuilder { final SampleDB sampleDB = new SampleDB(); final GenomeAnalysisEngine engine; + Set samplesFromDataSources = new HashSet(); + Set samplesFromPedigrees = new HashSet(); + + /** for testing only */ + protected SampleDBBuilder(PedigreeValidationType validationStrictness) { + engine = null; + this.validationStrictness = validationStrictness; + } + /** * Constructor takes both a SAM header and sample files because the two must be integrated. */ @@ -57,26 +66,34 @@ public class SampleDBBuilder { * Hallucinates sample objects for all the samples in the SAM file and stores them */ public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { - return addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); + addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); + return this; } public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { for (final String sampleName : sampleNames) { if (sampleDB.getSample(sampleName) == null) { final Sample newSample = new Sample(sampleName, sampleDB); - addSample(newSample); + sampleDB.addSample(newSample); + samplesFromDataSources.add(newSample); // keep track of data source samples } } return this; } - public SampleDBBuilder addSamplesFromPedigreeArgument(final List pedigreeArguments) { - for (final String ped : pedigreeArguments) { - final File pedFile = new File(ped); - if ( pedFile.exists() ) - addSamples(pedFile); - else - addSamples(ped); + public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) { + for (final File pedFile : pedigreeFiles) { + Collection samples = addSamplesFromPedigreeArgument(pedFile); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) { + for (final String pedString : pedigreeStrings) { + Collection samples = addSamplesFromPedigreeArgument(pedString); + samplesFromPedigrees.addAll(samples); } return this; @@ -86,41 +103,55 @@ public class SampleDBBuilder { * Parse one sample file and integrate it with samples that are already there * Fail quickly if we find any errors in the file */ - protected SampleDBBuilder addSamples(File sampleFile) { + private Collection addSamplesFromPedigreeArgument(File sampleFile) { final PedReader reader = new PedReader(); try { - reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); + return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); } catch ( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(sampleFile, e); } - - return this; } - protected SampleDBBuilder addSamples(final String string) { + private Collection addSamplesFromPedigreeArgument(final String string) { final PedReader reader = new PedReader(); - reader.parse(string, getMissingFields(string), sampleDB); - return this; - } - - /** - * Add a sample to the collection - * @param sample to be added - */ - protected SampleDBBuilder addSample(Sample sample) { - // todo -- merge with existing record if we have one - sampleDB.addSample(sample); - return this; + return reader.parse(string, getMissingFields(string), sampleDB); } public SampleDB getFinalSampleDB() { - sampleDB.validate(validationStrictness); + validate(); return sampleDB; } public EnumSet getMissingFields(final Object engineArg) { - final List posTags = engine.getTags(engineArg).getPositionalTags(); - return PedReader.parseMissingFieldTags(engineArg, posTags); + if ( engine == null ) + return EnumSet.noneOf(PedReader.MissingPedField.class); + else { + final List posTags = engine.getTags(engineArg).getPositionalTags(); + return PedReader.parseMissingFieldTags(engineArg, posTags); + } + } + + // -------------------------------------------------------------------------------- + // + // Validation + // + // -------------------------------------------------------------------------------- + + protected final void validate() { + if ( validationStrictness == PedigreeValidationType.SILENT ) + return; + else { + // check that samples in data sources are all annotated, if anything is annotated + if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { + final Set sampleNamesFromPedigrees = new HashSet(); + for ( final Sample pSample : samplesFromPedigrees ) + sampleNamesFromPedigrees.add(pSample.getID()); + + for ( final Sample dsSample : samplesFromDataSources ) + if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) + throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files"); + } + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index 500d322db..f6d3b42b8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -3,6 +3,11 @@ package org.broadinstitute.sting.gatk.samples; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import java.io.File; @@ -15,15 +20,101 @@ import java.util.*; * Time: 8:21:00 AM */ public class SampleDBUnitTest extends BaseTest { - // this empty header used to instantiate sampledatasource objects - private static SAMFileHeader header = new SAMFileHeader(); - + private static SampleDBBuilder builder; // all the test sample files are located here - private String sampleFilesDir = validationDataLocation + "samples/"; + private File testPED = new File(testDir + "ceutrio.ped"); + + private static final Set testPEDSamples = new HashSet(Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED))); + + private static final Set testSAMSamples = new HashSet(Arrays.asList( + new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), + new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), + new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN))); + + private static final String testPEDString = + String.format("%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2"); + + private static final String testPEDStringInconsistentGender = + "fam1 kid 0 0 2 2"; + + private static final Set testPEDSamplesAsSet = + new HashSet(testPEDSamples); + + + @BeforeMethod + public void before() { + builder = new SampleDBBuilder(PedigreeValidationType.STRICT); + } - // make sure samples are created from the SAM file correctly @Test() - public void loadSAMSamplesTest() { - //SampleDB s = new SampleDB(header); + public void loadPEDFile() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamplesAsSet, db.getSamples()); + } + + @Test() + public void loadPEDString() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDString)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamplesAsSet, db.getSamples()); + } + + private static final void addSAMHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); + ArtificialSAMUtils.createEnumeratedReadGroups(header, Arrays.asList("1", "2", "3"), + Arrays.asList("kid", "mom", "dad")); + builder.addSamplesFromSAMHeader(header); + } + + @Test() + public void loadSAMHeader() { + addSAMHeader(); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testSAMSamples, db.getSamples()); + } + + @Test() + public void loadSAMHeaderPlusPED() { + addSAMHeader(); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamples, db.getSamples()); + } + + @Test() + public void loadDuplicateData() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamples, db.getSamples()); + } + + @Test(expectedExceptions = UserException.class) + public void loadNonExistentFile() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(new File("non-existence-file.txt"))); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testSAMSamples, db.getSamples()); + } + + @Test(expectedExceptions = UserException.class) + public void loadInconsistentData() { + builder = new SampleDBBuilder(PedigreeValidationType.STRICT); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); + builder.getFinalSampleDB(); + } + + @Test(expectedExceptions = UserException.class) + public void sampleInSAMHeaderNotInSamplesDB() { + addSAMHeader(); + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); + builder.getFinalSampleDB(); } }