No longer supports extraProperties

-- the underlying data structure is still present, but until I decide what to do for the extensible system I've completely disabled the subsystem
-- Added code to merge Samples, so that a mostly full record can be merged with a consistent empty record.  If the two records are inconsistent, an error is thrown
-- addSample() in Sample.class now invokes mergeSample() when appropriate
-- Validation types are now only STRICT or SILENT
-- Validation code implemented in SampleDBBuilder
-- Extensive unit tests for SampleDBBuilder
This commit is contained in:
Mark DePristo 2011-10-03 19:20:33 -07:00
parent 867a7476c1
commit b20689ff55
8 changed files with 241 additions and 95 deletions

View File

@ -146,12 +146,14 @@
<mkdir dir="${lib.dir}"/>
<mkdir dir="${ivy.jar.dir}"/>
<get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}"
dest="${ivy.jar.dir}/${ivy.jar.file}"
usetimestamp="true"/>
<taskdef resource="org/apache/ivy/ant/antlib.xml"
uri="antlib:org.apache.ivy.ant"
classpath="${ivy.jar.dir}/${ivy.jar.file}"/>
<!-- Comment out the following two lines to build the GATK without a network connection, assuming you have all of the libraries cached already -->
<!-- <get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}" -->
<!-- dest="${ivy.jar.dir}/${ivy.jar.file}" -->
<!-- usetimestamp="true"/> -->
<!-- <taskdef resource="org/apache/ivy/ant/antlib.xml" -->
<!-- uri="antlib:org.apache.ivy.ant" -->
<!-- classpath="${ivy.jar.dir}/${ivy.jar.file}"/> -->
<ivy:settings file="${ivy.settings.dir}/ivysettings.xml"/>
<property name="init.resolve.done" value="true"/>
</target>

View File

@ -701,7 +701,8 @@ public class GenomeAnalysisEngine {
SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType);
sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader());
sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this));
sampleDBBuilder.addSamplesFromPedigreeArgument(argCollection.pedigreeData);
sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles);
sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings);
sampleDB = sampleDBBuilder.getFinalSampleDB();
}

View File

@ -215,8 +215,11 @@ public class GATKArgumentCollection {
/**
* MARK: add documentation details
*/
@Argument(fullName="pedigree", shortName = "ped", doc="Pedigree file / string for samples",required=false)
public List<String> pedigreeData = Collections.emptyList();
@Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false)
public List<File> pedigreeFiles = Collections.emptyList();
@Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false)
public List<String> pedigreeStrings = Collections.emptyList();
@Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false)
public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT;

View File

@ -29,6 +29,5 @@ package org.broadinstitute.sting.gatk.samples;
*/
public enum PedigreeValidationType {
STRICT,
LINIENT,
SILENT,
SILENT
}

View File

@ -1,6 +1,8 @@
package org.broadinstitute.sting.gatk.samples;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@ -121,36 +123,36 @@ public class Sample implements java.io.Serializable {
public String toString() {
return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s",
getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(),
getQuantitativePhenotype(), getExtraProperties());
getQuantitativePhenotype(), properties);
}
// -------------------------------------------------------------------------------------
//
// code for working with additional -- none standard -- properites
//
// -------------------------------------------------------------------------------------
public Map<String, Object> getExtraProperties() {
return Collections.unmodifiableMap(properties);
}
/**
* Get one property
* @param key key of property
* @return value of property as generic object
*/
public Object getExtraPropertyValue(final String key) {
return properties.get(key);
}
/**
*
* @param key property key
* @return true if sample has this property (even if its value is null)
*/
public boolean hasExtraProperty(String key) {
return properties.containsKey(key);
}
// // -------------------------------------------------------------------------------------
// //
// // code for working with additional -- none standard -- properites
// //
// // -------------------------------------------------------------------------------------
//
// public Map<String, Object> getExtraProperties() {
// return Collections.unmodifiableMap(properties);
// }
//
// /**
// * Get one property
// * @param key key of property
// * @return value of property as generic object
// */
// public Object getExtraPropertyValue(final String key) {
// return properties.get(key);
// }
//
// /**
// *
// * @param key property key
// * @return true if sample has this property (even if its value is null)
// */
// public boolean hasExtraProperty(String key) {
// return properties.containsKey(key);
// }
@Override
public int hashCode() {
@ -181,4 +183,36 @@ public class Sample implements java.io.Serializable {
else
return o2 == null ? false : o1.equals(o2);
}
private final static <T> T mergeValues(final String name, final String field, final T o1, final T o2, final T emptyValue) {
if ( o1 == null || o1.equals(emptyValue) ) {
// take o2 if both are null, otherwise keep o2
return o2 == null ? null : o2;
} else {
if ( o2 == null || o2.equals(emptyValue) )
return o1; // keep o1, since it's a real value
else {
// both o1 and o2 have a value
if ( o1 == o2 )
return o1;
else
throw new UserException("Inconsistent values detected for " + name + " for field " + field + " value1 " + o1 + " value2 " + o2);
}
}
}
public final static Sample mergeSamples(final Sample prev, final Sample next) {
if ( prev.equals(next) )
return next;
else {
return new Sample(prev.getID(), prev.infoDB,
mergeValues(prev.getID(), "Family_ID", prev.getFamilyID(), next.getFamilyID(), null),
mergeValues(prev.getID(), "Paternal_ID", prev.getPaternalID(), next.getPaternalID(), null),
mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null),
mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN),
mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN),
mergeValues(prev.getID(), "QuantitativeTrait", prev.getQuantitativePhenotype(), next.getQuantitativePhenotype(), UNSET_QT));
//mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap()));
}
}
}

View File

@ -44,6 +44,9 @@ public class SampleDB {
* @param sample to be added
*/
protected SampleDB addSample(Sample sample) {
Sample prev = samples.get(sample.getID());
if ( prev != null )
sample = Sample.mergeSamples(prev, sample);
samples.put(sample.getID(), sample);
return this;
}
@ -138,8 +141,8 @@ public class SampleDB {
return children;
}
public Collection<Sample> getSamples() {
return Collections.unmodifiableCollection(samples.values());
public Set<Sample> getSamples() {
return new HashSet<Sample>(samples.values());
}
public Collection<String> getSampleNames() {
@ -165,22 +168,4 @@ public class SampleDB {
}
return samples;
}
// --------------------------------------------------------------------------------
//
// Validation
//
// --------------------------------------------------------------------------------
public final void validate() {
validate(getSamples(), PedigreeValidationType.STRICT);
}
public final void validate(PedigreeValidationType validationType) {
validate(getSamples(), validationType);
}
public final void validate(Collection<Sample> samplesToCheck, PedigreeValidationType validationType) {
// todo -- actually do an implementation
}
}

View File

@ -45,6 +45,15 @@ public class SampleDBBuilder {
final SampleDB sampleDB = new SampleDB();
final GenomeAnalysisEngine engine;
Set<Sample> samplesFromDataSources = new HashSet<Sample>();
Set<Sample> samplesFromPedigrees = new HashSet<Sample>();
/** for testing only */
protected SampleDBBuilder(PedigreeValidationType validationStrictness) {
engine = null;
this.validationStrictness = validationStrictness;
}
/**
* Constructor takes both a SAM header and sample files because the two must be integrated.
*/
@ -57,26 +66,34 @@ public class SampleDBBuilder {
* Hallucinates sample objects for all the samples in the SAM file and stores them
*/
public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) {
return addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header));
addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header));
return this;
}
public SampleDBBuilder addSamplesFromSampleNames(final Collection<String> sampleNames) {
for (final String sampleName : sampleNames) {
if (sampleDB.getSample(sampleName) == null) {
final Sample newSample = new Sample(sampleName, sampleDB);
addSample(newSample);
sampleDB.addSample(newSample);
samplesFromDataSources.add(newSample); // keep track of data source samples
}
}
return this;
}
public SampleDBBuilder addSamplesFromPedigreeArgument(final List<String> pedigreeArguments) {
for (final String ped : pedigreeArguments) {
final File pedFile = new File(ped);
if ( pedFile.exists() )
addSamples(pedFile);
else
addSamples(ped);
public SampleDBBuilder addSamplesFromPedigreeFiles(final List<File> pedigreeFiles) {
for (final File pedFile : pedigreeFiles) {
Collection<Sample> samples = addSamplesFromPedigreeArgument(pedFile);
samplesFromPedigrees.addAll(samples);
}
return this;
}
public SampleDBBuilder addSamplesFromPedigreeStrings(final List<String> pedigreeStrings) {
for (final String pedString : pedigreeStrings) {
Collection<Sample> samples = addSamplesFromPedigreeArgument(pedString);
samplesFromPedigrees.addAll(samples);
}
return this;
@ -86,41 +103,55 @@ public class SampleDBBuilder {
* Parse one sample file and integrate it with samples that are already there
* Fail quickly if we find any errors in the file
*/
protected SampleDBBuilder addSamples(File sampleFile) {
private Collection<Sample> addSamplesFromPedigreeArgument(File sampleFile) {
final PedReader reader = new PedReader();
try {
reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB);
return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(sampleFile, e);
}
return this;
}
protected SampleDBBuilder addSamples(final String string) {
private Collection<Sample> addSamplesFromPedigreeArgument(final String string) {
final PedReader reader = new PedReader();
reader.parse(string, getMissingFields(string), sampleDB);
return this;
}
/**
* Add a sample to the collection
* @param sample to be added
*/
protected SampleDBBuilder addSample(Sample sample) {
// todo -- merge with existing record if we have one
sampleDB.addSample(sample);
return this;
return reader.parse(string, getMissingFields(string), sampleDB);
}
public SampleDB getFinalSampleDB() {
sampleDB.validate(validationStrictness);
validate();
return sampleDB;
}
public EnumSet<PedReader.MissingPedField> getMissingFields(final Object engineArg) {
final List<String> posTags = engine.getTags(engineArg).getPositionalTags();
return PedReader.parseMissingFieldTags(engineArg, posTags);
if ( engine == null )
return EnumSet.noneOf(PedReader.MissingPedField.class);
else {
final List<String> posTags = engine.getTags(engineArg).getPositionalTags();
return PedReader.parseMissingFieldTags(engineArg, posTags);
}
}
// --------------------------------------------------------------------------------
//
// Validation
//
// --------------------------------------------------------------------------------
protected final void validate() {
if ( validationStrictness == PedigreeValidationType.SILENT )
return;
else {
// check that samples in data sources are all annotated, if anything is annotated
if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) {
final Set<String> sampleNamesFromPedigrees = new HashSet<String>();
for ( final Sample pSample : samplesFromPedigrees )
sampleNamesFromPedigrees.add(pSample.getID());
for ( final Sample dsSample : samplesFromDataSources )
if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) )
throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files");
}
}
}
}

View File

@ -3,6 +3,11 @@ package org.broadinstitute.sting.gatk.samples;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import java.io.File;
@ -15,15 +20,101 @@ import java.util.*;
* Time: 8:21:00 AM
*/
public class SampleDBUnitTest extends BaseTest {
// this empty header used to instantiate sampledatasource objects
private static SAMFileHeader header = new SAMFileHeader();
private static SampleDBBuilder builder;
// all the test sample files are located here
private String sampleFilesDir = validationDataLocation + "samples/";
private File testPED = new File(testDir + "ceutrio.ped");
private static final Set<Sample> testPEDSamples = new HashSet<Sample>(Arrays.asList(
new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED),
new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED),
new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)));
private static final Set<Sample> testSAMSamples = new HashSet<Sample>(Arrays.asList(
new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN),
new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN),
new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN)));
private static final String testPEDString =
String.format("%s%n%s%n%s",
"fam1 kid dad mom 1 2",
"fam1 dad 0 0 1 1",
"fam1 mom 0 0 2 2");
private static final String testPEDStringInconsistentGender =
"fam1 kid 0 0 2 2";
private static final Set<Sample> testPEDSamplesAsSet =
new HashSet<Sample>(testPEDSamples);
@BeforeMethod
public void before() {
builder = new SampleDBBuilder(PedigreeValidationType.STRICT);
}
// make sure samples are created from the SAM file correctly
@Test()
public void loadSAMSamplesTest() {
//SampleDB s = new SampleDB(header);
public void loadPEDFile() {
builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
SampleDB db = builder.getFinalSampleDB();
Assert.assertEquals(testPEDSamplesAsSet, db.getSamples());
}
@Test()
public void loadPEDString() {
builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDString));
SampleDB db = builder.getFinalSampleDB();
Assert.assertEquals(testPEDSamplesAsSet, db.getSamples());
}
private static final void addSAMHeader() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
ArtificialSAMUtils.createEnumeratedReadGroups(header, Arrays.asList("1", "2", "3"),
Arrays.asList("kid", "mom", "dad"));
builder.addSamplesFromSAMHeader(header);
}
@Test()
public void loadSAMHeader() {
addSAMHeader();
SampleDB db = builder.getFinalSampleDB();
Assert.assertEquals(testSAMSamples, db.getSamples());
}
@Test()
public void loadSAMHeaderPlusPED() {
addSAMHeader();
builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
SampleDB db = builder.getFinalSampleDB();
Assert.assertEquals(testPEDSamples, db.getSamples());
}
@Test()
public void loadDuplicateData() {
builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
SampleDB db = builder.getFinalSampleDB();
Assert.assertEquals(testPEDSamples, db.getSamples());
}
@Test(expectedExceptions = UserException.class)
public void loadNonExistentFile() {
builder.addSamplesFromPedigreeFiles(Arrays.asList(new File("non-existence-file.txt")));
SampleDB db = builder.getFinalSampleDB();
Assert.assertEquals(testSAMSamples, db.getSamples());
}
@Test(expectedExceptions = UserException.class)
public void loadInconsistentData() {
builder = new SampleDBBuilder(PedigreeValidationType.STRICT);
builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED));
builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender));
builder.getFinalSampleDB();
}
@Test(expectedExceptions = UserException.class)
public void sampleInSAMHeaderNotInSamplesDB() {
addSAMHeader();
builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender));
builder.getFinalSampleDB();
}
}