On path to SampleDB engine integration

-- PedReader tag parser
-- Separation of SampleDBBuilder from SampleDB (now immutable)
-- Removed old sample engine arguments
This commit is contained in:
Mark DePristo 2011-10-03 12:08:07 -07:00
parent 8ee0f91904
commit dd71884b0c
7 changed files with 304 additions and 60 deletions

View File

@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
@ -44,10 +45,7 @@ import org.simpleframework.xml.stream.HyphenStyle;
import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
* @author aaron
@ -72,11 +70,6 @@ public class GATKArgumentCollection {
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false)
public List<String> samFiles = new ArrayList<String>();
// parameters and their defaults
@ElementList(required = false)
@Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false)
public List<File> sampleFiles = new ArrayList<File>();
@Element(required = false)
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
@ -215,9 +208,25 @@ public class GATKArgumentCollection {
// --------------------------------------------------------------------------------------------------------------
//
// distributed GATK arguments
// PED (pedigree) support
//
// --------------------------------------------------------------------------------------------------------------
/**
* MARK: add documentation details
*/
@Argument(fullName="pedigree", shortName = "ped", doc="Pedigree file / string for samples",required=false)
public List<String> pedigreeData = Collections.emptyList();
@Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false)
public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT;
// --------------------------------------------------------------------------------------------------------------
//
// BAM indexing and sharding arguments
//
// --------------------------------------------------------------------------------------------------------------
@Element(required = false)
@Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false)
@Hidden

View File

@ -114,10 +114,42 @@ public class PedReader {
final static private Set<String> CATAGORICAL_TRAIT_VALUES = new HashSet<String>(Arrays.asList("-9", "0", "1", "2"));
final static private String commentMarker = "#";
/**
* An enum that specifies which, if any, of the standard PED fields are
* missing from the input records. For example, suppose we have the full record:
*
* "fam1 kid dad mom 1 2"
*
* indicating a male affected child. This can be parsed with the -ped x.ped argument
* to the GATK. Suppose we only have:
*
* "fam1 kid 1"
*
* we can parse the reduced version of this record with -ped:NO_PARENTS,NO_PHENOTYPE x.ped
*/
public enum MissingPedField {
/**
* The PED records do not have the first (FAMILY_ID) argument. The family id
* will be set to null / empty.
*/
NO_FAMILY_ID,
/**
* The PED records do not have either the paternal or maternal IDs, so
* the corresponding IDs are set to null.
*/
NO_PARENTS,
/**
* The PED records do not have the GENDER field, so the sex of each
* sample will be set to UNKNOWN.
*/
NO_SEX,
/**
* The PED records do not have the PHENOTYPE field, so the phenotype
* of each sample will be set to UNKNOWN.
*/
NO_PHENOTYPE
}
@ -233,8 +265,6 @@ public class PedReader {
if ( mom != null ) samples.add(mom);
}
sampleDB.validate(samples);
return samples;
}
@ -253,4 +283,26 @@ public class PedReader {
} else
return null;
}
/**
* Parses a list of tags from the command line, assuming it comes from the GATK Engine
* tags, and returns the corresponding EnumSet.
*
* @param arg the actual engine arg, used for the UserException if there's an error
* @param tags a list of string tags that should be converted to the MissingPedField value
* @return
*/
public static final EnumSet<MissingPedField> parseMissingFieldTags(final Object arg, final List<String> tags) {
final EnumSet<MissingPedField> missingFields = EnumSet.noneOf(MissingPedField.class);
for ( final String tag : tags ) {
try {
missingFields.add(MissingPedField.valueOf(tag));
} catch ( IllegalArgumentException e ) {
throw new UserException.BadArgumentValue(arg.toString(), "Unknown tag " + tag + " allowed values are " + MissingPedField.values());
}
}
return missingFields;
}
}

View File

@ -0,0 +1,34 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.samples;
/**
*
*/
public enum PedigreeValidationType {
STRICT,
LINIENT,
SILENT,
}

View File

@ -38,51 +38,9 @@ public class SampleDB {
}
public SampleDB(final SAMFileHeader header, final List<File> sampleFiles) {
this();
addSamples(header);
addSamples(sampleFiles);
}
// --------------------------------------------------------------------------------
//
// Functions for adding samples to the DB
//
// TODO: these should be protected, really
//
// --------------------------------------------------------------------------------
/**
* Hallucinates sample objects for all the samples in the SAM file and stores them
*/
protected SampleDB addSamples(SAMFileHeader header) {
for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
if (getSample(sampleName) == null) {
Sample newSample = new Sample(sampleName, this);
samples.put(sampleName, newSample);
}
}
return this;
}
protected SampleDB addSamples(final List<File> sampleFiles) {
// add files consecutively
for (File file : sampleFiles) {
addSamples(file);
}
return this;
}
/**
* Parse one sample file and integrate it with samples that are already there
* Fail quickly if we find any errors in the file
*/
protected SampleDB addSamples(File sampleFile) {
return this;
}
/**
* Add a sample to the collection
* Protected function to add a single sample to the database
*
* @param sample to be added
*/
protected SampleDB addSample(Sample sample) {
@ -215,10 +173,14 @@ public class SampleDB {
// --------------------------------------------------------------------------------
public final void validate() {
validate(getSamples());
validate(getSamples(), PedigreeValidationType.STRICT);
}
public final void validate(Collection<Sample> samplesToCheck) {
public final void validate(PedigreeValidationType validationType) {
validate(getSamples(), validationType);
}
public final void validate(Collection<Sample> samplesToCheck, PedigreeValidationType validationType) {
// todo -- actually do an implementation
}
}

View File

@ -0,0 +1,121 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.samples;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
/**
*
*/
public class SampleDBBuilder {
PedigreeValidationType validationStrictness;
final SampleDB sampleDB = new SampleDB();
final GenomeAnalysisEngine engine;
/**
* Constructor takes both a SAM header and sample files because the two must be integrated.
*/
public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) {
this.engine = engine;
this.validationStrictness = validationStrictness;
}
/**
* Hallucinates sample objects for all the samples in the SAM file and stores them
*/
public SampleDBBuilder addSamples(SAMFileHeader header) {
for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
if (sampleDB.getSample(sampleName) == null) {
final Sample newSample = new Sample(sampleName, sampleDB);
addSample(newSample);
}
}
return this;
}
public SampleDBBuilder addSamples(final List<String> pedigreeArguments) {
for (final String ped : pedigreeArguments) {
final File pedFile = new File(ped);
if ( pedFile.exists() )
addSamples(pedFile);
else
addSamples(ped);
}
return this;
}
/**
* Parse one sample file and integrate it with samples that are already there
* Fail quickly if we find any errors in the file
*/
protected SampleDBBuilder addSamples(File sampleFile) {
final PedReader reader = new PedReader();
try {
reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(sampleFile, e);
}
return this;
}
protected SampleDBBuilder addSamples(final String string) {
final PedReader reader = new PedReader();
reader.parse(string, getMissingFields(string), sampleDB);
return this;
}
/**
* Add a sample to the collection
* @param sample to be added
*/
protected SampleDBBuilder addSample(Sample sample) {
sampleDB.addSample(sample);
return this;
}
public SampleDB getFinalSampleDB() {
sampleDB.validate(validationStrictness);
return sampleDB;
}
public EnumSet<PedReader.MissingPedField> getMissingFields(final Object engineArg) {
final List<String> posTags = engine.getTags(engineArg).getPositionalTags();
return PedReader.parseMissingFieldTags(engineArg, posTags);
}
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.samples;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -285,4 +286,69 @@ public class PedReaderUnitTest extends BaseTest {
parts.remove(field.ordinal());
return Utils.join("\t", parts);
}
// -----------------------------------------------------------------
// parsing tags
// -----------------------------------------------------------------
private class PedReaderTestTagParsing extends TestDataProvider {
public EnumSet<PedReader.MissingPedField> expected;
public final List<String> tags;
private PedReaderTestTagParsing(final List<String> tags, EnumSet<PedReader.MissingPedField> missingDesc) {
super(PedReaderTestTagParsing.class);
this.tags = tags;
this.expected = missingDesc;
}
}
@DataProvider(name = "readerTestTagParsing")
public Object[][] createReaderTestTagParsing() {
new PedReaderTestTagParsing(
Collections.<String>emptyList(),
EnumSet.noneOf(PedReader.MissingPedField.class));
new PedReaderTestTagParsing(
Arrays.asList("NO_FAMILY_ID"),
EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID));
new PedReaderTestTagParsing(
Arrays.asList("NO_PARENTS"),
EnumSet.of(PedReader.MissingPedField.NO_PARENTS));
new PedReaderTestTagParsing(
Arrays.asList("NO_PHENOTYPE"),
EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE));
new PedReaderTestTagParsing(
Arrays.asList("NO_SEX"),
EnumSet.of(PedReader.MissingPedField.NO_SEX));
new PedReaderTestTagParsing(
Arrays.asList("NO_SEX", "NO_PHENOTYPE"),
EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE));
new PedReaderTestTagParsing(
Arrays.asList("NO_SEX", "NO_PHENOTYPE", "NO_PARENTS"),
EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS));
return PedReaderTestTagParsing.getTests(PedReaderTestTagParsing.class);
}
@Test(enabled = true, dataProvider = "readerTestTagParsing")
public void testPedReaderTagParsing(PedReaderTestTagParsing test) {
EnumSet<PedReader.MissingPedField> parsed = PedReader.parseMissingFieldTags("test", test.tags);
Assert.assertEquals(test.expected, parsed, "Failed to properly parse tags " + test.tags);
}
@Test(enabled = true, expectedExceptions = UserException.class)
public void testPedReaderTagParsing1() {
EnumSet<PedReader.MissingPedField> parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("XXX"));
}
@Test(enabled = true, expectedExceptions = UserException.class)
public void testPedReaderTagParsing2() {
EnumSet<PedReader.MissingPedField> parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("NO_SEX", "XXX"));
}
}

View File

@ -14,7 +14,7 @@ import java.util.*;
* Date: Sep 9, 2010
* Time: 8:21:00 AM
*/
public class SampleDataSourceUnitTest extends BaseTest {
public class SampleDBUnitTest extends BaseTest {
// this empty header used to instantiate sampledatasource objects
private static SAMFileHeader header = new SAMFileHeader();
@ -24,6 +24,6 @@ public class SampleDataSourceUnitTest extends BaseTest {
// make sure samples are created from the SAM file correctly
@Test()
public void loadSAMSamplesTest() {
SampleDB s = new SampleDB(header, Collections.<File>emptyList());
//SampleDB s = new SampleDB(header);
}
}