On path to SampleDB engine integration
-- PedReader tag parser -- Separation of SampleDBBuilder from SampleDB (now immutable) -- Removed old sample engine arguments
This commit is contained in:
parent
8ee0f91904
commit
dd71884b0c
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Input;
|
|||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
||||
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
|
|
@ -44,10 +45,7 @@ import org.simpleframework.xml.stream.HyphenStyle;
|
|||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
|
|
@ -72,11 +70,6 @@ public class GATKArgumentCollection {
|
|||
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false)
|
||||
public List<String> samFiles = new ArrayList<String>();
|
||||
|
||||
// parameters and their defaults
|
||||
@ElementList(required = false)
|
||||
@Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false)
|
||||
public List<File> sampleFiles = new ArrayList<File>();
|
||||
|
||||
@Element(required = false)
|
||||
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
|
||||
public Integer readBufferSize = null;
|
||||
|
|
@ -215,9 +208,25 @@ public class GATKArgumentCollection {
|
|||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// distributed GATK arguments
|
||||
// PED (pedigree) support
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* MARK: add documentation details
|
||||
*/
|
||||
@Argument(fullName="pedigree", shortName = "ped", doc="Pedigree file / string for samples",required=false)
|
||||
public List<String> pedigreeData = Collections.emptyList();
|
||||
|
||||
@Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false)
|
||||
public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// BAM indexing and sharding arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Element(required = false)
|
||||
@Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false)
|
||||
@Hidden
|
||||
|
|
|
|||
|
|
@ -114,10 +114,42 @@ public class PedReader {
|
|||
final static private Set<String> CATAGORICAL_TRAIT_VALUES = new HashSet<String>(Arrays.asList("-9", "0", "1", "2"));
|
||||
final static private String commentMarker = "#";
|
||||
|
||||
/**
|
||||
* An enum that specifies which, if any, of the standard PED fields are
|
||||
* missing from the input records. For example, suppose we have the full record:
|
||||
*
|
||||
* "fam1 kid dad mom 1 2"
|
||||
*
|
||||
* indicating a male affected child. This can be parsed with the -ped x.ped argument
|
||||
* to the GATK. Suppose we only have:
|
||||
*
|
||||
* "fam1 kid 1"
|
||||
*
|
||||
* we can parse the reduced version of this record with -ped:NO_PARENTS,NO_PHENOTYPE x.ped
|
||||
*/
|
||||
public enum MissingPedField {
|
||||
/**
|
||||
* The PED records do not have the first (FAMILY_ID) argument. The family id
|
||||
* will be set to null / empty.
|
||||
*/
|
||||
NO_FAMILY_ID,
|
||||
|
||||
/**
|
||||
* The PED records do not have either the paternal or maternal IDs, so
|
||||
* the corresponding IDs are set to null.
|
||||
*/
|
||||
NO_PARENTS,
|
||||
|
||||
/**
|
||||
* The PED records do not have the GENDER field, so the sex of each
|
||||
* sample will be set to UNKNOWN.
|
||||
*/
|
||||
NO_SEX,
|
||||
|
||||
/**
|
||||
* The PED records do not have the PHENOTYPE field, so the phenotype
|
||||
* of each sample will be set to UNKNOWN.
|
||||
*/
|
||||
NO_PHENOTYPE
|
||||
}
|
||||
|
||||
|
|
@ -233,8 +265,6 @@ public class PedReader {
|
|||
if ( mom != null ) samples.add(mom);
|
||||
}
|
||||
|
||||
|
||||
sampleDB.validate(samples);
|
||||
return samples;
|
||||
}
|
||||
|
||||
|
|
@ -253,4 +283,26 @@ public class PedReader {
|
|||
} else
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a list of tags from the command line, assuming it comes from the GATK Engine
|
||||
* tags, and returns the corresponding EnumSet.
|
||||
*
|
||||
* @param arg the actual engine arg, used for the UserException if there's an error
|
||||
* @param tags a list of string tags that should be converted to the MissingPedField value
|
||||
* @return
|
||||
*/
|
||||
public static final EnumSet<MissingPedField> parseMissingFieldTags(final Object arg, final List<String> tags) {
|
||||
final EnumSet<MissingPedField> missingFields = EnumSet.noneOf(MissingPedField.class);
|
||||
|
||||
for ( final String tag : tags ) {
|
||||
try {
|
||||
missingFields.add(MissingPedField.valueOf(tag));
|
||||
} catch ( IllegalArgumentException e ) {
|
||||
throw new UserException.BadArgumentValue(arg.toString(), "Unknown tag " + tag + " allowed values are " + MissingPedField.values());
|
||||
}
|
||||
}
|
||||
|
||||
return missingFields;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.samples;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public enum PedigreeValidationType {
|
||||
STRICT,
|
||||
LINIENT,
|
||||
SILENT,
|
||||
}
|
||||
|
|
@ -38,51 +38,9 @@ public class SampleDB {
|
|||
|
||||
}
|
||||
|
||||
public SampleDB(final SAMFileHeader header, final List<File> sampleFiles) {
|
||||
this();
|
||||
addSamples(header);
|
||||
addSamples(sampleFiles);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Functions for adding samples to the DB
|
||||
//
|
||||
// TODO: these should be protected, really
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Hallucinates sample objects for all the samples in the SAM file and stores them
|
||||
*/
|
||||
protected SampleDB addSamples(SAMFileHeader header) {
|
||||
for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
|
||||
if (getSample(sampleName) == null) {
|
||||
Sample newSample = new Sample(sampleName, this);
|
||||
samples.put(sampleName, newSample);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
protected SampleDB addSamples(final List<File> sampleFiles) {
|
||||
// add files consecutively
|
||||
for (File file : sampleFiles) {
|
||||
addSamples(file);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse one sample file and integrate it with samples that are already there
|
||||
* Fail quickly if we find any errors in the file
|
||||
*/
|
||||
protected SampleDB addSamples(File sampleFile) {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a sample to the collection
|
||||
* Protected function to add a single sample to the database
|
||||
*
|
||||
* @param sample to be added
|
||||
*/
|
||||
protected SampleDB addSample(Sample sample) {
|
||||
|
|
@ -215,10 +173,14 @@ public class SampleDB {
|
|||
// --------------------------------------------------------------------------------
|
||||
|
||||
public final void validate() {
|
||||
validate(getSamples());
|
||||
validate(getSamples(), PedigreeValidationType.STRICT);
|
||||
}
|
||||
|
||||
public final void validate(Collection<Sample> samplesToCheck) {
|
||||
public final void validate(PedigreeValidationType validationType) {
|
||||
validate(getSamples(), validationType);
|
||||
}
|
||||
|
||||
public final void validate(Collection<Sample> samplesToCheck, PedigreeValidationType validationType) {
|
||||
// todo -- actually do an implementation
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,121 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.samples;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class SampleDBBuilder {
|
||||
PedigreeValidationType validationStrictness;
|
||||
final SampleDB sampleDB = new SampleDB();
|
||||
final GenomeAnalysisEngine engine;
|
||||
|
||||
/**
|
||||
* Constructor takes both a SAM header and sample files because the two must be integrated.
|
||||
*/
|
||||
public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) {
|
||||
this.engine = engine;
|
||||
this.validationStrictness = validationStrictness;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hallucinates sample objects for all the samples in the SAM file and stores them
|
||||
*/
|
||||
public SampleDBBuilder addSamples(SAMFileHeader header) {
|
||||
for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
|
||||
if (sampleDB.getSample(sampleName) == null) {
|
||||
final Sample newSample = new Sample(sampleName, sampleDB);
|
||||
addSample(newSample);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public SampleDBBuilder addSamples(final List<String> pedigreeArguments) {
|
||||
for (final String ped : pedigreeArguments) {
|
||||
final File pedFile = new File(ped);
|
||||
if ( pedFile.exists() )
|
||||
addSamples(pedFile);
|
||||
else
|
||||
addSamples(ped);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse one sample file and integrate it with samples that are already there
|
||||
* Fail quickly if we find any errors in the file
|
||||
*/
|
||||
protected SampleDBBuilder addSamples(File sampleFile) {
|
||||
final PedReader reader = new PedReader();
|
||||
|
||||
try {
|
||||
reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(sampleFile, e);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
protected SampleDBBuilder addSamples(final String string) {
|
||||
final PedReader reader = new PedReader();
|
||||
reader.parse(string, getMissingFields(string), sampleDB);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a sample to the collection
|
||||
* @param sample to be added
|
||||
*/
|
||||
protected SampleDBBuilder addSample(Sample sample) {
|
||||
sampleDB.addSample(sample);
|
||||
return this;
|
||||
}
|
||||
|
||||
public SampleDB getFinalSampleDB() {
|
||||
sampleDB.validate(validationStrictness);
|
||||
return sampleDB;
|
||||
}
|
||||
|
||||
public EnumSet<PedReader.MissingPedField> getMissingFields(final Object engineArg) {
|
||||
final List<String> posTags = engine.getTags(engineArg).getPositionalTags();
|
||||
return PedReader.parseMissingFieldTags(engineArg, posTags);
|
||||
}
|
||||
}
|
||||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.samples;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -285,4 +286,69 @@ public class PedReaderUnitTest extends BaseTest {
|
|||
parts.remove(field.ordinal());
|
||||
return Utils.join("\t", parts);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// parsing tags
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
private class PedReaderTestTagParsing extends TestDataProvider {
|
||||
public EnumSet<PedReader.MissingPedField> expected;
|
||||
public final List<String> tags;
|
||||
|
||||
private PedReaderTestTagParsing(final List<String> tags, EnumSet<PedReader.MissingPedField> missingDesc) {
|
||||
super(PedReaderTestTagParsing.class);
|
||||
this.tags = tags;
|
||||
this.expected = missingDesc;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "readerTestTagParsing")
|
||||
public Object[][] createReaderTestTagParsing() {
|
||||
new PedReaderTestTagParsing(
|
||||
Collections.<String>emptyList(),
|
||||
EnumSet.noneOf(PedReader.MissingPedField.class));
|
||||
|
||||
new PedReaderTestTagParsing(
|
||||
Arrays.asList("NO_FAMILY_ID"),
|
||||
EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID));
|
||||
|
||||
new PedReaderTestTagParsing(
|
||||
Arrays.asList("NO_PARENTS"),
|
||||
EnumSet.of(PedReader.MissingPedField.NO_PARENTS));
|
||||
|
||||
new PedReaderTestTagParsing(
|
||||
Arrays.asList("NO_PHENOTYPE"),
|
||||
EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE));
|
||||
|
||||
new PedReaderTestTagParsing(
|
||||
Arrays.asList("NO_SEX"),
|
||||
EnumSet.of(PedReader.MissingPedField.NO_SEX));
|
||||
|
||||
new PedReaderTestTagParsing(
|
||||
Arrays.asList("NO_SEX", "NO_PHENOTYPE"),
|
||||
EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE));
|
||||
|
||||
new PedReaderTestTagParsing(
|
||||
Arrays.asList("NO_SEX", "NO_PHENOTYPE", "NO_PARENTS"),
|
||||
EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS));
|
||||
|
||||
return PedReaderTestTagParsing.getTests(PedReaderTestTagParsing.class);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "readerTestTagParsing")
|
||||
public void testPedReaderTagParsing(PedReaderTestTagParsing test) {
|
||||
EnumSet<PedReader.MissingPedField> parsed = PedReader.parseMissingFieldTags("test", test.tags);
|
||||
Assert.assertEquals(test.expected, parsed, "Failed to properly parse tags " + test.tags);
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions = UserException.class)
|
||||
public void testPedReaderTagParsing1() {
|
||||
EnumSet<PedReader.MissingPedField> parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("XXX"));
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions = UserException.class)
|
||||
public void testPedReaderTagParsing2() {
|
||||
EnumSet<PedReader.MissingPedField> parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("NO_SEX", "XXX"));
|
||||
}
|
||||
}
|
||||
|
|
@ -14,7 +14,7 @@ import java.util.*;
|
|||
* Date: Sep 9, 2010
|
||||
* Time: 8:21:00 AM
|
||||
*/
|
||||
public class SampleDataSourceUnitTest extends BaseTest {
|
||||
public class SampleDBUnitTest extends BaseTest {
|
||||
// this empty header used to instantiate sampledatasource objects
|
||||
private static SAMFileHeader header = new SAMFileHeader();
|
||||
|
||||
|
|
@ -24,6 +24,6 @@ public class SampleDataSourceUnitTest extends BaseTest {
|
|||
// make sure samples are created from the SAM file correctly
|
||||
@Test()
|
||||
public void loadSAMSamplesTest() {
|
||||
SampleDB s = new SampleDB(header, Collections.<File>emptyList());
|
||||
//SampleDB s = new SampleDB(header);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue