diff --git a/build.xml b/build.xml index 1f26e7b7a..f266ee7fa 100644 --- a/build.xml +++ b/build.xml @@ -146,12 +146,14 @@ + + @@ -546,8 +548,9 @@ - - + + + @@ -914,7 +917,7 @@ - + diff --git a/public/R/src/gsalib/R/gsa.read.gatkreport.R b/public/R/src/gsalib/R/gsa.read.gatkreport.R index 011b5240d..46bbf7eda 100644 --- a/public/R/src/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/gsalib/R/gsa.read.gatkreport.R @@ -99,5 +99,5 @@ gsa.read.gatkreport <- function(filename) { .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); } - gatkreport = as.list(tableEnv); + gatkreport = as.list(tableEnv, all.names=TRUE); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 972943e26..a35cd3690 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -34,28 +34,23 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; import java.util.*; @@ -92,7 +87,7 @@ public class GenomeAnalysisEngine { /** * Accessor for sample metadata */ - private SampleDataSource sampleDataSource = null; + private SampleDB sampleDB = null; /** * Accessor for sharded reference-ordered data. @@ -206,6 +201,9 @@ public class GenomeAnalysisEngine { // Prepare the data for traversal. initializeDataSources(); + // initialize sampleDB + initializeSampleDB(); + // initialize and validate the interval list initializeIntervals(); validateSuppliedIntervals(); @@ -692,12 +690,22 @@ public class GenomeAnalysisEngine { for (ReadFilter filter : filters) filter.initialize(this); - sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); } + /** + * Entry-point function to initialize the samples database from input data and pedigree arguments + */ + private void initializeSampleDB() { + SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); + sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); + sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); + sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); + sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); + sampleDB = sampleDBBuilder.getFinalSampleDB(); + } + /** * Gets a unique identifier for the reader sourcing this read. * @param read Read to examine. @@ -716,100 +724,6 @@ public class GenomeAnalysisEngine { return getReadsDataSource().getSAMFile(id); } - /** - * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names - * found in the corresponding bam file. - * - * @return Sets of samples in the merged input SAM stream, grouped by readers - */ - public List> getSamplesByReaders() { - Collection readers = getReadsDataSource().getReaderIDs(); - - List> sample_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set samples = new HashSet(1); - sample_sets.add(samples); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - samples.add(g.getSample()); - } - } - - return sample_sets; - - } - - /** - * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names - * found in the corresponding bam file. - * - * @return Sets of libraries present in the (merged) input SAM stream, grouped by readers - */ - public List> getLibrariesByReaders() { - - - Collection readers = getReadsDataSource().getReaderIDs(); - - List> lib_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set libs = new HashSet(2); - lib_sets.add(libs); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - libs.add(g.getLibrary()); - } - } - - return lib_sets; - - } - - /** - * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD **** - * - * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups - * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. - * - * @return sets of (merged) read group ids in order of input bams - */ - public List> getMergedReadGroupsByReaders() { - - - Collection readers = getReadsDataSource().getReaderIDs(); - - List> rg_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set groups = new HashSet(5); - rg_sets.add(groups); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - if (getReadsDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: - // use HeaderMerger to translate original read group id from the reader into the read group id in the - // merged stream, and save that remapped read group id to associate it with specific reader - groups.add(getReadsDataSource().getReadGroupId(r, g.getReadGroupId())); - } else { - // otherwise, pass through the unmapped read groups since this is what Picard does as well - groups.add(g.getReadGroupId()); - } - } - } - - return rg_sets; - - } - /** * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). * @@ -929,6 +843,18 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(reader); } + /** + * Returns an ordered list of the unmerged SAM file headers known to this engine. + * @return list of header for each input SAM file, in command line order + */ + public List getSAMFileHeaders() { + final List headers = new ArrayList(); + for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { + headers.add(getReadsDataSource().getHeader(id)); + } + return headers; + } + /** * Gets the master sequence dictionary for this GATK engine instance * @return a never-null dictionary listing all of the contigs known to this engine instance @@ -947,8 +873,6 @@ public class GenomeAnalysisEngine { return this.readsDataSource; } - - /** * Sets the collection of GATK main application arguments. * @@ -1035,140 +959,14 @@ public class GenomeAnalysisEngine { return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); } - public SampleDataSource getSampleMetadata() { - return this.sampleDataSource; - } + // ------------------------------------------------------------------------------------- + // + // code for working with Samples database + // + // ------------------------------------------------------------------------------------- - /** - * Get a sample by its ID - * If an alias is passed in, return the main sample object - * @param id sample id - * @return sample Object with this ID - */ - public Sample getSampleById(String id) { - return sampleDataSource.getSampleById(id); - } - - /** - * Get the sample for a given read group - * Must first look up ID for read group - * @param readGroup of sample - * @return sample object with ID from the read group - */ - public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { - return sampleDataSource.getSampleByReadGroup(readGroup); - } - - /** - * Get a sample for a given read - * Must first look up read group, and then sample ID for that read group - * @param read of sample - * @return sample object of this read - */ - public Sample getSampleByRead(SAMRecord read) { - return getSampleByReadGroup(read.getReadGroup()); - } - - /** - * Get number of sample objects - * @return size of samples map - */ - public int sampleCount() { - return sampleDataSource.sampleCount(); - } - - /** - * Return all samples with a given family ID - * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this - * @param familyId family ID - * @return Samples with the given family ID - */ - public Set getFamily(String familyId) { - return sampleDataSource.getFamily(familyId); - } - - /** - * Returns all children of a given sample - * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient - * @param sample parent sample - * @return children of the given sample - */ - public Set getChildren(Sample sample) { - return sampleDataSource.getChildren(sample); - } - - /** - * Gets all the samples - * @return - */ - public Collection getSamples() { - return sampleDataSource.getSamples(); - } - - /** - * Takes a list of sample names and returns their corresponding sample objects - * - * @param sampleNameList List of sample names - * @return Corresponding set of samples - */ - public Set getSamples(Collection sampleNameList) { - return sampleDataSource.getSamples(sampleNameList); - } - - - /** - * Returns a set of samples that have any value (which could be null) for a given property - * @param key Property key - * @return Set of samples with the property - */ - public Set getSamplesWithProperty(String key) { - return sampleDataSource.getSamplesWithProperty(key); - } - - /** - * Returns a set of samples that have a property with a certain value - * Value must be a string for now - could add a similar method for matching any objects in the future - * - * @param key Property key - * @param value String property value - * @return Set of samples that match key and value - */ - public Set getSamplesWithProperty(String key, String value) { - return sampleDataSource.getSamplesWithProperty(key, value); - - } - - /** - * Returns a set of sample objects for the sample names in a variant context - * - * @param context Any variant context - * @return a set of the sample objects - */ - public Set getSamplesByVariantContext(VariantContext context) { - Set samples = new HashSet(); - for (String sampleName : context.getSampleNames()) { - samples.add(sampleDataSource.getOrCreateSample(sampleName)); - } - return samples; - } - - /** - * Returns all samples that were referenced in the SAM file - */ - public Set getSAMFileSamples() { - return sampleDataSource.getSAMFileSamples(); - } - - /** - * Return a subcontext restricted to samples with a given property key/value - * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering - * @param context VariantContext to filter - * @param key property key - * @param value property value (must be string) - * @return subcontext - */ - public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { - return sampleDataSource.subContextFromSampleProperty(context, key, value); + public SampleDB getSampleDB() { + return this.sampleDB; } public Map getApproximateCommandLineArguments(Object... argumentProviders) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index fd39d46b0..486868dc2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; @@ -44,10 +45,7 @@ import org.simpleframework.xml.stream.HyphenStyle; import java.io.File; import java.io.InputStream; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @author aaron @@ -72,11 +70,6 @@ public class GATKArgumentCollection { @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) public List samFiles = new ArrayList(); - // parameters and their defaults - @ElementList(required = false) - @Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false) - public List sampleFiles = new ArrayList(); - @Element(required = false) @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; @@ -215,28 +208,91 @@ public class GATKArgumentCollection { // -------------------------------------------------------------------------------------------------------------- // - // distributed GATK arguments + // PED (pedigree) support // // -------------------------------------------------------------------------------------------------------------- - @Element(required=false) - @Argument(fullName="processingTracker",shortName="C",doc="A lockable, shared file for coordinating distributed GATK runs",required=false) - @Hidden - public File processingTrackerFile = null; - @Element(required=false) - @Argument(fullName="restartProcessingTracker",shortName="RPT",doc="Should we delete the processing tracker file at startup?",required=false) - @Hidden - public boolean restartProcessingTracker = false; + /** + *

Reads PED file-formatted tabular text files describing meta-data about the samples being + * processed in the GATK.

+ * + * + * + *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

+ * + *
    + *
  • Family ID
  • + *
  • Individual ID
  • + *
  • Paternal ID
  • + *
  • Maternal ID
  • + *
  • Sex (1=male; 2=female; other=unknown)
  • + *
  • Phenotype
  • + *
+ * + *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: GATK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

+ * + *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

+ * + *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore.

+ * + *

Affection status should be coded:

+ * + *
    + *
  • -9 missing
  • + *
  • 0 missing
  • + *
  • 1 unaffected
  • + *
  • 2 affected
  • + *
+ * + *

If any value outside of -9,0,1,2 is detected than the samples are assumed + * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely + * represents the missing value.

+ * + *

Genotypes (column 7 onwards) cannot be specified to the GATK.

+ * + *

For example, here are two individuals (one row = one person):

+ * + *
+     *   FAM001  1  0 0  1  2
+     *   FAM001  2  0 0  1  2
+     * 
+ * + *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to + * tell the GATK PED parser that the corresponding fields are missing from the ped file.

+ * + *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree + * data should clearly indicate so in their arguments and will throw errors if required pedigree + * information is missing.

+ */ + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) + public List pedigreeFiles = Collections.emptyList(); - @Element(required=false) - @Argument(fullName="processingTrackerStatusFile",shortName="CSF",doc="If provided, a detailed accounting of the state of the process tracker is written to this file. For debugging, only",required=false) - @Hidden - public File processingTrackerStatusFile = null; + /** + * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more + * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString + * as -ped supports + */ + @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) + public List pedigreeStrings = Collections.emptyList(); - @Element(required=false) - @Argument(fullName="processingTrackerID",shortName="CID",doc="If provided, an integer ID (starting at 1) indicating a unique id for this process within the distributed GATK group",required=false) - @Hidden - public int processTrackerID = -1; + /** + * How strict should we be in parsing the PED files? + */ + @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) + public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; + + // -------------------------------------------------------------------------------------------------------------- + // + // BAM indexing and sharding arguments + // + // -------------------------------------------------------------------------------------------------------------- @Element(required = false) @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false) @@ -387,7 +443,7 @@ public class GATKArgumentCollection { return false; } if ((other.RODToInterval == null && RODToInterval != null) || - (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) { + (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) { return false; } @@ -405,20 +461,6 @@ public class GATKArgumentCollection { (other.performanceLog != null && !other.performanceLog.equals(this.performanceLog))) return false; - if ((other.processingTrackerFile == null && this.processingTrackerFile != null) || - (other.processingTrackerFile != null && !other.processingTrackerFile.equals(this.processingTrackerFile))) - return false; - - if ((other.processingTrackerStatusFile == null && this.processingTrackerStatusFile != null) || - (other.processingTrackerStatusFile != null && !other.processingTrackerStatusFile.equals(this.processingTrackerStatusFile))) - return false; - - if ( restartProcessingTracker != other.restartProcessingTracker ) - return false; - - if ( processTrackerID != other.processTrackerID ) - return false; - if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM) return false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java index 1f9a7d705..c9506ec4c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.contexts; import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -76,14 +75,6 @@ public class AlignmentContextUtils { return splitContextBySampleName(context, null); } - public static Map splitContextBySample(AlignmentContext context) { - Map m = new HashMap(); - for ( Map.Entry entry : splitContextBySampleName(context, null).entrySet() ) { - m.put(new Sample(entry.getKey()), entry.getValue()); - } - return m; - } - /** * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead * of sample object. @@ -97,8 +88,8 @@ public class AlignmentContextUtils { GenomeLoc loc = context.getLocation(); HashMap contexts = new HashMap(); - for(String sample: context.getPileup().getSampleNames()) { - ReadBackedPileup pileupBySample = context.getPileup().getPileupForSampleName(sample); + for(String sample: context.getPileup().getSamples()) { + ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample); // Don't add empty pileups to the split context. if(pileupBySample.size() == 0) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java deleted file mode 100644 index 433e0af40..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java +++ /dev/null @@ -1,30 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 12, 2010 - * Time: 2:09:16 PM - */ -public class PropertyDefinition { - - String property; - - String[] values; - - public String getProperty() { - return property; - } - - public void setProperty(String property) { - this.property = property; - } - - public String[] getValues() { - return values; - } - - public void setValues(String[] values) { - this.values = values; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java deleted file mode 100644 index ca8756684..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java +++ /dev/null @@ -1,203 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - - -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Jul 26, 2010 - * Time: 3:31:38 PM - */ -public class Sample implements java.io.Serializable { - - private final String id; - - private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file - - private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file - - private HashMap properties = new HashMap(); - - private HashMap relationships = new HashMap(); - - public enum Gender { - MALE, - FEMALE, - UNKNOWN - } - - public Sample(String id) { -/* if (id == null) { - throw new StingException("Error creating sample: sample ID cannot be null"); - }*/ - this.id = id; - } - - public String getId() { - return this.id; - } - - public Map getProperties() { - return properties; - } - - public void setProperties(Map properties) { - this.properties = (HashMap) properties; - } - - public Map getRelationships() { - return Collections.unmodifiableMap(this.relationships); - } - - public void setSampleFileEntry(boolean value) { - this.hasSampleFileEntry = value; - } - - public boolean hasSAMFileEntry() { - return this.hasSAMFileEntry; - } - - public void setSAMFileEntry(boolean value) { - this.hasSAMFileEntry = value; - } - - public boolean hasSampleFileEntry() { - return this.hasSampleFileEntry; - } - - /** - * Get one property - * @param key key of property - * @return value of property as generic object - */ - public Object getProperty(String key) { - return properties.get(key); - } - - /** - * Set a property - * If property already exists, it is overwritten - * @param key key of property - * @param value object to be stored in properties array - */ - public void setProperty(String key, Object value) { - - if (relationships.containsKey(key)) { - throw new StingException("The same key cannot exist as a property and a relationship"); - } - - if (key.equals("gender") && value.getClass() != Gender.class) { - throw new StingException("'gender' property must be of type Sample.Gender"); - } - - if (key.equals("population") && value.getClass() != String.class) { - throw new StingException("'population' property must be of type String"); - } - - properties.put(key, value); - } - - /** - * Get one relationship - * @param key of relationship - * @return Sample object that this relationship points to - */ - public Sample getRelationship(String key) { - return relationships.get(key); - } - - /** - * Set one relationship - * If already set, it is overwritten - * @param key key of the relationship - * @param value Sample object this relationship points to - */ - public void setRelationship(String key, Sample value) { - if (properties.containsKey(key)) { - throw new StingException("The same key cannot exist as a property and a relationship"); - } - relationships.put(key, value); - } - - /** - * Get the sample's mother - * @return sample object with relationship mother, if exists, or null - */ - public Sample getMother() { - return getRelationship("mother"); - } - - /** - * Get the sample's father - * @return sample object with relationship father, if exists, or null - */ - public Sample getFather() { - return getRelationship("father"); - } - - /** - * Get gender of the sample - * @return property of key "gender" - must be of type Gender - */ - public Gender getGender() { - return (Gender) properties.get("gender"); - } - - public String getPopulation() { - return (String) properties.get("population"); - } - - public String getFamilyId() { - return (String) properties.get("familyId"); - } - - /** - * @return True if sample is male, false if female, unknown, or null - */ - public boolean isMale() { - return properties.get("gender") == Gender.MALE; - } - - /** - * @return True if sample is female, false if male, unknown or null - */ - public boolean isFemale() { - return properties.get("gender") == Gender.MALE; - } - - /** - * - * @param key property key - * @return true if sample has this property (even if its value is null) - */ - public boolean hasProperty(String key) { - return properties.containsKey(key); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Sample sample = (Sample) o; - - if (hasSAMFileEntry != sample.hasSAMFileEntry) return false; - if (hasSampleFileEntry != sample.hasSampleFileEntry) return false; - if (id != null ? !id.equals(sample.id) : sample.id != null) return false; - if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false; - if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null) - return false; - - return true; - } - - @Override - public int hashCode() { - return id != null ? id.hashCode() : "".hashCode(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java deleted file mode 100644 index ce749cb83..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 13, 2010 - * Time: 5:13:46 PM - */ -public class SampleAlias { - - String mainId; - - String[] otherIds; - - public String getMainId() { - return mainId; - } - - public void setMainId(String mainId) { - this.mainId = mainId; - } - - public String[] getOtherIds() { - return otherIds; - } - - public void setOtherIds(String[] otherIds) { - this.otherIds = otherIds; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java deleted file mode 100644 index 067bf3f72..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java +++ /dev/null @@ -1,590 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.yaml.snakeyaml.TypeDescription; -import org.yaml.snakeyaml.Yaml; -import org.yaml.snakeyaml.constructor.Constructor; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Jul 26, 2010 - * Time: 3:30:09 PM - * - * This class stores and manages sample metadata. This data is encoded in a sample file, which can be included - * in the GATK by the "--samples" argument. This class reads and parses those files. - * - * Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really - * only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker - * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource. - * - */ -public class SampleDataSource { - - /** - * SAMFileHeader that has been created for this analysis. - */ - private SAMFileHeader header; - - /** - * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so - * this is stored as a HashMap. - */ - private final HashMap samples = new HashMap(); - - /** - * Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different - * datasets. If this is the case, one ID is the "primary ID" and others are "aliases". - * - * This maps ID => primary ID for all samples ID strings - both primary IDs and aliases. - */ - private HashMap sampleAliases = new HashMap(); - - /** - * While loading sample files, we must be aware of "special" properties and relationships that are always allowed - */ - public static final String[] specialProperties = new String[] {"familyId", "population", "gender"}; - public static final String[] specialRelationships = new String[] {"mother", "father"}; - - /** - * Constructor takes both a SAM header and sample files because the two must be integrated. - * @param header SAMFileHeader that has been created for this analysis - * @param sampleFiles Sample files that were included on the command line - */ - public SampleDataSource(SAMFileHeader header, List sampleFiles) { - this(); - this.header = header; - // create empty sample object for each sample referenced in the SAM header - for (String sampleName : SampleUtils.getSAMFileSamples(header)) { - if (!hasSample(sampleName)) { - Sample newSample = new Sample(sampleName); - newSample.setSAMFileEntry(true); - samples.put(sampleName, newSample); - } - } - - // add files consecutively - if (sampleFiles != null) { - for (File file : sampleFiles) { - addFile(file); - } - } - } - - public SampleDataSource() { - samples.put(null, new Sample(null)); - } - - /** - * Hallucinates sample objects for all the samples in the SAM file and stores them - */ - public void addSamplesFromSAMHeader(SAMFileHeader header) { - for (String sampleName : SampleUtils.getSAMFileSamples(header)) { - if (!hasSample(sampleName)) { - Sample newSample = new Sample(sampleName); - newSample.setSAMFileEntry(true); - samples.put(sampleName, newSample); - } - } - } - - /** - * Parse one sample file and integrate it with samples that are already there - * Fail quickly if we find any errors in the file - */ - public void addFile(File sampleFile) { - - BufferedReader reader; - try { - reader = new BufferedReader(new FileReader(sampleFile)); - } - catch (IOException e) { - throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e); - } - - // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file - Constructor con = new Constructor(SampleFileParser.class); - TypeDescription desc = new TypeDescription(SampleFileParser.class); - desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class); - desc.putListPropertyType("sampleAliases", SampleAlias.class); - con.addTypeDescription(desc); - Yaml yaml = new Yaml(con); - - // SampleFileParser stores an object representation of a sample file - this is what we'll parse - SampleFileParser parser; - try { - parser = (SampleFileParser) yaml.load(reader); - } - catch (Exception e) { - throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e); - } - - // check to see which validation options were built into the file - boolean restrictProperties = parser.getAllowedProperties() != null; - boolean restrictRelationships = parser.getAllowedRelationships() != null; - boolean restrictPropertyValues = parser.getPropertyDefinitions() != null; - - // propertyValues stores the values that are allowed for a given property - HashMap propertyValues = null; - if (restrictPropertyValues) { - propertyValues = new HashMap(); - for (PropertyDefinition def : parser.getPropertyDefinitions()) { - HashSet set = new HashSet(); - for (String value : def.getValues()) { - set.add(value); - } - propertyValues.put(def.getProperty(), set); - } - } - - // make sure the aliases are valid - validateAliases(parser); - - // loop through each sample in the file - a SampleParser stores an object that will become a Sample - for (SampleParser sampleParser : parser.getSamples()) { - - try { - // step 1: add the sample if it doesn't already exist - Sample sample = getSampleById(sampleParser.getId()); - if (sample == null) { - sample = new Sample(sampleParser.getId()); - } - addSample(sample); - sample.setSampleFileEntry(true); - - // step 2: add the properties - if (sampleParser.getProperties() != null) { - for (String property : sampleParser.getProperties().keySet()) { - - // check that property is allowed - if (restrictProperties) { - if (!isPropertyValid(property, parser.getAllowedProperties())) { - throw new StingException(property + " is an invalid property. It is not included in the list " + - "of allowed properties."); - } - } - - // next check that the value is allowed - if (restrictPropertyValues) { - if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) { - throw new StingException("The value of property '" + property + "' is invalid. " + - "It is not included in the list of allowed values for this property."); - } - } - - // next check that there isn't already a conflicting property there - if (sample.getProperty(property) != null && - sample.getProperty(property) != sampleParser.getProperties().get(property)) - { - throw new StingException(property + " is a conflicting property!"); - } - - // checks are passed - now add the property! - saveProperty(sample, property, sampleParser.getProperties().get(property)); - } - } - - // step 3: add the relationships - if (sampleParser.getRelationships() != null) { - for (String relationship : sampleParser.getRelationships().keySet()) { - String relativeId = sampleParser.getRelationships().get(relationship); - if (relativeId == null) { - throw new StingException("The relationship cannot be null"); - } - - // first check that it's not invalid - if (restrictRelationships) { - if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) { - throw new StingException(relationship + " is an invalid relationship"); - } - } - - // next check that there isn't already a conflicting property there - if (sample.getRelationship(relationship) != null) { - if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) { - throw new StingException(relationship + " is a conflicting relationship!"); - } - // if the relationship is already set - and consistent with what we're reading now - no need to continue - else { - continue; - } - } - - // checks are passed - now save the relationship - saveRelationship(sample, relationship, relativeId); - } - } - } catch (Exception e) { - throw new StingException("An error occurred while loading this sample from the sample file: " + - sampleParser.getId(), e); - } - } - - } - - private boolean isValueAllowed(String key, Object value, HashMap valuesList) { - - // if the property values weren't specified for this property, then any value is okay - if (!valuesList.containsKey(key)) { - return true; - } - - // if this property has enumerated values, it must be a string - else if (value.getClass() != String.class) - return false; - - // is the value specified or not? - else if (!valuesList.get(key).contains(value)) - return false; - - return true; - } - - /** - * Makes sure that the aliases are valid - * Checks that 1) no string is used as both a main ID and an alias; - * 2) no alias is used more than once - * @param parser - */ - private void validateAliases(SampleFileParser parser) { - - // no aliases sure validate - if (parser.getSampleAliases() == null) - return; - - HashSet mainIds = new HashSet(); - HashSet otherIds = new HashSet(); - - for (SampleAlias sampleAlias : parser.getSampleAliases()) { - mainIds.add(sampleAlias.getMainId()); - for (String otherId : sampleAlias.getOtherIds()) { - if (mainIds.contains(otherId)) - throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " + - "be both a main ID and an other ID", otherId)); - - if (!otherIds.add(otherId)) - throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " + - "alias more than once.", otherId)); - } - } - } - - private boolean isPropertyValid(String property, String[] allowedProperties) { - - // is it a special property that is always allowed? - for (String allowedProperty : specialProperties) { - if (property.equals(allowedProperty)) - return true; - } - - // is it in the allowed properties list? - for (String allowedProperty : allowedProperties) { - if (property.equals(allowedProperty)) - return true; - } - - return false; - } - - private boolean isRelationshipValid(String relationship, String[] allowedRelationships) { - - // is it a special relationship that is always allowed? - for (String allowedRelationship : specialRelationships) { - if (relationship.equals(allowedRelationship)) - return true; - } - - // is it in the allowed properties list? - for (String allowedRelationship : allowedRelationships) { - if (relationship.equals(allowedRelationship)) - return true; - } - - return false; - } - - /** - * Saves a property as the correct type - * @param key property key - * @param value property value, as read from YAML parser - * @return property value to be stored - */ - private void saveProperty(Sample sample, String key, Object value) { - - // convert gender to the right type, if it was stored as a String - if (key.equals("gender")) { - if (((String) value).toLowerCase().equals("male")) { - value = Sample.Gender.MALE; - } - else if (((String) value).toLowerCase().equals("female")) { - value = Sample.Gender.FEMALE; - } - else if (((String) value).toLowerCase().equals("unknown")) { - value = Sample.Gender.UNKNOWN; - } - else if (value != null) { - throw new StingException("'gender' property must be male, female, or unknown."); - } - } - try { - sample.setProperty(key, value); - } - catch (Exception e) { - throw new StingException("Could not save property " + key, e); - } - } - - /** - * Saves a relationship as the correct type - * @param key relationship key - * @param relativeId sample ID string of the relative - * @return relationship value to be stored - */ - private void saveRelationship(Sample sample, String key, String relativeId) { - - // get the reference that we'll store as the value - Sample relative = getSampleById(relativeId); - - // create sample object for the relative, if necessary - if (relative == null) { - relative = new Sample(relativeId); - addSample(relative); - } - sample.setRelationship(key, relative); - } - - - - /** - * Filter a sample name in case it is an alias - * @param sampleId to be filtered - * @return ID of sample that stores data for this alias - */ - private String aliasFilter(String sampleId) { - if (!sampleAliases.containsKey(sampleId)) - return sampleId; - else - return sampleAliases.get(sampleId); - } - - /** - * Add a sample to the collection - * @param sample to be added - */ - private void addSample(Sample sample) { - samples.put(sample.getId(), sample); - } - - /** - * Check if sample with this ID exists - * Note that this will return true if name passed in is an alias - * @param id ID of sample to be checked - * @return true if sample exists; false if not - */ - public boolean hasSample(String id) { - return samples.get(aliasFilter(id)) != null; - } - - /** - * Get a sample by its ID - * If an alias is passed in, return the main sample object - * @param id - * @return sample Object with this ID - */ - public Sample getSampleById(String id) { - return samples.get(aliasFilter(id)); - } - - /** - * Get the sample for a given read group - * Must first look up ID for read group - * @param readGroup of sample - * @return sample object with ID from the read group - */ - public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { - String nameFromReadGroup = readGroup.getSample(); - return getSampleById(nameFromReadGroup); - } - - /** - * Get a sample for a given read - * Must first look up read group, and then sample ID for that read group - * @param read of sample - * @return sample object of this read - */ - public Sample getSampleByRead(SAMRecord read) { - return getSampleByReadGroup(read.getReadGroup()); - } - - /** - * Get number of sample objects - * @return size of samples map - */ - public int sampleCount() { - return samples.size(); - } - - /** - * Return all samples with a given family ID - * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this - * @param familyId - * @return - */ - public Set getFamily(String familyId) { - HashSet familyMembers = new HashSet(); - - for (Sample sample : samples.values()) { - if (sample.getFamilyId() != null) { - if (sample.getFamilyId().equals(familyId)) - familyMembers.add(sample); - } - } - return familyMembers; - } - - /** - * Returns all children of a given sample - * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient - * @param sample - * @return - */ - public Set getChildren(Sample sample) { - HashSet children = new HashSet(); - for (Sample familyMember : getFamily(sample.getFamilyId())) { - if (familyMember.getMother() == sample || familyMember.getFather() == sample) { - children.add(familyMember); - } - } - return children; - } - - public Set getSamples() { - HashSet set = new HashSet(); - set.addAll(samples.values()); - return set; - } - - /** - * Takes a collection of sample names and returns their corresponding sample objects - * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set - * @param sampleNameList Set of sample names - * @return Corresponding set of samples - */ - public Set getSamples(Collection sampleNameList) { - HashSet samples = new HashSet(); - for (String name : sampleNameList) { - try { - samples.add(getSampleById(name)); - } - catch (Exception e) { - throw new StingException("Could not get sample with the following ID: " + name, e); - } - } - return samples; - } - - /** - * Returns a set of samples that have any value (which could be null) for a given property - * @param key Property key - * @return Set of samples with the property - */ - public Set getSamplesWithProperty(String key) { - HashSet toReturn = new HashSet(); - for (Sample s : samples.values()) { - if (s.hasProperty(key)) - toReturn.add(s); - } - return toReturn; - } - - /** - * Returns a set of samples that have a property with a certain value - * Value must be a string for now - could add a similar method for matching any objects in the future - * - * @param key Property key - * @param value String property value - * @return Set of samples that match key and value - */ - public Set getSamplesWithProperty(String key, String value) { - Set toReturn = getSamplesWithProperty(key); - for (Sample s : toReturn) { - if (!s.getProperty(key).equals(value)) - toReturn.remove(s); - } - return toReturn; - } - - public Sample getOrCreateSample(String id) { - Sample sample = getSampleById(id); - if (sample == null) { - sample = new Sample(id); - addSample(sample); - } - return sample; - } - - /** - * Returns all samples that were referenced in the SAM file - */ - public Set getSAMFileSamples() { - Set toReturn = new HashSet(); - for (Sample sample : samples.values()) { - if (sample.hasSAMFileEntry()) - toReturn.add(sample); - } - return toReturn; - } - - /** - * Returns a set of sample objects for the sample names in a variant context - * - * @param context Any variant context - * @return a set of the sample objects - */ - public Set getSamplesByVariantContext(VariantContext context) { - Set samples = new HashSet(); - for (String sampleName : context.getSampleNames()) { - samples.add(getOrCreateSample(sampleName)); - } - return samples; - } - - - /** - * Return a subcontext restricted to samples with a given property key/value - * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering - * @param context VariantContext to filter - * @param key property key - * @param value property value (must be string) - * @return subcontext - */ - public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { - - Set samplesWithProperty = new HashSet(); - for (String sampleName : context.getSampleNames()) { - Sample s = samples.get(sampleName); - if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value)) - samplesWithProperty.add(sampleName); - } - Map genotypes = context.getGenotypes(samplesWithProperty); - return context.subContextFromGenotypes(genotypes.values()); - } - - public static SampleDataSource createEmptyDataSource() { - SAMFileHeader header = new SAMFileHeader(); - return new SampleDataSource(header, null); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java deleted file mode 100644 index a362af663..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 12, 2010 - * Time: 1:30:44 PM - */ -public class SampleFileParser { - - private SampleAlias[] sampleAliases; - - private String[] allowedProperties; - - private String[] allowedRelationships; - - private PropertyDefinition[] propertyDefinitions; - - private SampleParser[] samples; - - public PropertyDefinition[] getPropertyDefinitions() { - return propertyDefinitions; - } - - public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) { - this.propertyDefinitions = propertyDefinitions; - } - - public SampleFileParser() { - - } - - public String[] getAllowedProperties() { - return allowedProperties; - } - - public void setAllowedProperties(String[] allowedProperties) { - this.allowedProperties = allowedProperties; - } - - public SampleParser[] getSamples() { - return samples; - } - - public void setSamples(SampleParser[] samples) { - this.samples = samples; - } - - public String[] getAllowedRelationships() { - return allowedRelationships; - } - - public void setAllowedRelationships(String[] allowedRelationships) { - this.allowedRelationships = allowedRelationships; - } - - public SampleAlias[] getSampleAliases() { - return sampleAliases; - } - - public void setSampleAliases(SampleAlias[] sampleAliases) { - this.sampleAliases = sampleAliases; - } - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java deleted file mode 100644 index f5e07ca29..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java +++ /dev/null @@ -1,43 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import java.util.HashMap; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 13, 2010 - * Time: 2:09:43 PM - */ -public class SampleParser { - - private String id; - - private HashMap properties; - - private HashMap relationships; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public HashMap getProperties() { - return properties; - } - - public void setProperties(HashMap properties) { - this.properties = properties; - } - - public HashMap getRelationships() { - return relationships; - } - - public void setRelationships(HashMap relationships) { - this.relationships = relationships; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 3b9e35311..a07f735fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -84,12 +84,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar */ protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse ) { super(engine, walker, reads, reference, rods); - this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); - - if (engine.getArguments().processingTrackerFile != null) { - throw new UserException.BadArgumentValue("-C", "Distributed GATK calculations currently not supported in multi-threaded mode. Complain to Mark depristo@broadinstitute.org to implement and test this code path"); - } } public Object execute( Walker walker, ShardStrategy shardStrategy ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 09ab4bd44..deafcd0cc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.SampleUtils; import java.util.Collection; @@ -56,7 +57,8 @@ public class LinearMicroScheduler extends MicroScheduler { traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata()); + WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), + getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 2b6488ada..11e51d99b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -62,7 +62,10 @@ public class ShardTraverser implements Callable { Object accumulator = walker.reduceInit(); LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),microScheduler.getReadIterator(shard),shard.getGenomeLocs(), microScheduler.engine.getSampleMetadata()); // todo: microScheduler.engine is protected - is it okay to user it here? + WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(), + microScheduler.getReadIterator(shard), + shard.getGenomeLocs(), + microScheduler.engine.getSampleDB().getSampleNames()); // todo: microScheduler.engine is protected - is it okay to user it here? ShardDataProvider dataProvider = null; for(WindowMaker.WindowMakerIterator iterator: windowMaker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index 43ea46002..d1f5d80da 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,7 +4,6 @@ import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; @@ -12,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; @@ -63,17 +63,20 @@ public class WindowMaker implements Iterable, I * the given intervals. * @param iterator The data source for this window. * @param intervals The set of intervals over which to traverse. - * @param sampleData SampleDataSource that we can reference reads with + * @param sampleNames The complete set of sample names in the reads in shard */ - public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, SampleDataSource sampleData ) { + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) { this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - - this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData)); + this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) { + this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + } + public Iterator iterator() { return this; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index e13c5a764..eb5b51b33 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -35,12 +35,11 @@ import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.ReservoirDownsampler; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -52,9 +51,6 @@ import java.util.*; /** Iterator that traverses a SAM File, accumulating information on a per-locus basis */ public class LocusIteratorByState extends LocusIterator { -// private static long discarded_bases = 0L; -// private static long observed_bases = 0L; - /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(LocusIteratorByState.class); @@ -69,7 +65,7 @@ public class LocusIteratorByState extends LocusIterator { * Used to create new GenomeLocs. */ private final GenomeLocParser genomeLocParser; - private final ArrayList samples; + private final ArrayList samples; private final ReadStateManager readStates; static private class SAMRecordState { @@ -278,15 +274,30 @@ public class LocusIteratorByState extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, SampleDataSource sampleData ) { + public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples ) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; + this.samples = new ArrayList(samples); + this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); - // get the list of samples - this.samples = new ArrayList(sampleData.getSamples()); - - readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); - + // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when + // there's no read data. So we need to throw this error only when samIterator.hasNext() is true + if ( this.samples.isEmpty() && samIterator.hasNext() ) { + // actually we cannot process BAMs without read groups unless we tolerate empty + // sample lists. In the empty case we need to add the null element to the samples + this.samples.add(null); + //throw new IllegalArgumentException("samples list must not be empty"); + } + } + + /** + * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list + * for the system. + */ + public final static Collection sampleListForSAMWithoutReadGroups() { + List samples = new ArrayList(); + samples.add(null); + return samples; } public Iterator iterator() { @@ -303,19 +314,6 @@ public class LocusIteratorByState extends LocusIterator { //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); } - public void printState() { - for(Sample sample: samples) { - Iterator iterator = readStates.iterator(sample); - while(iterator.hasNext()) { - SAMRecordState state = iterator.next(); - logger.debug(String.format("printState():")); - SAMRecord read = state.getRead(); - int offset = state.getReadOffset(); - logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, (char)read.getReadBases()[offset], read.getCigarString())); - } - } - } - private GenomeLoc getLocation() { return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); } @@ -355,14 +353,14 @@ public class LocusIteratorByState extends LocusIterator { // In this case, the subsequent call to next() will emit the normal pileup at the current base // and shift the position. if (readInfo.generateExtendedEvents() && hasExtendedEvents) { - Map fullExtendedEventPileup = new HashMap(); + Map fullExtendedEventPileup = new HashMap(); // get current location on the reference and decrement it by 1: the indels we just stepped over // are associated with the *previous* reference base GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1); boolean hasBeenSampled = false; - for(Sample sample: samples) { + for(final String sample: samples) { Iterator iterator = readStates.iterator(sample); List indelPile = new ArrayList(readStates.size(sample)); hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample); @@ -426,10 +424,10 @@ public class LocusIteratorByState extends LocusIterator { nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled); } else { GenomeLoc location = getLocation(); - Map fullPileup = new HashMap(); + Map fullPileup = new HashMap(); boolean hasBeenSampled = false; - for(Sample sample: samples) { + for(final String sample: samples) { Iterator iterator = readStates.iterator(sample); List pile = new ArrayList(readStates.size(sample)); hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); @@ -495,7 +493,7 @@ public class LocusIteratorByState extends LocusIterator { } private void updateReadStates() { - for(Sample sample: samples) { + for(final String sample: samples) { Iterator it = readStates.iterator(sample); while ( it.hasNext() ) { SAMRecordState state = it.next(); @@ -522,7 +520,7 @@ public class LocusIteratorByState extends LocusIterator { private final PeekableIterator iterator; private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); + private final Map readStatesBySample = new HashMap(); private final int targetCoverage; private int totalReadStates = 0; @@ -540,9 +538,9 @@ public class LocusIteratorByState extends LocusIterator { } Map readSelectors = new HashMap(); - for(Sample sample: samples) { + for(final String sample: samples) { readStatesBySample.put(sample,new PerSampleReadStateManager()); - readSelectors.put(sample.getId(),downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); + readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); } samplePartitioner = new SamplePartitioner(readSelectors); @@ -554,7 +552,7 @@ public class LocusIteratorByState extends LocusIterator { * @param sample The sample. * @return Iterator over the reads associated with that sample. */ - public Iterator iterator(final Sample sample) { + public Iterator iterator(final String sample) { return new Iterator() { private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); @@ -590,7 +588,7 @@ public class LocusIteratorByState extends LocusIterator { * @param sample The sample. * @return Total number of reads in the given sample. */ - public int size(final Sample sample) { + public int size(final String sample) { return readStatesBySample.get(sample).size(); } @@ -600,12 +598,12 @@ public class LocusIteratorByState extends LocusIterator { * @param sample Sample, downsampled independently. * @return Integer stop of the furthest undownsampled region. */ - public int getDownsamplingExtent(final Sample sample) { + public int getDownsamplingExtent(final String sample) { return readStatesBySample.get(sample).getDownsamplingExtent(); } public SAMRecordState getFirst() { - for(Sample sample: samples) { + for(final String sample: samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); if(!reads.isEmpty()) return reads.peek(); @@ -639,8 +637,8 @@ public class LocusIteratorByState extends LocusIterator { } samplePartitioner.complete(); - for(Sample sample: samples) { - ReadSelector aggregator = samplePartitioner.getSelectedReads(sample.getId()); + for(final String sample: samples) { + ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); Collection newReads = new ArrayList(aggregator.getSelectedReads()); @@ -1072,6 +1070,3 @@ class SamplePartitioner implements ReadSelector { } } - - - diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java new file mode 100644 index 000000000..83e31f672 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** + * Categorical sample trait for association and analysis + * + * Samples can have unknown status, be affected or unaffected by the + * categorical trait, or they can be marked as actually having an + * other trait value (stored in an associated value in the Sample class) + * + * @author Mark DePristo + * @since Sept. 2011 + */ +public enum Affection { + /** Status is unknown */ + UNKNOWN, + /** Suffers from the disease */ + AFFECTED, + /** Unaffected by the disease */ + UNAFFECTED, + /** An "other" trait: value of the trait is stored elsewhere and is an arbitrary string */ + OTHER +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java new file mode 100644 index 000000000..6fb44804a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** +* ENUM of possible human genders: male, female, or unknown +*/ +public enum Gender { + MALE, + FEMALE, + UNKNOWN +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java new file mode 100644 index 000000000..c442409fb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.*; +import java.util.*; + +/** + * Reads PED file-formatted tabular text files + * + * See http://www.broadinstitute.org/mpg/tagger/faq.html + * See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped + * + * The "ped" file format refers to the widely-used format for linkage pedigree data. + * Each line describes a single (diploid) individual in the following format: + * + * family_ID individual_ID father_ID mother_ID gender phenotype genotype_1 genotype_2 ... + * + * If your data lacks pedigree information (for example, unrelated case/control individuals), + * set the father_ID and mother_ID to 0. sex denotes the individual's gender with 1=male and 2=female. + * phenotype refers to the affected status (for association studies) where 0=unknown, 1=unaffected, 2=affected. + * Finally, each genotype is written as two (=diploid) integer numbers (separated by whitespace), + * where 1=A, 2=C, 3=G, 4=T. No header lines are allowed and all columns must be separated by whitespace. + * Check out the information at the PLINK website on the "ped" file format. + * + * The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory: + * Family ID + * Individual ID + * Paternal ID + * Maternal ID + * Sex (1=male; 2=female; other=unknown) + * Phenotype + * + * The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: PLINK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed). + * Note that the GATK actually supports arbitrary values for quantitative trait -- not just doubles -- + * and are actually representing these values as strings instead of doubles + * + * NOTE Quantitative traits with decimal points must be coded with a period/full-stop character and + * not a comma, i.e. 2.394 not 2,394 + * + * If an individual's sex is unknown, then any character other than 1 or 2 can be used. + * When new files are created (PED, FAM, or other which contain sex) then the original coding will be + * preserved. However, these individuals will be dropped from any analyses (i.e. phenotype set to missing also) + * and an error message will arise if an analysis that uses family information is requested and an + * individual of 'unknown' sex is specified as a father or mother. + * + * + * HINT You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore. + * + * Affection status, by default, should be coded: + * -9 missing + * 0 missing + * 1 unaffected + * 2 affected + * + * If your file is coded 0/1 to represent unaffected/affected, then use the --1 flag: + * plink --file mydata --1 which will specify a disease phenotype coded: + * + * -9 missing + * 0 unaffected + * 1 affected + * + * The missing phenotype value for quantitative traits is, by default, -9 (this can also be used for + * disease traits as well as 0). It can be reset by including the --missing-phenotype option: + * + * Genotypes (column 7 onwards) should also be white-space delimited; they can be any character + * (e.g. 1,2,3,4 or A,C,G,T or anything else) except 0 which is, by default, the missing genotype + * character. All markers should be biallelic. All SNPs (whether haploid or not) must have two + * alleles specified. Either Both alleles should be missing (i.e. 0) or neither. + * + * No header row should be given. For example, here are two individuals typed for 3 SNPs (one row = one person): + * + * FAM001 1 0 0 1 2 A A G G A C + * FAM001 2 0 0 1 2 A A A G 0 0 + * ... + * + * Note that the GATK does not support genotypes in a PED file. + * + * @author Mark DePristo + * @since 2011 + */ +public class PedReader { + private static Logger logger = Logger.getLogger(PedReader.class); + final static private Set CATAGORICAL_TRAIT_VALUES = new HashSet(Arrays.asList("-9", "0", "1", "2")); + final static private String commentMarker = "#"; + + /** + * An enum that specifies which, if any, of the standard PED fields are + * missing from the input records. For example, suppose we have the full record: + * + * "fam1 kid dad mom 1 2" + * + * indicating a male affected child. This can be parsed with the -ped x.ped argument + * to the GATK. Suppose we only have: + * + * "fam1 kid 1" + * + * we can parse the reduced version of this record with -ped:NO_PARENTS,NO_PHENOTYPE x.ped + */ + public enum MissingPedField { + /** + * The PED records do not have the first (FAMILY_ID) argument. The family id + * will be set to null / empty. + */ + NO_FAMILY_ID, + + /** + * The PED records do not have either the paternal or maternal IDs, so + * the corresponding IDs are set to null. + */ + NO_PARENTS, + + /** + * The PED records do not have the GENDER field, so the sex of each + * sample will be set to UNKNOWN. + */ + NO_SEX, + + /** + * The PED records do not have the PHENOTYPE field, so the phenotype + * of each sample will be set to UNKNOWN. + */ + NO_PHENOTYPE + } + + protected enum Field { + FAMILY_ID, INDIVIDUAL_ID, PATERNAL_ID, MATERNAL_ID, GENDER, PHENOTYPE + } + + // phenotype + private final static String MISSING_VALUE1 = "-9"; + private final static String MISSING_VALUE2 = "0"; + private final static String PHENOTYPE_UNAFFECTED = "1"; + private final static String PHENOTYPE_AFFECTED = "2"; + + // Sex + private final static String SEX_MALE = "1"; + private final static String SEX_FEMALE = "2"; + // other=unknown + + public PedReader() { } + + public final List parse(File source, EnumSet missingFields, SampleDB sampleDB) throws FileNotFoundException { + logger.info("Reading PED file " + source + " with missing fields: " + missingFields); + return parse(new FileReader(source), missingFields, sampleDB); + } + + public final List parse(final String source, EnumSet missingFields, SampleDB sampleDB) { + logger.warn("Reading PED string: \"" + source + "\" with missing fields: " + missingFields); + return parse(new StringReader(source.replace(";", String.format("%n"))), missingFields, sampleDB); + } + + public final List parse(Reader reader, EnumSet missingFields, SampleDB sampleDB) { + final List lines = new XReadLines(reader).readLines(); + + // What are the record offsets? + final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0; + final int samplePos = familyPos + 1; + final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1; + final int maternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1; + final int sexPos = missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; + final int phenotypePos = missingFields.contains(MissingPedField.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; + final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1; + + // go through once and determine properties + int lineNo = 1; + boolean isQT = false; + final List splits = new ArrayList(lines.size()); + for ( final String line : lines ) { + if ( line.startsWith(commentMarker)) continue; + if ( line.trim().equals("") ) continue; + + final String[] parts = line.split("\\s+"); + + if ( parts.length != nExpectedFields ) + throw new UserException.MalformedFile(reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields"); + + if ( phenotypePos != -1 ) { + isQT = isQT || ! CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]); + } + + splits.add(parts); + lineNo++; + } + logger.info("Phenotype is other? " + isQT); + + // now go through and parse each record + lineNo = 1; + final List samples = new ArrayList(splits.size()); + for ( final String[] parts : splits ) { + String familyID = null, individualID, paternalID = null, maternalID = null; + Gender sex = Gender.UNKNOWN; + String quantitativePhenotype = Sample.UNSET_QT; + Affection affection = Affection.UNKNOWN; + + if ( familyPos != -1 ) familyID = maybeMissing(parts[familyPos]); + individualID = parts[samplePos]; + if ( paternalPos != -1 ) paternalID = maybeMissing(parts[paternalPos]); + if ( maternalPos != -1 ) maternalID = maybeMissing(parts[maternalPos]); + + if ( sexPos != -1 ) { + if ( parts[sexPos].equals(SEX_MALE) ) sex = Gender.MALE; + else if ( parts[sexPos].equals(SEX_FEMALE) ) sex = Gender.FEMALE; + else sex = Gender.UNKNOWN; + } + + if ( phenotypePos != -1 ) { + if ( isQT ) { + if ( parts[phenotypePos].equals(MISSING_VALUE1) ) + affection = Affection.UNKNOWN; + else { + affection = Affection.OTHER; + quantitativePhenotype = parts[phenotypePos]; + } + } else { + if ( parts[phenotypePos].equals(MISSING_VALUE1) ) affection = Affection.UNKNOWN; + else if ( parts[phenotypePos].equals(MISSING_VALUE2) ) affection = Affection.UNKNOWN; + else if ( parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED) ) affection = Affection.UNAFFECTED; + else if ( parts[phenotypePos].equals(PHENOTYPE_AFFECTED) ) affection = Affection.AFFECTED; + else throw new ReviewedStingException("Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo); + } + } + + final Sample s = new Sample(individualID, sampleDB, familyID, paternalID, maternalID, sex, affection, quantitativePhenotype); + samples.add(s); + sampleDB.addSample(s); + lineNo++; + } + + for ( final Sample sample : new ArrayList(samples) ) { + Sample dad = maybeAddImplicitSample(sampleDB, sample.getPaternalID(), sample.getFamilyID(), Gender.MALE); + if ( dad != null ) samples.add(dad); + + Sample mom = maybeAddImplicitSample(sampleDB, sample.getMaternalID(), sample.getFamilyID(), Gender.FEMALE); + if ( mom != null ) samples.add(mom); + } + + return samples; + } + + private final static String maybeMissing(final String string) { + if ( string.equals(MISSING_VALUE1) || string.equals(MISSING_VALUE2) ) + return null; + else + return string; + } + + private final Sample maybeAddImplicitSample(SampleDB sampleDB, final String id, final String familyID, final Gender gender) { + if ( id != null && sampleDB.getSample(id) == null ) { + Sample s = new Sample(id, sampleDB, familyID, null, null, gender, Affection.UNKNOWN, Sample.UNSET_QT); + sampleDB.addSample(s); + return s; + } else + return null; + } + + /** + * Parses a list of tags from the command line, assuming it comes from the GATK Engine + * tags, and returns the corresponding EnumSet. + * + * @param arg the actual engine arg, used for the UserException if there's an error + * @param tags a list of string tags that should be converted to the MissingPedField value + * @return + */ + public static final EnumSet parseMissingFieldTags(final Object arg, final List tags) { + final EnumSet missingFields = EnumSet.noneOf(MissingPedField.class); + + for ( final String tag : tags ) { + try { + missingFields.add(MissingPedField.valueOf(tag)); + } catch ( IllegalArgumentException e ) { + throw new UserException.BadArgumentValue(arg.toString(), "Unknown tag " + tag + " allowed values are " + MissingPedField.values()); + } + } + + return missingFields; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java new file mode 100644 index 000000000..bbf857820 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** +* +*/ +public enum PedigreeValidationType { + /** + * Require if a pedigree file is provided at all samples in the VCF or BAM files have a corresponding + * entry in the pedigree file(s). + */ + STRICT, + + /** + * Do not enforce any overlap between the VCF/BAM samples and the pedigree data + * */ + SILENT +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java new file mode 100644 index 000000000..b39fdd79d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -0,0 +1,222 @@ +package org.broadinstitute.sting.gatk.samples; + + +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.HashMap; +import java.util.Map; + +/** + * + */ +public class Sample implements Comparable { // implements java.io.Serializable { + final private String familyID, paternalID, maternalID; + final private Gender gender; + final private String otherPhenotype; + final private Affection affection; + final private String ID; + final private SampleDB infoDB; + final private Map properties = new HashMap(); + + public final static String UNSET_QT = null; + + public Sample(final String ID, final SampleDB infoDB, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final Affection affection, final String otherPhenotype) { + this.familyID = familyID; + this.paternalID = paternalID; + this.maternalID = maternalID; + this.gender = gender; + this.otherPhenotype = otherPhenotype; + this.affection = affection; + this.ID = ID; + this.infoDB = infoDB; + } + + protected Sample(final String ID, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final Affection affection, final String otherPhenotype) { + this(ID, null, familyID, paternalID, maternalID, gender, affection, otherPhenotype); + } + + protected Sample(final String ID, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final Affection affection) { + this(ID, null, familyID, paternalID, maternalID, gender, affection, UNSET_QT); + } + + + public Sample(final String ID, final SampleDB infoDB, + final String familyID, final String paternalID, final String maternalID, final Gender gender) { + this(ID, infoDB, familyID, paternalID, maternalID, gender, Affection.UNKNOWN, UNSET_QT); + } + + public Sample(final String ID, final SampleDB infoDB, final Affection affection, final String otherPhenotype) { + this(ID, infoDB, null, null, null, Gender.UNKNOWN, affection, otherPhenotype); + } + + public Sample(String id, SampleDB infoDB) { + this(id, infoDB, null, null, null, + Gender.UNKNOWN, Affection.UNKNOWN, UNSET_QT); + } + + // ------------------------------------------------------------------------------------- + // + // standard property getters + // + // ------------------------------------------------------------------------------------- + + public String getID() { + return ID; + } + + public String getFamilyID() { + return familyID; + } + + public String getPaternalID() { + return paternalID; + } + + public String getMaternalID() { + return maternalID; + } + + public Affection getAffection() { + return affection; + } + + public boolean hasOtherPhenotype() { + return affection == Affection.OTHER; + } + + public String getOtherPhenotype() { + return otherPhenotype; + } + + /** + * Get the sample's mother + * @return sample object with relationship mother, if exists, or null + */ + public Sample getMother() { + return infoDB.getSample(maternalID); + } + + /** + * Get the sample's father + * @return sample object with relationship father, if exists, or null + */ + public Sample getFather() { + return infoDB.getSample(paternalID); + } + + /** + * Get gender of the sample + * @return property of key "gender" - must be of type Gender + */ + public Gender getGender() { + return gender; + } + + @Override + public int compareTo(final Sample sample) { + return ID.compareTo(sample.getID()); + } + + @Override + public String toString() { + return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s", + getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(), + getOtherPhenotype(), properties); + } + +// // ------------------------------------------------------------------------------------- +// // +// // code for working with additional -- none standard -- properites +// // +// // ------------------------------------------------------------------------------------- +// +// public Map getExtraProperties() { +// return Collections.unmodifiableMap(properties); +// } +// +// /** +// * Get one property +// * @param key key of property +// * @return value of property as generic object +// */ +// public Object getExtraPropertyValue(final String key) { +// return properties.get(key); +// } +// +// /** +// * +// * @param key property key +// * @return true if sample has this property (even if its value is null) +// */ +// public boolean hasExtraProperty(String key) { +// return properties.containsKey(key); +// } + + @Override + public int hashCode() { + return ID.hashCode(); + } + + @Override + public boolean equals(final Object o) { + if(o == null) + return false; + if(o instanceof Sample) { + Sample otherSample = (Sample)o; + return ID.equals(otherSample.ID) && + equalOrNull(familyID, otherSample.familyID) && + equalOrNull(paternalID, otherSample.paternalID) && + equalOrNull(maternalID, otherSample.maternalID) && + equalOrNull(gender, otherSample.gender) && + equalOrNull(otherPhenotype, otherSample.otherPhenotype) && + equalOrNull(affection, otherSample.affection) && + equalOrNull(properties, otherSample.properties); + } + return false; + } + + private final static boolean equalOrNull(final Object o1, final Object o2) { + if ( o1 == null ) + return o2 == null; + else + return o2 == null ? false : o1.equals(o2); + } + + private final static T mergeValues(final String name, final String field, final T o1, final T o2, final T emptyValue) { + if ( o1 == null || o1.equals(emptyValue) ) { + // take o2 if both are null, otherwise keep o2 + return o2 == null ? null : o2; + } else { + if ( o2 == null || o2.equals(emptyValue) ) + return o1; // keep o1, since it's a real value + else { + // both o1 and o2 have a value + if ( o1 == o2 ) + return o1; + else + throw new UserException("Inconsistent values detected for " + name + " for field " + field + " value1 " + o1 + " value2 " + o2); + } + } + } + + public final static Sample mergeSamples(final Sample prev, final Sample next) { + if ( prev.equals(next) ) + return next; + else { + return new Sample(prev.getID(), prev.infoDB, + mergeValues(prev.getID(), "Family_ID", prev.getFamilyID(), next.getFamilyID(), null), + mergeValues(prev.getID(), "Paternal_ID", prev.getPaternalID(), next.getPaternalID(), null), + mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null), + mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN), + mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN), + mergeValues(prev.getID(), "OtherPhenotype", prev.getOtherPhenotype(), next.getOtherPhenotype(), UNSET_QT)); + //mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap())); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java new file mode 100644 index 000000000..ee0873c6e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -0,0 +1,183 @@ +package org.broadinstitute.sting.gatk.samples; + +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.variantcontext.Genotype; + +import java.util.*; + +/** + * + */ +public class SampleDB { + /** + * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so + * this is stored as a HashMap. + */ + private final HashMap samples = new HashMap(); + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + */ + public SampleDB() { + + } + + /** + * Protected function to add a single sample to the database + * + * @param sample to be added + */ + protected SampleDB addSample(Sample sample) { + Sample prev = samples.get(sample.getID()); + if ( prev != null ) + sample = Sample.mergeSamples(prev, sample); + samples.put(sample.getID(), sample); + return this; + } + + // -------------------------------------------------------------------------------- + // + // Functions for getting a sample from the DB + // + // -------------------------------------------------------------------------------- + + /** + * Get a sample by its ID + * If an alias is passed in, return the main sample object + * @param id + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(String id) { + return samples.get(id); + } + + /** + * + * @param read + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(final SAMRecord read) { + return getSample(read.getReadGroup()); + } + + /** + * + * @param rg + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(final SAMReadGroupRecord rg) { + return getSample(rg.getSample()); + } + + /** + * @param g Genotype + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(final Genotype g) { + return getSample(g.getSampleName()); + } + + // -------------------------------------------------------------------------------- + // + // Functions for accessing samples in the DB + // + // -------------------------------------------------------------------------------- + + /** + * Get number of sample objects + * @return size of samples map + */ + public int sampleCount() { + return samples.size(); + } + + public Set getSamples() { + return new HashSet(samples.values()); + } + + public Collection getSampleNames() { + return Collections.unmodifiableCollection(samples.keySet()); + } + + + /** + * Takes a collection of sample names and returns their corresponding sample objects + * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set + * @param sampleNameList Set of sample names + * @return Corresponding set of samples + */ + public Set getSamples(Collection sampleNameList) { + HashSet samples = new HashSet(); + for (String name : sampleNameList) { + try { + samples.add(getSample(name)); + } + catch (Exception e) { + throw new StingException("Could not get sample with the following ID: " + name, e); + } + } + return samples; + } + + // -------------------------------------------------------------------------------- + // + // Higher level pedigree functions + // + // -------------------------------------------------------------------------------- + + /** + * Returns a sorted set of the family IDs in all samples (excluding null ids) + * @return + */ + public final Set getFamilyIDs() { + return getFamilies().keySet(); + } + + /** + * Returns a map from family ID -> set of family members for all samples with + * non-null family ids + * + * @return + */ + public final Map> getFamilies() { + final Map> families = new TreeMap>(); + + for ( final Sample sample : samples.values() ) { + final String famID = sample.getFamilyID(); + if ( famID != null ) { + if ( ! families.containsKey(famID) ) + families.put(famID, new TreeSet()); + families.get(famID).add(sample); + } + } + + return families; + } + + /** + * Return all samples with a given family ID + * @param familyId + * @return + */ + public Set getFamily(String familyId) { + return getFamilies().get(familyId); + } + + /** + * Returns all children of a given sample + * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient + * @param sample + * @return + */ + public Set getChildren(Sample sample) { + final HashSet children = new HashSet(); + for ( final Sample familyMember : getFamily(sample.getFamilyID())) { + if ( familyMember.getMother() == sample || familyMember.getFather() == sample ) { + children.add(familyMember); + } + } + return children; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java new file mode 100644 index 000000000..44a8600b0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * + */ +public class SampleDBBuilder { + PedigreeValidationType validationStrictness; + final SampleDB sampleDB = new SampleDB(); + final GenomeAnalysisEngine engine; + + Set samplesFromDataSources = new HashSet(); + Set samplesFromPedigrees = new HashSet(); + + /** for testing only */ + protected SampleDBBuilder(PedigreeValidationType validationStrictness) { + engine = null; + this.validationStrictness = validationStrictness; + } + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + */ + public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) { + this.engine = engine; + this.validationStrictness = validationStrictness; + } + + /** + * Hallucinates sample objects for all the samples in the SAM file and stores them + */ + public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { + addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); + return this; + } + + public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { + for (final String sampleName : sampleNames) { + if (sampleDB.getSample(sampleName) == null) { + final Sample newSample = new Sample(sampleName, sampleDB); + sampleDB.addSample(newSample); + samplesFromDataSources.add(newSample); // keep track of data source samples + } + } + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) { + for (final File pedFile : pedigreeFiles) { + Collection samples = addSamplesFromPedigreeArgument(pedFile); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) { + for (final String pedString : pedigreeStrings) { + Collection samples = addSamplesFromPedigreeArgument(pedString); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + /** + * Parse one sample file and integrate it with samples that are already there + * Fail quickly if we find any errors in the file + */ + private Collection addSamplesFromPedigreeArgument(File sampleFile) { + final PedReader reader = new PedReader(); + + try { + return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleFile, e); + } + } + + private Collection addSamplesFromPedigreeArgument(final String string) { + final PedReader reader = new PedReader(); + return reader.parse(string, getMissingFields(string), sampleDB); + } + + public SampleDB getFinalSampleDB() { + validate(); + return sampleDB; + } + + public EnumSet getMissingFields(final Object engineArg) { + if ( engine == null ) + return EnumSet.noneOf(PedReader.MissingPedField.class); + else { + final List posTags = engine.getTags(engineArg).getPositionalTags(); + return PedReader.parseMissingFieldTags(engineArg, posTags); + } + } + + // -------------------------------------------------------------------------------- + // + // Validation + // + // -------------------------------------------------------------------------------- + + protected final void validate() { + if ( validationStrictness == PedigreeValidationType.SILENT ) + return; + else { + // check that samples in data sources are all annotated, if anything is annotated + if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { + final Set sampleNamesFromPedigrees = new HashSet(); + for ( final Sample pSample : samplesFromPedigrees ) + sampleNamesFromPedigrees.add(pSample.getID()); + + for ( final Sample dsSample : samplesFromDataSources ) + if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) + throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation"); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 10261112c..792fef9c3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -30,11 +30,12 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.MalformedReadFilter; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.GenericDocumentationHandler; import java.util.List; @@ -87,6 +88,14 @@ public abstract class Walker { return getToolkit().getMasterSequenceDictionary(); } + protected SampleDB getSampleDB() { + return getToolkit().getSampleDB(); + } + + protected Sample getSample(final String id) { + return getToolkit().getSampleDB().getSample(id); + } + /** * (conceptual static) method that states whether you want to see reads piling up at a locus * that contain a deletion at the locus. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 87695077d..b722220f9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Gender; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VQSRCalibrationCurve; import org.broadinstitute.sting.utils.GenomeLoc; @@ -247,7 +248,7 @@ public class ProduceBeagleInputWalker extends RodWalker { Map preferredGenotypes = preferredVC.getGenotypes(); Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for ( String sample : samples ) { - boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale(); + boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 32875a098..1dfc6fea0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -227,9 +227,8 @@ public class CallableLociWalker extends LocusWalker getSamplesFromToolKit(DoCOutputType.Partition type) { HashSet partition = new HashSet(); if ( type == DoCOutputType.Partition.sample ) { - for ( Set sampleSet : getToolkit().getSamplesByReaders() ) { - for ( String s : sampleSet ) { - partition.add(s); - } - } + partition.addAll(SampleUtils.getSAMFileSamples(getToolkit())); } else if ( type == DoCOutputType.Partition.readgroup ) { for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); } } else if ( type == DoCOutputType.Partition.library ) { - for ( Set libraries : getToolkit().getLibrariesByReaders() ) { - for ( String l : libraries ) { - partition.add(l); - } + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getLibrary()); } } else if ( type == DoCOutputType.Partition.center ) { for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 8680f3537..36e4db1c5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -248,6 +248,9 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") protected String N_WAY_OUT = null; + @Hidden + @Argument(fullName="generate_nWayOut_md5s",doc="Generate md5sums for BAMs") + protected boolean generateMD5s = false; // DEBUGGING OPTIONS FOLLOW @@ -401,9 +404,9 @@ public class IndelRealigner extends ReadWalker { // if ( args.containsKey("disable_bam_indexing") ) { System.out.println("NO INDEXING!!"); System.exit(1); createIndex = false; } if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) { - writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT),SAMFileHeader.SortOrder.coordinate,true, createIndex); + writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT),SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s); } else { - writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, createIndex); + writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s); } } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 7c436ce44..9edf5b5d4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -32,16 +32,11 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.io.File; -import java.io.FileWriter; -import java.io.PrintStream; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 5b10a79c6..74cbfa05f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -36,12 +36,8 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.utils.codecs.refseq.Transcript; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; @@ -51,6 +47,9 @@ import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; +import org.broadinstitute.sting.utils.codecs.refseq.Transcript; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.collections.CircularArray; import org.broadinstitute.sting.utils.collections.PrimitivePair; @@ -392,7 +391,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); - normalSamples = getToolkit().getSamplesByReaders().get(0); + normalSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeaders().get(0)); try { // we already checked that bedOutput and output_file are not set simultaneously diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 17a6e20f1..998cfa654 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -1095,14 +1094,14 @@ public class ReadBackedPhasingWalker extends RodWalker { + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + Sample sample = getSampleDB().getSample(read); + return sample.getGender() == Gender.MALE ? 1 : 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 72058ba7b..e83434037 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -130,6 +130,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval { nVariantLoci++; nMixed++; break; + case SYMBOLIC: + // ignore symbolic alleles, but don't fail + // todo - consistent way of treating symbolic alleles thgoughout codebase? + break; default: throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index c44d84136..81d0c36ac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -133,7 +133,7 @@ public class VariantsToTable extends RodWalker { /** * By default, this tool throws a UserException when it encounters a field without a value in some record. This - * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being + * is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHRMO not being * found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such * fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error. diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java index 8da118174..e62a7e512 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -1,13 +1,10 @@ package org.broadinstitute.sting.utils; -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; @@ -19,9 +16,6 @@ import java.util.regex.Pattern; * Time: 12:38 PM */ public class MendelianViolation { - - - String sampleMom; String sampleDad; String sampleChild; @@ -34,22 +28,15 @@ public class MendelianViolation { private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); - static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; - static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; - - public String getSampleMom() { return sampleMom; } - public String getSampleDad() { return sampleDad; } - public String getSampleChild() { return sampleChild; } - public double getMinGenotypeQuality() { return minGenotypeQuality; } @@ -90,37 +77,12 @@ public class MendelianViolation { * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation */ public MendelianViolation(Sample sample, double minGenotypeQualityP) { - sampleMom = sample.getMother().getId(); - sampleDad = sample.getFather().getId(); - sampleChild = sample.getId(); + sampleMom = sample.getMother().getID(); + sampleDad = sample.getFather().getID(); + sampleChild = sample.getID(); minGenotypeQuality = minGenotypeQualityP; } - - /** - * The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option. - * @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored. - * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation - */ - public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) { - boolean gotSampleInformation = false; - Collection samples = engine.getSamples(); - // Iterate through all samples in the sample_metadata file but we really can only take one. - for (Sample sample : samples) { - if (sample.getMother() != null && sample.getFather() != null) { - sampleMom = sample.getMother().getId(); - sampleDad = sample.getFather().getId(); - sampleChild = sample.getId(); - minGenotypeQuality = minGenotypeQualityP; - gotSampleInformation = true; - break; // we can only deal with one trio information - } - } - if (!gotSampleInformation) - throw new UserException("YAML file has no sample with relationship information (mother/father)"); - } - - /** * This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to * isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site @@ -158,7 +120,7 @@ public class MendelianViolation { * @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation. * */ - public boolean isViolation (VariantContext vc) + public boolean isViolation(VariantContext vc) { return setAlleles(vc) && isViolation(); } @@ -172,42 +134,4 @@ public class MendelianViolation { return false; return true; } - - /** - * @return the likelihood ratio for a mendelian violation - */ - public double violationLikelihoodRatio(VariantContext vc) { - double[] logLikAssignments = new double[27]; - // the matrix to set up is - // MOM DAD CHILD - // |- AA - // AA AA | AB - // |- BB - // |- AA - // AA AB | AB - // |- BB - // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs - double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector(); - double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector(); - double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector(); - int offset = 0; - for ( int oMom = 0; oMom < 3; oMom++ ) { - for ( int oDad = 0; oDad < 3; oDad++ ) { - for ( int oChild = 0; oChild < 3; oChild ++ ) { - logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild]; - } - } - } - double[] mvLiks = new double[12]; - double[] nonMVLiks = new double[15]; - for ( int i = 0; i < 12; i ++ ) { - mvLiks[i] = logLikAssignments[mvOffsets[i]]; - } - - for ( int i = 0; i < 15; i++) { - nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]]; - } - - return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks); - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 1b4703e4a..edc1413ba 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -69,6 +69,18 @@ public class SampleUtils { return samples; } + + /** + * Same as @link getSAMFileSamples but gets all of the samples + * in the SAM files loaded by the engine + * + * @param engine + * @return + */ + public final static Set getSAMFileSamples(GenomeAnalysisEngine engine) { + return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader()); + } + /** * Gets all of the unique sample names from all VCF rods input by the user * diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 11a59de10..0106442e0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -4,7 +4,6 @@ import com.google.java.contract.Requires; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.ReadUtils; diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 70f7387f4..77f1ed6c0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -215,6 +215,10 @@ public class UserException extends ReviewedStingException { super(String.format("File %s is malformed: %s caused by %s", f.getAbsolutePath(), message, e.getMessage())); } + public MalformedFile(String name, String message) { + super(String.format("File associated with name %s is malformed: %s", name, message)); + } + public MalformedFile(String name, String message, Exception e) { super(String.format("File associated with name %s is malformed: %s caused by %s", name, message, e.getMessage())); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 3821c9c8a..b3f2bc6b0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -26,11 +26,9 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; import java.util.*; @@ -114,10 +112,10 @@ public abstract class AbstractReadBackedPileup> pileupsBySample) { + protected AbstractReadBackedPileup(GenomeLoc loc, Map> pileupsBySample) { this.loc = loc; PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); - for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) { + for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) { tracker.addElements(pileupEntry.getKey(),pileupEntry.getValue().pileupElementTracker); addPileupToCumulativeStats(pileupEntry.getValue()); } @@ -213,7 +211,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutDeletions(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -251,7 +249,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getOverlappingFragmentFilteredPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -305,7 +303,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutMappingQualityZeroReads(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -334,7 +332,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPositiveStrandPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -363,7 +361,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getNegativeStrandPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -393,7 +391,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getFilteredPileup(filter); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -425,7 +423,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ,minMapQ); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -492,7 +490,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroup(targetReadGroupId); if(pileup != null) @@ -523,7 +521,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForLane(laneID); if(pileup != null) @@ -550,14 +548,10 @@ public abstract class AbstractReadBackedPileup getSampleNames() { + public Collection getSamples() { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; - Collection sampleNames = new HashSet(); - for (Sample sample : tracker.getSamples()) { - sampleNames.add(sample.getId()); - } - return sampleNames; + return new HashSet(tracker.getSamples()); } else { Collection sampleNames = new HashSet(); @@ -570,16 +564,6 @@ public abstract class AbstractReadBackedPileup getSamples() { - if(!(pileupElementTracker instanceof PerSamplePileupElementTracker)) { - throw new StingException("Must be an instance of PerSampleElementTracker"); - } - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; - return tracker.getSamples(); - } - - /** * Returns a pileup randomly downsampled to the desiredCoverage. * @@ -604,7 +588,7 @@ public abstract class AbstractReadBackedPileup perSampleElements = tracker.getElements(sample); List filteredPileup = new ArrayList(); @@ -639,7 +623,7 @@ public abstract class AbstractReadBackedPileup sampleNames) { + public RBP getPileupForSamples(Collection sampleNames) { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleNames); @@ -665,7 +649,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleName); @@ -688,30 +672,6 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; - PileupElementTracker filteredElements = tracker.getElements(sample); - return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null; - } - else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { - SAMRecord read = p.getRead(); - if(sample != null) { - if(read.getReadGroup() != null && sample.getId().equals(read.getReadGroup().getSample())) - filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; - } - } - // -------------------------------------------------------- // // iterators @@ -801,7 +761,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { int[] countsBySample = createNewPileup(loc,tracker.getElements(sample)).getBaseCounts(); for(int i = 0; i < counts.length; i++) counts[i] += countsBySample[i]; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java index 7005cf869..c00ed24f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import java.util.Comparator; import java.util.Iterator; @@ -42,7 +41,7 @@ class MergingPileupElementIterator implements Iterator public MergingPileupElementIterator(PerSamplePileupElementTracker tracker) { perSampleIterators = new PriorityQueue>(Math.max(1,tracker.getSamples().size()),new PileupElementIteratorComparator()); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker trackerPerSample = tracker.getElements(sample); if(trackerPerSample.size() != 0) perSampleIterators.add(new PeekableIterator(trackerPerSample.iterator())); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java index 29e431695..09b907e00 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java @@ -24,8 +24,6 @@ package org.broadinstitute.sting.utils.pileup; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; - import java.util.*; /** @@ -60,52 +58,35 @@ class UnifiedPileupElementTracker extends PileupElemen } class PerSamplePileupElementTracker extends PileupElementTracker { - private final Map> pileup; - private final Map sampleNames = new HashMap(); + private final Map> pileup; private int size = 0; public PerSamplePileupElementTracker() { - pileup = new HashMap>(); - } - - public PerSamplePileupElementTracker(Map> pileupsBySample) { - pileup = new HashMap>(); - for(Map.Entry> entry: pileupsBySample.entrySet()) { - Sample sample = entry.getKey(); - AbstractReadBackedPileup pileupBySample = entry.getValue(); - pileup.put(sample,pileupBySample.pileupElementTracker); - sampleNames.put(sample.getId(), sample); - } + pileup = new HashMap>(); } /** * Gets a list of all the samples stored in this pileup. * @return List of samples in this pileup. */ - public Collection getSamples() { + public Collection getSamples() { return pileup.keySet(); } - public PileupElementTracker getElements(final Sample sample) { + public PileupElementTracker getElements(final String sample) { return pileup.get(sample); } - public PileupElementTracker getElements(final String sampleName) { - return pileup.get(sampleNames.get(sampleName)); - } - public PileupElementTracker getElements(final Collection selectSampleNames) { PerSamplePileupElementTracker result = new PerSamplePileupElementTracker(); - for (String sample : selectSampleNames) { - Sample sampleObject = sampleNames.get(sample); - result.addElements(sampleObject, pileup.get(sampleObject)); + for (final String sample : selectSampleNames) { + result.addElements(sample, pileup.get(sample)); } return result; } - public void addElements(final Sample sample, PileupElementTracker elements) { + public void addElements(final String sample, PileupElementTracker elements) { pileup.put(sample,elements); - sampleNames.put(sample.getId(), sample); size += elements.size(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java index 8d43a368a..e7c0bc18f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; @@ -121,20 +120,7 @@ public interface ReadBackedExtendedEventPileup extends ReadBackedPileup { * Gets a list of all the samples stored in this pileup. * @return List of samples in this pileup. */ - public Collection getSampleNames(); - - /** - * Gets a list of all the samples stored in this pileup. - * @return List of samples in this pileup. - */ - public Collection getSamples(); - - /** - * Gets the particular subset of this pileup with the given sample name. - * @param sample Name of the sample to use. - * @return A subset of this pileup containing only reads with the given sample. - */ - public ReadBackedExtendedEventPileup getPileupForSample(Sample sample); + public Collection getSamples(); public Iterable toExtendedIterable(); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index 31d29430a..21dfee8b8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -56,7 +55,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< } // this is the good new one - public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { + public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { super(loc,pileupElementsBySample); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 36b8a8c65..3d30aa11b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.HasGenomeLocation; @@ -137,18 +136,11 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForLane(String laneID); - - /** - * Gets a collection of all the samples stored in this pileup. - * @return Collection of samples in this pileup. - */ - public Collection getSamples(); - /** * Gets a collection of *names* of all the samples stored in this pileup. * @return Collection of names */ - public Collection getSampleNames(); + public Collection getSamples(); /** @@ -156,7 +148,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * @param sampleNames Name of the sample to use. * @return A subset of this pileup containing only reads with the given sample. */ - public ReadBackedPileup getPileupForSampleNames(Collection sampleNames); + public ReadBackedPileup getPileupForSamples(Collection sampleNames); /** @@ -164,14 +156,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * @param sampleName Name of the sample to use. * @return A subset of this pileup containing only reads with the given sample. */ - public ReadBackedPileup getPileupForSampleName(String sampleName); - - /** - * Gets the particular subset of this pileup with the given sample. - * @param sample Sample to use. - * @return A subset of this pileup containing only reads with the given sample. - */ - public ReadBackedPileup getPileupForSample(Sample sample); + public ReadBackedPileup getPileupForSample(String sampleName); /** * Simple useful routine to count the number of deletion bases in this pileup diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index e5b054961..18e6d9134 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.List; @@ -48,7 +47,7 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup pileupElementsBySample) { + public ReadBackedPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { super(loc,pileupElementsBySample); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index 70417889b..07bfc52c7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -51,18 +51,18 @@ public class NWaySAMFileWriter implements SAMFileWriter { private boolean presorted ; GenomeAnalysisEngine toolkit; - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly) { + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { this.presorted = presorted; this.toolkit = toolkit; writerMap = new HashMap(); - setupByReader(toolkit,in2out,order, presorted, indexOnTheFly); + setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5); } - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly ) { + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly , boolean generateMD5) { this.presorted = presorted; this.toolkit = toolkit; writerMap = new HashMap(); - setupByReader(toolkit,ext,order, presorted, indexOnTheFly); + setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5); } @@ -73,8 +73,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { * @param toolkit * @param in2out */ - public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly) { - + public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { @@ -88,7 +87,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { if ( writerMap.containsKey( rid ) ) throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - addWriter(rid,outName, order, presorted, indexOnTheFly); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5); } } @@ -101,7 +100,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { * @param toolkit * @param ext */ - public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly) { + public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); @@ -119,16 +118,19 @@ public class NWaySAMFileWriter implements SAMFileWriter { if ( writerMap.containsKey( rid ) ) throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - addWriter(rid,outName, order, presorted, indexOnTheFly); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5); } } - private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly) { + private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { File f = new File(outName); SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); header.setSortOrder(order); - SAMFileWriter sw = new SAMFileWriterFactory().setCreateIndex(indexOnTheFly).makeSAMOrBAMWriter(header, presorted, f); + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(indexOnTheFly); + factory.setCreateMd5File(generateMD5); + SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); writerMap.put(id,sw); } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java index 52b6f3b01..49e9ddf52 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java @@ -99,9 +99,9 @@ public class XReadLines implements Iterator, Iterable { * * @param reader */ - public XReadLines(final BufferedReader reader, final boolean trimWhitespace) { + public XReadLines(final Reader reader, final boolean trimWhitespace) { try { - this.in = reader; + this.in = new BufferedReader(reader); nextline = readNextLine(); this.trimWhitespace = trimWhitespace; } catch(IOException e) { @@ -109,7 +109,7 @@ public class XReadLines implements Iterator, Iterable { } } - public XReadLines(final BufferedReader reader) throws FileNotFoundException { + public XReadLines(final Reader reader) { this(reader, true); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 412cbd90b..05e21c8b8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -132,7 +132,7 @@ import java.util.*; * vc.hasGenotypes() * vc.isMonomorphic() * vc.isPolymorphic() - * vc.getSampleNames().size() + * vc.getSamples().size() * * vc.getGenotypes() * vc.getGenotypes().get("g1") diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index 0194e114a..374a9f8da 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -129,7 +129,7 @@ public class MD5DB { System.out.printf("##### Skipping update, cannot write file %s%n", dbFile); } } else { - System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); + //System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); } } @@ -170,6 +170,18 @@ public class MD5DB { return bytes; } + public static class MD5Match { + final String md5; + final String failMessage; + boolean failed; + + public MD5Match(final String md5, final String failMessage, final boolean failed) { + this.md5 = md5; + this.failMessage = failMessage; + this.failed = failed; + } + } + /** * Tests a file MD5 against an expected value, returning the MD5. NOTE: This function WILL throw an exception if the MD5s are different. * @param name Name of the test. @@ -178,18 +190,21 @@ public class MD5DB { * @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text. * @return The calculated MD5. */ - public static String assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) { - String filemd5sum = testFileMD5(name, resultsFile, expectedMD5, parameterize); + public static MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) { + final String filemd5sum = testFileMD5(name, resultsFile, expectedMD5, parameterize); + String failMessage = null; + boolean failed = false; if (parameterize || expectedMD5.equals("")) { // Don't assert } else if ( filemd5sum.equals(expectedMD5) ) { - System.out.println(String.format(" => %s PASSED", name)); + System.out.println(String.format(" => %s PASSED (expected=%s)", name, expectedMD5)); } else { - Assert.fail(String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, filemd5sum)); + failed = true; + failMessage = String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, filemd5sum); } - return filemd5sum; + return new MD5Match(filemd5sum, failMessage, failed); } @@ -218,8 +233,8 @@ public class MD5DB { System.out.println(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b", name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5))); } else { - System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5)); - System.out.flush(); + //System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5)); + //System.out.flush(); if ( ! expectedMD5.equals(filemd5sum) ) { // we are going to fail for real in assertEquals (so we are counted by the testing framework). diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index a1817e3c7..ca7653b58 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -52,7 +52,7 @@ public class WalkerTest extends BaseTest { GenomeAnalysisEngine.resetRandomGenerator(); } - public String assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5) { + public MD5DB.MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5) { return MD5DB.assertMatchingMD5(name, resultsFile, expectedMD5, parameterize()); } @@ -84,10 +84,23 @@ public class WalkerTest extends BaseTest { public List assertMatchingMD5s(final String name, List resultFiles, List expectedMD5s) { List md5s = new ArrayList(); + List fails = new ArrayList(); + for (int i = 0; i < resultFiles.size(); i++) { - String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); - maybeValidateSupplementaryFile(name, resultFiles.get(i)); - md5s.add(i, md5); + MD5DB.MD5Match result = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); + if ( ! result.failed ) { + maybeValidateSupplementaryFile(name, resultFiles.get(i)); + md5s.add(result.md5); + } else { + fails.add(result); + } + } + + if ( ! fails.isEmpty() ) { + for ( final MD5DB.MD5Match fail : fails ) { + logger.warn("Fail: " + fail.failMessage); + } + Assert.fail("Test failed: " + name); } return md5s; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index e5cf80826..2adb4864c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -8,11 +8,9 @@ import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.testng.annotations.BeforeClass; @@ -51,7 +49,7 @@ public abstract class LocusViewTemplate extends BaseTest { GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); @@ -67,7 +65,7 @@ public abstract class LocusViewTemplate extends BaseTest { GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(shardBounds)); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); @@ -82,7 +80,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -96,7 +94,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -110,7 +108,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -124,7 +122,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 5, 5))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -138,7 +136,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -152,7 +150,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -167,7 +165,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read1, read2); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -186,7 +184,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -205,7 +203,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -226,7 +224,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4, read5, read6); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -254,7 +252,7 @@ public abstract class LocusViewTemplate extends BaseTest { read07, read08, read09, read10, read11, read12); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 2ecd75754..5ee373e4f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import com.google.caliper.Param; import net.sf.picard.filter.FilteringIterator; -import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; @@ -34,15 +33,12 @@ import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.baq.BAQ; -import java.io.File; import java.util.Collections; import java.util.Iterator; @@ -88,12 +84,9 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { (byte)0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); - SampleDataSource sampleDataSource = new SampleDataSource(); - sampleDataSource.addSamplesFromSAMHeader(reader.getFileHeader()); - // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); - LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser,sampleDataSource); + LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); while(locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java deleted file mode 100644 index 59405c065..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java +++ /dev/null @@ -1,241 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Sep 9, 2010 - * Time: 8:21:00 AM - */ -public class SampleDataSourceUnitTest extends BaseTest { - - // this empty header used to instantiate sampledatasource objects - private static SAMFileHeader header = new SAMFileHeader(); - - // all the test sample files are located here - private String sampleFilesDir = validationDataLocation + "samples/"; - - // make sure samples are created from the SAM file correctly - @Test() - public void loadSAMSamplesTest() { - SampleDataSource s = new SampleDataSource(header, null); - } - - // tests that a basic sample with relationships loads correctly - // Note that this is the only test for family relationships - we may want to expand this - @Test() - public void basicLoadSampleFileTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - Sample sampleA = s.getSampleById("sampleA"); - Sample sampleB = s.getSampleById("sampleB"); - Assert.assertTrue(sampleB.getMother() == sampleA); - Assert.assertTrue(s.getChildren(sampleA).contains(sampleB)); - Set family = s.getFamily("family1"); - Assert.assertTrue(family.size() == 2); - Assert.assertTrue(family.contains(sampleA)); - Assert.assertTrue(family.contains(sampleB)); - } - - // but that file should fail if it has an extra character in it... - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleExtraCharText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraChar.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // ...or a typo... - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleTypoText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxTypo.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - - } - - // ...or an extra unrecognized array - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleExtraArrayText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraArray.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // make sure aliases work - @Test(expectedExceptions=StingException.class) - public void sampleAliasText() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileWithAlias.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - // this file has two samples, but one has an alias. let's make sure that checks out... - Assert.assertTrue(s.sampleCount() == 3); - Assert.assertTrue(s.getSampleById("sampleA") == s.getSampleById("sampleC")); - } - - // error is thrown if property is included that's not in properties array - @Test(expectedExceptions=StingException.class) - public void unallowedPropertySampleTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedProperty.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // same as above, with relationship - @Test(expectedExceptions=StingException.class) - public void unallowedRelationshipSampleTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedRelationship.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // two sample files - @Test() - public void twoSampleFilesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - File secondFile = new File(sampleFilesDir + "basicSampleFileExt.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(secondFile); - SampleDataSource s = new SampleDataSource(header, files); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propC").equals("valC")); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); - } - - // two sample files, with contradictory properties - @Test(expectedExceptions=StingException.class) - public void twoContradictorySampleFilesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - File secondFile = new File(sampleFilesDir + "basicSampleFileInvalidExt.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(secondFile); - SampleDataSource s = new SampleDataSource(header, files); - } - - // three sample files - @Test() - public void threeSamplesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(new File(sampleFilesDir + "basicSampleFileExt.yaml")); - files.add(new File(sampleFilesDir + "basicSampleFileExt2.yaml")); - SampleDataSource s = new SampleDataSource(header, files); - Assert.assertTrue(s.sampleCount() == 6); - Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC")); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); - } - - /** - * testing getSamplesWithProperty - * in this file there are 5 samples - 2 with population "CEU", 1 with population "ABC", 1 with no population, - * and then the default null sample - */ - @Test() - public void getSamplesWithPropertyTest() { - File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - Set ceuSamples = s.getSamplesWithProperty("population", "CEU"); - Assert.assertTrue(ceuSamples.size() == 2); - - Iterator i = ceuSamples.iterator(); - ArrayList sampleNames = new ArrayList(); - sampleNames.add(i.next().getId()); - sampleNames.add(i.next().getId()); - Assert.assertTrue(sampleNames.contains("sampleA")); - Assert.assertTrue(sampleNames.contains("sampleB")); - } - - // make sure we can import data types other than Strings - @Test() - public void sampleTestPropertyType() { - File sampleFile = new File(sampleFilesDir + "sampleFileOtherTypes.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Sample sample = s.getSampleById("sampleA"); - Assert.assertTrue(sample.getProperty("a").getClass() == Integer.class); - Assert.assertTrue(sample.getProperty("b").getClass() == String.class); - Assert.assertTrue(sample.getProperty("c").getClass() == Double.class); - Assert.assertTrue(sample.getProperty("b").getClass() == String.class); - } - - /** - * check that getSamplesFromVariantContext works - * create a variant context with two sample names, and make sure the right samples are there - */ - @Test() - public void variantContextTest() { - SampleDataSource s = new SampleDataSource(header, null); - List alleleCollection = new ArrayList(); - Allele a1 = Allele.create("A", true); - alleleCollection.add(a1); - - Set genotypeCollection = new HashSet(); - genotypeCollection.add(new Genotype("NA123", alleleCollection)); - genotypeCollection.add(new Genotype("NA456", alleleCollection)); - - VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection); - - // make sure the set that's returned is the right size - HashSet set = (HashSet) s.getSamplesByVariantContext(v); - Assert.assertTrue(set.size() == 2); - - // make sure both samples are included - Iterator i = set.iterator(); - ArrayList sampleNames = new ArrayList(); - sampleNames.add(i.next().getId()); - sampleNames.add(i.next().getId()); - Assert.assertTrue(sampleNames.contains("NA123")); - Assert.assertTrue(sampleNames.contains("NA456")); - } - - /** - * checking subContextFromSampleProperty - */ - - /** - * check that subContextFromSampleProperty works - * create a variant context with four sample names, make sure that it filters correctly to 2 - */ - @Test() - public void subContextFromSamplePropertyTest() { - - File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - - List alleleCollection = new ArrayList(); - Allele a1 = Allele.create("A", true); - alleleCollection.add(a1); - - Set genotypeCollection = new HashSet(); - genotypeCollection.add(new Genotype("NA123", alleleCollection)); - genotypeCollection.add(new Genotype("sampleA", alleleCollection)); - genotypeCollection.add(new Genotype("sampleB", alleleCollection)); - genotypeCollection.add(new Genotype("sampleC", alleleCollection)); - - VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection); - VariantContext subContext = s.subContextFromSampleProperty(v, "population", "CEU"); - - Assert.assertTrue(subContext.getSampleNames().contains("sampleA")); - Assert.assertTrue(subContext.getSampleNames().contains("sampleA")); - Assert.assertTrue(subContext.getSampleNames().size() == 2); - - } - - - // we create lots of single item lists... - private ArrayList makeFileList(File file) { - ArrayList a = new ArrayList(); - a.add(file); - return a; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java deleted file mode 100644 index 67e84cdd8..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java +++ /dev/null @@ -1,64 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Sep 9, 2010 - * Time: 8:21:00 AM - */ -public class SampleUnitTest extends BaseTest { - - static Sample sampleA; - static Sample sampleA1; - static Sample sampleB; - static Sample sampleC; - - @BeforeClass - public void init() { - sampleA = new Sample("sampleA"); - sampleA.setProperty("uniqueProperty", "uniqueValue"); - sampleA1 = new Sample("sampleA"); - sampleA1.setProperty("uniqueProperty", "uniqueValue"); - sampleB = new Sample("sampleB"); - sampleC = new Sample("sampleC"); - sampleC.setProperty("population", "pop1"); - sampleC.setProperty("gender", Sample.Gender.MALE); - } - - /** - * Testing equality - */ - @Test() - public void equalsTest() { - Assert.assertTrue(sampleA.equals(sampleA1)); - Assert.assertFalse(sampleA == sampleA1); - Assert.assertFalse(sampleA.equals(sampleB)); - } - - /** - * And hash - */ - @Test() - public void basicHashTest() { - Assert.assertFalse(sampleA.hashCode() == sampleB.hashCode()); - Assert.assertTrue(sampleA.hashCode() == sampleA1.hashCode()); - } - - /** - * Now test the special getter methods - */ - @Test() - public void specialGettersTest() { - Assert.assertTrue(sampleC.getId().equals("sampleC")); - Assert.assertTrue(sampleC.getPopulation().equals("pop1")); - Assert.assertTrue(sampleC.isMale()); - Assert.assertFalse(sampleA.isMale()); // sample A doesn't have a gender, so this should be false - } - -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 32d3675b7..297a8501a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.iterators; -import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; @@ -12,7 +11,6 @@ import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -29,11 +27,8 @@ import java.util.*; * testing of the LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends BaseTest { - - private final int MAX_READS = 10; private static SAMFileHeader header; private LocusIteratorByState li; - private GenomeLocParser genomeLocParser; @BeforeClass @@ -42,6 +37,11 @@ public class LocusIteratorByStateUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } + private final LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), + readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + } + @Test public void testIndelBaseQualityFiltering() { final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; @@ -68,7 +68,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(before,during,after); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser, new SampleDataSource()); + li = makeLTBS(reads,readAttributes); boolean foundExtendedEventPileup = false; while (li.hasNext()) { @@ -120,7 +120,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(before,during,after); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser, new SampleDataSource()); + li = makeLTBS(reads,readAttributes); boolean foundExtendedEventPileup = false; while (li.hasNext()) { @@ -154,7 +154,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(indelOnlyRead); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = makeLTBS(reads, readAttributes); // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read // and considers it to be an indel-containing read. @@ -167,7 +167,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // Turn on extended events, and make sure the event is found. JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = makeLTBS(reads, readAttributes); Assert.assertTrue(li.hasNext(),"LocusIteratorByState with extended events should contain exactly one pileup"); alignmentContext = li.next(); @@ -203,7 +203,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),createTestReadProperties(),genomeLocParser,new SampleDataSource()); + li = makeLTBS(reads, createTestReadProperties()); int currentLocus = firstLocus; int numAlignmentContextsFound = 0; @@ -260,7 +260,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = makeLTBS(reads,readAttributes); Assert.assertTrue(li.hasNext(),"Missing first locus at " + firstLocus); AlignmentContext alignmentContext = li.next(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java new file mode 100644 index 000000000..1601845cd --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.StringReader; +import java.util.*; + +/** + * UnitTest for PedReader + * + * @author Mark DePristo + * @since 2011 + */ +public class PedReaderUnitTest extends BaseTest { + private static Logger logger = Logger.getLogger(PedReaderUnitTest.class); + + private class PedReaderTest extends TestDataProvider { + public String fileContents; + public List expectedSamples; + EnumSet missing; + + private PedReaderTest(final String name, final List expectedSamples, final String fileContents) { + super(PedReaderTest.class, name); + this.fileContents = fileContents; + this.expectedSamples = expectedSamples; + } + } + +// Family ID +// Individual ID +// Paternal ID +// Maternal ID +// Sex (1=male; 2=female; other=unknown) +// Phenotype +// +// -9 missing +// 0 missing +// 1 unaffected +// 2 affected + + @DataProvider(name = "readerTest") + public Object[][] createPEDFiles() { + new PedReaderTest("singleRecordMale", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 1 1"); + + new PedReaderTest("singleRecordFemale", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.FEMALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 2 1"); + + new PedReaderTest("singleRecordMissingGender", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.UNKNOWN, Affection.UNKNOWN)), + "fam1 kid 0 0 0 0"); + + // Affection + new PedReaderTest("singleRecordAffected", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.AFFECTED)), + "fam1 kid 0 0 1 2"); + + new PedReaderTest("singleRecordUnaffected", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 1 1"); + + new PedReaderTest("singleRecordMissingAffection-9", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNKNOWN)), + "fam1 kid 0 0 1 -9"); + + new PedReaderTest("singleRecordMissingAffection0", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNKNOWN)), + "fam1 kid 0 0 1 0"); + + new PedReaderTest("multipleUnrelated", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 2")); + + new PedReaderTest("multipleUnrelatedExtraLine", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s%n %n", // note extra newlines and whitespace + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 2")); + + new PedReaderTest("explicitTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2")); + + new PedReaderTest("implicitTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + "fam1 kid dad mom 1 2"); + + new PedReaderTest("partialTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + String.format("%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1")); + + new PedReaderTest("bigPedigree", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", "granddad1", "grandma1", Gender.MALE, Affection.UNAFFECTED), + new Sample("granddad1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("grandma1", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN), + new Sample("mom", "fam1", "granddad2", "grandma2", Gender.FEMALE, Affection.AFFECTED), + new Sample("granddad2", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("grandma2", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + String.format("%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad granddad1 grandma1 1 1", + "fam1 mom granddad2 grandma2 2 2")); + + // Quantitative trait + new PedReaderTest("OtherPhenotype", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.OTHER, "1"), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10.0")), + String.format("%s%n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 10.0")); + + new PedReaderTest("OtherPhenotypeWithMissing", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10.0")), + String.format("%s%n%s", + "fam1 s1 0 0 1 -9", + "fam2 s2 0 0 2 10.0")); + + new PedReaderTest("OtherPhenotypeOnlyInts", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.OTHER, "1"), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10")), + String.format("%s%n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 10")); + + return PedReaderTest.getTests(PedReaderTest.class); + } + + private static final void runTest(PedReaderTest test, String myFileContents, EnumSet missing) { + logger.warn("Test " + test); + PedReader reader = new PedReader(); + SampleDB sampleDB = new SampleDB(); + List readSamples = reader.parse(myFileContents, missing, sampleDB); + Assert.assertEquals(new HashSet(test.expectedSamples), new HashSet(readSamples)); + } + + @Test(enabled = true, dataProvider = "readerTest") + public void testPedReader(PedReaderTest test) { + runTest(test, test.fileContents, EnumSet.noneOf(PedReader.MissingPedField.class)); + } + + @Test(enabled = true, dataProvider = "readerTest") + public void testPedReaderWithComments(PedReaderTest test) { + runTest(test, String.format("#comment%n%s", test.fileContents), EnumSet.noneOf(PedReader.MissingPedField.class)); + } + + @Test(enabled = true, dataProvider = "readerTest") + public void testPedReaderWithSemicolons(PedReaderTest test) { + runTest(test, + test.fileContents.replace(String.format("%n"), ";"), + EnumSet.noneOf(PedReader.MissingPedField.class)); + } + + // ----------------------------------------------------------------- + // missing format field tests + // ----------------------------------------------------------------- + + private class PedReaderTestMissing extends TestDataProvider { + public EnumSet missingDesc; + public EnumSet missingFields; + public final String fileContents; + public Sample expected; + + + private PedReaderTestMissing(final String name, final String fileContents, + EnumSet missingDesc, + EnumSet missingFields, + final Sample expected) { + super(PedReaderTestMissing.class, name); + this.fileContents = fileContents; + this.missingDesc = missingDesc; + this.missingFields = missingFields; + this.expected = expected; + } + } + + @DataProvider(name = "readerTestMissing") + public Object[][] createPEDFilesWithMissing() { + new PedReaderTestMissing("missingFam", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID), + EnumSet.of(PedReader.Field.FAMILY_ID), + new Sample("kid", null, "dad", "mom", Gender.MALE, Affection.AFFECTED)); + + new PedReaderTestMissing("missingParents", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PARENTS), + EnumSet.of(PedReader.Field.PATERNAL_ID, PedReader.Field.MATERNAL_ID), + new Sample("kid", "fam1", null, null, Gender.MALE, Affection.AFFECTED)); + + new PedReaderTestMissing("missingSex", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_SEX), + EnumSet.of(PedReader.Field.GENDER), + new Sample("kid", "fam1", "dad", "mom", Gender.UNKNOWN, Affection.AFFECTED)); + + new PedReaderTestMissing("missingPhenotype", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE), + EnumSet.of(PedReader.Field.PHENOTYPE), + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.UNKNOWN)); + + new PedReaderTestMissing("missingEverythingButGender", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS, PedReader.MissingPedField.NO_FAMILY_ID), + EnumSet.of(PedReader.Field.FAMILY_ID, PedReader.Field.PATERNAL_ID, PedReader.Field.MATERNAL_ID, PedReader.Field.PHENOTYPE), + new Sample("kid", null, null, null, Gender.MALE, Affection.UNKNOWN)); + + + return PedReaderTestMissing.getTests(PedReaderTestMissing.class); + } + + @Test(enabled = true, dataProvider = "readerTestMissing") + public void testPedReaderWithMissing(PedReaderTestMissing test) { + final String contents = sliceContents(test.missingFields, test.fileContents); + logger.warn("Test " + test); + PedReader reader = new PedReader(); + SampleDB sampleDB = new SampleDB(); + reader.parse(new StringReader(contents), test.missingDesc, sampleDB); + final Sample missingSample = sampleDB.getSample("kid"); + Assert.assertEquals(test.expected, missingSample, "Missing field value not expected value for " + test); + } + + private final static String sliceContents(EnumSet missingFieldsSet, String full) { + List parts = new ArrayList(Arrays.asList(full.split("\\s+"))); + final List missingFields = new ArrayList(missingFieldsSet); + Collections.reverse(missingFields); + for ( PedReader.Field field : missingFields ) + parts.remove(field.ordinal()); + return Utils.join("\t", parts); + } + + // ----------------------------------------------------------------- + // parsing tags + // ----------------------------------------------------------------- + + private class PedReaderTestTagParsing extends TestDataProvider { + public EnumSet expected; + public final List tags; + + private PedReaderTestTagParsing(final List tags, EnumSet missingDesc) { + super(PedReaderTestTagParsing.class); + this.tags = tags; + this.expected = missingDesc; + } + } + + @DataProvider(name = "readerTestTagParsing") + public Object[][] createReaderTestTagParsing() { + new PedReaderTestTagParsing( + Collections.emptyList(), + EnumSet.noneOf(PedReader.MissingPedField.class)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_FAMILY_ID"), + EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_PARENTS"), + EnumSet.of(PedReader.MissingPedField.NO_PARENTS)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_PHENOTYPE"), + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX"), + EnumSet.of(PedReader.MissingPedField.NO_SEX)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX", "NO_PHENOTYPE"), + EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX", "NO_PHENOTYPE", "NO_PARENTS"), + EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS)); + + return PedReaderTestTagParsing.getTests(PedReaderTestTagParsing.class); + } + + @Test(enabled = true, dataProvider = "readerTestTagParsing") + public void testPedReaderTagParsing(PedReaderTestTagParsing test) { + EnumSet parsed = PedReader.parseMissingFieldTags("test", test.tags); + Assert.assertEquals(test.expected, parsed, "Failed to properly parse tags " + test.tags); + } + + @Test(enabled = true, expectedExceptions = UserException.class) + public void testPedReaderTagParsing1() { + EnumSet parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("XXX")); + } + + @Test(enabled = true, expectedExceptions = UserException.class) + public void testPedReaderTagParsing2() { + EnumSet parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("NO_SEX", "XXX")); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java new file mode 100644 index 000000000..d498ee61a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -0,0 +1,157 @@ +package org.broadinstitute.sting.gatk.samples; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Sep 9, 2010 + * Time: 8:21:00 AM + */ +public class SampleDBUnitTest extends BaseTest { + private static SampleDBBuilder builder; + // all the test sample files are located here + private File testPED = new File(testDir + "ceutrio.ped"); + + private static final Set testPEDSamples = new HashSet(Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED))); + + private static final Set testSAMSamples = new HashSet(Arrays.asList( + new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), + new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), + new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN))); + + private static final String testPEDString = + String.format("%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2"); + + private static final String testPEDMultipleFamilies = + String.format("%s%n%s%n%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2", + "fam3 s1 d1 m1 2 2", + "fam2 s2 d2 m2 2 2"); + + private static final String testPEDStringInconsistentGender = + "fam1 kid 0 0 2 2"; + + private static final Set testPEDSamplesAsSet = + new HashSet(testPEDSamples); + + + @BeforeMethod + public void before() { + builder = new SampleDBBuilder(PedigreeValidationType.STRICT); + } + + @Test() + public void loadPEDFile() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamplesAsSet, db.getSamples()); + } + + @Test() + public void loadPEDString() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDString)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamplesAsSet, db.getSamples()); + } + + private static final void addSAMHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); + ArtificialSAMUtils.createEnumeratedReadGroups(header, Arrays.asList("1", "2", "3"), + Arrays.asList("kid", "mom", "dad")); + builder.addSamplesFromSAMHeader(header); + } + + @Test() + public void loadSAMHeader() { + addSAMHeader(); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testSAMSamples, db.getSamples()); + } + + @Test() + public void loadSAMHeaderPlusPED() { + addSAMHeader(); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamples, db.getSamples()); + } + + @Test() + public void loadDuplicateData() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamples, db.getSamples()); + } + + @Test(expectedExceptions = UserException.class) + public void loadNonExistentFile() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(new File("non-existence-file.txt"))); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testSAMSamples, db.getSamples()); + } + + @Test(expectedExceptions = UserException.class) + public void loadInconsistentData() { + builder = new SampleDBBuilder(PedigreeValidationType.STRICT); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); + builder.getFinalSampleDB(); + } + + @Test(expectedExceptions = UserException.class) + public void sampleInSAMHeaderNotInSamplesDB() { + addSAMHeader(); + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); + builder.getFinalSampleDB(); + } + + @Test() + public void getFamilyIDs() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFamilyIDs(), new TreeSet(Arrays.asList("fam1", "fam2", "fam3"))); + } + + @Test() + public void getFamily() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFamily("fam1"), testPEDSamplesAsSet); + } + + @Test() + public void loadFamilyIDs() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Map> families = db.getFamilies(); + Assert.assertEquals(families.size(), 3); + Assert.assertEquals(families.keySet(), new TreeSet(Arrays.asList("fam1", "fam2", "fam3"))); + + for ( final String famID : families.keySet() ) { + final Set fam = families.get(famID); + Assert.assertEquals(fam.size(), 3); + for ( final Sample sample : fam ) { + Assert.assertEquals(sample.getFamilyID(), famID); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java new file mode 100644 index 000000000..3af40adbe --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -0,0 +1,64 @@ +package org.broadinstitute.sting.gatk.samples; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +/** + * + */ +public class SampleUnitTest extends BaseTest { + SampleDB db; + static Sample fam1A, fam1B, fam1C; + static Sample s1, s2; + static Sample trait1, trait2, trait3, trait4, trait5; + + @BeforeClass + public void init() { + db = new SampleDB(); + + fam1A = new Sample("1A", db, "fam1", "1B", "1C", Gender.UNKNOWN); + fam1B = new Sample("1B", db, "fam1", null, null, Gender.MALE); + fam1C = new Sample("1C", db, "fam1", null, null, Gender.FEMALE); + + s1 = new Sample("s1", db); + s2 = new Sample("s2", db); + + trait1 = new Sample("t1", db, Affection.AFFECTED, Sample.UNSET_QT); + trait2 = new Sample("t2", db, Affection.UNAFFECTED, Sample.UNSET_QT); + trait3 = new Sample("t3", db, Affection.UNKNOWN, Sample.UNSET_QT); + trait4 = new Sample("t4", db, Affection.OTHER, "1.0"); + trait5 = new Sample("t4", db, Affection.OTHER, "CEU"); + } + + /** + * Now basic getters + */ + @Test() + public void normalGettersTest() { + Assert.assertEquals("1A", fam1A.getID()); + Assert.assertEquals("fam1", fam1A.getFamilyID()); + Assert.assertEquals("1B", fam1A.getPaternalID()); + Assert.assertEquals("1C", fam1A.getMaternalID()); + Assert.assertEquals(null, fam1B.getPaternalID()); + Assert.assertEquals(null, fam1B.getMaternalID()); + + Assert.assertEquals(Affection.AFFECTED, trait1.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait1.getOtherPhenotype()); + Assert.assertEquals(Affection.UNAFFECTED, trait2.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait2.getOtherPhenotype()); + Assert.assertEquals(Affection.UNKNOWN, trait3.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait3.getOtherPhenotype()); + Assert.assertEquals(Affection.OTHER, trait4.getAffection()); + Assert.assertEquals("1.0", trait4.getOtherPhenotype()); + Assert.assertEquals("CEU", trait5.getOtherPhenotype()); + } + + @Test() + public void testGenders() { + Assert.assertTrue(fam1A.getGender() == Gender.UNKNOWN); + Assert.assertTrue(fam1B.getGender() == Gender.MALE); + Assert.assertTrue(fam1C.getGender() == Gender.FEMALE); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 59ac1a41e..646fb5e77 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -57,11 +57,11 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { // now add the expected files that get generated spec.addAuxFile("423571e4c05e7934322172654ac6dbb7", baseOutputFile); spec.addAuxFile("9df5e7e07efeb34926c94a724714c219", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); - spec.addAuxFile("b9a7748e5aec4dc06daed893c901c00d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); + spec.addAuxFile("229b9b5bc2141c86dbc69c8acc9eba6a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); spec.addAuxFile("9cd395f47b329b9dd00ad024fcac9929", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); - spec.addAuxFile("aec669d64d9dd652dd088a5341835ea5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); - spec.addAuxFile("f6dbd74d32a48abe71ce08d300bce983", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); - spec.addAuxFile("e3a3467ed259ee3680f8d01980f525b7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("471c34ad2e4f7228efd20702d5941ba9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("9667c77284c2c08e647b162d0e9652d4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); + spec.addAuxFile("5a96c75f96d6fa6ee617451d731dae37", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); spec.addAuxFile("b82846df660f0aac8429aec57c2a62d6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); spec.addAuxFile("d32a8c425fadcc4c048bd8b48d0f61e5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); @@ -69,11 +69,11 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.addAuxFile("e70952f241eebb9b5448f2e7cb288131", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); spec.addAuxFile("054ed1e184f46d6a170dc9bf6524270c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); spec.addAuxFile("d53431022f7387fe9ac47814ab1fcd88", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); - spec.addAuxFile("650ee3714da7fbad7832c9d4ad49eb51", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("a395dafde101971d2b9e5ddb6cd4b7d0", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("df0ba76e0e6082c0d29fcfd68efc6b77", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("7dcac2e8962c778081486332a4576dc3", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("a50011571334f17e950ad3ed1149e350", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("6f3260504295695d765af639539585c9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + spec.addAuxFile("e013cb5b11b0321a81c8dbd7c1863787", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("661160f571def8c323345b5859cfb9da", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("c95a7a6840334cadd0e520939615c77b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); execute("testBaseOutputNoFiltering",spec); } @@ -90,7 +90,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.setOutputFileLocation(baseOutputFile); spec.addAuxFile("6ccd7d8970ba98cb95fe41636a070c1c",baseOutputFile); - spec.addAuxFile("0ee40f3e5091536c14e077b77557083a",createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("7d87783b3d98b928cac16d383ceca807",createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); execute("testNoCoverageDueToFiltering",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 488b3ccd9..07b2f0566 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -29,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("e6639ea2dc81635c706e6c35921406d7")); + Arrays.asList("b27939251539439a382538e507e03507")); executeTest("test MultiSample Pilot1", spec); } @@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("4be308fd9e8167ebee677f62a7a753b7")); + Arrays.asList("37e891bf1ac40caec9ea228f39c27e44")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java index c14fea24c..cc0007439 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -7,7 +7,6 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index d8695cf38..cd0de27f5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -25,16 +25,14 @@ package org.broadinstitute.sting.utils.clipreads; -import net.sf.samtools.*; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.util.LinkedList; -import java.util.List; - /** * Created by IntelliJ IDEA. * User: roger diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 2fab1f287..379d79c84 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -76,7 +76,7 @@ public class IntervalIntegrationTest extends WalkerTest { // our base file File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("99c266d777e2e167b8153c858c305fda",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("748a38ed5eb0a043dfc7b82f0d1e8063",createTempFileFromBase(baseOutputFile.getAbsolutePath())); spec.addAuxFile("fadcdf88597b9609c5f2a17f4c6eb455", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadInclusion",spec); @@ -96,8 +96,8 @@ public class IntervalIntegrationTest extends WalkerTest { // our base file File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("8236f0b2df5a692e54751b08bc3836fa",createTempFileFromBase(baseOutputFile.getAbsolutePath())); - spec.addAuxFile("651b42456d31ba24e913297b71b32143", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + spec.addAuxFile("80887ba488e53dabd9596ff93070ae75",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadExclusion",spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index fb479ab47..b07da7cc8 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -28,7 +28,6 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.testng.Assert; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.annotations.Test; @@ -141,9 +140,9 @@ public class ReadBackedPileupUnitTest { ReadBackedPileupImpl sample2Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read2,read4), Arrays.asList(1,1)); - Map sampleToPileupMap = new HashMap(); - sampleToPileupMap.put(new Sample(readGroupOne.getSample()),sample1Pileup); - sampleToPileupMap.put(new Sample(readGroupTwo.getSample()),sample2Pileup); + Map sampleToPileupMap = new HashMap(); + sampleToPileupMap.put(readGroupOne.getSample(),sample1Pileup); + sampleToPileupMap.put(readGroupTwo.getSample(),sample2Pileup); ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null,sampleToPileupMap); @@ -164,13 +163,13 @@ public class ReadBackedPileupUnitTest { @Test public void testGetPileupForSample() { - Sample sample1 = new Sample("sample1"); - Sample sample2 = new Sample("sample2"); + String sample1 = "sample1"; + String sample2 = "sample2"; SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); - readGroupOne.setSample(sample1.getId()); + readGroupOne.setSample(sample1); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); - readGroupTwo.setSample(sample2.getId()); + readGroupTwo.setSample(sample2); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); header.addReadGroup(readGroupOne); @@ -181,24 +180,20 @@ public class ReadBackedPileupUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); read2.setAttribute("RG",readGroupTwo.getId()); - Map sampleToPileupMap = new HashMap(); + Map sampleToPileupMap = new HashMap(); sampleToPileupMap.put(sample1,new ReadBackedPileupImpl(null,Collections.singletonList(read1),0)); sampleToPileupMap.put(sample2,new ReadBackedPileupImpl(null,Collections.singletonList(read2),0)); ReadBackedPileup pileup = new ReadBackedPileupImpl(null,sampleToPileupMap); - ReadBackedPileup sample1Pileup = pileup.getPileupForSample(sample1); - Assert.assertEquals(sample1Pileup.size(),1,"Sample 1 pileup has wrong number of elements"); - Assert.assertEquals(sample1Pileup.getReads().get(0),read1,"Sample 1 pileup has incorrect read"); - - ReadBackedPileup sample2Pileup = pileup.getPileupForSampleName(sample2.getId()); + ReadBackedPileup sample2Pileup = pileup.getPileupForSample(sample2); Assert.assertEquals(sample2Pileup.size(),1,"Sample 2 pileup has wrong number of elements"); Assert.assertEquals(sample2Pileup.getReads().get(0),read2,"Sample 2 pileup has incorrect read"); - ReadBackedPileup missingSamplePileup = pileup.getPileupForSample(new Sample("missing")); + ReadBackedPileup missingSamplePileup = pileup.getPileupForSample("missing"); Assert.assertNull(missingSamplePileup,"Pileup for sample 'missing' should be null but isn't"); - missingSamplePileup = pileup.getPileupForSampleName("not here"); + missingSamplePileup = pileup.getPileupForSample("not here"); Assert.assertNull(missingSamplePileup,"Pileup for sample 'not here' should be null but isn't"); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java new file mode 100644 index 000000000..50eebe179 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VCFJarClassLoadingUnitTest.java @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import org.testng.annotations.Test; + +import java.io.File; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.net.URLClassLoader; + +/** + * Test to ensure that, given only the VCF jar and its expected dependencies, core VCF classes will load. + */ +public class VCFJarClassLoadingUnitTest { + @Test + public void testVCFJarClassLoading() throws ClassNotFoundException, MalformedURLException { + URI vcfURI = new File("dist/vcf.jar").toURI(); + URI tribbleURI = new File("lib/tribble-24.jar").toURI(); + + ClassLoader classLoader = new URLClassLoader(new URL[] {vcfURI.toURL(),tribbleURI.toURL()}, null); + classLoader.loadClass("org.broadinstitute.sting.utils.variantcontext.VariantContext"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.VCFCodec"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.VCFWriter"); + classLoader.loadClass("org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter"); + } +} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala deleted file mode 100755 index d333e1dc0..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala +++ /dev/null @@ -1,202 +0,0 @@ -package org.broadinstitute.sting.queue.qscripts - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk.RodBind -import org.broadinstitute.sting.queue.extensions.gatk._ - -class StandardVariantEvaluation extends QScript { - // todo -- update to released version when things stabilize - @Argument(doc="gatkJarFile", required=false) - var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar") - - @Argument(shortName = "R", doc="B37 reference sequence: defaults to broad standard location", required=false) - var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - - @Argument(shortName = "intervals", doc="intervals to evaluate. Only supports evaluation on chromosome 20 now, as most evaluation data is there", required=false) - val TARGET_INTERVAL: String = "20" - - @Argument(shortName = "includeUnion", doc="If provided, we'll create a union of the evaluation data sets for evaluation", required=false) - val CREATE_UNION: Boolean = false - - @Argument(shortName = "dataDir", doc="Path to the standard evaluation data files", required=false) - val DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Comparisons/StandardForEvaluation/b37/" - - @Argument(shortName = "evalStandard1000GCalls", doc="If provided, we'll include some standard 1000G data for evaluation", required=false) - val EVAL_STANDARD_1000G_CALLS: Boolean = false - - val COMPS_DIR = DATA_DIR + "/comps/" - val EVALS_DIR = DATA_DIR + "/evals/" - - @Argument(shortName = "moreSNPsToEval", doc="Path to additional SNP call sets for evaluation", required=false) - val moreSNPsToEval: List[File] = Nil - - @Argument(shortName = "moreIndelsToEval", doc="Path to additional Indel call sets for evaluation", required=false) - val moreIndelsToEval: List[File] = Nil - - val VARIANT_TYPES: List[String] = List("indels", "snps") - val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map( - "indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION), - "snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION) - ) - - val SITES_DIR: String = "sitesFiles" - - // path to b37 DBSNP - @Argument(shortName = "dbsnp", doc="Path to DBSNP **VCF** for evaluation", required=false) - val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf") - //val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf"); - - class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) { - val originalFile = new File(COMPS_DIR + filename) - val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile - val sitesFile = new File(SITES_DIR + "/" + swapExt(file, ".vcf", ".sites.vcf").getName) - } - - class Eval(val name: String, val evalType: String, val filename: String, val overrideFile: File = null ) { - val file: File = if ( overrideFile != null ) overrideFile else new File(EVALS_DIR + "/" + filename) - } - - var COMPS: List[Comp] = Nil - def addComp(comp: Comp) { COMPS = comp :: COMPS } - - var EVALS: List[Eval] = Nil - def addEval(eval: Eval) { EVALS = eval :: EVALS } - def addEvalFromCMD(file: File, t: String) { addEval(new Eval(file.getName, t, null, file)) } - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - this.logging_level = "INFO"; - this.jarFile = gatkJarFile; - this.intervalsString = List(TARGET_INTERVAL); - this.reference_sequence = referenceFile; - this.memoryLimit = 2 - } - - def initializeStandardDataFiles() = { - // - // Standard evaluation files for indels - // - addComp(new Comp("NA12878.homvar.GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true)) - addComp(new Comp("CG.38samples", "indels", "CG.Indels.leftAligned.b37.vcf")) - addComp(new Comp("NA12878.homvar.CG", "indels", "NA12878.CG.b37.indels.vcf", true)) - addComp(new Comp("g1k.pilot1.validation", "indels", "pilot1_indel_validation_2009.b37.vcf")) - addComp(new Comp("NA12878.hand_curated", "indels", "NA12878.validated.curated.polymorphic.indels.vcf")) - addComp(new Comp("NA12878.Mullikin", "indels", "NA12878.DIPline.NQScm.expanded.chr20.b37.minReads_2_or_gt2bp.vcf")) - - - // - // INDEL call sets - // - if ( EVAL_STANDARD_1000G_CALLS ) { - addEval(new Eval("dindel", "indels", "20110208.chr20.dindel2.EUR.sites.vcf")) - addEval(new Eval("si", "indels", "20101123.chr20.si.v2.EUR.sites.vcf")) - addEval(new Eval("gatk", "indels", "EUR.phase1.chr20.broad.filtered.indels.sites.vcf")) - } - - // - // Standard evaluation files for SNPs - // - addComp(new Comp("NA12878.homvar.GATK", "snps", "NA12878.HiSeq19.cut.vcf", true)) - addComp(new Comp("CG.38samples", "snps", "CG.38samples.b37.vcf")) - addComp(new Comp("NA12878.homvar.CG", "snps", "NA12878.CG.b37.snps.vcf", true)) - addComp(new Comp("HapMap3.3", "snps", "hapmap3.3.sites_r27_nr.b37_fwd.vcf")) - addComp(new Comp("OMNI.2.5M", "snps", "omni2.5.1212samples.b37.sites.chr20.monoAreAC0.vcf")) - addComp(new Comp("g1k.pilot1.validation", "snps", "1000G.snp.validation.b37.vcf")) - - // - // SNP call sets - // - if ( EVAL_STANDARD_1000G_CALLS ) { - addEval(new Eval("1000G.gatk.eurPlus.phase1", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf")) - addEval(new Eval("1000G.high_specificity.phase1", "snps", "ALL.phase1.chr20.projectConsensus.highSpecificity.snps.genotypes.sites.vcf")) - } - } - - def script = { - val sitesDir = new File(SITES_DIR) - if ( ! sitesDir.exists ) sitesDir.mkdirs() - - initializeStandardDataFiles(); - - // add additional files for evaluation, if necessary - moreSNPsToEval.foreach(addEvalFromCMD(_, "snps")) - moreIndelsToEval.foreach(addEvalFromCMD(_, "indels")) - - // - // create hom-var versions of key files - // - for ( comp <- COMPS ) - if ( comp.MakeHomVar ) - add(new SelectHomVars(comp.originalFile, comp.file)) - - for ( comp <- COMPS ) - add(new JustSites(comp.file, comp.sitesFile)) - - // - // Loop over evaluation types - // - for ( evalType <- VARIANT_TYPES ) { - var evalsOfType = EVALS.filter(_.evalType == evalType) - val compsOfType = COMPS.filter(_.evalType == evalType) - - if ( evalsOfType.size > 0 ) { - - // if desired and possible, create a union.X.vcf file - if ( CREATE_UNION && evalsOfType.size > 1 ) { - val union: File = new File("union.%s.vcf".format(evalType)) - add(new MyCombine(evalsOfType.map(_.file), union)); - evalsOfType = new Eval("union", evalType, null, union) :: evalsOfType - } - - // our root VE - val VE = new MyEval() - VE.VT = VARIANT_TYPE_VT(evalType) - VE.o = new File(evalType + ".eval") - - // add evals - for ( calls <- evalsOfType ) - VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file) - - // add comps - //VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP) - for ( comp <- compsOfType ) - VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile) - - add(VE) - } - } - } - - /** - * Select homozygous non-reference sites from a single deep data set - */ - class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { - this.rodBind :+= RodBind("variant", "VCF", vcf) - this.o = out - this.select ++= List("\"AC == 2\"") - } - - /** - * A simple union - */ - class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS { - for ( vcf <- vcfs ) - this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) - this.o = out - } - - /** - * A command line (cut) that removes all genotyping information from a file - */ - class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { - def commandLine = "cut -f 1-8 %s > %s".format(in, out) - } - - /** - * Base class for VariantEval used here - */ - class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS { - this.noST = true - this.evalModule :+= "ValidationReport" - } -} - diff --git a/public/testdata/ceutrio.ped b/public/testdata/ceutrio.ped new file mode 100644 index 000000000..1302e1a2d --- /dev/null +++ b/public/testdata/ceutrio.ped @@ -0,0 +1,3 @@ +fam1 kid dad mom 1 2 +fam1 dad 0 0 1 1 +fam1 mom 0 0 2 2