diff --git a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java new file mode 100755 index 000000000..6351b6c68 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java @@ -0,0 +1,808 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.*; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.ArgumentSource; +import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.ReadMetrics; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; +import org.broadinstitute.sting.gatk.filters.FilterManager; +import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; +import org.broadinstitute.sting.gatk.io.stubs.Stub; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * Parses a GATKArgumentCollection into various gatk objects and then executes() some function. + */ +public abstract class AbstractGenomeAnalysisEngine { + + /** + * Accessor for sharded read data. + */ + private SAMDataSource readsDataSource = null; + + protected ReferenceDataSource getReferenceDataSource() { + return referenceDataSource; + } + + /** + * Accessor for sharded reference data. + */ + private ReferenceDataSource referenceDataSource = null; + + /** + * Accessor for sample metadata + */ + private SampleDataSource sampleDataSource = null; + + /** + * Accessor for sharded reference-ordered data. + */ + private List rodDataSources; + + // our argument collection + private GATKArgumentCollection argCollection; + + /** + * Collection of inputs used by the engine. + */ + private Map inputs = new HashMap(); + + /** + * Collection of intervals used by the engine. + */ + private GenomeLocSortedSet intervals = null; + + /** + * Collection of outputs used by the engine. + */ + private Collection> outputs = new ArrayList>(); + + /** + * List of tags associated with the given instantiation of the command-line argument. + */ + private final Map> tags = new IdentityHashMap>(); + + /** + * Collection of the filters applied to the input data. + */ + private Collection filters; + + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(AbstractGenomeAnalysisEngine.class); + + /** + * Manage lists of filters. + */ + private final FilterManager filterManager = new FilterManager(); + + private Date startTime = null; // the start time for execution + + /** + * Actually run the engine. + * @return the value of this traversal. + */ + public abstract Object execute(); + + protected abstract boolean flashbackData(); + protected abstract boolean includeReadsWithDeletionAtLoci(); + protected abstract boolean generateExtendedEvents(); + + /** + * Sets the start time when the execute() function was last called + * @param startTime the start time when the execute() function was last called + */ + protected void setStartTime(Date startTime) { + this.startTime = startTime; + } + + /** + * @return the start time when the execute() function was last called + */ + public Date getStartTime() { + return startTime; + } + + /** + * Setup the intervals to be processed + */ + protected void initializeIntervals() { + + // return null if no interval arguments at all + if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null)) + return; + + else { + // if include argument isn't given, create new set of all possible intervals + GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? + GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : + loadIntervals(argCollection.intervals, + argCollection.intervalMerging, + GenomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging))); + + // if no exclude arguments, can return parseIntervalArguments directly + if (argCollection.excludeIntervals == null) + intervals = includeSortedSet; + + // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets + else { + GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null); + intervals = includeSortedSet.subtractRegions(excludeSortedSet); + + // logging messages only printed when exclude (-XL) arguments are given + long toPruneSize = includeSortedSet.coveredSize(); + long toExcludeSize = excludeSortedSet.coveredSize(); + long intervalSize = intervals.coveredSize(); + logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); + logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", + toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); + } + + } + } + + /** + * Loads the intervals relevant to the current execution + * @param argList String representation of arguments; might include 'all', filenames, intervals in samtools + * notation, or a combination of the + * @param mergingRule Technique to use when merging interval data. + * @param additionalIntervals a list of additional intervals to add to the returned set. Can be null. + * @return A sorted, merged list of all intervals specified in this arg list. + */ + private GenomeLocSortedSet loadIntervals(List argList, + IntervalMergingRule mergingRule, + List additionalIntervals) { + + return IntervalUtils.sortAndMergeIntervals(IntervalUtils.mergeListsBySetOperator(additionalIntervals, + IntervalUtils.parseIntervalArguments(argList), + argCollection.BTIMergeRule), + mergingRule); + } + + /** + * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs + * @return ROD intervals as GenomeLocs + */ + private List checkRODToIntervalArgument() { + Map rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources); + // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag? + List ret = new ArrayList(); + if (rodNames != null && argCollection.RODToInterval != null) { + String rodName = argCollection.RODToInterval; + + // check to make sure we have a rod of that name + if (!rodNames.containsKey(rodName)) + throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option"); + + for (String str : rodNames.keySet()) + if (str.equals(rodName)) { + logger.info("Adding interval list from track (ROD) named " + rodName); + RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData()); + ret.addAll(intervalGenerator.toGenomeLocList()); + } + } + return ret; + } + + /** + * Add additional, externally managed IO streams for inputs. + * + * @param argumentSource Field into which to inject the value. + * @param value Instance to inject. + */ + public void addInput(ArgumentSource argumentSource, Object value) { + inputs.put(argumentSource, value); + } + + /** + * Add additional, externally managed IO streams for output. + * + * @param stub Instance to inject. + */ + public void addOutput(Stub stub) { + outputs.add(stub); + } + + /** + * Adds an association between a object created by the + * command-line argument system and a freeform list of tags. + * @param key Object created by the command-line argument system. + * @param tags List of tags to use when reading arguments. + */ + public void addTags(Object key, List tags) { + this.tags.put(key,tags); + } + + /** + * Gets the tags associated with a given object. + * @param key Key for which to find a tag. + * @return List of tags associated with this key. + */ + public List getTags(Object key) { + if(!tags.containsKey(key)) + return Collections.emptyList(); + return tags.get(key); + } + + /** + * Gets a list of the filters to associate with the engine. Will NOT initialize the engine with this filters; + * the caller must handle that directly. + * @return A collection of available filters. + */ + protected Collection createFilters() { + Set filters = new HashSet(); + if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) + filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); + for(String filterName: this.getArguments().readFilters) + filters.add(this.getFilterManager().createByName(filterName)); + return Collections.unmodifiableSet(filters); + } + + protected void initializeDataSources() { + logger.info("Strictness is " + argCollection.strictnessLevel); + + validateSuppliedReads(); + readsDataSource = createReadsDataSource(extractSourceInfo()); + + validateSuppliedReference(); + referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile); + + sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); + + if (argCollection.DBSNPFile != null) + bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile); + + RMDTrackBuilder manager = new RMDTrackBuilder(); + + // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference + manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); + + List tracks = manager.getReferenceMetaDataSources(this,argCollection.RODBindings); + validateSuppliedReferenceOrderedData(tracks); + + // validate all the sequence dictionaries against the reference + validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), tracks); + + rodDataSources = getReferenceOrderedDataSources(tracks); + } + + /** + * Gets a unique identifier for the reader sourcing this read. + * @param read Read to examine. + * @return A unique identifier for the source file of this read. Exception if not found. + */ + public SAMReaderID getReaderIDForRead(final SAMRecord read) { + return getDataSource().getReaderID(read); + } + + /** + * Gets the source file for this read. + * @param id Unique identifier determining which input file to use. + * @return The source filename for this read. + */ + public File getSourceFileForReaderID(final SAMReaderID id) { + return getDataSource().getSAMFile(id); + } + + /** + * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names + * found in the corresponding bam file. + * + * @return Sets of samples in the merged input SAM stream, grouped by readers + */ + public List> getSamplesByReaders() { + List readers = getDataSource().getReaderIDs(); + + List> sample_sets = new ArrayList>(readers.size()); + + for (SAMReaderID r : readers) { + + Set samples = new HashSet(1); + sample_sets.add(samples); + + for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { + samples.add(g.getSample()); + } + } + + return sample_sets; + + } + + /** + * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names + * found in the corresponding bam file. + * + * @return Sets of libraries present in the (merged) input SAM stream, grouped by readers + */ + public List> getLibrariesByReaders() { + + + List readers = getDataSource().getReaderIDs(); + + List> lib_sets = new ArrayList>(readers.size()); + + for (SAMReaderID r : readers) { + + Set libs = new HashSet(2); + lib_sets.add(libs); + + for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { + libs.add(g.getLibrary()); + } + } + + return lib_sets; + + } + + /** + * Returns a mapping from original input files to their (merged) read group ids + * + * @return the mapping + */ + public Map> getFileToReadGroupIdMapping() { + // populate the file -> read group mapping + Map> fileToReadGroupIdMap = new HashMap>(); + for (SAMReaderID id: getDataSource().getReaderIDs()) { + Set readGroups = new HashSet(5); + + for (SAMReadGroupRecord g : getDataSource().getHeader(id).getReadGroups()) { + if (getDataSource().hasReadGroupCollisions()) { + // Check if there were read group clashes. + // If there were, use the SamFileHeaderMerger to translate from the + // original read group id to the read group id in the merged stream + readGroups.add(getDataSource().getReadGroupId(id,g.getReadGroupId())); + } else { + // otherwise, pass through the unmapped read groups since this is what Picard does as well + readGroups.add(g.getReadGroupId()); + } + } + + fileToReadGroupIdMap.put(getDataSource().getSAMFile(id),readGroups); + } + + return fileToReadGroupIdMap; + } + + /** + * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD **** + * + * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups + * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. + * + * @return sets of (merged) read group ids in order of input bams + */ + public List> getMergedReadGroupsByReaders() { + + + List readers = getDataSource().getReaderIDs(); + + List> rg_sets = new ArrayList>(readers.size()); + + for (SAMReaderID r : readers) { + + Set groups = new HashSet(5); + rg_sets.add(groups); + + for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { + if (getDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: + // use HeaderMerger to translate original read group id from the reader into the read group id in the + // merged stream, and save that remapped read group id to associate it with specific reader + groups.add(getDataSource().getReadGroupId(r, g.getReadGroupId())); + } else { + // otherwise, pass through the unmapped read groups since this is what Picard does as well + groups.add(g.getReadGroupId()); + } + } + } + + return rg_sets; + + } + + /** + * Bundles all the source information about the reads into a unified data structure. + * + * @return The reads object providing reads source info. + */ + private ReadProperties extractSourceInfo() { + + DownsamplingMethod method = getDownsamplingMethod(); + + return new ReadProperties(unpackBAMFileList(argCollection.samFiles), + argCollection.strictnessLevel, + argCollection.readBufferSize, + method, + new ValidationExclusion(Arrays.asList(argCollection.unsafe)), + filters, + includeReadsWithDeletionAtLoci(), + generateExtendedEvents()); + } + + protected DownsamplingMethod getDownsamplingMethod() { + DownsamplingMethod method; + if(argCollection.getDownsamplingMethod() != null) + method = argCollection.getDownsamplingMethod(); + else + method = argCollection.getDefaultDownsamplingMethod(); + return method; + } + + protected void validateSuppliedReads() { + } + + protected void validateSuppliedReference() { + } + + protected void validateSuppliedReferenceOrderedData(List rods) { + } + + /** + * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). + * + * @param reads Reads data source. + * @param reference Reference data source. + * @param tracks a collection of the reference ordered data tracks + */ + private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection tracks) { + if ((reads.isEmpty() && (tracks == null || tracks.isEmpty())) || reference == null ) + return; + + // Compile a set of sequence names that exist in the reference file. + SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); + + if (!reads.isEmpty()) { + // Compile a set of sequence names that exist in the BAM files. + SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); + + Set readsSequenceNames = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences()) + readsSequenceNames.add(dictionaryEntry.getSequenceName()); + + + if (readsSequenceNames.size() == 0) { + logger.info("Reads file is unmapped. Skipping validation against reference."); + return; + } + + // compare the reads to the reference + SequenceDictionaryUtils.validateDictionaries(logger, "reads", readsDictionary, "reference", referenceDictionary); + } + + // compare the tracks to the reference, if they have a sequence dictionary + for (RMDTrack track : tracks) { + SAMSequenceDictionary trackDict = track.getSequenceDictionary(); + + // hack: if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation + if (trackDict == null || trackDict.size() == 0) { + logger.info("Track " + track.getName() + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + continue; + } + + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + SequenceDictionaryUtils.validateDictionaries(logger, track.getName(), trackDict, "reference", referenceDictionary); + } + + } + + + /** + * Convenience function that binds RODs using the old-style command line parser to the new style list for + * a uniform processing. + * + * @param name the name of the rod + * @param type its type + * @param file the file to load the rod from + */ + private void bindConvenienceRods(final String name, final String type, final String file) { + argCollection.RODBindings.add(Utils.join(",", new String[]{name, type, file})); + } + + /** + * Gets a data source for the given set of reads. + * + * @param reads the read source information + * @return A data source for the given set of reads. + */ + private SAMDataSource createReadsDataSource(ReadProperties reads) { + return new SAMDataSource(reads); + } + + /** + * Opens a reference sequence file paired with an index. + * + * @param refFile Handle to a reference sequence file. Non-null. + * @return A thread-safe file wrapper. + */ + private ReferenceDataSource openReferenceSequenceFile(File refFile) { + ReferenceDataSource ref = new ReferenceDataSource(refFile); + GenomeLocParser.setupRefContigOrdering(ref.getReference()); + return ref; + } + + /** + * Open the reference-ordered data sources. + * + * @param rods the reference order data to execute using + * @return A list of reference-ordered data sources. + */ + private List getReferenceOrderedDataSources(List rods) { + List dataSources = new ArrayList(); + for (RMDTrack rod : rods) + dataSources.add(new ReferenceOrderedDataSource(rod, flashbackData())); + return dataSources; + } + + /** + * Returns the SAM File Header from the input reads' data source file + * @return the SAM File Header from the input reads' data source file + */ + public SAMFileHeader getSAMFileHeader() { + return readsDataSource.getHeader(); + } + + /** + * Returns the unmerged SAM file header for an individual reader. + * @param reader The reader. + * @return Header for that reader. + */ + public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { + return readsDataSource.getHeader(reader); + } + + /** + * Returns data source object encapsulating all essential info and handlers used to traverse + * reads; header merger, individual file readers etc can be accessed through the returned data source object. + * + * @return the reads data source + */ + public SAMDataSource getDataSource() { + return this.readsDataSource; + } + + /** + * Sets the collection of GATK main application arguments. + * + * @param argCollection the GATK argument collection + */ + public void setArguments(GATKArgumentCollection argCollection) { + this.argCollection = argCollection; + } + + /** + * Gets the collection of GATK main application arguments. + * + * @return the GATK argument collection + */ + public GATKArgumentCollection getArguments() { + return this.argCollection; + } + + /** + * Get the list of intervals passed to the engine. + * @return List of intervals. + */ + public GenomeLocSortedSet getIntervals() { + return this.intervals; + } + + /** + * Gets the list of filters employed by this engine. + * @return Collection of filters (actual instances) used by this engine. + */ + public Collection getFilters() { + return this.filters; + } + + /** + * Sets the list of filters employed by this engine. + * @param filters Collection of filters (actual instances) used by this engine. + */ + public void setFilters(Collection filters) { + this.filters = filters; + } + + /** + * Gets the filter manager for this engine. + * @return filter manager for this engine. + */ + protected FilterManager getFilterManager() { + return filterManager; + } + + /** + * Gets the input sources for this engine. + * @return input sources for this engine. + */ + protected Map getInputs() { + return inputs; + } + + /** + * Gets the output stubs for this engine. + * @return output stubs for this engine. + */ + protected Collection> getOutputs() { + return outputs; + } + + /** + * Returns data source objects encapsulating all rod data; + * individual rods can be accessed through the returned data source objects. + * + * @return the rods data sources + */ + public List getRodDataSources() { + return this.rodDataSources; + } + + /** + * Gets cumulative metrics about the entire run to this point. + * @return cumulative metrics about the entire run. + */ + public ReadMetrics getCumulativeMetrics() { + return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); + } + + /** + * Unpack the bam files to be processed, given a list of files. That list of files can + * itself contain entries which are lists of other files to be read (note: you cannot have lists of lists of lists) + * + * @param inputFiles a list of files that represent either bam files themselves, or a file containing a list of bam files to process + * + * @return a flattened list of the bam files provided + */ + private List unpackBAMFileList( List inputFiles ) { + List unpackedReads = new ArrayList(); + for( File inputFile: inputFiles ) { + if (inputFile.getName().toLowerCase().endsWith(".list") ) { + try { + for(String fileName : new XReadLines(inputFile)) + unpackedReads.add(new SAMReaderID(new File(fileName),getTags(inputFile))); + } + catch( FileNotFoundException ex ) { + throw new UserException.CouldNotReadInputFile(inputFile, "Unable to find file while unpacking reads", ex); + } + } + else if(inputFile.getName().toLowerCase().endsWith(".bam")) { + unpackedReads.add( new SAMReaderID(inputFile,getTags(inputFile)) ); + } + else if(inputFile.getName().equals("-")) { + unpackedReads.add(new SAMReaderID(new File("/dev/stdin"),Collections.emptyList())); + } + else { + throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " + + "with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " + + "of BAM files is in the correct format, update the extension, and try again.",inputFile.getName())); + } + } + return unpackedReads; + } + + /** + * Get a sample by its ID + * If an alias is passed in, return the main sample object + * @param id sample id + * @return sample Object with this ID + */ + public Sample getSampleById(String id) { + return sampleDataSource.getSampleById(id); + } + + /** + * Get the sample for a given read group + * Must first look up ID for read group + * @param readGroup of sample + * @return sample object with ID from the read group + */ + public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { + return sampleDataSource.getSampleByReadGroup(readGroup); + } + + /** + * Get a sample for a given read + * Must first look up read group, and then sample ID for that read group + * @param read of sample + * @return sample object of this read + */ + public Sample getSampleByRead(SAMRecord read) { + return getSampleByReadGroup(read.getReadGroup()); + } + + /** + * Get number of sample objects + * @return size of samples map + */ + public int sampleCount() { + return sampleDataSource.sampleCount(); + } + + /** + * Return all samples with a given family ID + * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this + * @param familyId family ID + * @return Samples with the given family ID + */ + public Set getFamily(String familyId) { + return sampleDataSource.getFamily(familyId); + } + + /** + * Returns all children of a given sample + * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient + * @param sample parent sample + * @return children of the given sample + */ + public Set getChildren(Sample sample) { + return sampleDataSource.getChildren(sample); + } + + /** + * Takes a list of sample names and returns their corresponding sample objects + * + * @param sampleNameList List of sample names + * @return Corresponding set of samples + */ + public Set getSamples(Collection sampleNameList) { + return sampleDataSource.getSamples(sampleNameList); + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index 4a64d16b5..63542314b 100644 --- a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -74,7 +74,11 @@ public abstract class CommandLineExecutable extends CommandLineProgram { Walker walker = engine.getWalkerByName(getAnalysisName()); try { - Collection filters = engine.createFiltersForWalker(getArgumentCollection(),walker); + engine.setArguments(getArgumentCollection()); + engine.setWalker(walker); + + Collection filters = engine.createFilters(); + engine.setFilters(filters); // load the arguments into the walker / filters. // TODO: The fact that this extra load call exists here when all the parsing happens at the engine @@ -88,8 +92,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram { argumentSources.add(filter); } - // set the analysis name in the argument collection - engine.execute(getArgumentCollection(), walker, filters); + engine.execute(); generateGATKRunReport(walker); } catch ( Exception e ) { generateGATKRunReport(walker, e); @@ -120,7 +123,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram { /** * Convenience method for fully parameterized generateGATKRunReport when an exception has * not occurred - * + * * @param walker */ private void generateGATKRunReport(Walker walker) { @@ -160,9 +163,11 @@ public abstract class CommandLineExecutable extends CommandLineProgram { Collection argumentSources = new ArrayList(); Walker walker = engine.getWalkerByName(getAnalysisName()); + engine.setArguments(getArgumentCollection()); + engine.setWalker(walker); argumentSources.add(walker.getClass()); - Collection filters = engine.createFiltersForWalker(getArgumentCollection(),walker); + Collection filters = engine.createFilters(); for(SamRecordFilter filter: filters) argumentSources.add(filter.getClass()); diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 5ce8c6ae4..612044805 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -27,113 +27,47 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.filter.SamRecordFilter; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.*; -import org.apache.log4j.Logger; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.commandline.ArgumentException; +import org.broadinstitute.sting.commandline.ArgumentSource; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; - -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; -import org.broadinstitute.sting.gatk.datasources.sample.SampleFileParser; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.*; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.executive.MicroScheduler; -import org.broadinstitute.sting.gatk.filters.FilterManager; -import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; -import org.broadinstitute.sting.gatk.io.stubs.*; +import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.commandline.ArgumentException; -import org.broadinstitute.sting.commandline.ArgumentSource; -import org.broadinstitute.sting.commandline.ArgumentTypeDescriptor; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; -import java.io.File; -import java.io.FileNotFoundException; import java.util.*; -public class GenomeAnalysisEngine { +/** + * A GenomeAnalysisEngine that runs a specified walker. + */ +public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { // our instance of this genome analysis toolkit; it's used by other classes to extract the traversal engine // TODO: public static without final tends to indicate we're thinking about this the wrong way public static GenomeAnalysisEngine instance; - /** - * Accessor for sharded read data. - */ - private SAMDataSource readsDataSource = null; - - /** - * Accessor for sharded reference data. - */ - private ReferenceDataSource referenceDataSource = null; - - /** - * Accessor for sample metadata - */ - private SampleDataSource sampleDataSource = null; - - /** - * Accessor for sharded reference-ordered data. - */ - private List rodDataSources; - - // our argument collection - private GATKArgumentCollection argCollection; - - /** - * Collection of inputs used by the walker. - */ - private Map inputs = new HashMap(); - - /** - * Collection of intervals used by the walker. - */ - private GenomeLocSortedSet intervals = null; - - /** - * Collection of outputs used by the walker. - */ - private Collection> outputs = new ArrayList>(); - - /** - * List of tags associated with the given instantiation of the command-line argument. - */ - private final Map> tags = new IdentityHashMap>(); - - /** - * Collection of the filters applied to the walker's input data. - */ - private Collection filters; - - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); - /** * our walker manager */ - private final WalkerManager walkerManager; + private final WalkerManager walkerManager = new WalkerManager();; - /** - * Manage lists of filters. - */ - private final FilterManager filterManager; - private Date startTime = null; // the start time for execution + private Walker walker; /** * our constructor, where all the work is done @@ -143,57 +77,47 @@ public class GenomeAnalysisEngine { */ public GenomeAnalysisEngine() { // make sure our instance variable points to this analysis engine -// if ( instance != null ) -// throw new StingException("Instantiating GenomeAnalysisEngine but global instance variable isn't null, indicating that an instance has already been created: " + instance); - instance = this; - walkerManager = new WalkerManager(); - filterManager = new FilterManager(); + } + + public void setWalker(Walker walker) { + this.walker = walker; } /** * Actually run the GATK with the specified walker. * - * @param args the argument collection, where we get all our setup information from - * @param my_walker Walker to run over the dataset. Must not be null. * @return the value of this traversal. */ - public Object execute(GATKArgumentCollection args, Walker my_walker, Collection filters) { + public Object execute() { //HeapSizeMonitor monitor = new HeapSizeMonitor(); //monitor.start(); - startTime = new java.util.Date(); + setStartTime(new java.util.Date()); // validate our parameters - if (args == null) { + if (this.getArguments() == null) { throw new ReviewedStingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); } // validate our parameters - if (my_walker == null) + if (this.walker == null) throw new ReviewedStingException("The walker passed to GenomeAnalysisEngine can not be null."); - // save our argument parameter - this.argCollection = args; - this.filters = filters; - // Prepare the data for traversal. - initializeDataSources(my_walker, filters, argCollection); + initializeDataSources(); // our microscheduler, which is in charge of running everything - MicroScheduler microScheduler = createMicroscheduler(my_walker); + MicroScheduler microScheduler = createMicroscheduler(); // create the output streams " - initializeOutputStreams(my_walker, microScheduler.getOutputTracker()); + initializeOutputStreams(microScheduler.getOutputTracker()); initializeIntervals(); - ShardStrategy shardStrategy = getShardStrategy(my_walker, - microScheduler.getReference(), - intervals, - readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); + ShardStrategy shardStrategy = getShardStrategy(microScheduler.getReference()); // execute the microscheduler, storing the results - Object result = microScheduler.execute(my_walker, shardStrategy); + Object result = microScheduler.execute(this.walker, shardStrategy); //monitor.stop(); //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); @@ -201,134 +125,6 @@ public class GenomeAnalysisEngine { return result; } - /** - * @return the start time when the execute() function was last called - */ - public Date getStartTime() { - return startTime; - } - - - /** - * Setup the intervals to be processed - */ - private void initializeIntervals() { - - // return null if no interval arguments at all - if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null)) - return; - - else { - // if include argument isn't given, create new set of all possible intervals - GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? - GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(argCollection.intervals, - argCollection.intervalMerging, - GenomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging))); - - // if no exclude arguments, can return parseIntervalArguments directly - if (argCollection.excludeIntervals == null) - intervals = includeSortedSet; - - // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets - else { - GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null); - intervals = includeSortedSet.subtractRegions(excludeSortedSet); - - // logging messages only printed when exclude (-XL) arguments are given - long toPruneSize = includeSortedSet.coveredSize(); - long toExcludeSize = excludeSortedSet.coveredSize(); - long intervalSize = intervals.coveredSize(); - logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); - logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", - toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); - } - - } - } - - /** - * Loads the intervals relevant to the current execution - * @param argList String representation of arguments; might include 'all', filenames, intervals in samtools - * notation, or a combination of the - * @param mergingRule Technique to use when merging interval data. - * @param additionalIntervals a list of additional intervals to add to the returned set. Can be null. - * @return A sorted, merged list of all intervals specified in this arg list. - */ - private GenomeLocSortedSet loadIntervals(List argList, - IntervalMergingRule mergingRule, - List additionalIntervals) { - - return IntervalUtils.sortAndMergeIntervals(IntervalUtils.mergeListsBySetOperator(additionalIntervals, - IntervalUtils.parseIntervalArguments(argList), - argCollection.BTIMergeRule), - mergingRule); - } - - /** - * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs - */ - private static List checkRODToIntervalArgument() { - Map rodNames = RMDIntervalGenerator.getRMDTrackNames(instance.rodDataSources); - // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag? - List ret = new ArrayList(); - if (rodNames != null && instance.argCollection.RODToInterval != null) { - String rodName = GenomeAnalysisEngine.instance.argCollection.RODToInterval; - - // check to make sure we have a rod of that name - if (!rodNames.containsKey(rodName)) - throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option"); - - for (String str : rodNames.keySet()) - if (str.equals(rodName)) { - logger.info("Adding interval list from track (ROD) named " + rodName); - RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData()); - ret.addAll(intervalGenerator.toGenomeLocList()); - } - } - return ret; - } - - /** - * Add additional, externally managed IO streams for walker input. - * - * @param argumentSource Field in the walker into which to inject the value. - * @param value Instance to inject. - */ - public void addInput(ArgumentSource argumentSource, Object value) { - inputs.put(argumentSource, value); - } - - /** - * Add additional, externally managed IO streams for walker output. - * - * @param stub Instance to inject. - */ - public void addOutput(Stub stub) { - outputs.add(stub); - } - - /** - * Adds an association between a object created by the - * command-line argument system and a freeform list of tags. - * @param key Object created by the command-line argument system. - * @param tags List of tags to use when reading arguments. - */ - public void addTags(Object key, List tags) { - this.tags.put(key,tags); - } - - /** - * Gets the tags associated with a given object. - * @param key Key for which to find a tag. - * @return List of tags associated with this key. - */ - public List getTags(Object key) { - if(!tags.containsKey(key)) - return Collections.emptyList(); - return tags.get(key); - } - /** * Retrieves an instance of the walker based on the walker name. * @@ -351,17 +147,13 @@ public class GenomeAnalysisEngine { /** * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; * the caller must handle that directly. - * @param args Existing argument collection, for compatibility with legacy command-line walkers. - * @param walker Walker to use when determining which filters to apply. * @return A collection of available filters. */ - protected Collection createFiltersForWalker(GATKArgumentCollection args, Walker walker) { + @Override + protected Collection createFilters() { Set filters = new HashSet(); - filters.addAll(WalkerManager.getReadFilters(walker,filterManager)); - if (args.readGroupBlackList != null && args.readGroupBlackList.size() > 0) - filters.add(new ReadGroupBlackListFilter(args.readGroupBlackList)); - for(String filterName: args.readFilters) - filters.add(filterManager.createByName(filterName)); + filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); + filters.addAll(super.createFilters()); return Collections.unmodifiableSet(filters); } @@ -372,231 +164,56 @@ public class GenomeAnalysisEngine { protected WalkerManager getWalkerManager() { return walkerManager; } - - private void initializeDataSources(Walker my_walker, Collection filters, GATKArgumentCollection argCollection) { - validateSuppliedReadsAgainstWalker(my_walker, argCollection); - logger.info("Strictness is " + argCollection.strictnessLevel); - readsDataSource = createReadsDataSource(extractSourceInfo(my_walker, filters, argCollection)); - - validateSuppliedReferenceAgainstWalker(my_walker, argCollection); - referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile); - - initializeSampleDataSource(); - - if (argCollection.DBSNPFile != null) bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile); - - RMDTrackBuilder manager = new RMDTrackBuilder(); - - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference - manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); - - List tracks = manager.getReferenceMetaDataSources(this,argCollection.RODBindings); - validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks); - - // validate all the sequence dictionaries against the reference - validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), tracks); - - rodDataSources = getReferenceOrderedDataSources(my_walker, tracks); - } - - private void initializeSampleDataSource() { - this.sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); - } - + /** * setup a microscheduler * - * @param my_walker our walker of type LocusWalker * @return a new microscheduler */ - private MicroScheduler createMicroscheduler(Walker my_walker) { + private MicroScheduler createMicroscheduler() { + Walker my_walker = this.walker; + // the mircoscheduler to return MicroScheduler microScheduler = null; // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. if ((my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker || my_walker instanceof ReadPairWalker) && - argCollection.referenceFile == null) { + this.getArguments().referenceFile == null) { throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); } - return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource.getReference(),rodDataSources,argCollection.numberOfThreads); + return MicroScheduler.create(this,my_walker,this.getDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads); } - /** - * Gets a unique identifier for the reader sourcing this read. - * @param read Read to examine. - * @return A unique identifier for the source file of this read. Exception if not found. - */ - public SAMReaderID getReaderIDForRead(final SAMRecord read) { - return getDataSource().getReaderID(read); - } - - /** - * Gets the source file for this read. - * @param id Unique identifier determining which input file to use. - * @return The source filename for this read. - */ - public File getSourceFileForReaderID(final SAMReaderID id) { - return getDataSource().getSAMFile(id); - } - - /** - * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names - * found in the corresponding bam file. - * - * @return - */ - public List> getSamplesByReaders() { - List readers = getDataSource().getReaderIDs(); - - List> sample_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set samples = new HashSet(1); - sample_sets.add(samples); - - for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { - samples.add(g.getSample()); - } - } - - return sample_sets; - - } - - /** - * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names - * found in the corresponding bam file. - * - * @return - */ - public List> getLibrariesByReaders() { - - - List readers = getDataSource().getReaderIDs(); - - List> lib_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set libs = new HashSet(2); - lib_sets.add(libs); - - for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { - libs.add(g.getLibrary()); - } - } - - return lib_sets; - - } - - /** - * Returns a mapping from original input files to their (merged) read group ids - * - * @return the mapping - */ - public Map> getFileToReadGroupIdMapping() { - // populate the file -> read group mapping - Map> fileToReadGroupIdMap = new HashMap>(); - for (SAMReaderID id: getDataSource().getReaderIDs()) { - Set readGroups = new HashSet(5); - - for (SAMReadGroupRecord g : getDataSource().getHeader(id).getReadGroups()) { - if (getDataSource().hasReadGroupCollisions()) { - // Check if there were read group clashes. - // If there were, use the SamFileHeaderMerger to translate from the - // original read group id to the read group id in the merged stream - readGroups.add(getDataSource().getReadGroupId(id,g.getReadGroupId())); - } else { - // otherwise, pass through the unmapped read groups since this is what Picard does as well - readGroups.add(g.getReadGroupId()); - } - } - - fileToReadGroupIdMap.put(getDataSource().getSAMFile(id),readGroups); - } - - return fileToReadGroupIdMap; - } - - /** - * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD **** - * - * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups - * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. - * - * @return sets of (merged) read group ids in order of input bams - */ - public List> getMergedReadGroupsByReaders() { - - - List readers = getDataSource().getReaderIDs(); - - List> rg_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set groups = new HashSet(5); - rg_sets.add(groups); - - for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { - if (getDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: - // use HeaderMerger to translate original read group id from the reader into the read group id in the - // merged stream, and save that remapped read group id to associate it with specific reader - groups.add(getDataSource().getReadGroupId(r, g.getReadGroupId())); - } else { - // otherwise, pass through the unmapped read groups since this is what Picard does as well - groups.add(g.getReadGroupId()); - } - } - } - - return rg_sets; - - } - - /** - * Bundles all the source information about the reads into a unified data structure. - * - * @param walker The walker for which to extract info. - * @param argCollection The collection of arguments passed to the engine. - * @return The reads object providing reads source info. - */ - private ReadProperties extractSourceInfo(Walker walker, Collection filters, GATKArgumentCollection argCollection) { - - DownsamplingMethod method = null; + @Override + protected DownsamplingMethod getDownsamplingMethod() { + GATKArgumentCollection argCollection = this.getArguments(); + DownsamplingMethod method; if(argCollection.getDownsamplingMethod() != null) method = argCollection.getDownsamplingMethod(); else if(WalkerManager.getDownsamplingMethod(walker) != null) method = WalkerManager.getDownsamplingMethod(walker); else method = argCollection.getDefaultDownsamplingMethod(); + return method; + } - return new ReadProperties(unpackBAMFileList(argCollection.samFiles), - argCollection.strictnessLevel, - argCollection.readBufferSize, - method, - new ValidationExclusion(Arrays.asList(argCollection.unsafe)), - filters, - walker.includeReadsWithDeletionAtLoci(), - walker.generateExtendedEvents()); + @Override + protected boolean generateExtendedEvents() { + return walker.generateExtendedEvents(); + } + + @Override + protected boolean includeReadsWithDeletionAtLoci() { + return walker.includeReadsWithDeletionAtLoci(); } /** * Verifies that the supplied set of reads files mesh with what the walker says it requires. - * - * @param walker Walker to test. - * @param arguments Supplied reads files. */ - private void validateSuppliedReadsAgainstWalker(Walker walker, GATKArgumentCollection arguments) { + @Override + protected void validateSuppliedReads() { + GATKArgumentCollection arguments = this.getArguments(); // Check what the walker says is required against what was provided on the command line. if (WalkerManager.isRequired(walker, DataSource.READS) && (arguments.samFiles == null || arguments.samFiles.size() == 0)) throw new ArgumentException("Walker requires reads but none were provided."); @@ -608,11 +225,10 @@ public class GenomeAnalysisEngine { /** * Verifies that the supplied reference file mesh with what the walker says it requires. - * 611 - * @param walker Walker to test. - * @param arguments Supplied reads files. */ - private void validateSuppliedReferenceAgainstWalker(Walker walker, GATKArgumentCollection arguments) { + @Override + protected void validateSuppliedReference() { + GATKArgumentCollection arguments = this.getArguments(); // Check what the walker says is required against what was provided on the command line. // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) @@ -627,10 +243,10 @@ public class GenomeAnalysisEngine { * Verifies that all required reference-ordered data has been supplied, and any reference-ordered data that was not * 'allowed' is still present. * - * @param walker Walker to test. * @param rods Reference-ordered data to load. */ - private void validateSuppliedReferenceOrderedDataAgainstWalker(Walker walker, List rods) { + @Override + protected void validateSuppliedReferenceOrderedData(List rods) { // Check to make sure that all required metadata is present. List allRequired = WalkerManager.getRequiredMetaData(walker); for (RMD required : allRequired) { @@ -650,81 +266,17 @@ public class GenomeAnalysisEngine { } } - /** - * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). - * - * @param reads Reads data source. - * @param reference Reference data source. - * @param tracks a collection of the reference ordered data tracks - */ - private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection tracks) { - if ((reads.isEmpty() && (tracks == null || tracks.isEmpty())) || reference == null ) - return; - - // Compile a set of sequence names that exist in the reference file. - SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); - - if (!reads.isEmpty()) { - // Compile a set of sequence names that exist in the BAM files. - SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - - Set readsSequenceNames = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences()) - readsSequenceNames.add(dictionaryEntry.getSequenceName()); - - - if (readsSequenceNames.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); - return; - } - - // compare the reads to the reference - SequenceDictionaryUtils.validateDictionaries(logger, "reads", readsDictionary, "reference", referenceDictionary); - } - - // compare the tracks to the reference, if they have a sequence dictionary - for (RMDTrack track : tracks) { - SAMSequenceDictionary trackDict = track.getSequenceDictionary(); - - // hack: if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) { - logger.info("Track " + track.getName() + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - continue; - } - - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, track.getName(), trackDict, "reference", referenceDictionary); - } - - } - - - /** - * Convenience function that binds RODs using the old-style command line parser to the new style list for - * a uniform processing. - * - * @param name the name of the rod - * @param type its type - * @param file the file to load the rod from - */ - private void bindConvenienceRods(final String name, final String type, final String file) { - argCollection.RODBindings.add(Utils.join(",", new String[]{name, type, file})); - } - /** * Get the sharding strategy given a driving data source. * - * @param walker Walker for which to infer sharding strategy. * @param drivingDataSource Data on which to shard. - * @param intervals Intervals to use when limiting sharding. - * @return Sharding strategy for this driving data source. + * @return the sharding strategy */ - protected ShardStrategy getShardStrategy(Walker walker, - ReferenceSequenceFile drivingDataSource, - GenomeLocSortedSet intervals, - ValidationExclusion exclusions) { + protected ShardStrategy getShardStrategy(ReferenceSequenceFile drivingDataSource) { + GenomeLocSortedSet intervals = this.getIntervals(); + SAMDataSource readsDataSource = this.getDataSource(); + ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); + ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original // sharding system; it's required with the new sharding system only for locus walkers. if(readsDataSource != null && !readsDataSource.hasIndex() ) { @@ -815,227 +367,22 @@ public class GenomeAnalysisEngine { return shardStrategy; } - /** - * Gets a data source for the given set of reads. - * - * @param reads the read source information - * @return A data source for the given set of reads. - */ - private SAMDataSource createReadsDataSource(ReadProperties reads) { - return new SAMDataSource(reads); - } - - /** - * Opens a reference sequence file paired with an index. - * - * @param refFile Handle to a reference sequence file. Non-null. - * @return A thread-safe file wrapper. - */ - private ReferenceDataSource openReferenceSequenceFile(File refFile) { - ReferenceDataSource ref = new ReferenceDataSource(refFile); - GenomeLocParser.setupRefContigOrdering(ref.getReference()); - return ref; - } - - /** - * Open the reference-ordered data sources. - * - * @param rods the reference order data to execute using - * @return A list of reference-ordered data sources. - */ - private List getReferenceOrderedDataSources(Walker walker, List rods) { - List dataSources = new ArrayList(); - for (RMDTrack rod : rods) - dataSources.add(new ReferenceOrderedDataSource(walker, rod)); - return dataSources; + @Override + protected boolean flashbackData() { + return walker instanceof ReadWalker; } /** * Initialize the output streams as specified by the user. * - * @param walker the walker to initialize output streams for * @param outputTracker the tracker supplying the initialization data. */ - private void initializeOutputStreams(Walker walker, OutputTracker outputTracker) { - for (Map.Entry input : inputs.entrySet()) + private void initializeOutputStreams(OutputTracker outputTracker) { + for (Map.Entry input : getInputs().entrySet()) outputTracker.addInput(input.getKey(), input.getValue()); - for (Stub stub : outputs) + for (Stub stub : getOutputs()) outputTracker.addOutput(stub); outputTracker.prepareWalker(walker); } - - /** - * Returns the SAM File Header from the input reads' data source file - * @return the SAM File Header from the input reads' data source file - */ - public SAMFileHeader getSAMFileHeader() { - return readsDataSource.getHeader(); - } - - /** - * Returns the unmerged SAM file header for an individual reader. - * @param reader The reader. - * @return Header for that reader. - */ - public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { - return readsDataSource.getHeader(reader); - } - - /** - * Returns data source object encapsulating all essential info and handlers used to traverse - * reads; header merger, individual file readers etc can be accessed through the returned data source object. - * - * @return the reads data source - */ - public SAMDataSource getDataSource() { - return this.readsDataSource; - } - - /** - * Gets the collection of GATK main application arguments for enhanced walker validation. - * - * @return the GATK argument collection - */ - public GATKArgumentCollection getArguments() { - return this.argCollection; - } - - /** - * Get the list of intervals passed to the engine. - * @return List of intervals. - */ - public GenomeLocSortedSet getIntervals() { - return this.intervals; - } - - /** - * Gets the list of filters employed by this walker. - * @return Collection of filters (actual instances) used by this walker. - */ - public Collection getFilters() { - return this.filters; - } - - /** - * Returns data source objects encapsulating all rod data; - * individual rods can be accessed through the returned data source objects. - * - * @return the rods data sources - */ - public List getRodDataSources() { - return this.rodDataSources; - } - - /** - * Gets cumulative metrics about the entire run to this point. - * @return cumulative metrics about the entire run. - */ - public ReadMetrics getCumulativeMetrics() { - return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); - } - - /** - * Unpack the bam files to be processed, given a list of files. That list of files can - * itself contain entries which are lists of other files to be read (note: you cannot have lists of lists of lists) - * - * @param inputFiles a list of files that represent either bam files themselves, or a file containing a list of bam files to process - * - * @return a flattened list of the bam files provided - */ - private List unpackBAMFileList( List inputFiles ) { - List unpackedReads = new ArrayList(); - for( File inputFile: inputFiles ) { - if (inputFile.getName().toLowerCase().endsWith(".list") ) { - try { - for(String fileName : new XReadLines(inputFile)) - unpackedReads.add(new SAMReaderID(new File(fileName),getTags(inputFile))); - } - catch( FileNotFoundException ex ) { - throw new UserException.CouldNotReadInputFile(inputFile, "Unable to find file while unpacking reads", ex); - } - } - else if(inputFile.getName().toLowerCase().endsWith(".bam")) { - unpackedReads.add( new SAMReaderID(inputFile,getTags(inputFile)) ); - } - else if(inputFile.getName().equals("-")) { - unpackedReads.add(new SAMReaderID(new File("/dev/stdin"),Collections.emptyList())); - } - else { - throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " + - "with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " + - "of BAM files is in the correct format, update the extension, and try again.",inputFile.getName())); - } - } - return unpackedReads; - } - - /** - * Get a sample by its ID - * If an alias is passed in, return the main sample object - * @param id - * @return sample Object with this ID - */ - public Sample getSampleById(String id) { - return sampleDataSource.getSampleById(id); - } - - /** - * Get the sample for a given read group - * Must first look up ID for read group - * @param readGroup of sample - * @return sample object with ID from the read group - */ - public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { - return sampleDataSource.getSampleByReadGroup(readGroup); - } - - /** - * Get a sample for a given read - * Must first look up read group, and then sample ID for that read group - * @param read of sample - * @return sample object of this read - */ - public Sample getSampleByRead(SAMRecord read) { - return getSampleByReadGroup(read.getReadGroup()); - } - - /** - * Get number of sample objects - * @return size of samples map - */ - public int sampleCount() { - return sampleDataSource.sampleCount(); - } - - /** - * Return all samples with a given family ID - * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this - * @param familyId - * @return - */ - public Set getFamily(String familyId) { - return sampleDataSource.getFamily(familyId); - } - - /** - * Returns all children of a given sample - * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient - * @param sample - * @return - */ - public Set getChildren(Sample sample) { - return sampleDataSource.getChildren(sample); - } - - /** - * Takes a list of sample names and returns their corresponding sample objects - * - * @param sampleNameList List of sample names - * @return Corresponding set of samples - */ - public Set getSamples(Collection sampleNameList) { - return sampleDataSource.getSamples(sampleNameList); - } - } diff --git a/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index 6b9634c0b..8e9ec7b82 100755 --- a/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -120,6 +120,7 @@ public class ReadProperties { * @param samFiles list of reads files. * @param strictness Stringency of reads file parsing. * @param readBufferSize Number of reads to hold in memory per BAM. + * @param downsamplingMethod Method for downsampling reads at a given locus. * @param exclusionList what safety checks we're willing to let slide * @param supplementalFilters additional filters to dynamically apply. * @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with @@ -129,7 +130,7 @@ public class ReadProperties { * will explicitly list reads with deletion over the current reference base; otherwise, only observed * bases will be seen in the pileups, and the deletions will be skipped silently. */ - ReadProperties( List samFiles, + public ReadProperties( List samFiles, SAMFileReader.ValidationStringency strictness, Integer readBufferSize, DownsamplingMethod downsamplingMethod, diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java index 4c0d3b4ac..b6373738a 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java @@ -47,12 +47,12 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { * Create a new reference-ordered data source. * @param rod the reference ordered data */ - public ReferenceOrderedDataSource( Walker walker, RMDTrack rod) { + public ReferenceOrderedDataSource( RMDTrack rod, boolean flashbackData ) { this.rod = rod; if (rod.supportsQuery()) iteratorPool = new ReferenceOrderedQueryDataPool(new RMDTrackBuilder(),rod); else - iteratorPool = new ReferenceOrderedDataPool( walker, rod ); + iteratorPool = new ReferenceOrderedDataPool( rod, flashbackData ); } /** @@ -110,8 +110,8 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { class ReferenceOrderedDataPool extends ResourcePool { private final RMDTrack rod; boolean flashbackData = false; - public ReferenceOrderedDataPool( Walker walker, RMDTrack rod ) { - if (walker instanceof ReadWalker) flashbackData = true; + public ReferenceOrderedDataPool( RMDTrack rod, boolean flashbackData ) { + this.flashbackData = flashbackData; this.rod = rod; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java index e57e2b18b..20e6d6c09 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java @@ -33,10 +33,10 @@ import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.source.BasicFeatureSource; import org.broad.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -367,7 +367,7 @@ public class RMDTrackBuilder extends PluginManager { * * @return a list of RMDTracks, one for each -B option */ - public List getReferenceMetaDataSources(GenomeAnalysisEngine engine, List bindings) { + public List getReferenceMetaDataSources(AbstractGenomeAnalysisEngine engine, List bindings) { initializeBindings(engine,bindings); // try and make the tracks given their requests return createRequestedTrackObjects(); @@ -378,7 +378,7 @@ public class RMDTrackBuilder extends PluginManager { * @param engine The engine, used to populate tags. * @param bindings the input to the GATK, as a list of strings passed in through the -B options */ - private void initializeBindings(GenomeAnalysisEngine engine,List bindings) { + private void initializeBindings(AbstractGenomeAnalysisEngine engine,List bindings) { // NOTE: Method acts as a static. Once the inputs have been passed once they are locked in. if (inputs.size() > 0 || bindings.size() == 0) return; diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index 5cf701016..6b4ba7db6 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -76,7 +76,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { public void testSingleBinding() { File file = new File(testDir + "TabularDataTest.dat"); RMDTrack track = builder.createInstanceOfTrack(TableCodec.class,"tableTest",file); - ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(null,track); + ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(track,false); Shard shard = new MockLocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc("chrM",1,30))); @@ -100,9 +100,9 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { RMDTrack track = builder.createInstanceOfTrack(TableCodec.class,"tableTest1",file); - ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(null,track); + ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(track,false); RMDTrack track2 = builder.createInstanceOfTrack(TableCodec.class,"tableTest2",file); - ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(null,track2); + ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(track2,false); Shard shard = new MockLocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc("chrM",1,30))); diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java index fe6460961..fb3d3b66e 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java @@ -59,7 +59,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { @Test public void testCreateSingleIterator() { - ResourcePool iteratorPool = new ReferenceOrderedDataPool(null,rod); + ResourcePool iteratorPool = new ReferenceOrderedDataPool(rod, false); LocationAwareSeekableRODIterator iterator = (LocationAwareSeekableRODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite1) ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); @@ -80,7 +80,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { @Test public void testCreateMultipleIterators() { - ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(null,rod); + ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod, false); LocationAwareSeekableRODIterator iterator1 = iteratorPool.iterator( new MappedStreamSegment(testSite1) ); // Create a new iterator at position 2. @@ -130,7 +130,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { @Test public void testIteratorConservation() { - ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(null,rod); + ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod, false); LocationAwareSeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite1) ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); @@ -165,7 +165,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { @Test public void testIteratorCreation() { - ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(null,rod); + ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod, false); LocationAwareSeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite3) ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());