From 09c7ea879d98a22d3146b009c25f44cd69a822bb Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 21 Dec 2010 02:09:46 +0000 Subject: [PATCH] Merging GenomeAnalysisEngine and AbstractGenomeAnalysisEngine back together. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4889 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/AbstractGenomeAnalysisEngine.java | 892 ------------------ .../sting/gatk/GenomeAnalysisEngine.java | 786 ++++++++++++++- .../sting/gatk/io/storage/StorageFactory.java | 2 - .../tracks/builders/RMDTrackBuilder.java | 8 +- 4 files changed, 778 insertions(+), 910 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java diff --git a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java deleted file mode 100755 index dc0ea8ab6..000000000 --- a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java +++ /dev/null @@ -1,892 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk; - -import net.sf.picard.filter.SamRecordFilter; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.*; -import org.apache.log4j.Logger; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.ArgumentSource; -import org.broadinstitute.sting.commandline.CommandLineUtils; -import org.broadinstitute.sting.commandline.ParsingEngine; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; -import org.broadinstitute.sting.gatk.filters.FilterManager; -import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; -import org.broadinstitute.sting.gatk.filters.SamRecordHeaderFilter; -import org.broadinstitute.sting.gatk.io.stubs.Stub; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -/** - * Parses a GATKArgumentCollection into various gatk objects and then executes() some function. - */ -public abstract class AbstractGenomeAnalysisEngine { - - /** - * The GATK command-line argument parsing code. - */ - private ParsingEngine parsingEngine; - - /** - * The genomeLocParser can create and parse GenomeLocs. - */ - private GenomeLocParser genomeLocParser; - - /** - * Accessor for sharded read data. - */ - private SAMDataSource readsDataSource = null; - - /** - * Needs to be - * @return - */ - public ReferenceDataSource getReferenceDataSource() { - return referenceDataSource; - } - - public GenomeLocParser getGenomeLocParser() { - return genomeLocParser; - } - - /** - * Accessor for sharded reference data. - */ - private ReferenceDataSource referenceDataSource = null; - - /** - * Accessor for sample metadata - */ - private SampleDataSource sampleDataSource = null; - - /** - * Accessor for sharded reference-ordered data. - */ - private List rodDataSources; - - // our argument collection - private GATKArgumentCollection argCollection; - - /** - * Collection of inputs used by the engine. - */ - private Map inputs = new HashMap(); - - /** - * Collection of intervals used by the engine. - */ - private GenomeLocSortedSet intervals = null; - - /** - * Collection of outputs used by the engine. - */ - private Collection> outputs = new ArrayList>(); - - /** - * Collection of the filters applied to the input data. - */ - private Collection filters; - - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(AbstractGenomeAnalysisEngine.class); - - /** - * Manage lists of filters. - */ - private final FilterManager filterManager = new FilterManager(); - - private Date startTime = null; // the start time for execution - - public void setParser(ParsingEngine parsingEngine) { - this.parsingEngine = parsingEngine; - } - - /** - * Explicitly set the GenomeLocParser, for unit testing. - * @param genomeLocParser GenomeLocParser to use. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - /** - * Actually run the engine. - * @return the value of this traversal. - */ - public abstract Object execute(); - - protected abstract boolean flashbackData(); - protected abstract boolean includeReadsWithDeletionAtLoci(); - protected abstract boolean generateExtendedEvents(); - - /** - * Sets the start time when the execute() function was last called - * @param startTime the start time when the execute() function was last called - */ - protected void setStartTime(Date startTime) { - this.startTime = startTime; - } - - /** - * @return the start time when the execute() function was last called - */ - public Date getStartTime() { - return startTime; - } - - /** - * Setup the intervals to be processed - */ - protected void initializeIntervals() { - - // return if no interval arguments at all - if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null)) - return; - - // if '-L all' was specified, verify that it was the only -L specified and return if so. - if(argCollection.intervals != null) { - for(String interval: argCollection.intervals) { - if(interval.trim().equals("all")) { - if(argCollection.intervals.size() > 1) - throw new UserException("'-L all' was specified along with other intervals or interval lists; the GATK cannot combine '-L all' with other intervals."); - - // '-L all' was specified and seems valid. Return. - return; - } - } - } - - // if include argument isn't given, create new set of all possible intervals - GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? - GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(argCollection.intervals, - argCollection.intervalMerging, - genomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging))); - - // if no exclude arguments, can return parseIntervalArguments directly - if (argCollection.excludeIntervals == null) - intervals = includeSortedSet; - - // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets - else { - GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null); - intervals = includeSortedSet.subtractRegions(excludeSortedSet); - - // logging messages only printed when exclude (-XL) arguments are given - long toPruneSize = includeSortedSet.coveredSize(); - long toExcludeSize = excludeSortedSet.coveredSize(); - long intervalSize = intervals.coveredSize(); - logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); - logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", - toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); - } - } - - /** - * Loads the intervals relevant to the current execution - * @param argList String representation of arguments; might include 'all', filenames, intervals in samtools - * notation, or a combination of the - * @param mergingRule Technique to use when merging interval data. - * @param additionalIntervals a list of additional intervals to add to the returned set. Can be null. - * @return A sorted, merged list of all intervals specified in this arg list. - */ - private GenomeLocSortedSet loadIntervals(List argList, - IntervalMergingRule mergingRule, - List additionalIntervals) { - - return IntervalUtils.sortAndMergeIntervals(genomeLocParser,IntervalUtils.mergeListsBySetOperator(additionalIntervals, - IntervalUtils.parseIntervalArguments(genomeLocParser,argList, - this.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST), - argCollection.BTIMergeRule), - mergingRule); - } - - /** - * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs - * @return ROD intervals as GenomeLocs - */ - private List checkRODToIntervalArgument() { - Map rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources); - // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag? - List ret = new ArrayList(); - if (rodNames != null && argCollection.RODToInterval != null) { - String rodName = argCollection.RODToInterval; - - // check to make sure we have a rod of that name - if (!rodNames.containsKey(rodName)) - throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option"); - - for (String str : rodNames.keySet()) - if (str.equals(rodName)) { - logger.info("Adding interval list from track (ROD) named " + rodName); - RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData()); - ret.addAll(intervalGenerator.toGenomeLocList()); - } - } - return ret; - } - - /** - * Add additional, externally managed IO streams for inputs. - * - * @param argumentSource Field into which to inject the value. - * @param value Instance to inject. - */ - public void addInput(ArgumentSource argumentSource, Object value) { - inputs.put(argumentSource, value); - } - - /** - * Add additional, externally managed IO streams for output. - * - * @param stub Instance to inject. - */ - public void addOutput(Stub stub) { - outputs.add(stub); - } - - /** - * Gets the tags associated with a given object. - * @param key Key for which to find a tag. - * @return List of tags associated with this key. - */ - public List getTags(Object key) { - return parsingEngine.getTags(key); - } - - /** - * Gets a list of the filters to associate with the engine. Will NOT initialize the engine with this filters; - * the caller must handle that directly. - * @return A collection of available filters. - */ - public Collection createFilters() { - Set filters = new HashSet(); - if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) - filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); - for(String filterName: this.getArguments().readFilters) - filters.add(this.getFilterManager().createByName(filterName)); - return Collections.unmodifiableSet(filters); - } - - protected void initializeDataSources() { - logger.info("Strictness is " + argCollection.strictnessLevel); - - // TODO -- REMOVE ME - BAQ.DEFAULT_GOP = argCollection.BAQGOP; - - validateSuppliedReference(); - referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile); - - validateSuppliedReads(); - readsDataSource = createReadsDataSource(genomeLocParser, referenceDataSource.getReference()); - - sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); - - for (SamRecordFilter filter : filters) - if (filter instanceof SamRecordHeaderFilter) - ((SamRecordHeaderFilter)filter).setHeader(this.getSAMFileHeader()); - - sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); - - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference - RMDTrackBuilder manager = new RMDTrackBuilder(referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); - List tracks = manager.getReferenceMetaDataSources(this,argCollection); - validateSuppliedReferenceOrderedData(tracks); - - // validate all the sequence dictionaries against the reference - validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), tracks, manager); - - rodDataSources = getReferenceOrderedDataSources(tracks); - } - - /** - * Gets a unique identifier for the reader sourcing this read. - * @param read Read to examine. - * @return A unique identifier for the source file of this read. Exception if not found. - */ - public SAMReaderID getReaderIDForRead(final SAMRecord read) { - return getReadsDataSource().getReaderID(read); - } - - /** - * Gets the source file for this read. - * @param id Unique identifier determining which input file to use. - * @return The source filename for this read. - */ - public File getSourceFileForReaderID(final SAMReaderID id) { - return getReadsDataSource().getSAMFile(id); - } - - /** - * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names - * found in the corresponding bam file. - * - * @return Sets of samples in the merged input SAM stream, grouped by readers - */ - public List> getSamplesByReaders() { - List readers = getReadsDataSource().getReaderIDs(); - - List> sample_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set samples = new HashSet(1); - sample_sets.add(samples); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - samples.add(g.getSample()); - } - } - - return sample_sets; - - } - - /** - * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names - * found in the corresponding bam file. - * - * @return Sets of libraries present in the (merged) input SAM stream, grouped by readers - */ - public List> getLibrariesByReaders() { - - - List readers = getReadsDataSource().getReaderIDs(); - - List> lib_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set libs = new HashSet(2); - lib_sets.add(libs); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - libs.add(g.getLibrary()); - } - } - - return lib_sets; - - } - - /** - * Returns a mapping from original input files to their (merged) read group ids - * - * @return the mapping - */ - public Map> getFileToReadGroupIdMapping() { - // populate the file -> read group mapping - Map> fileToReadGroupIdMap = new HashMap>(); - for (SAMReaderID id: getReadsDataSource().getReaderIDs()) { - Set readGroups = new HashSet(5); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(id).getReadGroups()) { - if (getReadsDataSource().hasReadGroupCollisions()) { - // Check if there were read group clashes. - // If there were, use the SamFileHeaderMerger to translate from the - // original read group id to the read group id in the merged stream - readGroups.add(getReadsDataSource().getReadGroupId(id,g.getReadGroupId())); - } else { - // otherwise, pass through the unmapped read groups since this is what Picard does as well - readGroups.add(g.getReadGroupId()); - } - } - - fileToReadGroupIdMap.put(getReadsDataSource().getSAMFile(id),readGroups); - } - - return fileToReadGroupIdMap; - } - - /** - * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD **** - * - * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups - * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. - * - * @return sets of (merged) read group ids in order of input bams - */ - public List> getMergedReadGroupsByReaders() { - - - List readers = getReadsDataSource().getReaderIDs(); - - List> rg_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set groups = new HashSet(5); - rg_sets.add(groups); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - if (getReadsDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: - // use HeaderMerger to translate original read group id from the reader into the read group id in the - // merged stream, and save that remapped read group id to associate it with specific reader - groups.add(getReadsDataSource().getReadGroupId(r, g.getReadGroupId())); - } else { - // otherwise, pass through the unmapped read groups since this is what Picard does as well - groups.add(g.getReadGroupId()); - } - } - } - - return rg_sets; - - } - - protected DownsamplingMethod getDownsamplingMethod() { - DownsamplingMethod method; - if(argCollection.getDownsamplingMethod() != null) - method = argCollection.getDownsamplingMethod(); - else - method = argCollection.getDefaultDownsamplingMethod(); - return method; - } - - protected abstract void validateSuppliedReads(); - protected abstract void validateSuppliedReference(); - protected abstract void validateSuppliedReferenceOrderedData(List rods); - - /** - * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). - * - * @param reads Reads data source. - * @param reference Reference data source. - * @param tracks a collection of the reference ordered data tracks - */ - private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection tracks, RMDTrackBuilder manager) { - if ((reads.isEmpty() && (tracks == null || tracks.isEmpty())) || reference == null ) - return; - - // Compile a set of sequence names that exist in the reference file. - SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); - - if (!reads.isEmpty()) { - // Compile a set of sequence names that exist in the BAM files. - SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - - Set readsSequenceNames = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences()) - readsSequenceNames.add(dictionaryEntry.getSequenceName()); - - - if (readsSequenceNames.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); - return; - } - - // compare the reads to the reference - SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, "reference", referenceDictionary); - } - - // compare the tracks to the reference, if they have a sequence dictionary - for (RMDTrack track : tracks) - manager.validateTrackSequenceDictionary(track.getName(),track.getSequenceDictionary(),referenceDictionary); - } - - /** - * Gets a data source for the given set of reads. - * - * @return A data source for the given set of reads. - */ - private SAMDataSource createReadsDataSource(GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { - DownsamplingMethod method = getDownsamplingMethod(); - - if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF) - throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested."); - - return new SAMDataSource( - unpackBAMFileList(argCollection.samFiles), - genomeLocParser, - argCollection.useOriginalBaseQualities, - argCollection.strictnessLevel, - argCollection.readBufferSize, - method, - new ValidationExclusion(Arrays.asList(argCollection.unsafe)), - filters, - includeReadsWithDeletionAtLoci(), - generateExtendedEvents(), - getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, - getWalkerBAQQualityMode(), - refReader); - } - - /** - * Overloaded to break abstraction. Thanks Khalid :-( - * @return - */ - public BAQ.QualityMode getWalkerBAQQualityMode() { return BAQ.QualityMode.DONT_MODIFY; } - public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return BAQ.ApplicationTime.ON_INPUT; } - - /** - * Opens a reference sequence file paired with an index. - * - * @param refFile Handle to a reference sequence file. Non-null. - * @return A thread-safe file wrapper. - */ - private ReferenceDataSource openReferenceSequenceFile(File refFile) { - ReferenceDataSource ref = new ReferenceDataSource(refFile); - genomeLocParser = new GenomeLocParser(ref.getReference()); - return ref; - } - - /** - * Open the reference-ordered data sources. - * - * @param rods the reference order data to execute using - * @return A list of reference-ordered data sources. - */ - private List getReferenceOrderedDataSources(List rods) { - List dataSources = new ArrayList(); - for (RMDTrack rod : rods) - dataSources.add(new ReferenceOrderedDataSource(referenceDataSource.getReference().getSequenceDictionary(), - genomeLocParser, - argCollection.unsafe, - rod, - flashbackData())); - return dataSources; - } - - /** - * Returns the SAM File Header from the input reads' data source file - * @return the SAM File Header from the input reads' data source file - */ - public SAMFileHeader getSAMFileHeader() { - return readsDataSource.getHeader(); - } - - /** - * Returns the unmerged SAM file header for an individual reader. - * @param reader The reader. - * @return Header for that reader. - */ - public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { - return readsDataSource.getHeader(reader); - } - - /** - * Returns data source object encapsulating all essential info and handlers used to traverse - * reads; header merger, individual file readers etc can be accessed through the returned data source object. - * - * @return the reads data source - */ - public SAMDataSource getReadsDataSource() { - return this.readsDataSource; - } - - - - /** - * Sets the collection of GATK main application arguments. - * - * @param argCollection the GATK argument collection - */ - public void setArguments(GATKArgumentCollection argCollection) { - this.argCollection = argCollection; - } - - /** - * Gets the collection of GATK main application arguments. - * - * @return the GATK argument collection - */ - public GATKArgumentCollection getArguments() { - return this.argCollection; - } - - /** - * Get the list of intervals passed to the engine. - * @return List of intervals. - */ - public GenomeLocSortedSet getIntervals() { - return this.intervals; - } - - /** - * Gets the list of filters employed by this engine. - * @return Collection of filters (actual instances) used by this engine. - */ - public Collection getFilters() { - return this.filters; - } - - /** - * Sets the list of filters employed by this engine. - * @param filters Collection of filters (actual instances) used by this engine. - */ - public void setFilters(Collection filters) { - this.filters = filters; - } - - /** - * Gets the filter manager for this engine. - * @return filter manager for this engine. - */ - protected FilterManager getFilterManager() { - return filterManager; - } - - /** - * Gets the input sources for this engine. - * @return input sources for this engine. - */ - protected Map getInputs() { - return inputs; - } - - /** - * Gets the output stubs for this engine. - * @return output stubs for this engine. - */ - protected Collection> getOutputs() { - return outputs; - } - - /** - * Returns data source objects encapsulating all rod data; - * individual rods can be accessed through the returned data source objects. - * - * @return the rods data sources - */ - public List getRodDataSources() { - return this.rodDataSources; - } - - /** - * Gets cumulative metrics about the entire run to this point. - * @return cumulative metrics about the entire run. - */ - public ReadMetrics getCumulativeMetrics() { - return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); - } - - /** - * Unpack the bam files to be processed, given a list of files. That list of files can - * itself contain entries which are lists of other files to be read (note: you cannot have lists of lists of lists) - * - * @param inputFiles a list of files that represent either bam files themselves, or a file containing a list of bam files to process - * - * @return a flattened list of the bam files provided - */ - private List unpackBAMFileList( List inputFiles ) { - List unpackedReads = new ArrayList(); - for( File inputFile: inputFiles ) { - if (inputFile.getName().toLowerCase().endsWith(".list") ) { - try { - for(String fileName : new XReadLines(inputFile)) - unpackedReads.add(new SAMReaderID(new File(fileName),getTags(inputFile))); - } - catch( FileNotFoundException ex ) { - throw new UserException.CouldNotReadInputFile(inputFile, "Unable to find file while unpacking reads", ex); - } - } - else if(inputFile.getName().toLowerCase().endsWith(".bam")) { - unpackedReads.add( new SAMReaderID(inputFile,getTags(inputFile)) ); - } - else if(inputFile.getName().equals("-")) { - unpackedReads.add(new SAMReaderID(new File("/dev/stdin"),Collections.emptyList())); - } - else { - throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " + - "with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " + - "of BAM files is in the correct format, update the extension, and try again.",inputFile.getName())); - } - } - return unpackedReads; - } - - public SampleDataSource getSampleMetadata() { - return this.sampleDataSource; - } - - /** - * Get a sample by its ID - * If an alias is passed in, return the main sample object - * @param id sample id - * @return sample Object with this ID - */ - public Sample getSampleById(String id) { - return sampleDataSource.getSampleById(id); - } - - /** - * Get the sample for a given read group - * Must first look up ID for read group - * @param readGroup of sample - * @return sample object with ID from the read group - */ - public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { - return sampleDataSource.getSampleByReadGroup(readGroup); - } - - /** - * Get a sample for a given read - * Must first look up read group, and then sample ID for that read group - * @param read of sample - * @return sample object of this read - */ - public Sample getSampleByRead(SAMRecord read) { - return getSampleByReadGroup(read.getReadGroup()); - } - - /** - * Get number of sample objects - * @return size of samples map - */ - public int sampleCount() { - return sampleDataSource.sampleCount(); - } - - /** - * Return all samples with a given family ID - * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this - * @param familyId family ID - * @return Samples with the given family ID - */ - public Set getFamily(String familyId) { - return sampleDataSource.getFamily(familyId); - } - - /** - * Returns all children of a given sample - * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient - * @param sample parent sample - * @return children of the given sample - */ - public Set getChildren(Sample sample) { - return sampleDataSource.getChildren(sample); - } - - /** - * Gets all the samples - * @return - */ - public Collection getSamples() { - return sampleDataSource.getSamples(); - } - - /** - * Takes a list of sample names and returns their corresponding sample objects - * - * @param sampleNameList List of sample names - * @return Corresponding set of samples - */ - public Set getSamples(Collection sampleNameList) { - return sampleDataSource.getSamples(sampleNameList); - } - - - /** - * Returns a set of samples that have any value (which could be null) for a given property - * @param key Property key - * @return Set of samples with the property - */ - public Set getSamplesWithProperty(String key) { - return sampleDataSource.getSamplesWithProperty(key); - } - - /** - * Returns a set of samples that have a property with a certain value - * Value must be a string for now - could add a similar method for matching any objects in the future - * - * @param key Property key - * @param value String property value - * @return Set of samples that match key and value - */ - public Set getSamplesWithProperty(String key, String value) { - return sampleDataSource.getSamplesWithProperty(key, value); - - } - - /** - * Returns a set of sample objects for the sample names in a variant context - * - * @param context Any variant context - * @return a set of the sample objects - */ - public Set getSamplesByVariantContext(VariantContext context) { - Set samples = new HashSet(); - for (String sampleName : context.getSampleNames()) { - samples.add(sampleDataSource.getOrCreateSample(sampleName)); - } - return samples; - } - - /** - * Returns all samples that were referenced in the SAM file - */ - public Set getSAMFileSamples() { - return sampleDataSource.getSAMFileSamples(); - } - - /** - * Return a subcontext restricted to samples with a given property key/value - * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering - * @param context VariantContext to filter - * @param key property key - * @param value property value (must be string) - * @return subcontext - */ - public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { - return sampleDataSource.subContextFromSampleProperty(context, key, value); - } - - public Map getApproximateCommandLineArguments(Object... argumentProviders) { - return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); - } - - public String createApproximateCommandLineArgumentString(Object... argumentProviders) { - return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 914cbcc05..78e5b21ed 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -25,38 +25,114 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.filter.SamRecordFilter; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.*; +import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.ArgumentException; import org.broadinstitute.sting.commandline.ArgumentSource; import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.CommandLineUtils; +import org.broadinstitute.sting.commandline.ParsingEngine; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import org.broadinstitute.sting.gatk.executive.MicroScheduler; +import org.broadinstitute.sting.gatk.filters.FilterManager; +import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; +import org.broadinstitute.sting.gatk.filters.SamRecordHeaderFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SequenceDictionaryUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.text.XReadLines; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; /** * A GenomeAnalysisEngine that runs a specified walker. */ -public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { +public class GenomeAnalysisEngine { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); + + /** + * The GATK command-line argument parsing code. + */ + private ParsingEngine parsingEngine; + + /** + * The genomeLocParser can create and parse GenomeLocs. + */ + private GenomeLocParser genomeLocParser; + + /** + * Accessor for sharded read data. + */ + private SAMDataSource readsDataSource = null; + + /** + * Accessor for sharded reference data. + */ + private ReferenceDataSource referenceDataSource = null; + + /** + * Accessor for sample metadata + */ + private SampleDataSource sampleDataSource = null; + + /** + * Accessor for sharded reference-ordered data. + */ + private List rodDataSources; + + // our argument collection + private GATKArgumentCollection argCollection; + + /** + * Collection of intervals used by the engine. + */ + private GenomeLocSortedSet intervals = null; + + /** + * Collection of inputs used by the engine. + */ + private Map inputs = new HashMap(); + + /** + * Collection of outputs used by the engine. + */ + private Collection> outputs = new ArrayList>(); + + /** + * Collection of the filters applied to the input data. + */ + private Collection filters; + /** * our walker manager */ @@ -135,11 +211,13 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { * the caller must handle that directly. * @return A collection of available filters. */ - @Override public Collection createFilters() { Set filters = new HashSet(); filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); - filters.addAll(super.createFilters()); + if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) + filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); + for(String filterName: this.getArguments().readFilters) + filters.add(this.getFilterManager().createByName(filterName)); return Collections.unmodifiableSet(filters); } @@ -171,7 +249,6 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { return MicroScheduler.create(this,my_walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads); } - @Override protected DownsamplingMethod getDownsamplingMethod() { GATKArgumentCollection argCollection = this.getArguments(); DownsamplingMethod method; @@ -187,12 +264,10 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); } public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); } - @Override protected boolean generateExtendedEvents() { return walker.generateExtendedEvents(); } - @Override protected boolean includeReadsWithDeletionAtLoci() { return walker.includeReadsWithDeletionAtLoci(); } @@ -200,7 +275,6 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { /** * Verifies that the supplied set of reads files mesh with what the walker says it requires. */ - @Override protected void validateSuppliedReads() { GATKArgumentCollection arguments = this.getArguments(); // Check what the walker says is required against what was provided on the command line. @@ -215,7 +289,6 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { /** * Verifies that the supplied reference file mesh with what the walker says it requires. */ - @Override protected void validateSuppliedReference() { GATKArgumentCollection arguments = this.getArguments(); // Check what the walker says is required against what was provided on the command line. @@ -234,7 +307,6 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { * * @param rods Reference-ordered data to load. */ - @Override protected void validateSuppliedReferenceOrderedData(List rods) { // Check to make sure that all required metadata is present. List allRequired = WalkerManager.getRequiredMetaData(walker); @@ -370,7 +442,6 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { return shardStrategy; } - @Override protected boolean flashbackData() { return walker instanceof ReadWalker; } @@ -389,4 +460,695 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { outputTracker.prepareWalker(walker, getArguments().strictnessLevel); } + public ReferenceDataSource getReferenceDataSource() { + return referenceDataSource; + } + + public GenomeLocParser getGenomeLocParser() { + return genomeLocParser; + } + + /** + * Manage lists of filters. + */ + private final FilterManager filterManager = new FilterManager(); + + private Date startTime = null; // the start time for execution + + public void setParser(ParsingEngine parsingEngine) { + this.parsingEngine = parsingEngine; + } + + /** + * Explicitly set the GenomeLocParser, for unit testing. + * @param genomeLocParser GenomeLocParser to use. + */ + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + /** + * Sets the start time when the execute() function was last called + * @param startTime the start time when the execute() function was last called + */ + protected void setStartTime(Date startTime) { + this.startTime = startTime; + } + + /** + * @return the start time when the execute() function was last called + */ + public Date getStartTime() { + return startTime; + } + + /** + * Setup the intervals to be processed + */ + protected void initializeIntervals() { + + // return if no interval arguments at all + if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null)) + return; + + // if '-L all' was specified, verify that it was the only -L specified and return if so. + if(argCollection.intervals != null) { + for(String interval: argCollection.intervals) { + if(interval.trim().equals("all")) { + if(argCollection.intervals.size() > 1) + throw new UserException("'-L all' was specified along with other intervals or interval lists; the GATK cannot combine '-L all' with other intervals."); + + // '-L all' was specified and seems valid. Return. + return; + } + } + } + + // if include argument isn't given, create new set of all possible intervals + GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? + GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : + loadIntervals(argCollection.intervals, + argCollection.intervalMerging, + genomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging))); + + // if no exclude arguments, can return parseIntervalArguments directly + if (argCollection.excludeIntervals == null) + intervals = includeSortedSet; + + // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets + else { + GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null); + intervals = includeSortedSet.subtractRegions(excludeSortedSet); + + // logging messages only printed when exclude (-XL) arguments are given + long toPruneSize = includeSortedSet.coveredSize(); + long toExcludeSize = excludeSortedSet.coveredSize(); + long intervalSize = intervals.coveredSize(); + logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); + logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", + toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); + } + } + + /** + * Loads the intervals relevant to the current execution + * @param argList String representation of arguments; might include 'all', filenames, intervals in samtools + * notation, or a combination of the + * @param mergingRule Technique to use when merging interval data. + * @param additionalIntervals a list of additional intervals to add to the returned set. Can be null. + * @return A sorted, merged list of all intervals specified in this arg list. + */ + private GenomeLocSortedSet loadIntervals(List argList, + IntervalMergingRule mergingRule, + List additionalIntervals) { + + return IntervalUtils.sortAndMergeIntervals(genomeLocParser,IntervalUtils.mergeListsBySetOperator(additionalIntervals, + IntervalUtils.parseIntervalArguments(genomeLocParser,argList, + this.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST), + argCollection.BTIMergeRule), + mergingRule); + } + + /** + * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs + * @return ROD intervals as GenomeLocs + */ + private List checkRODToIntervalArgument() { + Map rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources); + // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag? + List ret = new ArrayList(); + if (rodNames != null && argCollection.RODToInterval != null) { + String rodName = argCollection.RODToInterval; + + // check to make sure we have a rod of that name + if (!rodNames.containsKey(rodName)) + throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option"); + + for (String str : rodNames.keySet()) + if (str.equals(rodName)) { + logger.info("Adding interval list from track (ROD) named " + rodName); + RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData()); + ret.addAll(intervalGenerator.toGenomeLocList()); + } + } + return ret; + } + + /** + * Add additional, externally managed IO streams for inputs. + * + * @param argumentSource Field into which to inject the value. + * @param value Instance to inject. + */ + public void addInput(ArgumentSource argumentSource, Object value) { + inputs.put(argumentSource, value); + } + + /** + * Add additional, externally managed IO streams for output. + * + * @param stub Instance to inject. + */ + public void addOutput(Stub stub) { + outputs.add(stub); + } + + /** + * Gets the tags associated with a given object. + * @param key Key for which to find a tag. + * @return List of tags associated with this key. + */ + public List getTags(Object key) { + return parsingEngine.getTags(key); + } + + protected void initializeDataSources() { + logger.info("Strictness is " + argCollection.strictnessLevel); + + // TODO -- REMOVE ME + BAQ.DEFAULT_GOP = argCollection.BAQGOP; + + validateSuppliedReference(); + referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile); + + validateSuppliedReads(); + readsDataSource = createReadsDataSource(genomeLocParser, referenceDataSource.getReference()); + + sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); + + for (SamRecordFilter filter : filters) + if (filter instanceof SamRecordHeaderFilter) + ((SamRecordHeaderFilter)filter).setHeader(this.getSAMFileHeader()); + + sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); + + // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference + RMDTrackBuilder manager = new RMDTrackBuilder(referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); + List tracks = manager.getReferenceMetaDataSources(this,argCollection); + validateSuppliedReferenceOrderedData(tracks); + + // validate all the sequence dictionaries against the reference + validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), tracks, manager); + + rodDataSources = getReferenceOrderedDataSources(tracks); + } + + /** + * Gets a unique identifier for the reader sourcing this read. + * @param read Read to examine. + * @return A unique identifier for the source file of this read. Exception if not found. + */ + public SAMReaderID getReaderIDForRead(final SAMRecord read) { + return getReadsDataSource().getReaderID(read); + } + + /** + * Gets the source file for this read. + * @param id Unique identifier determining which input file to use. + * @return The source filename for this read. + */ + public File getSourceFileForReaderID(final SAMReaderID id) { + return getReadsDataSource().getSAMFile(id); + } + + /** + * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names + * found in the corresponding bam file. + * + * @return Sets of samples in the merged input SAM stream, grouped by readers + */ + public List> getSamplesByReaders() { + List readers = getReadsDataSource().getReaderIDs(); + + List> sample_sets = new ArrayList>(readers.size()); + + for (SAMReaderID r : readers) { + + Set samples = new HashSet(1); + sample_sets.add(samples); + + for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { + samples.add(g.getSample()); + } + } + + return sample_sets; + + } + + /** + * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names + * found in the corresponding bam file. + * + * @return Sets of libraries present in the (merged) input SAM stream, grouped by readers + */ + public List> getLibrariesByReaders() { + + + List readers = getReadsDataSource().getReaderIDs(); + + List> lib_sets = new ArrayList>(readers.size()); + + for (SAMReaderID r : readers) { + + Set libs = new HashSet(2); + lib_sets.add(libs); + + for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { + libs.add(g.getLibrary()); + } + } + + return lib_sets; + + } + + /** + * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD **** + * + * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups + * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. + * + * @return sets of (merged) read group ids in order of input bams + */ + public List> getMergedReadGroupsByReaders() { + + + List readers = getReadsDataSource().getReaderIDs(); + + List> rg_sets = new ArrayList>(readers.size()); + + for (SAMReaderID r : readers) { + + Set groups = new HashSet(5); + rg_sets.add(groups); + + for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { + if (getReadsDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: + // use HeaderMerger to translate original read group id from the reader into the read group id in the + // merged stream, and save that remapped read group id to associate it with specific reader + groups.add(getReadsDataSource().getReadGroupId(r, g.getReadGroupId())); + } else { + // otherwise, pass through the unmapped read groups since this is what Picard does as well + groups.add(g.getReadGroupId()); + } + } + } + + return rg_sets; + + } + + /** + * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). + * + * @param reads Reads data source. + * @param reference Reference data source. + * @param tracks a collection of the reference ordered data tracks + */ + private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection tracks, RMDTrackBuilder manager) { + if ((reads.isEmpty() && (tracks == null || tracks.isEmpty())) || reference == null ) + return; + + // Compile a set of sequence names that exist in the reference file. + SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); + + if (!reads.isEmpty()) { + // Compile a set of sequence names that exist in the BAM files. + SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); + + Set readsSequenceNames = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences()) + readsSequenceNames.add(dictionaryEntry.getSequenceName()); + + + if (readsSequenceNames.size() == 0) { + logger.info("Reads file is unmapped. Skipping validation against reference."); + return; + } + + // compare the reads to the reference + SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, "reference", referenceDictionary); + } + + // compare the tracks to the reference, if they have a sequence dictionary + for (RMDTrack track : tracks) + manager.validateTrackSequenceDictionary(track.getName(),track.getSequenceDictionary(),referenceDictionary); + } + + /** + * Gets a data source for the given set of reads. + * + * @return A data source for the given set of reads. + */ + private SAMDataSource createReadsDataSource(GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { + DownsamplingMethod method = getDownsamplingMethod(); + + if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF) + throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested."); + + return new SAMDataSource( + unpackBAMFileList(argCollection.samFiles), + genomeLocParser, + argCollection.useOriginalBaseQualities, + argCollection.strictnessLevel, + argCollection.readBufferSize, + method, + new ValidationExclusion(Arrays.asList(argCollection.unsafe)), + filters, + includeReadsWithDeletionAtLoci(), + generateExtendedEvents(), + getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, + getWalkerBAQQualityMode(), + refReader); + } + + /** + * Opens a reference sequence file paired with an index. + * + * @param refFile Handle to a reference sequence file. Non-null. + * @return A thread-safe file wrapper. + */ + private ReferenceDataSource openReferenceSequenceFile(File refFile) { + ReferenceDataSource ref = new ReferenceDataSource(refFile); + genomeLocParser = new GenomeLocParser(ref.getReference()); + return ref; + } + + /** + * Open the reference-ordered data sources. + * + * @param rods the reference order data to execute using + * @return A list of reference-ordered data sources. + */ + private List getReferenceOrderedDataSources(List rods) { + List dataSources = new ArrayList(); + for (RMDTrack rod : rods) + dataSources.add(new ReferenceOrderedDataSource(referenceDataSource.getReference().getSequenceDictionary(), + genomeLocParser, + argCollection.unsafe, + rod, + flashbackData())); + return dataSources; + } + + /** + * Returns the SAM File Header from the input reads' data source file + * @return the SAM File Header from the input reads' data source file + */ + public SAMFileHeader getSAMFileHeader() { + return readsDataSource.getHeader(); + } + + /** + * Returns the unmerged SAM file header for an individual reader. + * @param reader The reader. + * @return Header for that reader. + */ + public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { + return readsDataSource.getHeader(reader); + } + + /** + * Returns data source object encapsulating all essential info and handlers used to traverse + * reads; header merger, individual file readers etc can be accessed through the returned data source object. + * + * @return the reads data source + */ + public SAMDataSource getReadsDataSource() { + return this.readsDataSource; + } + + + + /** + * Sets the collection of GATK main application arguments. + * + * @param argCollection the GATK argument collection + */ + public void setArguments(GATKArgumentCollection argCollection) { + this.argCollection = argCollection; + } + + /** + * Gets the collection of GATK main application arguments. + * + * @return the GATK argument collection + */ + public GATKArgumentCollection getArguments() { + return this.argCollection; + } + + /** + * Get the list of intervals passed to the engine. + * @return List of intervals. + */ + public GenomeLocSortedSet getIntervals() { + return this.intervals; + } + + /** + * Gets the list of filters employed by this engine. + * @return Collection of filters (actual instances) used by this engine. + */ + public Collection getFilters() { + return this.filters; + } + + /** + * Sets the list of filters employed by this engine. + * @param filters Collection of filters (actual instances) used by this engine. + */ + public void setFilters(Collection filters) { + this.filters = filters; + } + + /** + * Gets the filter manager for this engine. + * @return filter manager for this engine. + */ + protected FilterManager getFilterManager() { + return filterManager; + } + + /** + * Gets the input sources for this engine. + * @return input sources for this engine. + */ + protected Map getInputs() { + return inputs; + } + + /** + * Gets the output stubs for this engine. + * @return output stubs for this engine. + */ + protected Collection> getOutputs() { + return outputs; + } + + /** + * Returns data source objects encapsulating all rod data; + * individual rods can be accessed through the returned data source objects. + * + * @return the rods data sources + */ + public List getRodDataSources() { + return this.rodDataSources; + } + + /** + * Gets cumulative metrics about the entire run to this point. + * @return cumulative metrics about the entire run. + */ + public ReadMetrics getCumulativeMetrics() { + return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); + } + + /** + * Unpack the bam files to be processed, given a list of files. That list of files can + * itself contain entries which are lists of other files to be read (note: you cannot have lists of lists of lists) + * + * @param inputFiles a list of files that represent either bam files themselves, or a file containing a list of bam files to process + * + * @return a flattened list of the bam files provided + */ + private List unpackBAMFileList( List inputFiles ) { + List unpackedReads = new ArrayList(); + for( File inputFile: inputFiles ) { + if (inputFile.getName().toLowerCase().endsWith(".list") ) { + try { + for(String fileName : new XReadLines(inputFile)) + unpackedReads.add(new SAMReaderID(new File(fileName),getTags(inputFile))); + } + catch( FileNotFoundException ex ) { + throw new UserException.CouldNotReadInputFile(inputFile, "Unable to find file while unpacking reads", ex); + } + } + else if(inputFile.getName().toLowerCase().endsWith(".bam")) { + unpackedReads.add( new SAMReaderID(inputFile,getTags(inputFile)) ); + } + else if(inputFile.getName().equals("-")) { + unpackedReads.add(new SAMReaderID(new File("/dev/stdin"),Collections.emptyList())); + } + else { + throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " + + "with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " + + "of BAM files is in the correct format, update the extension, and try again.",inputFile.getName())); + } + } + return unpackedReads; + } + + public SampleDataSource getSampleMetadata() { + return this.sampleDataSource; + } + + /** + * Get a sample by its ID + * If an alias is passed in, return the main sample object + * @param id sample id + * @return sample Object with this ID + */ + public Sample getSampleById(String id) { + return sampleDataSource.getSampleById(id); + } + + /** + * Get the sample for a given read group + * Must first look up ID for read group + * @param readGroup of sample + * @return sample object with ID from the read group + */ + public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { + return sampleDataSource.getSampleByReadGroup(readGroup); + } + + /** + * Get a sample for a given read + * Must first look up read group, and then sample ID for that read group + * @param read of sample + * @return sample object of this read + */ + public Sample getSampleByRead(SAMRecord read) { + return getSampleByReadGroup(read.getReadGroup()); + } + + /** + * Get number of sample objects + * @return size of samples map + */ + public int sampleCount() { + return sampleDataSource.sampleCount(); + } + + /** + * Return all samples with a given family ID + * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this + * @param familyId family ID + * @return Samples with the given family ID + */ + public Set getFamily(String familyId) { + return sampleDataSource.getFamily(familyId); + } + + /** + * Returns all children of a given sample + * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient + * @param sample parent sample + * @return children of the given sample + */ + public Set getChildren(Sample sample) { + return sampleDataSource.getChildren(sample); + } + + /** + * Gets all the samples + * @return + */ + public Collection getSamples() { + return sampleDataSource.getSamples(); + } + + /** + * Takes a list of sample names and returns their corresponding sample objects + * + * @param sampleNameList List of sample names + * @return Corresponding set of samples + */ + public Set getSamples(Collection sampleNameList) { + return sampleDataSource.getSamples(sampleNameList); + } + + + /** + * Returns a set of samples that have any value (which could be null) for a given property + * @param key Property key + * @return Set of samples with the property + */ + public Set getSamplesWithProperty(String key) { + return sampleDataSource.getSamplesWithProperty(key); + } + + /** + * Returns a set of samples that have a property with a certain value + * Value must be a string for now - could add a similar method for matching any objects in the future + * + * @param key Property key + * @param value String property value + * @return Set of samples that match key and value + */ + public Set getSamplesWithProperty(String key, String value) { + return sampleDataSource.getSamplesWithProperty(key, value); + + } + + /** + * Returns a set of sample objects for the sample names in a variant context + * + * @param context Any variant context + * @return a set of the sample objects + */ + public Set getSamplesByVariantContext(VariantContext context) { + Set samples = new HashSet(); + for (String sampleName : context.getSampleNames()) { + samples.add(sampleDataSource.getOrCreateSample(sampleName)); + } + return samples; + } + + /** + * Returns all samples that were referenced in the SAM file + */ + public Set getSAMFileSamples() { + return sampleDataSource.getSAMFileSamples(); + } + + /** + * Return a subcontext restricted to samples with a given property key/value + * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering + * @param context VariantContext to filter + * @param key property key + * @param value property value (must be string) + * @return subcontext + */ + public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { + return sampleDataSource.subContextFromSampleProperty(context, key, value); + } + + public Map getApproximateCommandLineArguments(Object... argumentProviders) { + return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); + } + + public String createApproximateCommandLineArgumentString(Object... argumentProviders) { + return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); + } + + } diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java b/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java index 6e2aefca5..ee5c56524 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java @@ -25,8 +25,6 @@ package org.broadinstitute.sting.gatk.io.storage; -import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.io.stubs.OutputStreamStub; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java index 878450507..3420dc1fb 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java @@ -33,13 +33,13 @@ import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.source.BasicFeatureSource; import org.broad.tribble.util.LittleEndianOutputStream; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SequenceDictionaryUtils; @@ -391,14 +391,14 @@ public class RMDTrackBuilder extends PluginManager { * * @return a list of RMDTracks, one for each -B option */ - public List getReferenceMetaDataSources(AbstractGenomeAnalysisEngine engine, GATKArgumentCollection argCollection) { + public List getReferenceMetaDataSources(GenomeAnalysisEngine engine, GATKArgumentCollection argCollection) { initializeConvenienceBindings(engine,argCollection); initializeFullBindings(engine,argCollection); // try and make the tracks given their requests return createRequestedTrackObjects(); } - private void initializeConvenienceBindings(AbstractGenomeAnalysisEngine engine, GATKArgumentCollection argCollection) { + private void initializeConvenienceBindings(GenomeAnalysisEngine engine, GATKArgumentCollection argCollection) { if (argCollection.DBSNPFile != null) { if(argCollection.DBSNPFile.toLowerCase().contains("vcf")) throw new UserException("--DBSNP (-D) argument currently does not support VCF. To use dbSNP in VCF format, please use -B:dbsnp,vcf ."); @@ -411,7 +411,7 @@ public class RMDTrackBuilder extends PluginManager { * @param engine The engine, used to populate tags. * @param argCollection input arguments to the GATK. */ - private void initializeFullBindings(AbstractGenomeAnalysisEngine engine,GATKArgumentCollection argCollection) { + private void initializeFullBindings(GenomeAnalysisEngine engine,GATKArgumentCollection argCollection) { // NOTE: Method acts as a static. Once the inputs have been passed once they are locked in. if (argCollection.RODBindings.size() == 0) return;