Sliced the GAE in half like a gordian knot to avoid the constant merge conflicts.
The GAE half has all the walker specific code. The new "Abstract" GAE has the rest of the logic. More refactoring to come, with the end goal of having a tool that other java analysis programs (Queue, etc.) can use to read in genomic data. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4339 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0c9fabb06f
commit
4ed9f437e9
|
|
@ -0,0 +1,808 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.ArgumentSource;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Parses a GATKArgumentCollection into various gatk objects and then executes() some function.
|
||||
*/
|
||||
public abstract class AbstractGenomeAnalysisEngine {
|
||||
|
||||
/**
|
||||
* Accessor for sharded read data.
|
||||
*/
|
||||
private SAMDataSource readsDataSource = null;
|
||||
|
||||
protected ReferenceDataSource getReferenceDataSource() {
|
||||
return referenceDataSource;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor for sharded reference data.
|
||||
*/
|
||||
private ReferenceDataSource referenceDataSource = null;
|
||||
|
||||
/**
|
||||
* Accessor for sample metadata
|
||||
*/
|
||||
private SampleDataSource sampleDataSource = null;
|
||||
|
||||
/**
|
||||
* Accessor for sharded reference-ordered data.
|
||||
*/
|
||||
private List<ReferenceOrderedDataSource> rodDataSources;
|
||||
|
||||
// our argument collection
|
||||
private GATKArgumentCollection argCollection;
|
||||
|
||||
/**
|
||||
* Collection of inputs used by the engine.
|
||||
*/
|
||||
private Map<ArgumentSource, Object> inputs = new HashMap<ArgumentSource, Object>();
|
||||
|
||||
/**
|
||||
* Collection of intervals used by the engine.
|
||||
*/
|
||||
private GenomeLocSortedSet intervals = null;
|
||||
|
||||
/**
|
||||
* Collection of outputs used by the engine.
|
||||
*/
|
||||
private Collection<Stub<?>> outputs = new ArrayList<Stub<?>>();
|
||||
|
||||
/**
|
||||
* List of tags associated with the given instantiation of the command-line argument.
|
||||
*/
|
||||
private final Map<Object,List<String>> tags = new IdentityHashMap<Object,List<String>>();
|
||||
|
||||
/**
|
||||
* Collection of the filters applied to the input data.
|
||||
*/
|
||||
private Collection<SamRecordFilter> filters;
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(AbstractGenomeAnalysisEngine.class);
|
||||
|
||||
/**
|
||||
* Manage lists of filters.
|
||||
*/
|
||||
private final FilterManager filterManager = new FilterManager();
|
||||
|
||||
private Date startTime = null; // the start time for execution
|
||||
|
||||
/**
|
||||
* Actually run the engine.
|
||||
* @return the value of this traversal.
|
||||
*/
|
||||
public abstract Object execute();
|
||||
|
||||
protected abstract boolean flashbackData();
|
||||
protected abstract boolean includeReadsWithDeletionAtLoci();
|
||||
protected abstract boolean generateExtendedEvents();
|
||||
|
||||
/**
|
||||
* Sets the start time when the execute() function was last called
|
||||
* @param startTime the start time when the execute() function was last called
|
||||
*/
|
||||
protected void setStartTime(Date startTime) {
|
||||
this.startTime = startTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the start time when the execute() function was last called
|
||||
*/
|
||||
public Date getStartTime() {
|
||||
return startTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup the intervals to be processed
|
||||
*/
|
||||
protected void initializeIntervals() {
|
||||
|
||||
// return null if no interval arguments at all
|
||||
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null))
|
||||
return;
|
||||
|
||||
else {
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
|
||||
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
|
||||
loadIntervals(argCollection.intervals,
|
||||
argCollection.intervalMerging,
|
||||
GenomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging)));
|
||||
|
||||
// if no exclude arguments, can return parseIntervalArguments directly
|
||||
if (argCollection.excludeIntervals == null)
|
||||
intervals = includeSortedSet;
|
||||
|
||||
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
|
||||
else {
|
||||
GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null);
|
||||
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
|
||||
|
||||
// logging messages only printed when exclude (-XL) arguments are given
|
||||
long toPruneSize = includeSortedSet.coveredSize();
|
||||
long toExcludeSize = excludeSortedSet.coveredSize();
|
||||
long intervalSize = intervals.coveredSize();
|
||||
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
|
||||
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
|
||||
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the intervals relevant to the current execution
|
||||
* @param argList String representation of arguments; might include 'all', filenames, intervals in samtools
|
||||
* notation, or a combination of the
|
||||
* @param mergingRule Technique to use when merging interval data.
|
||||
* @param additionalIntervals a list of additional intervals to add to the returned set. Can be null.
|
||||
* @return A sorted, merged list of all intervals specified in this arg list.
|
||||
*/
|
||||
private GenomeLocSortedSet loadIntervals(List<String> argList,
|
||||
IntervalMergingRule mergingRule,
|
||||
List<GenomeLoc> additionalIntervals) {
|
||||
|
||||
return IntervalUtils.sortAndMergeIntervals(IntervalUtils.mergeListsBySetOperator(additionalIntervals,
|
||||
IntervalUtils.parseIntervalArguments(argList),
|
||||
argCollection.BTIMergeRule),
|
||||
mergingRule);
|
||||
}
|
||||
|
||||
/**
|
||||
* if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs
|
||||
* @return ROD intervals as GenomeLocs
|
||||
*/
|
||||
private List<GenomeLoc> checkRODToIntervalArgument() {
|
||||
Map<String, ReferenceOrderedDataSource> rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources);
|
||||
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
|
||||
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||
if (rodNames != null && argCollection.RODToInterval != null) {
|
||||
String rodName = argCollection.RODToInterval;
|
||||
|
||||
// check to make sure we have a rod of that name
|
||||
if (!rodNames.containsKey(rodName))
|
||||
throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option");
|
||||
|
||||
for (String str : rodNames.keySet())
|
||||
if (str.equals(rodName)) {
|
||||
logger.info("Adding interval list from track (ROD) named " + rodName);
|
||||
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData());
|
||||
ret.addAll(intervalGenerator.toGenomeLocList());
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add additional, externally managed IO streams for inputs.
|
||||
*
|
||||
* @param argumentSource Field into which to inject the value.
|
||||
* @param value Instance to inject.
|
||||
*/
|
||||
public void addInput(ArgumentSource argumentSource, Object value) {
|
||||
inputs.put(argumentSource, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add additional, externally managed IO streams for output.
|
||||
*
|
||||
* @param stub Instance to inject.
|
||||
*/
|
||||
public void addOutput(Stub<?> stub) {
|
||||
outputs.add(stub);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds an association between a object created by the
|
||||
* command-line argument system and a freeform list of tags.
|
||||
* @param key Object created by the command-line argument system.
|
||||
* @param tags List of tags to use when reading arguments.
|
||||
*/
|
||||
public void addTags(Object key, List<String> tags) {
|
||||
this.tags.put(key,tags);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the tags associated with a given object.
|
||||
* @param key Key for which to find a tag.
|
||||
* @return List of tags associated with this key.
|
||||
*/
|
||||
public List<String> getTags(Object key) {
|
||||
if(!tags.containsKey(key))
|
||||
return Collections.emptyList();
|
||||
return tags.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a list of the filters to associate with the engine. Will NOT initialize the engine with this filters;
|
||||
* the caller must handle that directly.
|
||||
* @return A collection of available filters.
|
||||
*/
|
||||
protected Collection<SamRecordFilter> createFilters() {
|
||||
Set<SamRecordFilter> filters = new HashSet<SamRecordFilter>();
|
||||
if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0)
|
||||
filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList));
|
||||
for(String filterName: this.getArguments().readFilters)
|
||||
filters.add(this.getFilterManager().createByName(filterName));
|
||||
return Collections.unmodifiableSet(filters);
|
||||
}
|
||||
|
||||
protected void initializeDataSources() {
|
||||
logger.info("Strictness is " + argCollection.strictnessLevel);
|
||||
|
||||
validateSuppliedReads();
|
||||
readsDataSource = createReadsDataSource(extractSourceInfo());
|
||||
|
||||
validateSuppliedReference();
|
||||
referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile);
|
||||
|
||||
sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
|
||||
|
||||
if (argCollection.DBSNPFile != null)
|
||||
bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile);
|
||||
|
||||
RMDTrackBuilder manager = new RMDTrackBuilder();
|
||||
|
||||
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
|
||||
manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
|
||||
|
||||
List<RMDTrack> tracks = manager.getReferenceMetaDataSources(this,argCollection.RODBindings);
|
||||
validateSuppliedReferenceOrderedData(tracks);
|
||||
|
||||
// validate all the sequence dictionaries against the reference
|
||||
validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), tracks);
|
||||
|
||||
rodDataSources = getReferenceOrderedDataSources(tracks);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a unique identifier for the reader sourcing this read.
|
||||
* @param read Read to examine.
|
||||
* @return A unique identifier for the source file of this read. Exception if not found.
|
||||
*/
|
||||
public SAMReaderID getReaderIDForRead(final SAMRecord read) {
|
||||
return getDataSource().getReaderID(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the source file for this read.
|
||||
* @param id Unique identifier determining which input file to use.
|
||||
* @return The source filename for this read.
|
||||
*/
|
||||
public File getSourceFileForReaderID(final SAMReaderID id) {
|
||||
return getDataSource().getSAMFile(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names
|
||||
* found in the corresponding bam file.
|
||||
*
|
||||
* @return Sets of samples in the merged input SAM stream, grouped by readers
|
||||
*/
|
||||
public List<Set<String>> getSamplesByReaders() {
|
||||
List<SAMReaderID> readers = getDataSource().getReaderIDs();
|
||||
|
||||
List<Set<String>> sample_sets = new ArrayList<Set<String>>(readers.size());
|
||||
|
||||
for (SAMReaderID r : readers) {
|
||||
|
||||
Set<String> samples = new HashSet<String>(1);
|
||||
sample_sets.add(samples);
|
||||
|
||||
for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) {
|
||||
samples.add(g.getSample());
|
||||
}
|
||||
}
|
||||
|
||||
return sample_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of library names
|
||||
* found in the corresponding bam file.
|
||||
*
|
||||
* @return Sets of libraries present in the (merged) input SAM stream, grouped by readers
|
||||
*/
|
||||
public List<Set<String>> getLibrariesByReaders() {
|
||||
|
||||
|
||||
List<SAMReaderID> readers = getDataSource().getReaderIDs();
|
||||
|
||||
List<Set<String>> lib_sets = new ArrayList<Set<String>>(readers.size());
|
||||
|
||||
for (SAMReaderID r : readers) {
|
||||
|
||||
Set<String> libs = new HashSet<String>(2);
|
||||
lib_sets.add(libs);
|
||||
|
||||
for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) {
|
||||
libs.add(g.getLibrary());
|
||||
}
|
||||
}
|
||||
|
||||
return lib_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a mapping from original input files to their (merged) read group ids
|
||||
*
|
||||
* @return the mapping
|
||||
*/
|
||||
public Map<File, Set<String>> getFileToReadGroupIdMapping() {
|
||||
// populate the file -> read group mapping
|
||||
Map<File, Set<String>> fileToReadGroupIdMap = new HashMap<File, Set<String>>();
|
||||
for (SAMReaderID id: getDataSource().getReaderIDs()) {
|
||||
Set<String> readGroups = new HashSet<String>(5);
|
||||
|
||||
for (SAMReadGroupRecord g : getDataSource().getHeader(id).getReadGroups()) {
|
||||
if (getDataSource().hasReadGroupCollisions()) {
|
||||
// Check if there were read group clashes.
|
||||
// If there were, use the SamFileHeaderMerger to translate from the
|
||||
// original read group id to the read group id in the merged stream
|
||||
readGroups.add(getDataSource().getReadGroupId(id,g.getReadGroupId()));
|
||||
} else {
|
||||
// otherwise, pass through the unmapped read groups since this is what Picard does as well
|
||||
readGroups.add(g.getReadGroupId());
|
||||
}
|
||||
}
|
||||
|
||||
fileToReadGroupIdMap.put(getDataSource().getSAMFile(id),readGroups);
|
||||
}
|
||||
|
||||
return fileToReadGroupIdMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD ****
|
||||
*
|
||||
* Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups
|
||||
* (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file.
|
||||
*
|
||||
* @return sets of (merged) read group ids in order of input bams
|
||||
*/
|
||||
public List<Set<String>> getMergedReadGroupsByReaders() {
|
||||
|
||||
|
||||
List<SAMReaderID> readers = getDataSource().getReaderIDs();
|
||||
|
||||
List<Set<String>> rg_sets = new ArrayList<Set<String>>(readers.size());
|
||||
|
||||
for (SAMReaderID r : readers) {
|
||||
|
||||
Set<String> groups = new HashSet<String>(5);
|
||||
rg_sets.add(groups);
|
||||
|
||||
for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) {
|
||||
if (getDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
|
||||
// use HeaderMerger to translate original read group id from the reader into the read group id in the
|
||||
// merged stream, and save that remapped read group id to associate it with specific reader
|
||||
groups.add(getDataSource().getReadGroupId(r, g.getReadGroupId()));
|
||||
} else {
|
||||
// otherwise, pass through the unmapped read groups since this is what Picard does as well
|
||||
groups.add(g.getReadGroupId());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rg_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Bundles all the source information about the reads into a unified data structure.
|
||||
*
|
||||
* @return The reads object providing reads source info.
|
||||
*/
|
||||
private ReadProperties extractSourceInfo() {
|
||||
|
||||
DownsamplingMethod method = getDownsamplingMethod();
|
||||
|
||||
return new ReadProperties(unpackBAMFileList(argCollection.samFiles),
|
||||
argCollection.strictnessLevel,
|
||||
argCollection.readBufferSize,
|
||||
method,
|
||||
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
|
||||
filters,
|
||||
includeReadsWithDeletionAtLoci(),
|
||||
generateExtendedEvents());
|
||||
}
|
||||
|
||||
protected DownsamplingMethod getDownsamplingMethod() {
|
||||
DownsamplingMethod method;
|
||||
if(argCollection.getDownsamplingMethod() != null)
|
||||
method = argCollection.getDownsamplingMethod();
|
||||
else
|
||||
method = argCollection.getDefaultDownsamplingMethod();
|
||||
return method;
|
||||
}
|
||||
|
||||
protected void validateSuppliedReads() {
|
||||
}
|
||||
|
||||
protected void validateSuppliedReference() {
|
||||
}
|
||||
|
||||
protected void validateSuppliedReferenceOrderedData(List<RMDTrack> rods) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available).
|
||||
*
|
||||
* @param reads Reads data source.
|
||||
* @param reference Reference data source.
|
||||
* @param tracks a collection of the reference ordered data tracks
|
||||
*/
|
||||
private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection<RMDTrack> tracks) {
|
||||
if ((reads.isEmpty() && (tracks == null || tracks.isEmpty())) || reference == null )
|
||||
return;
|
||||
|
||||
// Compile a set of sequence names that exist in the reference file.
|
||||
SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary();
|
||||
|
||||
if (!reads.isEmpty()) {
|
||||
// Compile a set of sequence names that exist in the BAM files.
|
||||
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
||||
|
||||
Set<String> readsSequenceNames = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences())
|
||||
readsSequenceNames.add(dictionaryEntry.getSequenceName());
|
||||
|
||||
|
||||
if (readsSequenceNames.size() == 0) {
|
||||
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
||||
return;
|
||||
}
|
||||
|
||||
// compare the reads to the reference
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, "reads", readsDictionary, "reference", referenceDictionary);
|
||||
}
|
||||
|
||||
// compare the tracks to the reference, if they have a sequence dictionary
|
||||
for (RMDTrack track : tracks) {
|
||||
SAMSequenceDictionary trackDict = track.getSequenceDictionary();
|
||||
|
||||
// hack: if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0) {
|
||||
logger.info("Track " + track.getName() + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
continue;
|
||||
}
|
||||
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, track.getName(), trackDict, "reference", referenceDictionary);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convenience function that binds RODs using the old-style command line parser to the new style list for
|
||||
* a uniform processing.
|
||||
*
|
||||
* @param name the name of the rod
|
||||
* @param type its type
|
||||
* @param file the file to load the rod from
|
||||
*/
|
||||
private void bindConvenienceRods(final String name, final String type, final String file) {
|
||||
argCollection.RODBindings.add(Utils.join(",", new String[]{name, type, file}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a data source for the given set of reads.
|
||||
*
|
||||
* @param reads the read source information
|
||||
* @return A data source for the given set of reads.
|
||||
*/
|
||||
private SAMDataSource createReadsDataSource(ReadProperties reads) {
|
||||
return new SAMDataSource(reads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a reference sequence file paired with an index.
|
||||
*
|
||||
* @param refFile Handle to a reference sequence file. Non-null.
|
||||
* @return A thread-safe file wrapper.
|
||||
*/
|
||||
private ReferenceDataSource openReferenceSequenceFile(File refFile) {
|
||||
ReferenceDataSource ref = new ReferenceDataSource(refFile);
|
||||
GenomeLocParser.setupRefContigOrdering(ref.getReference());
|
||||
return ref;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the reference-ordered data sources.
|
||||
*
|
||||
* @param rods the reference order data to execute using
|
||||
* @return A list of reference-ordered data sources.
|
||||
*/
|
||||
private List<ReferenceOrderedDataSource> getReferenceOrderedDataSources(List<RMDTrack> rods) {
|
||||
List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
|
||||
for (RMDTrack rod : rods)
|
||||
dataSources.add(new ReferenceOrderedDataSource(rod, flashbackData()));
|
||||
return dataSources;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the SAM File Header from the input reads' data source file
|
||||
* @return the SAM File Header from the input reads' data source file
|
||||
*/
|
||||
public SAMFileHeader getSAMFileHeader() {
|
||||
return readsDataSource.getHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the unmerged SAM file header for an individual reader.
|
||||
* @param reader The reader.
|
||||
* @return Header for that reader.
|
||||
*/
|
||||
public SAMFileHeader getSAMFileHeader(SAMReaderID reader) {
|
||||
return readsDataSource.getHeader(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source object encapsulating all essential info and handlers used to traverse
|
||||
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
|
||||
*
|
||||
* @return the reads data source
|
||||
*/
|
||||
public SAMDataSource getDataSource() {
|
||||
return this.readsDataSource;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the collection of GATK main application arguments.
|
||||
*
|
||||
* @param argCollection the GATK argument collection
|
||||
*/
|
||||
public void setArguments(GATKArgumentCollection argCollection) {
|
||||
this.argCollection = argCollection;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the collection of GATK main application arguments.
|
||||
*
|
||||
* @return the GATK argument collection
|
||||
*/
|
||||
public GATKArgumentCollection getArguments() {
|
||||
return this.argCollection;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of intervals passed to the engine.
|
||||
* @return List of intervals.
|
||||
*/
|
||||
public GenomeLocSortedSet getIntervals() {
|
||||
return this.intervals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the list of filters employed by this engine.
|
||||
* @return Collection of filters (actual instances) used by this engine.
|
||||
*/
|
||||
public Collection<SamRecordFilter> getFilters() {
|
||||
return this.filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the list of filters employed by this engine.
|
||||
* @param filters Collection of filters (actual instances) used by this engine.
|
||||
*/
|
||||
public void setFilters(Collection<SamRecordFilter> filters) {
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the filter manager for this engine.
|
||||
* @return filter manager for this engine.
|
||||
*/
|
||||
protected FilterManager getFilterManager() {
|
||||
return filterManager;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the input sources for this engine.
|
||||
* @return input sources for this engine.
|
||||
*/
|
||||
protected Map<ArgumentSource, Object> getInputs() {
|
||||
return inputs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the output stubs for this engine.
|
||||
* @return output stubs for this engine.
|
||||
*/
|
||||
protected Collection<Stub<?>> getOutputs() {
|
||||
return outputs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source objects encapsulating all rod data;
|
||||
* individual rods can be accessed through the returned data source objects.
|
||||
*
|
||||
* @return the rods data sources
|
||||
*/
|
||||
public List<ReferenceOrderedDataSource> getRodDataSources() {
|
||||
return this.rodDataSources;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets cumulative metrics about the entire run to this point.
|
||||
* @return cumulative metrics about the entire run.
|
||||
*/
|
||||
public ReadMetrics getCumulativeMetrics() {
|
||||
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpack the bam files to be processed, given a list of files. That list of files can
|
||||
* itself contain entries which are lists of other files to be read (note: you cannot have lists of lists of lists)
|
||||
*
|
||||
* @param inputFiles a list of files that represent either bam files themselves, or a file containing a list of bam files to process
|
||||
*
|
||||
* @return a flattened list of the bam files provided
|
||||
*/
|
||||
private List<SAMReaderID> unpackBAMFileList( List<File> inputFiles ) {
|
||||
List<SAMReaderID> unpackedReads = new ArrayList<SAMReaderID>();
|
||||
for( File inputFile: inputFiles ) {
|
||||
if (inputFile.getName().toLowerCase().endsWith(".list") ) {
|
||||
try {
|
||||
for(String fileName : new XReadLines(inputFile))
|
||||
unpackedReads.add(new SAMReaderID(new File(fileName),getTags(inputFile)));
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new UserException.CouldNotReadInputFile(inputFile, "Unable to find file while unpacking reads", ex);
|
||||
}
|
||||
}
|
||||
else if(inputFile.getName().toLowerCase().endsWith(".bam")) {
|
||||
unpackedReads.add( new SAMReaderID(inputFile,getTags(inputFile)) );
|
||||
}
|
||||
else if(inputFile.getName().equals("-")) {
|
||||
unpackedReads.add(new SAMReaderID(new File("/dev/stdin"),Collections.<String>emptyList()));
|
||||
}
|
||||
else {
|
||||
throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " +
|
||||
"with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " +
|
||||
"of BAM files is in the correct format, update the extension, and try again.",inputFile.getName()));
|
||||
}
|
||||
}
|
||||
return unpackedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a sample by its ID
|
||||
* If an alias is passed in, return the main sample object
|
||||
* @param id sample id
|
||||
* @return sample Object with this ID
|
||||
*/
|
||||
public Sample getSampleById(String id) {
|
||||
return sampleDataSource.getSampleById(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sample for a given read group
|
||||
* Must first look up ID for read group
|
||||
* @param readGroup of sample
|
||||
* @return sample object with ID from the read group
|
||||
*/
|
||||
public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
|
||||
return sampleDataSource.getSampleByReadGroup(readGroup);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a sample for a given read
|
||||
* Must first look up read group, and then sample ID for that read group
|
||||
* @param read of sample
|
||||
* @return sample object of this read
|
||||
*/
|
||||
public Sample getSampleByRead(SAMRecord read) {
|
||||
return getSampleByReadGroup(read.getReadGroup());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get number of sample objects
|
||||
* @return size of samples map
|
||||
*/
|
||||
public int sampleCount() {
|
||||
return sampleDataSource.sampleCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all samples with a given family ID
|
||||
* Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
|
||||
* @param familyId family ID
|
||||
* @return Samples with the given family ID
|
||||
*/
|
||||
public Set<Sample> getFamily(String familyId) {
|
||||
return sampleDataSource.getFamily(familyId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all children of a given sample
|
||||
* See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
|
||||
* @param sample parent sample
|
||||
* @return children of the given sample
|
||||
*/
|
||||
public Set<Sample> getChildren(Sample sample) {
|
||||
return sampleDataSource.getChildren(sample);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a list of sample names and returns their corresponding sample objects
|
||||
*
|
||||
* @param sampleNameList List of sample names
|
||||
* @return Corresponding set of samples
|
||||
*/
|
||||
public Set<Sample> getSamples(Collection<String> sampleNameList) {
|
||||
return sampleDataSource.getSamples(sampleNameList);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -74,7 +74,11 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
|
|||
Walker<?,?> walker = engine.getWalkerByName(getAnalysisName());
|
||||
|
||||
try {
|
||||
Collection<SamRecordFilter> filters = engine.createFiltersForWalker(getArgumentCollection(),walker);
|
||||
engine.setArguments(getArgumentCollection());
|
||||
engine.setWalker(walker);
|
||||
|
||||
Collection<SamRecordFilter> filters = engine.createFilters();
|
||||
engine.setFilters(filters);
|
||||
|
||||
// load the arguments into the walker / filters.
|
||||
// TODO: The fact that this extra load call exists here when all the parsing happens at the engine
|
||||
|
|
@ -88,8 +92,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
|
|||
argumentSources.add(filter);
|
||||
}
|
||||
|
||||
// set the analysis name in the argument collection
|
||||
engine.execute(getArgumentCollection(), walker, filters);
|
||||
engine.execute();
|
||||
generateGATKRunReport(walker);
|
||||
} catch ( Exception e ) {
|
||||
generateGATKRunReport(walker, e);
|
||||
|
|
@ -120,7 +123,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
|
|||
/**
|
||||
* Convenience method for fully parameterized generateGATKRunReport when an exception has
|
||||
* not occurred
|
||||
*
|
||||
*
|
||||
* @param walker
|
||||
*/
|
||||
private void generateGATKRunReport(Walker<?,?> walker) {
|
||||
|
|
@ -160,9 +163,11 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
|
|||
Collection<Class> argumentSources = new ArrayList<Class>();
|
||||
|
||||
Walker walker = engine.getWalkerByName(getAnalysisName());
|
||||
engine.setArguments(getArgumentCollection());
|
||||
engine.setWalker(walker);
|
||||
argumentSources.add(walker.getClass());
|
||||
|
||||
Collection<SamRecordFilter> filters = engine.createFiltersForWalker(getArgumentCollection(),walker);
|
||||
Collection<SamRecordFilter> filters = engine.createFilters();
|
||||
for(SamRecordFilter filter: filters)
|
||||
argumentSources.add(filter.getClass());
|
||||
|
||||
|
|
|
|||
|
|
@ -27,113 +27,47 @@ package org.broadinstitute.sting.gatk;
|
|||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.log4j.Logger;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.commandline.ArgumentException;
|
||||
import org.broadinstitute.sting.commandline.ArgumentSource;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.SampleFileParser;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.*;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.commandline.ArgumentException;
|
||||
import org.broadinstitute.sting.commandline.ArgumentSource;
|
||||
import org.broadinstitute.sting.commandline.ArgumentTypeDescriptor;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
public class GenomeAnalysisEngine {
|
||||
/**
|
||||
* A GenomeAnalysisEngine that runs a specified walker.
|
||||
*/
|
||||
public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine {
|
||||
|
||||
// our instance of this genome analysis toolkit; it's used by other classes to extract the traversal engine
|
||||
// TODO: public static without final tends to indicate we're thinking about this the wrong way
|
||||
public static GenomeAnalysisEngine instance;
|
||||
|
||||
/**
|
||||
* Accessor for sharded read data.
|
||||
*/
|
||||
private SAMDataSource readsDataSource = null;
|
||||
|
||||
/**
|
||||
* Accessor for sharded reference data.
|
||||
*/
|
||||
private ReferenceDataSource referenceDataSource = null;
|
||||
|
||||
/**
|
||||
* Accessor for sample metadata
|
||||
*/
|
||||
private SampleDataSource sampleDataSource = null;
|
||||
|
||||
/**
|
||||
* Accessor for sharded reference-ordered data.
|
||||
*/
|
||||
private List<ReferenceOrderedDataSource> rodDataSources;
|
||||
|
||||
// our argument collection
|
||||
private GATKArgumentCollection argCollection;
|
||||
|
||||
/**
|
||||
* Collection of inputs used by the walker.
|
||||
*/
|
||||
private Map<ArgumentSource, Object> inputs = new HashMap<ArgumentSource, Object>();
|
||||
|
||||
/**
|
||||
* Collection of intervals used by the walker.
|
||||
*/
|
||||
private GenomeLocSortedSet intervals = null;
|
||||
|
||||
/**
|
||||
* Collection of outputs used by the walker.
|
||||
*/
|
||||
private Collection<Stub<?>> outputs = new ArrayList<Stub<?>>();
|
||||
|
||||
/**
|
||||
* List of tags associated with the given instantiation of the command-line argument.
|
||||
*/
|
||||
private final Map<Object,List<String>> tags = new IdentityHashMap<Object,List<String>>();
|
||||
|
||||
/**
|
||||
* Collection of the filters applied to the walker's input data.
|
||||
*/
|
||||
private Collection<SamRecordFilter> filters;
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class);
|
||||
|
||||
/**
|
||||
* our walker manager
|
||||
*/
|
||||
private final WalkerManager walkerManager;
|
||||
private final WalkerManager walkerManager = new WalkerManager();;
|
||||
|
||||
/**
|
||||
* Manage lists of filters.
|
||||
*/
|
||||
private final FilterManager filterManager;
|
||||
|
||||
private Date startTime = null; // the start time for execution
|
||||
private Walker<?, ?> walker;
|
||||
|
||||
/**
|
||||
* our constructor, where all the work is done
|
||||
|
|
@ -143,57 +77,47 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
public GenomeAnalysisEngine() {
|
||||
// make sure our instance variable points to this analysis engine
|
||||
// if ( instance != null )
|
||||
// throw new StingException("Instantiating GenomeAnalysisEngine but global instance variable isn't null, indicating that an instance has already been created: " + instance);
|
||||
|
||||
instance = this;
|
||||
walkerManager = new WalkerManager();
|
||||
filterManager = new FilterManager();
|
||||
}
|
||||
|
||||
public void setWalker(Walker<?, ?> walker) {
|
||||
this.walker = walker;
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually run the GATK with the specified walker.
|
||||
*
|
||||
* @param args the argument collection, where we get all our setup information from
|
||||
* @param my_walker Walker to run over the dataset. Must not be null.
|
||||
* @return the value of this traversal.
|
||||
*/
|
||||
public Object execute(GATKArgumentCollection args, Walker<?, ?> my_walker, Collection<SamRecordFilter> filters) {
|
||||
public Object execute() {
|
||||
//HeapSizeMonitor monitor = new HeapSizeMonitor();
|
||||
//monitor.start();
|
||||
startTime = new java.util.Date();
|
||||
setStartTime(new java.util.Date());
|
||||
|
||||
// validate our parameters
|
||||
if (args == null) {
|
||||
if (this.getArguments() == null) {
|
||||
throw new ReviewedStingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null.");
|
||||
}
|
||||
|
||||
// validate our parameters
|
||||
if (my_walker == null)
|
||||
if (this.walker == null)
|
||||
throw new ReviewedStingException("The walker passed to GenomeAnalysisEngine can not be null.");
|
||||
|
||||
// save our argument parameter
|
||||
this.argCollection = args;
|
||||
this.filters = filters;
|
||||
|
||||
// Prepare the data for traversal.
|
||||
initializeDataSources(my_walker, filters, argCollection);
|
||||
initializeDataSources();
|
||||
|
||||
// our microscheduler, which is in charge of running everything
|
||||
MicroScheduler microScheduler = createMicroscheduler(my_walker);
|
||||
MicroScheduler microScheduler = createMicroscheduler();
|
||||
|
||||
// create the output streams "
|
||||
initializeOutputStreams(my_walker, microScheduler.getOutputTracker());
|
||||
initializeOutputStreams(microScheduler.getOutputTracker());
|
||||
|
||||
initializeIntervals();
|
||||
|
||||
ShardStrategy shardStrategy = getShardStrategy(my_walker,
|
||||
microScheduler.getReference(),
|
||||
intervals,
|
||||
readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
||||
ShardStrategy shardStrategy = getShardStrategy(microScheduler.getReference());
|
||||
|
||||
// execute the microscheduler, storing the results
|
||||
Object result = microScheduler.execute(my_walker, shardStrategy);
|
||||
Object result = microScheduler.execute(this.walker, shardStrategy);
|
||||
|
||||
//monitor.stop();
|
||||
//logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
|
||||
|
|
@ -201,134 +125,6 @@ public class GenomeAnalysisEngine {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the start time when the execute() function was last called
|
||||
*/
|
||||
public Date getStartTime() {
|
||||
return startTime;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Setup the intervals to be processed
|
||||
*/
|
||||
private void initializeIntervals() {
|
||||
|
||||
// return null if no interval arguments at all
|
||||
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null))
|
||||
return;
|
||||
|
||||
else {
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
|
||||
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
|
||||
loadIntervals(argCollection.intervals,
|
||||
argCollection.intervalMerging,
|
||||
GenomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging)));
|
||||
|
||||
// if no exclude arguments, can return parseIntervalArguments directly
|
||||
if (argCollection.excludeIntervals == null)
|
||||
intervals = includeSortedSet;
|
||||
|
||||
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
|
||||
else {
|
||||
GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null);
|
||||
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
|
||||
|
||||
// logging messages only printed when exclude (-XL) arguments are given
|
||||
long toPruneSize = includeSortedSet.coveredSize();
|
||||
long toExcludeSize = excludeSortedSet.coveredSize();
|
||||
long intervalSize = intervals.coveredSize();
|
||||
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
|
||||
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
|
||||
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the intervals relevant to the current execution
|
||||
* @param argList String representation of arguments; might include 'all', filenames, intervals in samtools
|
||||
* notation, or a combination of the
|
||||
* @param mergingRule Technique to use when merging interval data.
|
||||
* @param additionalIntervals a list of additional intervals to add to the returned set. Can be null.
|
||||
* @return A sorted, merged list of all intervals specified in this arg list.
|
||||
*/
|
||||
private GenomeLocSortedSet loadIntervals(List<String> argList,
|
||||
IntervalMergingRule mergingRule,
|
||||
List<GenomeLoc> additionalIntervals) {
|
||||
|
||||
return IntervalUtils.sortAndMergeIntervals(IntervalUtils.mergeListsBySetOperator(additionalIntervals,
|
||||
IntervalUtils.parseIntervalArguments(argList),
|
||||
argCollection.BTIMergeRule),
|
||||
mergingRule);
|
||||
}
|
||||
|
||||
/**
|
||||
* if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs
|
||||
*/
|
||||
private static List<GenomeLoc> checkRODToIntervalArgument() {
|
||||
Map<String, ReferenceOrderedDataSource> rodNames = RMDIntervalGenerator.getRMDTrackNames(instance.rodDataSources);
|
||||
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
|
||||
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||
if (rodNames != null && instance.argCollection.RODToInterval != null) {
|
||||
String rodName = GenomeAnalysisEngine.instance.argCollection.RODToInterval;
|
||||
|
||||
// check to make sure we have a rod of that name
|
||||
if (!rodNames.containsKey(rodName))
|
||||
throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option");
|
||||
|
||||
for (String str : rodNames.keySet())
|
||||
if (str.equals(rodName)) {
|
||||
logger.info("Adding interval list from track (ROD) named " + rodName);
|
||||
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData());
|
||||
ret.addAll(intervalGenerator.toGenomeLocList());
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add additional, externally managed IO streams for walker input.
|
||||
*
|
||||
* @param argumentSource Field in the walker into which to inject the value.
|
||||
* @param value Instance to inject.
|
||||
*/
|
||||
public void addInput(ArgumentSource argumentSource, Object value) {
|
||||
inputs.put(argumentSource, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add additional, externally managed IO streams for walker output.
|
||||
*
|
||||
* @param stub Instance to inject.
|
||||
*/
|
||||
public void addOutput(Stub<?> stub) {
|
||||
outputs.add(stub);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds an association between a object created by the
|
||||
* command-line argument system and a freeform list of tags.
|
||||
* @param key Object created by the command-line argument system.
|
||||
* @param tags List of tags to use when reading arguments.
|
||||
*/
|
||||
public void addTags(Object key, List<String> tags) {
|
||||
this.tags.put(key,tags);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the tags associated with a given object.
|
||||
* @param key Key for which to find a tag.
|
||||
* @return List of tags associated with this key.
|
||||
*/
|
||||
public List<String> getTags(Object key) {
|
||||
if(!tags.containsKey(key))
|
||||
return Collections.emptyList();
|
||||
return tags.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves an instance of the walker based on the walker name.
|
||||
*
|
||||
|
|
@ -351,17 +147,13 @@ public class GenomeAnalysisEngine {
|
|||
/**
|
||||
* Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters;
|
||||
* the caller must handle that directly.
|
||||
* @param args Existing argument collection, for compatibility with legacy command-line walkers.
|
||||
* @param walker Walker to use when determining which filters to apply.
|
||||
* @return A collection of available filters.
|
||||
*/
|
||||
protected Collection<SamRecordFilter> createFiltersForWalker(GATKArgumentCollection args, Walker walker) {
|
||||
@Override
|
||||
protected Collection<SamRecordFilter> createFilters() {
|
||||
Set<SamRecordFilter> filters = new HashSet<SamRecordFilter>();
|
||||
filters.addAll(WalkerManager.getReadFilters(walker,filterManager));
|
||||
if (args.readGroupBlackList != null && args.readGroupBlackList.size() > 0)
|
||||
filters.add(new ReadGroupBlackListFilter(args.readGroupBlackList));
|
||||
for(String filterName: args.readFilters)
|
||||
filters.add(filterManager.createByName(filterName));
|
||||
filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager()));
|
||||
filters.addAll(super.createFilters());
|
||||
return Collections.unmodifiableSet(filters);
|
||||
}
|
||||
|
||||
|
|
@ -372,231 +164,56 @@ public class GenomeAnalysisEngine {
|
|||
protected WalkerManager getWalkerManager() {
|
||||
return walkerManager;
|
||||
}
|
||||
|
||||
private void initializeDataSources(Walker my_walker, Collection<SamRecordFilter> filters, GATKArgumentCollection argCollection) {
|
||||
validateSuppliedReadsAgainstWalker(my_walker, argCollection);
|
||||
logger.info("Strictness is " + argCollection.strictnessLevel);
|
||||
readsDataSource = createReadsDataSource(extractSourceInfo(my_walker, filters, argCollection));
|
||||
|
||||
validateSuppliedReferenceAgainstWalker(my_walker, argCollection);
|
||||
referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile);
|
||||
|
||||
initializeSampleDataSource();
|
||||
|
||||
if (argCollection.DBSNPFile != null) bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile);
|
||||
|
||||
RMDTrackBuilder manager = new RMDTrackBuilder();
|
||||
|
||||
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
|
||||
manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
|
||||
|
||||
List<RMDTrack> tracks = manager.getReferenceMetaDataSources(this,argCollection.RODBindings);
|
||||
validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks);
|
||||
|
||||
// validate all the sequence dictionaries against the reference
|
||||
validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), tracks);
|
||||
|
||||
rodDataSources = getReferenceOrderedDataSources(my_walker, tracks);
|
||||
}
|
||||
|
||||
private void initializeSampleDataSource() {
|
||||
this.sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* setup a microscheduler
|
||||
*
|
||||
* @param my_walker our walker of type LocusWalker
|
||||
* @return a new microscheduler
|
||||
*/
|
||||
private MicroScheduler createMicroscheduler(Walker my_walker) {
|
||||
private MicroScheduler createMicroscheduler() {
|
||||
Walker my_walker = this.walker;
|
||||
|
||||
// the mircoscheduler to return
|
||||
MicroScheduler microScheduler = null;
|
||||
|
||||
// Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary.
|
||||
if ((my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker || my_walker instanceof ReadPairWalker) &&
|
||||
argCollection.referenceFile == null) {
|
||||
this.getArguments().referenceFile == null) {
|
||||
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
|
||||
}
|
||||
|
||||
return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource.getReference(),rodDataSources,argCollection.numberOfThreads);
|
||||
return MicroScheduler.create(this,my_walker,this.getDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a unique identifier for the reader sourcing this read.
|
||||
* @param read Read to examine.
|
||||
* @return A unique identifier for the source file of this read. Exception if not found.
|
||||
*/
|
||||
public SAMReaderID getReaderIDForRead(final SAMRecord read) {
|
||||
return getDataSource().getReaderID(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the source file for this read.
|
||||
* @param id Unique identifier determining which input file to use.
|
||||
* @return The source filename for this read.
|
||||
*/
|
||||
public File getSourceFileForReaderID(final SAMReaderID id) {
|
||||
return getDataSource().getSAMFile(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names
|
||||
* found in the corresponding bam file.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public List<Set<String>> getSamplesByReaders() {
|
||||
List<SAMReaderID> readers = getDataSource().getReaderIDs();
|
||||
|
||||
List<Set<String>> sample_sets = new ArrayList<Set<String>>(readers.size());
|
||||
|
||||
for (SAMReaderID r : readers) {
|
||||
|
||||
Set<String> samples = new HashSet<String>(1);
|
||||
sample_sets.add(samples);
|
||||
|
||||
for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) {
|
||||
samples.add(g.getSample());
|
||||
}
|
||||
}
|
||||
|
||||
return sample_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of library names
|
||||
* found in the corresponding bam file.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public List<Set<String>> getLibrariesByReaders() {
|
||||
|
||||
|
||||
List<SAMReaderID> readers = getDataSource().getReaderIDs();
|
||||
|
||||
List<Set<String>> lib_sets = new ArrayList<Set<String>>(readers.size());
|
||||
|
||||
for (SAMReaderID r : readers) {
|
||||
|
||||
Set<String> libs = new HashSet<String>(2);
|
||||
lib_sets.add(libs);
|
||||
|
||||
for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) {
|
||||
libs.add(g.getLibrary());
|
||||
}
|
||||
}
|
||||
|
||||
return lib_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a mapping from original input files to their (merged) read group ids
|
||||
*
|
||||
* @return the mapping
|
||||
*/
|
||||
public Map<File, Set<String>> getFileToReadGroupIdMapping() {
|
||||
// populate the file -> read group mapping
|
||||
Map<File, Set<String>> fileToReadGroupIdMap = new HashMap<File, Set<String>>();
|
||||
for (SAMReaderID id: getDataSource().getReaderIDs()) {
|
||||
Set<String> readGroups = new HashSet<String>(5);
|
||||
|
||||
for (SAMReadGroupRecord g : getDataSource().getHeader(id).getReadGroups()) {
|
||||
if (getDataSource().hasReadGroupCollisions()) {
|
||||
// Check if there were read group clashes.
|
||||
// If there were, use the SamFileHeaderMerger to translate from the
|
||||
// original read group id to the read group id in the merged stream
|
||||
readGroups.add(getDataSource().getReadGroupId(id,g.getReadGroupId()));
|
||||
} else {
|
||||
// otherwise, pass through the unmapped read groups since this is what Picard does as well
|
||||
readGroups.add(g.getReadGroupId());
|
||||
}
|
||||
}
|
||||
|
||||
fileToReadGroupIdMap.put(getDataSource().getSAMFile(id),readGroups);
|
||||
}
|
||||
|
||||
return fileToReadGroupIdMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD ****
|
||||
*
|
||||
* Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups
|
||||
* (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file.
|
||||
*
|
||||
* @return sets of (merged) read group ids in order of input bams
|
||||
*/
|
||||
public List<Set<String>> getMergedReadGroupsByReaders() {
|
||||
|
||||
|
||||
List<SAMReaderID> readers = getDataSource().getReaderIDs();
|
||||
|
||||
List<Set<String>> rg_sets = new ArrayList<Set<String>>(readers.size());
|
||||
|
||||
for (SAMReaderID r : readers) {
|
||||
|
||||
Set<String> groups = new HashSet<String>(5);
|
||||
rg_sets.add(groups);
|
||||
|
||||
for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) {
|
||||
if (getDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
|
||||
// use HeaderMerger to translate original read group id from the reader into the read group id in the
|
||||
// merged stream, and save that remapped read group id to associate it with specific reader
|
||||
groups.add(getDataSource().getReadGroupId(r, g.getReadGroupId()));
|
||||
} else {
|
||||
// otherwise, pass through the unmapped read groups since this is what Picard does as well
|
||||
groups.add(g.getReadGroupId());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rg_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Bundles all the source information about the reads into a unified data structure.
|
||||
*
|
||||
* @param walker The walker for which to extract info.
|
||||
* @param argCollection The collection of arguments passed to the engine.
|
||||
* @return The reads object providing reads source info.
|
||||
*/
|
||||
private ReadProperties extractSourceInfo(Walker walker, Collection<SamRecordFilter> filters, GATKArgumentCollection argCollection) {
|
||||
|
||||
DownsamplingMethod method = null;
|
||||
@Override
|
||||
protected DownsamplingMethod getDownsamplingMethod() {
|
||||
GATKArgumentCollection argCollection = this.getArguments();
|
||||
DownsamplingMethod method;
|
||||
if(argCollection.getDownsamplingMethod() != null)
|
||||
method = argCollection.getDownsamplingMethod();
|
||||
else if(WalkerManager.getDownsamplingMethod(walker) != null)
|
||||
method = WalkerManager.getDownsamplingMethod(walker);
|
||||
else
|
||||
method = argCollection.getDefaultDownsamplingMethod();
|
||||
return method;
|
||||
}
|
||||
|
||||
return new ReadProperties(unpackBAMFileList(argCollection.samFiles),
|
||||
argCollection.strictnessLevel,
|
||||
argCollection.readBufferSize,
|
||||
method,
|
||||
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
|
||||
filters,
|
||||
walker.includeReadsWithDeletionAtLoci(),
|
||||
walker.generateExtendedEvents());
|
||||
@Override
|
||||
protected boolean generateExtendedEvents() {
|
||||
return walker.generateExtendedEvents();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean includeReadsWithDeletionAtLoci() {
|
||||
return walker.includeReadsWithDeletionAtLoci();
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the supplied set of reads files mesh with what the walker says it requires.
|
||||
*
|
||||
* @param walker Walker to test.
|
||||
* @param arguments Supplied reads files.
|
||||
*/
|
||||
private void validateSuppliedReadsAgainstWalker(Walker walker, GATKArgumentCollection arguments) {
|
||||
@Override
|
||||
protected void validateSuppliedReads() {
|
||||
GATKArgumentCollection arguments = this.getArguments();
|
||||
// Check what the walker says is required against what was provided on the command line.
|
||||
if (WalkerManager.isRequired(walker, DataSource.READS) && (arguments.samFiles == null || arguments.samFiles.size() == 0))
|
||||
throw new ArgumentException("Walker requires reads but none were provided.");
|
||||
|
|
@ -608,11 +225,10 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
/**
|
||||
* Verifies that the supplied reference file mesh with what the walker says it requires.
|
||||
* 611
|
||||
* @param walker Walker to test.
|
||||
* @param arguments Supplied reads files.
|
||||
*/
|
||||
private void validateSuppliedReferenceAgainstWalker(Walker walker, GATKArgumentCollection arguments) {
|
||||
@Override
|
||||
protected void validateSuppliedReference() {
|
||||
GATKArgumentCollection arguments = this.getArguments();
|
||||
// Check what the walker says is required against what was provided on the command line.
|
||||
// TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required.
|
||||
if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null)
|
||||
|
|
@ -627,10 +243,10 @@ public class GenomeAnalysisEngine {
|
|||
* Verifies that all required reference-ordered data has been supplied, and any reference-ordered data that was not
|
||||
* 'allowed' is still present.
|
||||
*
|
||||
* @param walker Walker to test.
|
||||
* @param rods Reference-ordered data to load.
|
||||
*/
|
||||
private void validateSuppliedReferenceOrderedDataAgainstWalker(Walker walker, List<RMDTrack> rods) {
|
||||
@Override
|
||||
protected void validateSuppliedReferenceOrderedData(List<RMDTrack> rods) {
|
||||
// Check to make sure that all required metadata is present.
|
||||
List<RMD> allRequired = WalkerManager.getRequiredMetaData(walker);
|
||||
for (RMD required : allRequired) {
|
||||
|
|
@ -650,81 +266,17 @@ public class GenomeAnalysisEngine {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available).
|
||||
*
|
||||
* @param reads Reads data source.
|
||||
* @param reference Reference data source.
|
||||
* @param tracks a collection of the reference ordered data tracks
|
||||
*/
|
||||
private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection<RMDTrack> tracks) {
|
||||
if ((reads.isEmpty() && (tracks == null || tracks.isEmpty())) || reference == null )
|
||||
return;
|
||||
|
||||
// Compile a set of sequence names that exist in the reference file.
|
||||
SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary();
|
||||
|
||||
if (!reads.isEmpty()) {
|
||||
// Compile a set of sequence names that exist in the BAM files.
|
||||
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
||||
|
||||
Set<String> readsSequenceNames = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences())
|
||||
readsSequenceNames.add(dictionaryEntry.getSequenceName());
|
||||
|
||||
|
||||
if (readsSequenceNames.size() == 0) {
|
||||
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
||||
return;
|
||||
}
|
||||
|
||||
// compare the reads to the reference
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, "reads", readsDictionary, "reference", referenceDictionary);
|
||||
}
|
||||
|
||||
// compare the tracks to the reference, if they have a sequence dictionary
|
||||
for (RMDTrack track : tracks) {
|
||||
SAMSequenceDictionary trackDict = track.getSequenceDictionary();
|
||||
|
||||
// hack: if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0) {
|
||||
logger.info("Track " + track.getName() + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
continue;
|
||||
}
|
||||
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, track.getName(), trackDict, "reference", referenceDictionary);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convenience function that binds RODs using the old-style command line parser to the new style list for
|
||||
* a uniform processing.
|
||||
*
|
||||
* @param name the name of the rod
|
||||
* @param type its type
|
||||
* @param file the file to load the rod from
|
||||
*/
|
||||
private void bindConvenienceRods(final String name, final String type, final String file) {
|
||||
argCollection.RODBindings.add(Utils.join(",", new String[]{name, type, file}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sharding strategy given a driving data source.
|
||||
*
|
||||
* @param walker Walker for which to infer sharding strategy.
|
||||
* @param drivingDataSource Data on which to shard.
|
||||
* @param intervals Intervals to use when limiting sharding.
|
||||
* @return Sharding strategy for this driving data source.
|
||||
* @return the sharding strategy
|
||||
*/
|
||||
protected ShardStrategy getShardStrategy(Walker walker,
|
||||
ReferenceSequenceFile drivingDataSource,
|
||||
GenomeLocSortedSet intervals,
|
||||
ValidationExclusion exclusions) {
|
||||
protected ShardStrategy getShardStrategy(ReferenceSequenceFile drivingDataSource) {
|
||||
GenomeLocSortedSet intervals = this.getIntervals();
|
||||
SAMDataSource readsDataSource = this.getDataSource();
|
||||
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
||||
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
|
||||
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
|
||||
// sharding system; it's required with the new sharding system only for locus walkers.
|
||||
if(readsDataSource != null && !readsDataSource.hasIndex() ) {
|
||||
|
|
@ -815,227 +367,22 @@ public class GenomeAnalysisEngine {
|
|||
return shardStrategy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a data source for the given set of reads.
|
||||
*
|
||||
* @param reads the read source information
|
||||
* @return A data source for the given set of reads.
|
||||
*/
|
||||
private SAMDataSource createReadsDataSource(ReadProperties reads) {
|
||||
return new SAMDataSource(reads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a reference sequence file paired with an index.
|
||||
*
|
||||
* @param refFile Handle to a reference sequence file. Non-null.
|
||||
* @return A thread-safe file wrapper.
|
||||
*/
|
||||
private ReferenceDataSource openReferenceSequenceFile(File refFile) {
|
||||
ReferenceDataSource ref = new ReferenceDataSource(refFile);
|
||||
GenomeLocParser.setupRefContigOrdering(ref.getReference());
|
||||
return ref;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the reference-ordered data sources.
|
||||
*
|
||||
* @param rods the reference order data to execute using
|
||||
* @return A list of reference-ordered data sources.
|
||||
*/
|
||||
private List<ReferenceOrderedDataSource> getReferenceOrderedDataSources(Walker walker, List<RMDTrack> rods) {
|
||||
List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
|
||||
for (RMDTrack rod : rods)
|
||||
dataSources.add(new ReferenceOrderedDataSource(walker, rod));
|
||||
return dataSources;
|
||||
@Override
|
||||
protected boolean flashbackData() {
|
||||
return walker instanceof ReadWalker;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the output streams as specified by the user.
|
||||
*
|
||||
* @param walker the walker to initialize output streams for
|
||||
* @param outputTracker the tracker supplying the initialization data.
|
||||
*/
|
||||
private void initializeOutputStreams(Walker walker, OutputTracker outputTracker) {
|
||||
for (Map.Entry<ArgumentSource, Object> input : inputs.entrySet())
|
||||
private void initializeOutputStreams(OutputTracker outputTracker) {
|
||||
for (Map.Entry<ArgumentSource, Object> input : getInputs().entrySet())
|
||||
outputTracker.addInput(input.getKey(), input.getValue());
|
||||
for (Stub<?> stub : outputs)
|
||||
for (Stub<?> stub : getOutputs())
|
||||
outputTracker.addOutput(stub);
|
||||
|
||||
outputTracker.prepareWalker(walker);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the SAM File Header from the input reads' data source file
|
||||
* @return the SAM File Header from the input reads' data source file
|
||||
*/
|
||||
public SAMFileHeader getSAMFileHeader() {
|
||||
return readsDataSource.getHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the unmerged SAM file header for an individual reader.
|
||||
* @param reader The reader.
|
||||
* @return Header for that reader.
|
||||
*/
|
||||
public SAMFileHeader getSAMFileHeader(SAMReaderID reader) {
|
||||
return readsDataSource.getHeader(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source object encapsulating all essential info and handlers used to traverse
|
||||
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
|
||||
*
|
||||
* @return the reads data source
|
||||
*/
|
||||
public SAMDataSource getDataSource() {
|
||||
return this.readsDataSource;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the collection of GATK main application arguments for enhanced walker validation.
|
||||
*
|
||||
* @return the GATK argument collection
|
||||
*/
|
||||
public GATKArgumentCollection getArguments() {
|
||||
return this.argCollection;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of intervals passed to the engine.
|
||||
* @return List of intervals.
|
||||
*/
|
||||
public GenomeLocSortedSet getIntervals() {
|
||||
return this.intervals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the list of filters employed by this walker.
|
||||
* @return Collection of filters (actual instances) used by this walker.
|
||||
*/
|
||||
public Collection<SamRecordFilter> getFilters() {
|
||||
return this.filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source objects encapsulating all rod data;
|
||||
* individual rods can be accessed through the returned data source objects.
|
||||
*
|
||||
* @return the rods data sources
|
||||
*/
|
||||
public List<ReferenceOrderedDataSource> getRodDataSources() {
|
||||
return this.rodDataSources;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets cumulative metrics about the entire run to this point.
|
||||
* @return cumulative metrics about the entire run.
|
||||
*/
|
||||
public ReadMetrics getCumulativeMetrics() {
|
||||
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpack the bam files to be processed, given a list of files. That list of files can
|
||||
* itself contain entries which are lists of other files to be read (note: you cannot have lists of lists of lists)
|
||||
*
|
||||
* @param inputFiles a list of files that represent either bam files themselves, or a file containing a list of bam files to process
|
||||
*
|
||||
* @return a flattened list of the bam files provided
|
||||
*/
|
||||
private List<SAMReaderID> unpackBAMFileList( List<File> inputFiles ) {
|
||||
List<SAMReaderID> unpackedReads = new ArrayList<SAMReaderID>();
|
||||
for( File inputFile: inputFiles ) {
|
||||
if (inputFile.getName().toLowerCase().endsWith(".list") ) {
|
||||
try {
|
||||
for(String fileName : new XReadLines(inputFile))
|
||||
unpackedReads.add(new SAMReaderID(new File(fileName),getTags(inputFile)));
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new UserException.CouldNotReadInputFile(inputFile, "Unable to find file while unpacking reads", ex);
|
||||
}
|
||||
}
|
||||
else if(inputFile.getName().toLowerCase().endsWith(".bam")) {
|
||||
unpackedReads.add( new SAMReaderID(inputFile,getTags(inputFile)) );
|
||||
}
|
||||
else if(inputFile.getName().equals("-")) {
|
||||
unpackedReads.add(new SAMReaderID(new File("/dev/stdin"),Collections.<String>emptyList()));
|
||||
}
|
||||
else {
|
||||
throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " +
|
||||
"with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " +
|
||||
"of BAM files is in the correct format, update the extension, and try again.",inputFile.getName()));
|
||||
}
|
||||
}
|
||||
return unpackedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a sample by its ID
|
||||
* If an alias is passed in, return the main sample object
|
||||
* @param id
|
||||
* @return sample Object with this ID
|
||||
*/
|
||||
public Sample getSampleById(String id) {
|
||||
return sampleDataSource.getSampleById(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sample for a given read group
|
||||
* Must first look up ID for read group
|
||||
* @param readGroup of sample
|
||||
* @return sample object with ID from the read group
|
||||
*/
|
||||
public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
|
||||
return sampleDataSource.getSampleByReadGroup(readGroup);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a sample for a given read
|
||||
* Must first look up read group, and then sample ID for that read group
|
||||
* @param read of sample
|
||||
* @return sample object of this read
|
||||
*/
|
||||
public Sample getSampleByRead(SAMRecord read) {
|
||||
return getSampleByReadGroup(read.getReadGroup());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get number of sample objects
|
||||
* @return size of samples map
|
||||
*/
|
||||
public int sampleCount() {
|
||||
return sampleDataSource.sampleCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all samples with a given family ID
|
||||
* Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
|
||||
* @param familyId
|
||||
* @return
|
||||
*/
|
||||
public Set<Sample> getFamily(String familyId) {
|
||||
return sampleDataSource.getFamily(familyId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all children of a given sample
|
||||
* See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
|
||||
* @param sample
|
||||
* @return
|
||||
*/
|
||||
public Set<Sample> getChildren(Sample sample) {
|
||||
return sampleDataSource.getChildren(sample);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a list of sample names and returns their corresponding sample objects
|
||||
*
|
||||
* @param sampleNameList List of sample names
|
||||
* @return Corresponding set of samples
|
||||
*/
|
||||
public Set<Sample> getSamples(Collection<String> sampleNameList) {
|
||||
return sampleDataSource.getSamples(sampleNameList);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -120,6 +120,7 @@ public class ReadProperties {
|
|||
* @param samFiles list of reads files.
|
||||
* @param strictness Stringency of reads file parsing.
|
||||
* @param readBufferSize Number of reads to hold in memory per BAM.
|
||||
* @param downsamplingMethod Method for downsampling reads at a given locus.
|
||||
* @param exclusionList what safety checks we're willing to let slide
|
||||
* @param supplementalFilters additional filters to dynamically apply.
|
||||
* @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with
|
||||
|
|
@ -129,7 +130,7 @@ public class ReadProperties {
|
|||
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
|
||||
* bases will be seen in the pileups, and the deletions will be skipped silently.
|
||||
*/
|
||||
ReadProperties( List<SAMReaderID> samFiles,
|
||||
public ReadProperties( List<SAMReaderID> samFiles,
|
||||
SAMFileReader.ValidationStringency strictness,
|
||||
Integer readBufferSize,
|
||||
DownsamplingMethod downsamplingMethod,
|
||||
|
|
|
|||
|
|
@ -47,12 +47,12 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
|
|||
* Create a new reference-ordered data source.
|
||||
* @param rod the reference ordered data
|
||||
*/
|
||||
public ReferenceOrderedDataSource( Walker walker, RMDTrack rod) {
|
||||
public ReferenceOrderedDataSource( RMDTrack rod, boolean flashbackData ) {
|
||||
this.rod = rod;
|
||||
if (rod.supportsQuery())
|
||||
iteratorPool = new ReferenceOrderedQueryDataPool(new RMDTrackBuilder(),rod);
|
||||
else
|
||||
iteratorPool = new ReferenceOrderedDataPool( walker, rod );
|
||||
iteratorPool = new ReferenceOrderedDataPool( rod, flashbackData );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -110,8 +110,8 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
|
|||
class ReferenceOrderedDataPool extends ResourcePool<LocationAwareSeekableRODIterator, LocationAwareSeekableRODIterator> {
|
||||
private final RMDTrack rod;
|
||||
boolean flashbackData = false;
|
||||
public ReferenceOrderedDataPool( Walker walker, RMDTrack rod ) {
|
||||
if (walker instanceof ReadWalker) flashbackData = true;
|
||||
public ReferenceOrderedDataPool( RMDTrack rod, boolean flashbackData ) {
|
||||
this.flashbackData = flashbackData;
|
||||
this.rod = rod;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,10 +33,10 @@ import org.broad.tribble.index.Index;
|
|||
import org.broad.tribble.index.IndexFactory;
|
||||
import org.broad.tribble.source.BasicFeatureSource;
|
||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -367,7 +367,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
|
|||
*
|
||||
* @return a list of RMDTracks, one for each -B option
|
||||
*/
|
||||
public List<RMDTrack> getReferenceMetaDataSources(GenomeAnalysisEngine engine, List<String> bindings) {
|
||||
public List<RMDTrack> getReferenceMetaDataSources(AbstractGenomeAnalysisEngine engine, List<String> bindings) {
|
||||
initializeBindings(engine,bindings);
|
||||
// try and make the tracks given their requests
|
||||
return createRequestedTrackObjects();
|
||||
|
|
@ -378,7 +378,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
|
|||
* @param engine The engine, used to populate tags.
|
||||
* @param bindings the input to the GATK, as a list of strings passed in through the -B options
|
||||
*/
|
||||
private void initializeBindings(GenomeAnalysisEngine engine,List<String> bindings) {
|
||||
private void initializeBindings(AbstractGenomeAnalysisEngine engine,List<String> bindings) {
|
||||
// NOTE: Method acts as a static. Once the inputs have been passed once they are locked in.
|
||||
if (inputs.size() > 0 || bindings.size() == 0)
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
|
|||
public void testSingleBinding() {
|
||||
File file = new File(testDir + "TabularDataTest.dat");
|
||||
RMDTrack track = builder.createInstanceOfTrack(TableCodec.class,"tableTest",file);
|
||||
ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(null,track);
|
||||
ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(track,false);
|
||||
|
||||
Shard shard = new MockLocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc("chrM",1,30)));
|
||||
|
||||
|
|
@ -100,9 +100,9 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
|
|||
|
||||
|
||||
RMDTrack track = builder.createInstanceOfTrack(TableCodec.class,"tableTest1",file);
|
||||
ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(null,track);
|
||||
ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(track,false);
|
||||
RMDTrack track2 = builder.createInstanceOfTrack(TableCodec.class,"tableTest2",file);
|
||||
ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(null,track2);
|
||||
ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(track2,false);
|
||||
|
||||
|
||||
Shard shard = new MockLocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc("chrM",1,30)));
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testCreateSingleIterator() {
|
||||
ResourcePool iteratorPool = new ReferenceOrderedDataPool(null,rod);
|
||||
ResourcePool iteratorPool = new ReferenceOrderedDataPool(rod, false);
|
||||
LocationAwareSeekableRODIterator iterator = (LocationAwareSeekableRODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite1) );
|
||||
|
||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
|
|
@ -80,7 +80,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testCreateMultipleIterators() {
|
||||
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(null,rod);
|
||||
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod, false);
|
||||
LocationAwareSeekableRODIterator iterator1 = iteratorPool.iterator( new MappedStreamSegment(testSite1) );
|
||||
|
||||
// Create a new iterator at position 2.
|
||||
|
|
@ -130,7 +130,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testIteratorConservation() {
|
||||
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(null,rod);
|
||||
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod, false);
|
||||
LocationAwareSeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite1) );
|
||||
|
||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
|
|
@ -165,7 +165,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testIteratorCreation() {
|
||||
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(null,rod);
|
||||
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod, false);
|
||||
LocationAwareSeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite3) );
|
||||
|
||||
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
|
||||
|
|
|
|||
Loading…
Reference in New Issue