2009-06-23 05:11:18 +08:00
/ *
2010-04-20 07:00:08 +08:00
* Copyright ( c ) 2010 The Broad Institute
2010-04-20 23:26:32 +08:00
*
2009-06-23 05:11:18 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2009-06-23 05:11:18 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2009-06-23 05:11:18 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2009-06-23 05:11:18 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
2010-04-20 07:00:08 +08:00
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
2009-06-23 05:11:18 +08:00
* /
2009-05-11 10:07:20 +08:00
package org.broadinstitute.sting.gatk ;
2009-07-10 07:59:53 +08:00
import net.sf.picard.filter.SamRecordFilter ;
2010-04-01 06:39:56 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2009-07-30 07:00:15 +08:00
import net.sf.samtools.* ;
2009-05-11 10:07:20 +08:00
import org.apache.log4j.Logger ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection ;
2010-05-03 14:02:35 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper ;
2010-04-13 23:50:38 +08:00
import org.broadinstitute.sting.utils.interval.IntervalMergingRule ;
import org.broadinstitute.sting.utils.interval.IntervalUtils ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion ;
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy ;
import org.broadinstitute.sting.gatk.datasources.shards.Shard ;
2009-07-30 00:11:45 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy ;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.* ;
2009-05-11 10:07:20 +08:00
import org.broadinstitute.sting.gatk.executive.MicroScheduler ;
2009-11-11 02:40:16 +08:00
import org.broadinstitute.sting.gatk.filters.FilterManager ;
2010-03-27 03:38:57 +08:00
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.gatk.io.OutputTracker ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.io.stubs.Stub ;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack ;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager ;
2010-04-08 13:14:41 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2009-06-06 07:34:37 +08:00
import org.broadinstitute.sting.utils.* ;
2010-04-20 07:00:08 +08:00
import org.broadinstitute.sting.commandline.ArgumentException ;
import org.broadinstitute.sting.commandline.ArgumentSource ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile ;
2009-05-11 10:07:20 +08:00
2009-07-10 05:57:00 +08:00
import java.io.File ;
2009-07-30 00:11:45 +08:00
import java.io.FileNotFoundException ;
import java.util.* ;
2009-05-11 10:07:20 +08:00
public class GenomeAnalysisEngine {
// our instance of this genome analysis toolkit; it's used by other classes to extract the traversal engine
// TODO: public static without final tends to indicate we're thinking about this the wrong way
public static GenomeAnalysisEngine instance ;
2009-07-30 07:00:15 +08:00
/ * *
* Accessor for sharded read data .
* /
private SAMDataSource readsDataSource = null ;
/ * *
* Accessor for sharded reference data .
* /
private IndexedFastaSequenceFile referenceDataSource = null ;
/ * *
* Accessor for sharded reference - ordered data .
* /
private List < ReferenceOrderedDataSource > rodDataSources ;
2009-05-11 10:07:20 +08:00
// our argument collection
2009-07-10 07:59:53 +08:00
private GATKArgumentCollection argCollection ;
2009-05-11 10:07:20 +08:00
2009-10-06 10:45:31 +08:00
/ * *
* Collection of inputs used by the walker .
* /
private Map < ArgumentSource , Object > inputs = new HashMap < ArgumentSource , Object > ( ) ;
2009-08-23 08:56:02 +08:00
2010-02-25 08:16:50 +08:00
/ * *
* Collection of intervals used by the walker .
* /
private GenomeLocSortedSet intervals = null ;
2009-10-06 10:45:31 +08:00
/ * *
* Collection of outputs used by the walker .
* /
2009-08-23 08:56:02 +08:00
private Collection < Stub < ? > > outputs = new ArrayList < Stub < ? > > ( ) ;
2009-12-17 05:26:04 +08:00
/ * *
* Collection of the filters applied to the walker ' s input data .
* /
private Collection < SamRecordFilter > filters ;
2009-05-11 10:07:20 +08:00
2009-10-06 10:45:31 +08:00
/ * *
* our log , which we want to capture anything from this class
* /
2009-05-11 10:07:20 +08:00
private static Logger logger = Logger . getLogger ( GenomeAnalysisEngine . class ) ;
2009-10-06 10:45:31 +08:00
/ * *
* our walker manager
* /
2009-07-10 07:59:53 +08:00
private final WalkerManager walkerManager ;
2009-11-11 02:40:16 +08:00
/ * *
* Manage lists of filters .
* /
private final FilterManager filterManager ;
2009-05-11 10:07:20 +08:00
/ * *
* our constructor , where all the work is done
* < p / >
* legacy traversal types are sent to legacyTraversal function ; as we move more of the traversals to the
* new MicroScheduler class we ' ll be able to delete that function .
* /
2009-07-22 02:32:22 +08:00
public GenomeAnalysisEngine ( ) {
2009-07-10 07:59:53 +08:00
// make sure our instance variable points to this analysis engine
instance = this ;
2009-07-22 02:32:22 +08:00
walkerManager = new WalkerManager ( ) ;
2009-11-11 02:40:16 +08:00
filterManager = new FilterManager ( ) ;
2009-07-10 07:59:53 +08:00
}
2009-05-11 10:07:20 +08:00
2009-07-10 07:59:53 +08:00
/ * *
* Actually run the GATK with the specified walker .
2009-10-06 10:45:31 +08:00
*
2009-07-10 07:59:53 +08:00
* @param args the argument collection , where we get all our setup information from
* @param my_walker Walker to run over the dataset . Must not be null .
2009-09-30 06:23:19 +08:00
* @return the value of this traversal .
2009-07-10 07:59:53 +08:00
* /
2009-11-11 07:36:17 +08:00
public Object execute ( GATKArgumentCollection args , Walker < ? , ? > my_walker , Collection < SamRecordFilter > filters ) {
2010-05-18 05:00:44 +08:00
//HeapSizeMonitor monitor = new HeapSizeMonitor();
//monitor.start();
2009-05-11 10:07:20 +08:00
// validate our parameters
2009-07-10 07:59:53 +08:00
if ( args = = null ) {
2009-08-21 13:35:49 +08:00
throw new StingException ( "The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null." ) ;
2009-05-11 10:07:20 +08:00
}
2010-04-01 20:47:48 +08:00
// validate our parameters
2009-07-10 07:59:53 +08:00
if ( my_walker = = null )
2009-08-21 13:35:49 +08:00
throw new StingException ( "The walker passed to GenomeAnalysisEngine can not be null." ) ;
2009-07-10 07:59:53 +08:00
2009-05-11 10:07:20 +08:00
// save our argument parameter
this . argCollection = args ;
2009-12-17 05:26:04 +08:00
this . filters = filters ;
2009-05-11 10:07:20 +08:00
2009-07-30 07:00:15 +08:00
// Prepare the data for traversal.
2009-11-11 07:36:17 +08:00
initializeDataSources ( my_walker , filters , argCollection ) ;
2009-05-20 07:26:17 +08:00
2009-05-11 10:07:20 +08:00
// our microscheduler, which is in charge of running everything
2009-07-30 07:00:15 +08:00
MicroScheduler microScheduler = createMicroscheduler ( my_walker ) ;
2009-05-11 10:07:20 +08:00
2009-07-30 00:11:45 +08:00
// create the output streams
2009-08-23 08:56:02 +08:00
initializeOutputStreams ( my_walker , microScheduler . getOutputTracker ( ) ) ;
2009-05-11 10:07:20 +08:00
2010-03-15 05:08:14 +08:00
initializeIntervals ( ) ;
ShardStrategy shardStrategy = getShardStrategy ( my_walker ,
2010-03-18 00:24:10 +08:00
microScheduler . getReference ( ) ,
intervals ,
argCollection . maximumEngineIterations ,
readsDataSource ! = null ? readsDataSource . getReadsInfo ( ) . getValidationExclusionList ( ) : null ) ;
2010-03-15 05:08:14 +08:00
// execute the microscheduler, storing the results
2010-05-18 05:00:44 +08:00
Object result = microScheduler . execute ( my_walker , shardStrategy , argCollection . maximumEngineIterations ) ;
//monitor.stop();
//logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
return result ;
2010-03-15 05:08:14 +08:00
}
/ * *
* Setup the intervals to be processed
* /
private void initializeIntervals ( ) {
2010-03-13 03:23:12 +08:00
2010-04-01 20:47:48 +08:00
// return null if no interval arguments at all
2010-04-08 13:14:41 +08:00
if ( ( argCollection . intervals = = null ) & & ( argCollection . excludeIntervals = = null ) & & ( argCollection . RODToInterval = = null ) )
2010-04-01 20:47:48 +08:00
return ;
else {
// if include argument isn't given, create new set of all possible intervals
2010-04-08 13:14:41 +08:00
GenomeLocSortedSet includeSortedSet = ( argCollection . intervals = = null & & argCollection . RODToInterval = = null ?
2010-04-01 20:47:48 +08:00
GenomeLocSortedSet . createSetFromSequenceDictionary ( this . referenceDataSource . getSequenceDictionary ( ) ) :
2010-04-16 00:31:43 +08:00
loadIntervals ( argCollection . intervals ,
argCollection . intervalMerging ,
GenomeLocParser . mergeIntervalLocations ( checkRODToIntervalArgument ( ) , argCollection . intervalMerging ) ) ) ;
2010-04-01 20:47:48 +08:00
// if no exclude arguments, can return parseIntervalArguments directly
if ( argCollection . excludeIntervals = = null )
intervals = includeSortedSet ;
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
2010-04-16 00:31:43 +08:00
GenomeLocSortedSet excludeSortedSet = loadIntervals ( argCollection . excludeIntervals , argCollection . intervalMerging , null ) ;
2010-04-13 23:50:38 +08:00
intervals = includeSortedSet . subtractRegions ( excludeSortedSet ) ;
2010-04-01 20:47:48 +08:00
// logging messages only printed when exclude (-XL) arguments are given
long toPruneSize = includeSortedSet . coveredSize ( ) ;
long toExcludeSize = excludeSortedSet . coveredSize ( ) ;
long intervalSize = intervals . coveredSize ( ) ;
logger . info ( String . format ( "Initial include intervals cover %d bases" , toPruneSize ) ) ;
logger . info ( String . format ( "Excluding %d bases from original intervals (%.2f%% reduction)" ,
toExcludeSize , ( toPruneSize - intervalSize ) / ( 0.01 * toPruneSize ) ) ) ;
}
2009-07-10 05:57:00 +08:00
}
2010-04-01 20:47:48 +08:00
}
/ * *
2010-04-16 00:31:43 +08:00
* Loads the intervals relevant to the current execution
2010-04-13 23:50:38 +08:00
* @param argList String representation of arguments ; might include ' all ' , filenames , intervals in samtools
* notation , or a combination of the
* @param mergingRule Technique to use when merging interval data .
2010-04-16 00:31:43 +08:00
* @param additionalIntervals a list of additional intervals to add to the returned set . Can be null .
2010-04-13 23:50:38 +08:00
* @return A sorted , merged list of all intervals specified in this arg list .
2010-04-01 20:47:48 +08:00
* /
2010-04-16 00:31:43 +08:00
private GenomeLocSortedSet loadIntervals ( List < String > argList , IntervalMergingRule mergingRule , List < GenomeLoc > additionalIntervals ) {
List < GenomeLoc > rawIntervals = ( additionalIntervals = = null ) ? new ArrayList < GenomeLoc > ( ) : additionalIntervals ; // running list of raw GenomeLocs
2010-04-13 23:50:38 +08:00
rawIntervals . addAll ( IntervalUtils . parseIntervalArguments ( argList ) ) ;
2010-04-08 13:14:41 +08:00
2010-04-01 20:47:48 +08:00
// redundant check => default no arguments is null, not empty list
if ( rawIntervals . size ( ) = = 0 )
return null ;
2010-05-07 06:37:25 +08:00
return IntervalUtils . sortAndMergeIntervals ( rawIntervals , mergingRule ) ;
2010-04-01 20:47:48 +08:00
}
2010-04-08 13:14:41 +08:00
/ * *
* if we have a ROD specified as a ' rodToIntervalTrackName ' , convert its records to RODs
* /
private static List < GenomeLoc > checkRODToIntervalArgument ( ) {
Map < String , ReferenceOrderedDataSource > rodNames = RMDIntervalGenerator . getRMDTrackNames ( instance . rodDataSources ) ;
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
List < GenomeLoc > ret = new ArrayList < GenomeLoc > ( ) ;
if ( rodNames ! = null & & instance . argCollection . RODToInterval ! = null ) {
String rodName = GenomeAnalysisEngine . instance . argCollection . RODToInterval ;
// check to make sure we have a rod of that name
2010-04-16 14:17:31 +08:00
if ( ! rodNames . containsKey ( rodName ) )
2010-04-08 13:14:41 +08:00
throw new StingException ( "--rodToIntervalTrackName (-BTI) was pass the name '" + rodName + "', which wasn't given as a ROD name in the -B option" ) ;
for ( String str : rodNames . keySet ( ) )
2010-04-16 14:17:31 +08:00
if ( str . equals ( rodName ) ) {
2010-04-08 13:14:41 +08:00
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator ( rodNames . get ( str ) . getReferenceOrderedData ( ) ) ;
ret . addAll ( intervalGenerator . toGenomeLocList ( ) ) ;
}
}
return ret ;
}
2009-07-30 00:11:45 +08:00
/ * *
2009-08-23 08:56:02 +08:00
* Add additional , externally managed IO streams for walker input .
2009-10-06 10:45:31 +08:00
*
2009-08-23 08:56:02 +08:00
* @param argumentSource Field in the walker into which to inject the value .
2009-10-06 10:45:31 +08:00
* @param value Instance to inject .
2009-07-30 00:11:45 +08:00
* /
2009-10-06 10:45:31 +08:00
public void addInput ( ArgumentSource argumentSource , Object value ) {
inputs . put ( argumentSource , value ) ;
2009-08-23 08:56:02 +08:00
}
/ * *
* Add additional , externally managed IO streams for walker output .
2009-10-06 10:45:31 +08:00
*
2009-08-23 08:56:02 +08:00
* @param stub Instance to inject .
* /
2009-10-06 10:45:31 +08:00
public void addOutput ( Stub < ? > stub ) {
2009-08-23 08:56:02 +08:00
outputs . add ( stub ) ;
2009-05-11 10:07:20 +08:00
}
2009-07-10 07:59:53 +08:00
/ * *
* Retrieves an instance of the walker based on the walker name .
2009-10-06 10:45:31 +08:00
*
2009-07-10 07:59:53 +08:00
* @param walkerName Name of the walker . Must not be null . If the walker cannot be instantiated , an exception will be thrown .
* @return An instance of the walker .
* /
2009-10-06 10:45:31 +08:00
public Walker < ? , ? > getWalkerByName ( String walkerName ) {
2009-11-11 02:40:16 +08:00
return walkerManager . createByName ( walkerName ) ;
}
/ * *
* Gets the name of a given walker type .
* @param walkerType Type of walker .
* @return Name of the walker .
* /
public String getWalkerName ( Class < Walker > walkerType ) {
return walkerManager . getName ( walkerType ) ;
2009-07-10 07:59:53 +08:00
}
2009-11-11 07:36:17 +08:00
/ * *
* Gets a list of the filters to associate with the given walker . Will NOT initialize the engine with this filters ;
* the caller must handle that directly .
2009-12-17 05:26:04 +08:00
* @param args Existing argument collection , for compatibility with legacy command - line walkers .
2009-11-11 07:36:17 +08:00
* @param walker Walker to use when determining which filters to apply .
* @return A collection of available filters .
* /
2009-12-17 05:26:04 +08:00
protected Collection < SamRecordFilter > createFiltersForWalker ( GATKArgumentCollection args , Walker walker ) {
2010-03-18 00:24:10 +08:00
Set < SamRecordFilter > filters = new HashSet < SamRecordFilter > ( ) ;
2009-11-11 07:36:17 +08:00
filters . addAll ( WalkerManager . getReadFilters ( walker , filterManager ) ) ;
if ( args . filterZeroMappingQualityReads ! = null & & args . filterZeroMappingQualityReads )
filters . add ( new ZeroMappingQualityReadFilter ( ) ) ;
2010-03-27 03:38:57 +08:00
if ( args . readGroupBlackList ! = null & & args . readGroupBlackList . size ( ) > 0 )
filters . add ( new ReadGroupBlackListFilter ( args . readGroupBlackList ) ) ;
2009-11-11 07:36:17 +08:00
for ( String filterName : args . readFilters )
filters . add ( filterManager . createByName ( filterName ) ) ;
return Collections . unmodifiableSet ( filters ) ;
}
2009-12-05 07:24:29 +08:00
/ * *
* Allow subclasses and others within this package direct access to the walker manager .
* @return The walker manager used by this package .
* /
protected WalkerManager getWalkerManager ( ) {
return walkerManager ;
}
2009-11-11 07:36:17 +08:00
private void initializeDataSources ( Walker my_walker , Collection < SamRecordFilter > filters , GATKArgumentCollection argCollection ) {
2009-10-06 10:45:31 +08:00
validateSuppliedReadsAgainstWalker ( my_walker , argCollection ) ;
2009-07-30 07:00:15 +08:00
logger . info ( "Strictness is " + argCollection . strictnessLevel ) ;
2009-11-11 07:36:17 +08:00
readsDataSource = createReadsDataSource ( extractSourceInfo ( my_walker , filters , argCollection ) ) ;
2009-07-30 07:00:15 +08:00
2009-10-06 10:45:31 +08:00
validateSuppliedReferenceAgainstWalker ( my_walker , argCollection ) ;
2009-07-30 07:00:15 +08:00
referenceDataSource = openReferenceSequenceFile ( argCollection . referenceFile ) ;
validateReadsAndReferenceAreCompatible ( readsDataSource , referenceDataSource ) ;
//
// please don't use these in the future, use the new syntax <- if we're not using these please remove them
//
2010-05-03 14:02:35 +08:00
if ( argCollection . DBSNPFile ! = null ) bindConvenienceRods ( DbSNPHelper . STANDARD_DBSNP_TRACK_NAME , "dbsnp" , argCollection . DBSNPFile ) ;
2009-07-30 07:00:15 +08:00
if ( argCollection . HAPMAPFile ! = null )
bindConvenienceRods ( "hapmap" , "HapMapAlleleFrequencies" , argCollection . HAPMAPFile ) ;
if ( argCollection . HAPMAPChipFile ! = null )
bindConvenienceRods ( "hapmap-chip" , "GFF" , argCollection . HAPMAPChipFile ) ;
// TODO: The ROD iterator currently does not understand multiple intervals file. Fix this by cleaning the ROD system.
if ( argCollection . intervals ! = null & & argCollection . intervals . size ( ) = = 1 ) {
bindConvenienceRods ( "interval" , "Intervals" , argCollection . intervals . get ( 0 ) . replaceAll ( "," , "" ) ) ;
}
2010-04-01 06:39:56 +08:00
RMDTrackManager manager = new RMDTrackManager ( ) ;
List < RMDTrack > tracks = manager . getReferenceMetaDataSources ( argCollection . RODBindings ) ;
validateSuppliedReferenceOrderedDataAgainstWalker ( my_walker , tracks ) ;
2009-07-30 07:00:15 +08:00
2010-04-01 06:39:56 +08:00
rodDataSources = getReferenceOrderedDataSources ( my_walker , tracks ) ;
2009-07-30 07:00:15 +08:00
}
2009-07-10 07:59:53 +08:00
2009-05-11 10:07:20 +08:00
/ * *
* setup a microscheduler
2009-10-06 10:45:31 +08:00
*
2009-05-11 10:07:20 +08:00
* @param my_walker our walker of type LocusWalker
* @return a new microscheduler
* /
2009-07-30 07:00:15 +08:00
private MicroScheduler createMicroscheduler ( Walker my_walker ) {
2009-05-11 10:07:20 +08:00
// the mircoscheduler to return
MicroScheduler microScheduler = null ;
2010-03-22 07:22:25 +08:00
// Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary.
if ( ( my_walker instanceof ReadWalker | | my_walker instanceof DuplicateWalker | | my_walker instanceof ReadPairWalker ) & &
argCollection . referenceFile = = null ) {
Utils . scareUser ( String . format ( "Read-based traversals require a reference file but none was given" ) ) ;
2009-05-11 10:07:20 +08:00
}
2010-03-22 07:22:25 +08:00
return MicroScheduler . create ( this , my_walker , readsDataSource , referenceDataSource , rodDataSources , argCollection . numberOfThreads ) ;
2009-05-11 10:07:20 +08:00
}
2010-04-01 04:46:44 +08:00
/ * *
* Gets a unique identifier for the reader sourcing this read .
* @param read Read to examine .
* @return A unique identifier for the source file of this read . Exception if not found .
* /
public SAMReaderID getReaderIDForRead ( final SAMRecord read ) {
return getDataSource ( ) . getReaderID ( read ) ;
}
/ * *
* Gets the source file for this read .
* @param id Unique identifier determining which input file to use .
* @return The source filename for this read .
* /
public File getSourceFileForReaderID ( final SAMReaderID id ) {
return getDataSource ( ) . getSAMFile ( id ) ;
}
2009-10-06 10:45:31 +08:00
/ * *
* Returns sets of samples present in the ( merged ) input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of sample names
2009-11-13 06:52:08 +08:00
* found in the corresponding bam file .
2009-10-06 10:45:31 +08:00
*
* @return
* /
public List < Set < String > > getSamplesByReaders ( ) {
2010-03-04 08:59:32 +08:00
List < SAMReaderID > readers = getDataSource ( ) . getReaderIDs ( ) ;
2009-10-06 10:45:31 +08:00
2010-01-15 11:35:55 +08:00
List < Set < String > > sample_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
2009-10-06 10:45:31 +08:00
2010-03-04 08:59:32 +08:00
for ( SAMReaderID r : readers ) {
2009-10-06 10:45:31 +08:00
Set < String > samples = new HashSet < String > ( 1 ) ;
sample_sets . add ( samples ) ;
2010-03-04 08:59:32 +08:00
for ( SAMReadGroupRecord g : getDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
2009-10-06 10:45:31 +08:00
samples . add ( g . getSample ( ) ) ;
}
}
return sample_sets ;
}
/ * *
* Returns sets of libraries present in the ( merged ) input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of library names
* found in the corresponding bam file .
*
* @return
* /
public List < Set < String > > getLibrariesByReaders ( ) {
2010-03-04 08:59:32 +08:00
List < SAMReaderID > readers = getDataSource ( ) . getReaderIDs ( ) ;
2009-10-06 10:45:31 +08:00
2010-01-15 11:35:55 +08:00
List < Set < String > > lib_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
2009-10-06 10:45:31 +08:00
2010-03-04 08:59:32 +08:00
for ( SAMReaderID r : readers ) {
2009-10-06 10:45:31 +08:00
Set < String > libs = new HashSet < String > ( 2 ) ;
lib_sets . add ( libs ) ;
2010-03-04 08:59:32 +08:00
for ( SAMReadGroupRecord g : getDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
2009-10-06 10:45:31 +08:00
libs . add ( g . getLibrary ( ) ) ;
}
}
return lib_sets ;
}
/ * *
2010-02-04 12:12:49 +08:00
* Returns a mapping from original input files to their ( merged ) read group ids
*
* @return the mapping
* /
public Map < File , Set < String > > getFileToReadGroupIdMapping ( ) {
2010-02-07 12:21:04 +08:00
// populate the file -> read group mapping
Map < File , Set < String > > fileToReadGroupIdMap = new HashMap < File , Set < String > > ( ) ;
2010-03-04 08:59:32 +08:00
for ( SAMReaderID id : getDataSource ( ) . getReaderIDs ( ) ) {
2010-02-07 12:21:04 +08:00
Set < String > readGroups = new HashSet < String > ( 5 ) ;
2010-03-04 08:59:32 +08:00
for ( SAMReadGroupRecord g : getDataSource ( ) . getHeader ( id ) . getReadGroups ( ) ) {
2010-02-07 12:21:04 +08:00
if ( getDataSource ( ) . hasReadGroupCollisions ( ) ) {
// Check if there were read group clashes.
// If there were, use the SamFileHeaderMerger to translate from the
// original read group id to the read group id in the merged stream
2010-03-04 08:59:32 +08:00
readGroups . add ( getDataSource ( ) . getReadGroupId ( id , g . getReadGroupId ( ) ) ) ;
2010-02-07 12:21:04 +08:00
} else {
// otherwise, pass through the unmapped read groups since this is what Picard does as well
readGroups . add ( g . getReadGroupId ( ) ) ;
}
}
2010-03-04 08:59:32 +08:00
fileToReadGroupIdMap . put ( getDataSource ( ) . getSAMFile ( id ) , readGroups ) ;
2010-02-07 12:21:04 +08:00
}
return fileToReadGroupIdMap ;
2010-02-04 12:12:49 +08:00
}
/ * *
2010-03-18 00:24:10 +08:00
* * * * * UNLESS YOU HAVE GOOD REASON TO , DO NOT USE THIS METHOD ; USE getFileToReadGroupIdMapping ( ) INSTEAD * * * *
2010-02-04 12:12:49 +08:00
*
2009-10-06 10:45:31 +08:00
* Returns sets of ( remapped ) read groups in input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of remapped read groups
* ( i . e . as seen by read . getReadGroup ( ) . getReadGroupId ( ) in the merged stream ) that come from the corresponding bam file .
*
2010-02-04 12:12:49 +08:00
* @return sets of ( merged ) read group ids in order of input bams
2009-10-06 10:45:31 +08:00
* /
public List < Set < String > > getMergedReadGroupsByReaders ( ) {
2010-03-04 08:59:32 +08:00
List < SAMReaderID > readers = getDataSource ( ) . getReaderIDs ( ) ;
2009-10-06 10:45:31 +08:00
2010-01-15 11:35:55 +08:00
List < Set < String > > rg_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
2009-10-06 10:45:31 +08:00
2010-03-04 08:59:32 +08:00
for ( SAMReaderID r : readers ) {
2009-10-06 10:45:31 +08:00
Set < String > groups = new HashSet < String > ( 5 ) ;
rg_sets . add ( groups ) ;
2010-03-04 08:59:32 +08:00
for ( SAMReadGroupRecord g : getDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
2010-01-15 11:35:55 +08:00
if ( getDataSource ( ) . hasReadGroupCollisions ( ) ) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
2009-10-06 10:45:31 +08:00
// use HeaderMerger to translate original read group id from the reader into the read group id in the
// merged stream, and save that remapped read group id to associate it with specific reader
2010-01-15 11:35:55 +08:00
groups . add ( getDataSource ( ) . getReadGroupId ( r , g . getReadGroupId ( ) ) ) ;
2009-10-06 10:45:31 +08:00
} else {
// otherwise, pass through the unmapped read groups since this is what Picard does as well
groups . add ( g . getReadGroupId ( ) ) ;
}
}
}
return rg_sets ;
}
2009-06-10 21:39:32 +08:00
/ * *
* Bundles all the source information about the reads into a unified data structure .
2009-07-10 05:57:00 +08:00
*
2009-10-06 10:45:31 +08:00
* @param walker The walker for which to extract info .
2009-06-10 21:39:32 +08:00
* @param argCollection The collection of arguments passed to the engine .
* @return The reads object providing reads source info .
* /
2009-11-11 07:36:17 +08:00
private Reads extractSourceInfo ( Walker walker , Collection < SamRecordFilter > filters , GATKArgumentCollection argCollection ) {
2010-05-19 13:40:05 +08:00
DownsamplingMethod method = null ;
if ( argCollection . downsamplingType ! = DownsampleType . NONE )
method = new DownsamplingMethod ( argCollection . downsamplingType , argCollection . downsampleCoverage , argCollection . downsampleFraction ) ;
else if ( WalkerManager . getDownsamplingMethod ( walker ) ! = null )
method = WalkerManager . getDownsamplingMethod ( walker ) ;
else
method = new DownsamplingMethod ( DownsampleType . NONE , null , null ) ;
2009-10-06 10:45:31 +08:00
return new Reads ( argCollection . samFiles ,
argCollection . strictnessLevel ,
2010-05-19 13:40:05 +08:00
method ,
2010-01-15 08:14:35 +08:00
new ValidationExclusion ( Arrays . asList ( argCollection . unsafe ) ) ,
2009-10-06 10:45:31 +08:00
filters ,
argCollection . readMaxPileup ,
2009-12-29 03:52:44 +08:00
walker . includeReadsWithDeletionAtLoci ( ) ,
walker . generateExtendedEvents ( ) ) ;
2009-06-10 21:39:32 +08:00
}
2009-07-30 07:00:15 +08:00
/ * *
* Verifies that the supplied set of reads files mesh with what the walker says it requires .
2009-10-06 10:45:31 +08:00
*
* @param walker Walker to test .
2009-07-30 07:00:15 +08:00
* @param arguments Supplied reads files .
* /
2009-10-06 10:45:31 +08:00
private void validateSuppliedReadsAgainstWalker ( Walker walker , GATKArgumentCollection arguments ) {
2009-05-20 07:26:17 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( WalkerManager . isRequired ( walker , DataSource . READS ) & & ( arguments . samFiles = = null | | arguments . samFiles . size ( ) = = 0 ) )
2009-07-30 07:00:15 +08:00
throw new ArgumentException ( "Walker requires reads but none were provided. If this is incorrect, alter the walker's @Requires annotation." ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( ( arguments . samFiles ! = null & & arguments . samFiles . size ( ) > 0 ) & & ! WalkerManager . isAllowed ( walker , DataSource . READS ) )
2009-07-30 07:00:15 +08:00
throw new ArgumentException ( "Walker does not allow reads but reads were provided. If this is incorrect, alter the walker's @Allows annotation" ) ;
}
/ * *
* Verifies that the supplied reference file mesh with what the walker says it requires .
2009-10-06 10:45:31 +08:00
*
* @param walker Walker to test .
2009-07-30 07:00:15 +08:00
* @param arguments Supplied reads files .
* /
2009-10-06 10:45:31 +08:00
private void validateSuppliedReferenceAgainstWalker ( Walker walker , GATKArgumentCollection arguments ) {
2009-07-30 07:00:15 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-12-11 03:15:48 +08:00
// TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required.
if ( /*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments . referenceFile = = null )
2009-07-30 07:00:15 +08:00
throw new ArgumentException ( "Walker requires a reference but none was provided. If this is incorrect, alter the walker's @Requires annotation." ) ;
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( arguments . referenceFile ! = null & & ! WalkerManager . isAllowed ( walker , DataSource . REFERENCE ) )
2009-07-30 07:00:15 +08:00
throw new ArgumentException ( "Walker does not allow a reference but one was provided. If this is incorrect, alter the walker's @Allows annotation" ) ;
}
2009-05-20 07:26:17 +08:00
2009-07-30 07:00:15 +08:00
/ * *
* Verifies that all required reference - ordered data has been supplied , and any reference - ordered data that was not
* ' allowed ' is still present .
2009-10-06 10:45:31 +08:00
*
2009-07-30 07:00:15 +08:00
* @param walker Walker to test .
2009-10-06 10:45:31 +08:00
* @param rods Reference - ordered data to load .
2009-07-30 07:00:15 +08:00
* /
2010-04-01 06:39:56 +08:00
private void validateSuppliedReferenceOrderedDataAgainstWalker ( Walker walker , List < RMDTrack > rods ) {
2009-05-20 07:26:17 +08:00
// Check to make sure that all required metadata is present.
List < RMD > allRequired = WalkerManager . getRequiredMetaData ( walker ) ;
2009-07-10 05:57:00 +08:00
for ( RMD required : allRequired ) {
2009-05-20 07:26:17 +08:00
boolean found = false ;
2010-04-01 06:39:56 +08:00
for ( RMDTrack rod : rods ) {
2009-07-10 05:57:00 +08:00
if ( rod . matches ( required . name ( ) , required . type ( ) ) )
2009-05-20 07:26:17 +08:00
found = true ;
}
2009-07-10 05:57:00 +08:00
if ( ! found )
throw new ArgumentException ( String . format ( "Unable to find reference metadata (%s,%s)" , required . name ( ) , required . type ( ) ) ) ;
2009-05-20 07:26:17 +08:00
}
// Check to see that no forbidden rods are present.
2010-04-01 06:39:56 +08:00
for ( RMDTrack rod : rods ) {
2009-07-10 05:57:00 +08:00
if ( ! WalkerManager . isAllowed ( walker , rod ) )
2009-10-16 23:39:08 +08:00
throw new ArgumentException ( String . format ( "Walker of type %s does not allow access to metadata: %s. If this is incorrect, change the @Allows metadata" , walker . getClass ( ) , rod . getName ( ) ) ) ;
2009-07-30 07:00:15 +08:00
}
}
/ * *
* Now that all files are open , validate the sequence dictionaries of the reads vs . the reference .
2009-10-06 10:45:31 +08:00
*
* @param reads Reads data source .
2009-07-30 07:00:15 +08:00
* @param reference Reference data source .
* /
2009-10-06 10:45:31 +08:00
private void validateReadsAndReferenceAreCompatible ( SAMDataSource reads , ReferenceSequenceFile reference ) {
if ( reads = = null | | reference = = null )
2009-07-30 07:00:15 +08:00
return ;
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads . getHeader ( ) . getSequenceDictionary ( ) ;
Set < String > readsSequenceNames = new TreeSet < String > ( ) ;
2009-10-06 10:45:31 +08:00
for ( SAMSequenceRecord dictionaryEntry : readsDictionary . getSequences ( ) )
2009-07-30 07:00:15 +08:00
readsSequenceNames . add ( dictionaryEntry . getSequenceName ( ) ) ;
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference . getSequenceDictionary ( ) ;
Set < String > referenceSequenceNames = new TreeSet < String > ( ) ;
2009-10-06 10:45:31 +08:00
for ( SAMSequenceRecord dictionaryEntry : referenceDictionary . getSequences ( ) )
2009-07-30 07:00:15 +08:00
referenceSequenceNames . add ( dictionaryEntry . getSequenceName ( ) ) ;
2009-10-06 10:45:31 +08:00
if ( readsSequenceNames . size ( ) = = 0 ) {
2009-07-30 07:00:15 +08:00
logger . info ( "Reads file is unmapped. Skipping validation against reference." ) ;
return ;
}
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
Set < String > intersectingSequenceNames = new HashSet < String > ( readsSequenceNames ) ;
intersectingSequenceNames . retainAll ( referenceSequenceNames ) ;
2009-10-06 10:45:31 +08:00
if ( intersectingSequenceNames . size ( ) = = 0 ) {
2009-07-30 07:00:15 +08:00
StringBuilder error = new StringBuilder ( ) ;
error . append ( "No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n" ) ;
error . append ( System . getProperty ( "line.separator" ) ) ;
error . append ( String . format ( "Reads contigs: %s%n" , prettyPrintSequenceRecords ( readsDictionary ) ) ) ;
error . append ( String . format ( "Reference contigs: %s%n" , prettyPrintSequenceRecords ( referenceDictionary ) ) ) ;
logger . error ( error . toString ( ) ) ;
Utils . scareUser ( "No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference." ) ;
}
// If the two datasets are not equal and neither is a strict subset of the other, warn the user.
2009-10-06 10:45:31 +08:00
if ( ! readsSequenceNames . equals ( referenceSequenceNames ) & &
! readsSequenceNames . containsAll ( referenceSequenceNames ) & &
! referenceSequenceNames . containsAll ( readsSequenceNames ) ) {
2009-07-30 07:00:15 +08:00
StringBuilder warning = new StringBuilder ( ) ;
warning . append ( "Limited overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n" ) ;
warning . append ( System . getProperty ( "line.separator" ) ) ;
warning . append ( String . format ( "Reads contigs: %s%n" , prettyPrintSequenceRecords ( readsDictionary ) ) ) ;
warning . append ( String . format ( "Reference contigs: %s%n" , prettyPrintSequenceRecords ( referenceDictionary ) ) ) ;
logger . warn ( warning . toString ( ) ) ;
2009-05-20 07:26:17 +08:00
}
}
2009-10-06 10:45:31 +08:00
private String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
String [ ] sequenceRecordNames = new String [ sequenceDictionary . size ( ) ] ;
2009-07-30 07:00:15 +08:00
int sequenceRecordIndex = 0 ;
2009-10-06 10:45:31 +08:00
for ( SAMSequenceRecord sequenceRecord : sequenceDictionary . getSequences ( ) )
2009-07-30 07:00:15 +08:00
sequenceRecordNames [ sequenceRecordIndex + + ] = sequenceRecord . getSequenceName ( ) ;
return Arrays . deepToString ( sequenceRecordNames ) ;
}
2009-05-11 10:07:20 +08:00
/ * *
* Convenience function that binds RODs using the old - style command line parser to the new style list for
* a uniform processing .
*
2009-07-10 06:10:22 +08:00
* @param name the name of the rod
* @param type its type
* @param file the file to load the rod from
2009-05-11 10:07:20 +08:00
* /
private void bindConvenienceRods ( final String name , final String type , final String file ) {
argCollection . RODBindings . add ( Utils . join ( "," , new String [ ] { name , type , file } ) ) ;
}
2009-07-30 00:11:45 +08:00
/ * *
* Get the sharding strategy given a driving data source .
*
* @param walker Walker for which to infer sharding strategy .
* @param drivingDataSource Data on which to shard .
* @param intervals Intervals to use when limiting sharding .
* @param maxIterations the maximum number of iterations to run through
* @return Sharding strategy for this driving data source .
* /
protected ShardStrategy getShardStrategy ( Walker walker ,
ReferenceSequenceFile drivingDataSource ,
GenomeLocSortedSet intervals ,
2010-01-15 08:14:35 +08:00
Integer maxIterations ,
ValidationExclusion exclusions ) {
2010-03-22 07:22:25 +08:00
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
// sharding system; it's required with the new sharding system only for locus walkers.
if ( readsDataSource ! = null & & ! readsDataSource . hasIndex ( ) & & ( argCollection . disableExperimentalSharding | | walker instanceof LocusWalker ) ) {
2010-01-15 08:14:35 +08:00
if ( ! exclusions . contains ( ValidationExclusion . TYPE . ALLOW_UNINDEXED_BAM ) | | intervals ! = null )
2009-12-17 05:55:42 +08:00
throw new StingException ( "The GATK cannot currently process unindexed BAM files" ) ;
Shard . ShardType shardType ;
2010-03-22 07:31:45 +08:00
if ( walker instanceof LocusWalker ) {
if ( readsDataSource ! = null & & readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
Utils . scareUser ( "Locus walkers can only walk over coordinate-sorted data. Please resort your input BAM file." ) ;
2009-12-17 05:55:42 +08:00
shardType = Shard . ShardType . LOCUS ;
2010-03-22 07:31:45 +08:00
}
2010-03-22 07:22:25 +08:00
else if ( walker instanceof ReadWalker | | walker instanceof DuplicateWalker | | walker instanceof ReadPairWalker )
2009-12-17 05:55:42 +08:00
shardType = Shard . ShardType . READ ;
else
throw new StingException ( "The GATK cannot currently process unindexed BAM files" ) ;
2010-05-18 05:04:00 +08:00
return new MonolithicShardStrategy ( shardType ) ;
2009-12-17 05:55:42 +08:00
}
2009-07-30 00:11:45 +08:00
ShardStrategy shardStrategy = null ;
ShardStrategyFactory . SHATTER_STRATEGY shardType ;
2009-09-13 03:13:15 +08:00
2009-12-17 05:55:42 +08:00
long SHARD_SIZE = 100000L ;
2009-07-30 00:11:45 +08:00
if ( walker instanceof LocusWalker ) {
2009-10-21 07:31:13 +08:00
if ( walker instanceof RodWalker ) SHARD_SIZE * = 1000 ;
2009-09-13 03:13:15 +08:00
2010-01-13 01:52:27 +08:00
if ( intervals ! = null & & ! intervals . isEmpty ( ) ) {
2010-03-22 07:22:25 +08:00
if ( readsDataSource ! = null & & readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
Utils . scareUser ( "Locus walkers can only walk over coordinate-sorted data. Please resort your input BAM file." ) ;
2009-07-30 00:11:45 +08:00
shardType = ( walker . isReduceByInterval ( ) ) ?
ShardStrategyFactory . SHATTER_STRATEGY . INTERVAL :
ShardStrategyFactory . SHATTER_STRATEGY . LINEAR ;
2010-01-15 11:35:55 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-03-08 05:01:25 +08:00
referenceDataSource ,
2010-03-17 11:32:45 +08:00
! argCollection . disableExperimentalSharding ? ShardStrategyFactory . SHATTER_STRATEGY . LOCUS_EXPERIMENTAL : shardType ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
intervals , maxIterations ) ;
} else
2010-03-01 09:58:44 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-03-08 05:01:25 +08:00
referenceDataSource ,
2010-03-17 11:32:45 +08:00
! argCollection . disableExperimentalSharding ? ShardStrategyFactory . SHATTER_STRATEGY . LOCUS_EXPERIMENTAL : ShardStrategyFactory . SHATTER_STRATEGY . LINEAR ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE , maxIterations ) ;
} else if ( walker instanceof ReadWalker | |
walker instanceof DuplicateWalker ) {
2010-03-17 11:32:45 +08:00
if ( ! argCollection . disableExperimentalSharding )
2010-01-15 11:35:55 +08:00
shardType = ShardStrategyFactory . SHATTER_STRATEGY . READS_EXPERIMENTAL ;
else
shardType = ShardStrategyFactory . SHATTER_STRATEGY . READS ;
2009-07-30 00:11:45 +08:00
2010-01-13 01:52:27 +08:00
if ( intervals ! = null & & ! intervals . isEmpty ( ) ) {
2010-03-08 05:01:25 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
referenceDataSource ,
shardType ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
intervals , maxIterations ) ;
} else {
2010-03-08 05:01:25 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
referenceDataSource ,
shardType ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE , maxIterations ) ;
}
2010-03-22 07:22:25 +08:00
} else if ( walker instanceof ReadPairWalker ) {
if ( argCollection . disableExperimentalSharding )
Utils . scareUser ( "Pairs traversal cannot be used in conjunction with the old sharding system." ) ;
if ( readsDataSource ! = null & & readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . queryname )
Utils . scareUser ( "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file." ) ;
if ( intervals ! = null & & ! intervals . isEmpty ( ) )
Utils . scareUser ( "Pairs traversal cannot be used in conjunction with intervals." ) ;
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
referenceDataSource ,
ShardStrategyFactory . SHATTER_STRATEGY . READS_EXPERIMENTAL ,
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE , maxIterations ) ;
2009-07-30 00:11:45 +08:00
} else
throw new StingException ( "Unable to support walker of type" + walker . getClass ( ) . getName ( ) ) ;
return shardStrategy ;
}
/ * *
* Gets a data source for the given set of reads .
*
2009-10-06 10:45:31 +08:00
* @param reads the read source information
2009-07-30 00:11:45 +08:00
* @return A data source for the given set of reads .
* /
private SAMDataSource createReadsDataSource ( Reads reads ) {
// By reference traversals are happy with no reads. Make sure that case is handled.
if ( reads . getReadsFiles ( ) . size ( ) = = 0 )
return null ;
2010-01-15 11:35:55 +08:00
SAMDataSource dataSource = null ;
2010-03-17 11:32:45 +08:00
if ( ! argCollection . disableExperimentalSharding )
2010-01-15 11:35:55 +08:00
dataSource = new BlockDrivenSAMDataSource ( reads ) ;
else
dataSource = new IndexDrivenSAMDataSource ( reads ) ;
2009-07-30 00:11:45 +08:00
return dataSource ;
}
/ * *
* Opens a reference sequence file paired with an index .
*
* @param refFile Handle to a reference sequence file . Non - null .
* @return A thread - safe file wrapper .
* /
private IndexedFastaSequenceFile openReferenceSequenceFile ( File refFile ) {
IndexedFastaSequenceFile ref = null ;
try {
ref = new IndexedFastaSequenceFile ( refFile ) ;
}
catch ( FileNotFoundException ex ) {
throw new StingException ( "I/O error while opening fasta file: " + ex . getMessage ( ) , ex ) ;
}
GenomeLocParser . setupRefContigOrdering ( ref ) ;
return ref ;
}
/ * *
* Open the reference - ordered data sources .
*
* @param rods the reference order data to execute using
* @return A list of reference - ordered data sources .
* /
2010-04-01 06:39:56 +08:00
private List < ReferenceOrderedDataSource > getReferenceOrderedDataSources ( Walker walker , List < RMDTrack > rods ) {
2009-07-30 00:11:45 +08:00
List < ReferenceOrderedDataSource > dataSources = new ArrayList < ReferenceOrderedDataSource > ( ) ;
2010-04-01 06:39:56 +08:00
for ( RMDTrack rod : rods )
2010-03-11 06:54:58 +08:00
dataSources . add ( new ReferenceOrderedDataSource ( walker , rod ) ) ;
2009-07-30 00:11:45 +08:00
return dataSources ;
}
2009-05-11 10:07:20 +08:00
2009-07-10 06:10:22 +08:00
/ * *
* Initialize the output streams as specified by the user .
*
2009-10-06 10:45:31 +08:00
* @param walker the walker to initialize output streams for
2009-09-30 06:23:19 +08:00
* @param outputTracker the tracker supplying the initialization data .
2009-07-10 06:10:22 +08:00
* /
2009-08-23 08:56:02 +08:00
private void initializeOutputStreams ( Walker walker , OutputTracker outputTracker ) {
2009-10-06 10:45:31 +08:00
if ( argCollection . outErrFileName ! = null )
outputTracker . initializeCoreIO ( argCollection . outErrFileName , argCollection . outErrFileName ) ;
2009-07-30 00:11:45 +08:00
else
2009-10-06 10:45:31 +08:00
outputTracker . initializeCoreIO ( argCollection . outFileName , argCollection . errFileName ) ;
2009-05-11 10:07:20 +08:00
2009-10-06 10:45:31 +08:00
for ( Map . Entry < ArgumentSource , Object > input : inputs . entrySet ( ) )
outputTracker . addInput ( input . getKey ( ) , input . getValue ( ) ) ;
for ( Stub < ? > stub : outputs )
2009-08-23 08:56:02 +08:00
outputTracker . addOutput ( stub ) ;
outputTracker . prepareWalker ( walker ) ;
2009-05-11 10:07:20 +08:00
}
2009-11-25 00:16:44 +08:00
/ * *
* Returns the SAM File Header from the input reads ' data source file
* @return the SAM File Header from the input reads ' data source file
* /
2009-07-30 07:00:15 +08:00
public SAMFileHeader getSAMFileHeader ( ) {
return readsDataSource . getHeader ( ) ;
2009-05-11 10:07:20 +08:00
}
2009-05-14 21:55:52 +08:00
2010-03-04 08:59:32 +08:00
/ * *
* Returns the unmerged SAM file header for an individual reader .
* @param reader The reader .
* @return Header for that reader .
* /
public SAMFileHeader getSAMFileHeader ( SAMReaderID reader ) {
return readsDataSource . getHeader ( reader ) ;
}
2009-07-25 06:59:49 +08:00
/ * *
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads ; header merger , individual file readers etc can be accessed through the returned data source object .
2009-10-06 10:45:31 +08:00
*
2009-11-09 12:25:39 +08:00
* @return the reads data source
2009-07-25 06:59:49 +08:00
* /
public SAMDataSource getDataSource ( ) {
2009-07-30 07:00:15 +08:00
return this . readsDataSource ;
2009-07-25 06:59:49 +08:00
}
2009-07-10 06:10:22 +08:00
/ * *
* Gets the collection of GATK main application arguments for enhanced walker validation .
*
* @return the GATK argument collection
* /
2009-05-14 21:55:52 +08:00
public GATKArgumentCollection getArguments ( ) {
return this . argCollection ;
}
2009-11-09 12:25:39 +08:00
2010-02-25 08:16:50 +08:00
/ * *
* Get the list of intervals passed to the engine .
* @return List of intervals .
* /
public GenomeLocSortedSet getIntervals ( ) {
return this . intervals ;
}
2009-12-17 05:26:04 +08:00
/ * *
* Gets the list of filters employed by this walker .
* @return Collection of filters ( actual instances ) used by this walker .
* /
public Collection < SamRecordFilter > getFilters ( ) {
return this . filters ;
}
2009-11-09 12:25:39 +08:00
/ * *
* Returns data source objects encapsulating all rod data ;
* individual rods can be accessed through the returned data source objects .
*
* @return the rods data sources
* /
public List < ReferenceOrderedDataSource > getRodDataSources ( ) {
return this . rodDataSources ;
}
2009-05-11 10:07:20 +08:00
}