2009-06-23 05:11:18 +08:00
/ *
2013-01-11 06:04:08 +08:00
* Copyright ( c ) 2012 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
2009-06-23 05:11:18 +08:00
2009-05-11 10:07:20 +08:00
package org.broadinstitute.sting.gatk ;
2012-09-09 08:17:15 +08:00
import com.google.java.contract.Ensures ;
2010-12-21 10:09:46 +08:00
import net.sf.picard.reference.IndexedFastaSequenceFile ;
2010-04-01 06:39:56 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2012-04-09 08:44:39 +08:00
import net.sf.samtools.SAMFileHeader ;
import net.sf.samtools.SAMRecord ;
import net.sf.samtools.SAMSequenceDictionary ;
2010-12-21 10:09:46 +08:00
import org.apache.log4j.Logger ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.commandline.* ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection ;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.gatk.datasources.reads.* ;
2011-02-04 01:59:19 +08:00
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource ;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource ;
2012-05-24 21:17:11 +08:00
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod ;
2009-05-11 10:07:20 +08:00
import org.broadinstitute.sting.gatk.executive.MicroScheduler ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.filters.FilterManager ;
2011-05-05 03:29:08 +08:00
import org.broadinstitute.sting.gatk.filters.ReadFilter ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.gatk.io.OutputTracker ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.gatk.io.stubs.Stub ;
2012-08-31 23:42:50 +08:00
import org.broadinstitute.sting.gatk.iterators.ReadTransformer ;
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode ;
2013-01-31 22:08:26 +08:00
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport ;
Sequence dictionary validation: detect problematic contig indexing differences
The GATK engine does not behave correctly when contigs are indexed
differently in the reads sequence dictionaries vs. the reference
sequence dictionary, and the inconsistently-indexed contigs are included
in the user's intervals. For example, given the dictionaries:
Reference dictionary = { chrM, chr1, chr2, ... }
BAM dictionary = { chr1, chr2, ... }
and the interval "-L chr1", the engine would fail to correctly retrieve
the reads from chr1, since chr1 has a different index in the two dictionaries.
With this patch, we throw an exception if there are contig index differences
between the dictionaries for reads and reference, AND the user's intervals
include at least one of the mismatching contigs.
The user can disable this exception via -U ALLOW_SEQ_DICT_INCOMPATIBILITY
In all other cases, dictionary validation behaves as before.
I also added comprehensive unit tests for the (previously-untested)
SequenceDictionaryUtils class.
GSA-768 #resolve
2013-02-22 04:31:16 +08:00
import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils ;
2011-07-26 01:21:52 +08:00
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder ;
2010-12-23 03:00:17 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet ;
2012-04-09 08:44:39 +08:00
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation ;
import org.broadinstitute.sting.gatk.samples.SampleDB ;
2011-10-04 05:41:13 +08:00
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2011-09-29 23:50:05 +08:00
import org.broadinstitute.sting.utils.* ;
2012-08-31 23:42:50 +08:00
import org.broadinstitute.sting.utils.classloader.PluginManager ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
import org.broadinstitute.sting.utils.exceptions.UserException ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.utils.interval.IntervalUtils ;
2013-01-09 05:23:29 +08:00
import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet ;
2012-08-23 21:59:37 +08:00
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor ;
2009-05-11 10:07:20 +08:00
2010-12-21 10:09:46 +08:00
import java.io.File ;
2009-07-30 00:11:45 +08:00
import java.util.* ;
2012-10-27 01:18:18 +08:00
import java.util.concurrent.TimeUnit ;
2009-05-11 10:07:20 +08:00
2013-03-16 04:41:14 +08:00
import static org.broadinstitute.sting.utils.DeprecatedToolChecks.getWalkerDeprecationInfo ;
import static org.broadinstitute.sting.utils.DeprecatedToolChecks.isDeprecatedWalker ;
2010-09-24 07:28:55 +08:00
/ * *
* A GenomeAnalysisEngine that runs a specified walker .
* /
2010-12-21 10:09:46 +08:00
public class GenomeAnalysisEngine {
/ * *
* our log , which we want to capture anything from this class
* /
private static Logger logger = Logger . getLogger ( GenomeAnalysisEngine . class ) ;
2012-10-27 01:18:18 +08:00
public static final long NO_RUNTIME_LIMIT = - 1 ;
2010-12-21 10:09:46 +08:00
/ * *
* The GATK command - line argument parsing code .
* /
private ParsingEngine parsingEngine ;
/ * *
* The genomeLocParser can create and parse GenomeLocs .
* /
private GenomeLocParser genomeLocParser ;
/ * *
* Accessor for sharded read data .
* /
private SAMDataSource readsDataSource = null ;
/ * *
* Accessor for sharded reference data .
* /
private ReferenceDataSource referenceDataSource = null ;
/ * *
* Accessor for sample metadata
* /
2011-10-04 00:33:30 +08:00
private SampleDB sampleDB = null ;
2010-12-21 10:09:46 +08:00
/ * *
* Accessor for sharded reference - ordered data .
* /
private List < ReferenceOrderedDataSource > rodDataSources ;
// our argument collection
private GATKArgumentCollection argCollection ;
/ * *
* Collection of intervals used by the engine .
* /
private GenomeLocSortedSet intervals = null ;
2011-04-05 02:41:55 +08:00
/ * *
* Explicitly assign the interval set to use for this traversal ( for unit testing purposes )
* @param intervals set of intervals to use for this traversal
* /
public void setIntervals ( GenomeLocSortedSet intervals ) {
this . intervals = intervals ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Collection of inputs used by the engine .
* /
private Map < ArgumentSource , Object > inputs = new HashMap < ArgumentSource , Object > ( ) ;
/ * *
* Collection of outputs used by the engine .
* /
private Collection < Stub < ? > > outputs = new ArrayList < Stub < ? > > ( ) ;
/ * *
* Collection of the filters applied to the input data .
* /
2011-05-05 03:29:08 +08:00
private Collection < ReadFilter > filters ;
2010-12-21 10:09:46 +08:00
2012-08-31 23:42:50 +08:00
/ * *
* Collection of the read transformers applied to the reads
* /
private List < ReadTransformer > readTransformers ;
2011-09-13 22:49:16 +08:00
/ * *
* Controls the allocation of threads between CPU vs IO .
* /
private ThreadAllocation threadAllocation ;
2012-09-05 02:50:06 +08:00
private ReadMetrics cumulativeMetrics = null ;
2011-01-18 05:23:09 +08:00
/ * *
* A currently hacky unique name for this GATK instance
* /
2011-04-08 01:03:48 +08:00
private String myName = "GATK_" + Math . abs ( getRandomGenerator ( ) . nextInt ( ) ) ;
2011-01-18 05:23:09 +08:00
2009-10-06 10:45:31 +08:00
/ * *
* our walker manager
* /
2010-09-25 10:49:30 +08:00
private final WalkerManager walkerManager = new WalkerManager ( ) ;
2009-11-11 02:40:16 +08:00
2010-09-24 07:28:55 +08:00
private Walker < ? , ? > walker ;
2010-08-29 06:53:32 +08:00
2010-09-24 07:28:55 +08:00
public void setWalker ( Walker < ? , ? > walker ) {
this . walker = walker ;
2009-07-10 07:59:53 +08:00
}
2009-05-11 10:07:20 +08:00
2010-12-23 03:00:17 +08:00
/ * *
* A processed collection of SAM reader identifiers .
* /
2010-12-31 12:52:22 +08:00
private Collection < SAMReaderID > samReaderIDs = Collections . emptyList ( ) ;
2010-12-23 03:00:17 +08:00
/ * *
* Set the SAM / BAM files over which to traverse .
* @param samReaderIDs Collection of ids to use during this traversal .
* /
public void setSAMFileIDs ( Collection < SAMReaderID > samReaderIDs ) {
this . samReaderIDs = samReaderIDs ;
}
/ * *
* Collection of reference metadata files over which to traverse .
* /
private Collection < RMDTriplet > referenceMetaDataFiles ;
2012-08-23 21:59:37 +08:00
/ * *
* The threading efficiency monitor we use in the GATK to monitor our efficiency .
*
* May be null if one isn ' t active , or hasn ' t be initialized yet
* /
private ThreadEfficiencyMonitor threadEfficiencyMonitor = null ;
2010-12-23 03:00:17 +08:00
/ * *
* Set the reference metadata files to use for this traversal .
* @param referenceMetaDataFiles Collection of files and descriptors over which to traverse .
* /
public void setReferenceMetaDataFiles ( Collection < RMDTriplet > referenceMetaDataFiles ) {
this . referenceMetaDataFiles = referenceMetaDataFiles ;
}
2011-04-08 01:03:48 +08:00
/ * *
* Static random number generator and seed .
* /
private static final long GATK_RANDOM_SEED = 47382911L ;
private static Random randomGenerator = new Random ( GATK_RANDOM_SEED ) ;
2011-04-30 03:29:08 +08:00
public static Random getRandomGenerator ( ) { return randomGenerator ; }
2011-05-31 22:06:37 +08:00
public static void resetRandomGenerator ( ) { randomGenerator . setSeed ( GATK_RANDOM_SEED ) ; }
public static void resetRandomGenerator ( long seed ) { randomGenerator . setSeed ( seed ) ; }
2012-02-06 02:09:03 +08:00
/ * *
2012-02-14 01:35:09 +08:00
* Base Quality Score Recalibration helper object
2012-02-06 02:09:03 +08:00
* /
2013-01-09 05:23:29 +08:00
private BQSRArgumentSet bqsrArgumentSet = null ;
public BQSRArgumentSet getBQSRArgumentSet ( ) { return bqsrArgumentSet ; }
public boolean hasBQSRArgumentSet ( ) { return bqsrArgumentSet ! = null ; }
public void setBaseRecalibration ( final GATKArgumentCollection args ) {
bqsrArgumentSet = new BQSRArgumentSet ( args ) ;
2012-07-17 22:52:43 +08:00
}
2009-07-10 07:59:53 +08:00
/ * *
* Actually run the GATK with the specified walker .
2009-10-06 10:45:31 +08:00
*
2009-09-30 06:23:19 +08:00
* @return the value of this traversal .
2009-07-10 07:59:53 +08:00
* /
2010-09-24 07:28:55 +08:00
public Object execute ( ) {
2013-01-31 22:08:26 +08:00
// first thing is to make sure the AWS keys can be decrypted
GATKRunReport . checkAWSAreValid ( ) ;
2010-05-18 05:00:44 +08:00
//HeapSizeMonitor monitor = new HeapSizeMonitor();
//monitor.start();
2010-09-24 07:28:55 +08:00
setStartTime ( new java . util . Date ( ) ) ;
2010-05-18 05:00:44 +08:00
2012-07-17 22:52:43 +08:00
final GATKArgumentCollection args = this . getArguments ( ) ;
2009-05-11 10:07:20 +08:00
// validate our parameters
2012-07-17 22:52:43 +08:00
if ( args = = null ) {
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null." ) ;
2009-05-11 10:07:20 +08:00
}
2010-04-01 20:47:48 +08:00
// validate our parameters
2010-09-24 07:28:55 +08:00
if ( this . walker = = null )
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "The walker passed to GenomeAnalysisEngine can not be null." ) ;
2009-07-10 07:59:53 +08:00
2012-07-17 22:52:43 +08:00
if ( args . nonDeterministicRandomSeed )
2011-04-30 03:29:08 +08:00
resetRandomGenerator ( System . currentTimeMillis ( ) ) ;
2012-02-06 02:09:03 +08:00
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
2012-07-17 22:52:43 +08:00
if ( args . BQSR_RECAL_FILE ! = null )
2013-01-09 05:23:29 +08:00
setBaseRecalibration ( args ) ;
2012-02-06 02:09:03 +08:00
2011-09-13 22:49:16 +08:00
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation ( ) ;
2009-07-30 07:00:15 +08:00
// Prepare the data for traversal.
2010-09-24 07:28:55 +08:00
initializeDataSources ( ) ;
2009-05-20 07:26:17 +08:00
2011-01-13 01:32:27 +08:00
// initialize and validate the interval list
initializeIntervals ( ) ;
validateSuppliedIntervals ( ) ;
Sequence dictionary validation: detect problematic contig indexing differences
The GATK engine does not behave correctly when contigs are indexed
differently in the reads sequence dictionaries vs. the reference
sequence dictionary, and the inconsistently-indexed contigs are included
in the user's intervals. For example, given the dictionaries:
Reference dictionary = { chrM, chr1, chr2, ... }
BAM dictionary = { chr1, chr2, ... }
and the interval "-L chr1", the engine would fail to correctly retrieve
the reads from chr1, since chr1 has a different index in the two dictionaries.
With this patch, we throw an exception if there are contig index differences
between the dictionaries for reads and reference, AND the user's intervals
include at least one of the mismatching contigs.
The user can disable this exception via -U ALLOW_SEQ_DICT_INCOMPATIBILITY
In all other cases, dictionary validation behaves as before.
I also added comprehensive unit tests for the (previously-untested)
SequenceDictionaryUtils class.
GSA-768 #resolve
2013-02-22 04:31:16 +08:00
// check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary
validateDataSourcesAgainstReference ( readsDataSource , referenceDataSource . getReference ( ) , rodDataSources ) ;
// initialize sampleDB
initializeSampleDB ( ) ;
2009-05-11 10:07:20 +08:00
// our microscheduler, which is in charge of running everything
2010-09-24 07:28:55 +08:00
MicroScheduler microScheduler = createMicroscheduler ( ) ;
2012-08-23 21:59:37 +08:00
threadEfficiencyMonitor = microScheduler . getThreadEfficiencyMonitor ( ) ;
2009-05-11 10:07:20 +08:00
2011-01-05 11:07:11 +08:00
// create temp directories as necessary
initializeTempDirectory ( ) ;
2012-02-06 02:09:03 +08:00
// create the output streams
2010-09-24 07:28:55 +08:00
initializeOutputStreams ( microScheduler . getOutputTracker ( ) ) ;
2009-05-11 10:07:20 +08:00
2013-02-05 04:33:57 +08:00
logger . info ( "Creating shard strategy for " + readsDataSource . getReaderIDs ( ) . size ( ) + " BAM files" ) ;
2011-09-13 22:49:16 +08:00
Iterable < Shard > shardStrategy = getShardStrategy ( readsDataSource , microScheduler . getReference ( ) , intervals ) ;
2013-02-05 04:33:57 +08:00
logger . info ( "Done creating shard strategy" ) ;
2010-03-15 05:08:14 +08:00
// execute the microscheduler, storing the results
2011-10-15 00:06:41 +08:00
return microScheduler . execute ( this . walker , shardStrategy ) ;
2010-05-18 05:00:44 +08:00
//monitor.stop();
//logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
2011-10-15 00:06:41 +08:00
//return result;
2010-03-15 05:08:14 +08:00
}
2009-07-10 07:59:53 +08:00
/ * *
* Retrieves an instance of the walker based on the walker name .
2009-10-06 10:45:31 +08:00
*
2009-07-10 07:59:53 +08:00
* @param walkerName Name of the walker . Must not be null . If the walker cannot be instantiated , an exception will be thrown .
* @return An instance of the walker .
* /
2009-10-06 10:45:31 +08:00
public Walker < ? , ? > getWalkerByName ( String walkerName ) {
2012-07-26 02:11:03 +08:00
try {
return walkerManager . createByName ( walkerName ) ;
} catch ( UserException e ) {
2013-01-08 03:42:40 +08:00
if ( isDeprecatedWalker ( walkerName ) ) {
2013-03-16 04:41:14 +08:00
e = new UserException . DeprecatedWalker ( walkerName , getWalkerDeprecationInfo ( walkerName ) ) ;
2012-08-01 21:50:00 +08:00
}
2012-07-26 02:11:03 +08:00
throw e ;
}
2009-11-11 02:40:16 +08:00
}
/ * *
* Gets the name of a given walker type .
* @param walkerType Type of walker .
* @return Name of the walker .
* /
2010-05-21 03:02:02 +08:00
public String getWalkerName ( Class < ? extends Walker > walkerType ) {
2009-11-11 02:40:16 +08:00
return walkerManager . getName ( walkerType ) ;
2009-07-10 07:59:53 +08:00
}
2011-01-18 05:23:09 +08:00
public String getName ( ) {
return myName ;
}
2009-11-11 07:36:17 +08:00
/ * *
* Gets a list of the filters to associate with the given walker . Will NOT initialize the engine with this filters ;
* the caller must handle that directly .
* @return A collection of available filters .
* /
2011-05-05 03:29:08 +08:00
public Collection < ReadFilter > createFilters ( ) {
2011-10-07 09:51:40 +08:00
final List < ReadFilter > filters = WalkerManager . getReadFilters ( walker , this . getFilterManager ( ) ) ;
2010-12-21 10:09:46 +08:00
if ( this . getArguments ( ) . readGroupBlackList ! = null & & this . getArguments ( ) . readGroupBlackList . size ( ) > 0 )
filters . add ( new ReadGroupBlackListFilter ( this . getArguments ( ) . readGroupBlackList ) ) ;
2011-10-07 09:51:40 +08:00
for ( final String filterName : this . getArguments ( ) . readFilters )
2010-12-21 10:09:46 +08:00
filters . add ( this . getFilterManager ( ) . createByName ( filterName ) ) ;
2011-10-07 09:51:40 +08:00
return Collections . unmodifiableList ( filters ) ;
2009-11-11 07:36:17 +08:00
}
2012-08-31 23:42:50 +08:00
/ * *
* Returns a list of active , initialized read transformers
*
* @param walker the walker we need to apply read transformers too
* /
public void initializeReadTransformers ( final Walker walker ) {
2013-03-06 04:57:44 +08:00
// keep a list of the active read transformers sorted based on priority ordering
List < ReadTransformer > activeTransformers = new ArrayList < ReadTransformer > ( ) ;
2012-08-31 23:42:50 +08:00
final ReadTransformersMode overrideMode = WalkerManager . getWalkerAnnotation ( walker , ReadTransformersMode . class ) ;
final ReadTransformer . ApplicationTime overrideTime = overrideMode ! = null ? overrideMode . ApplicationTime ( ) : null ;
final PluginManager < ReadTransformer > pluginManager = new PluginManager < ReadTransformer > ( ReadTransformer . class ) ;
for ( final ReadTransformer transformer : pluginManager . createAllTypes ( ) ) {
transformer . initialize ( overrideTime , this , walker ) ;
if ( transformer . enabled ( ) )
activeTransformers . add ( transformer ) ;
}
setReadTransformers ( activeTransformers ) ;
}
public List < ReadTransformer > getReadTransformers ( ) {
return readTransformers ;
}
2013-03-06 04:57:44 +08:00
/ *
* Sanity checks that incompatible read transformers are not active together ( and throws an exception if they are ) .
*
* @param readTransformers the active read transformers
* /
protected void checkActiveReadTransformers ( final List < ReadTransformer > readTransformers ) {
if ( readTransformers = = null )
throw new IllegalArgumentException ( "read transformers cannot be null" ) ;
ReadTransformer sawMustBeFirst = null ;
ReadTransformer sawMustBeLast = null ;
for ( final ReadTransformer r : readTransformers ) {
if ( r . getOrderingConstraint ( ) = = ReadTransformer . OrderingConstraint . MUST_BE_FIRST ) {
if ( sawMustBeFirst ! = null )
throw new UserException . IncompatibleReadFiltersException ( sawMustBeFirst . toString ( ) , r . toString ( ) ) ;
sawMustBeFirst = r ;
} else if ( r . getOrderingConstraint ( ) = = ReadTransformer . OrderingConstraint . MUST_BE_LAST ) {
if ( sawMustBeLast ! = null )
throw new UserException . IncompatibleReadFiltersException ( sawMustBeLast . toString ( ) , r . toString ( ) ) ;
sawMustBeLast = r ;
}
}
}
protected void setReadTransformers ( final List < ReadTransformer > readTransformers ) {
2012-08-31 23:42:50 +08:00
if ( readTransformers = = null )
throw new ReviewedStingException ( "read transformers cannot be null" ) ;
2013-03-06 04:57:44 +08:00
// sort them in priority order
Collections . sort ( readTransformers , new ReadTransformer . ReadTransformerComparator ( ) ) ;
// make sure we don't have an invalid set of active read transformers
checkActiveReadTransformers ( readTransformers ) ;
2012-08-31 23:42:50 +08:00
this . readTransformers = readTransformers ;
}
2011-09-13 22:49:16 +08:00
/ * *
* Parse out the thread allocation from the given command - line argument .
* /
private void determineThreadAllocation ( ) {
2012-09-06 03:41:52 +08:00
if ( argCollection . numberOfDataThreads < 1 ) throw new UserException . BadArgumentValue ( "num_threads" , "cannot be less than 1, but saw " + argCollection . numberOfDataThreads ) ;
if ( argCollection . numberOfCPUThreadsPerDataThread < 1 ) throw new UserException . BadArgumentValue ( "num_cpu_threads" , "cannot be less than 1, but saw " + argCollection . numberOfCPUThreadsPerDataThread ) ;
if ( argCollection . numberOfIOThreads < 0 ) throw new UserException . BadArgumentValue ( "num_io_threads" , "cannot be less than 0, but saw " + argCollection . numberOfIOThreads ) ;
this . threadAllocation = new ThreadAllocation ( argCollection . numberOfDataThreads ,
argCollection . numberOfCPUThreadsPerDataThread ,
argCollection . numberOfIOThreads ,
2012-09-20 04:59:24 +08:00
argCollection . monitorThreadEfficiency ) ;
2011-09-13 22:49:16 +08:00
}
2012-09-06 03:41:52 +08:00
public int getTotalNumberOfThreads ( ) {
return this . threadAllocation = = null ? 1 : threadAllocation . getTotalNumThreads ( ) ;
}
2009-12-05 07:24:29 +08:00
/ * *
* Allow subclasses and others within this package direct access to the walker manager .
* @return The walker manager used by this package .
* /
protected WalkerManager getWalkerManager ( ) {
return walkerManager ;
}
2010-09-24 07:28:55 +08:00
2009-05-11 10:07:20 +08:00
/ * *
* setup a microscheduler
2009-10-06 10:45:31 +08:00
*
2009-05-11 10:07:20 +08:00
* @return a new microscheduler
* /
2010-09-24 07:28:55 +08:00
private MicroScheduler createMicroscheduler ( ) {
2010-03-22 07:22:25 +08:00
// Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary.
2011-01-14 10:49:04 +08:00
if ( ( walker instanceof ReadWalker | | walker instanceof DuplicateWalker | | walker instanceof ReadPairWalker ) & &
2010-09-24 07:28:55 +08:00
this . getArguments ( ) . referenceFile = = null ) {
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "Read-based traversals require a reference file but none was given" ) ;
2009-05-11 10:07:20 +08:00
}
2011-09-13 22:49:16 +08:00
return MicroScheduler . create ( this , walker , this . getReadsDataSource ( ) , this . getReferenceDataSource ( ) . getReference ( ) , this . getRodDataSources ( ) , threadAllocation ) ;
2010-02-04 12:12:49 +08:00
}
2010-09-24 07:28:55 +08:00
protected DownsamplingMethod getDownsamplingMethod ( ) {
GATKArgumentCollection argCollection = this . getArguments ( ) ;
2012-11-27 01:44:48 +08:00
2012-05-24 21:17:11 +08:00
DownsamplingMethod commandLineMethod = argCollection . getDownsamplingMethod ( ) ;
2013-01-28 14:19:44 +08:00
DownsamplingMethod walkerMethod = WalkerManager . getDownsamplingMethod ( walker ) ;
DownsamplingMethod defaultMethod = DownsamplingMethod . getDefaultDownsamplingMethod ( walker ) ;
2012-05-24 21:17:11 +08:00
2012-11-27 01:44:48 +08:00
DownsamplingMethod method = commandLineMethod ! = null ? commandLineMethod : ( walkerMethod ! = null ? walkerMethod : defaultMethod ) ;
method . checkCompatibilityWithWalker ( walker ) ;
return method ;
2010-09-24 07:28:55 +08:00
}
2011-10-15 00:06:41 +08:00
protected void setDownsamplingMethod ( DownsamplingMethod method ) {
argCollection . setDownsamplingMethod ( method ) ;
}
2010-09-24 07:28:55 +08:00
protected boolean includeReadsWithDeletionAtLoci ( ) {
return walker . includeReadsWithDeletionAtLoci ( ) ;
2009-06-10 21:39:32 +08:00
}
2009-07-30 07:00:15 +08:00
/ * *
2011-03-31 06:23:24 +08:00
* Verifies that the supplied set of reads files mesh with what the walker says it requires ,
* and also makes sure that there were no duplicate SAM files specified on the command line .
2009-07-30 07:00:15 +08:00
* /
2010-09-24 07:28:55 +08:00
protected void validateSuppliedReads ( ) {
GATKArgumentCollection arguments = this . getArguments ( ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( WalkerManager . isRequired ( walker , DataSource . READS ) & & ( arguments . samFiles = = null | | arguments . samFiles . size ( ) = = 0 ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker requires reads but none were provided." ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( ( arguments . samFiles ! = null & & arguments . samFiles . size ( ) > 0 ) & & ! WalkerManager . isAllowed ( walker , DataSource . READS ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker does not allow reads but reads were provided." ) ;
2011-03-31 06:23:24 +08:00
// Make sure no SAM files were specified multiple times by the user.
checkForDuplicateSamFiles ( ) ;
}
/ * *
* Checks whether there are SAM files that appear multiple times in the fully unpacked list of
* SAM files ( samReaderIDs ) . If there are , throws an ArgumentException listing the files in question .
* /
protected void checkForDuplicateSamFiles ( ) {
Set < SAMReaderID > encounteredSamFiles = new HashSet < SAMReaderID > ( ) ;
Set < String > duplicateSamFiles = new LinkedHashSet < String > ( ) ;
for ( SAMReaderID samFile : samReaderIDs ) {
if ( encounteredSamFiles . contains ( samFile ) ) {
duplicateSamFiles . add ( samFile . getSamFilePath ( ) ) ;
}
else {
encounteredSamFiles . add ( samFile ) ;
}
}
if ( duplicateSamFiles . size ( ) > 0 ) {
throw new ArgumentException ( "The following BAM files appear multiple times in the list of input files: " +
duplicateSamFiles + " BAM files may be specified at most once." ) ;
}
2009-07-30 07:00:15 +08:00
}
/ * *
* Verifies that the supplied reference file mesh with what the walker says it requires .
* /
2010-09-24 07:28:55 +08:00
protected void validateSuppliedReference ( ) {
GATKArgumentCollection arguments = this . getArguments ( ) ;
2009-07-30 07:00:15 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-12-11 03:15:48 +08:00
// TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required.
if ( /*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments . referenceFile = = null )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker requires a reference but none was provided." ) ;
2009-07-30 07:00:15 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( arguments . referenceFile ! = null & & ! WalkerManager . isAllowed ( walker , DataSource . REFERENCE ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker does not allow a reference but one was provided." ) ;
2009-07-30 07:00:15 +08:00
}
2009-05-20 07:26:17 +08:00
2010-12-15 02:24:18 +08:00
protected void validateSuppliedIntervals ( ) {
// Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped.
if ( ! ( walker instanceof ReadWalker ) ) {
GenomeLocSortedSet intervals = getIntervals ( ) ;
if ( intervals ! = null & & getIntervals ( ) . contains ( GenomeLoc . UNMAPPED ) )
throw new ArgumentException ( "Interval list specifies unmapped region. Only read walkers may include the unmapped region." ) ;
}
2011-04-05 02:41:55 +08:00
// If intervals is non-null and empty at this point, it means that the list of intervals to process
// was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since
// this was very likely unintentional, the user should be informed of this. Note that this is different
2011-10-29 00:12:14 +08:00
// from the case where intervals == null, which indicates that there were no interval arguments.
2011-10-28 21:23:25 +08:00
if ( intervals ! = null & & intervals . isEmpty ( ) ) {
2011-10-29 00:12:14 +08:00
logger . warn ( "The given combination of -L and -XL options results in an empty set. No intervals to process." ) ;
2011-04-05 02:41:55 +08:00
}
2013-03-16 04:41:14 +08:00
// TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome
2010-12-15 02:24:18 +08:00
}
2009-07-30 00:11:45 +08:00
/ * *
* Get the sharding strategy given a driving data source .
*
2011-10-15 00:06:41 +08:00
* @param readsDataSource readsDataSource
2009-07-30 00:11:45 +08:00
* @param drivingDataSource Data on which to shard .
2011-10-15 00:06:41 +08:00
* @param intervals intervals
2010-09-24 07:28:55 +08:00
* @return the sharding strategy
2009-07-30 00:11:45 +08:00
* /
2011-09-13 22:49:16 +08:00
protected Iterable < Shard > getShardStrategy ( SAMDataSource readsDataSource , ReferenceSequenceFile drivingDataSource , GenomeLocSortedSet intervals ) {
2010-09-24 07:28:55 +08:00
ValidationExclusion exclusions = ( readsDataSource ! = null ? readsDataSource . getReadsInfo ( ) . getValidationExclusionList ( ) : null ) ;
2012-09-13 01:00:29 +08:00
DownsamplingMethod downsamplingMethod = readsDataSource ! = null ? readsDataSource . getReadsInfo ( ) . getDownsamplingMethod ( ) : null ;
2010-09-24 07:28:55 +08:00
ReferenceDataSource referenceDataSource = this . getReferenceDataSource ( ) ;
2011-09-13 22:49:16 +08:00
// If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
if ( ! readsDataSource . isEmpty ( ) ) {
if ( ! readsDataSource . hasIndex ( ) & & ! exclusions . contains ( ValidationExclusion . TYPE . ALLOW_UNINDEXED_BAM ) )
2011-01-15 05:32:53 +08:00
throw new UserException . CommandLineException ( "Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported." ) ;
2011-09-13 22:49:16 +08:00
if ( ! readsDataSource . hasIndex ( ) & & intervals ! = null & & ! argCollection . allowIntervalsWithUnindexedBAM )
2011-01-14 10:49:04 +08:00
throw new UserException . CommandLineException ( "Cannot perform interval processing when reads are present but no index is available." ) ;
2009-12-17 05:55:42 +08:00
2012-01-20 11:05:08 +08:00
if ( walker instanceof LocusWalker ) {
2010-07-19 00:29:59 +08:00
if ( readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
2010-09-14 13:04:26 +08:00
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . coordinate , "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately." ) ;
2011-09-13 22:49:16 +08:00
if ( intervals = = null )
2013-03-01 02:46:49 +08:00
return readsDataSource . createShardIteratorOverMappedReads ( new LocusShardBalancer ( ) ) ;
2011-09-13 22:49:16 +08:00
else
return readsDataSource . createShardIteratorOverIntervals ( intervals , new LocusShardBalancer ( ) ) ;
2012-01-20 11:05:08 +08:00
}
else if ( walker instanceof ActiveRegionWalker ) {
if ( readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . coordinate , "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately." ) ;
if ( intervals = = null )
2013-03-01 02:46:49 +08:00
return readsDataSource . createShardIteratorOverMappedReads ( new LocusShardBalancer ( ) ) ;
2012-01-20 11:05:08 +08:00
else
return readsDataSource . createShardIteratorOverIntervals ( ( ( ActiveRegionWalker ) walker ) . extendIntervals ( intervals , this . genomeLocParser , this . getReferenceDataSource ( ) . getReference ( ) ) , new LocusShardBalancer ( ) ) ;
}
2011-09-13 22:49:16 +08:00
else if ( walker instanceof ReadWalker | | walker instanceof ReadPairWalker | | walker instanceof DuplicateWalker ) {
// Apply special validation to read pair walkers.
if ( walker instanceof ReadPairWalker ) {
if ( readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . queryname )
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . queryname , "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker." ) ;
if ( intervals ! = null & & ! intervals . isEmpty ( ) )
throw new UserException . CommandLineException ( "Pairs traversal cannot be used in conjunction with intervals." ) ;
}
if ( intervals = = null )
2013-01-28 14:19:44 +08:00
return readsDataSource . createShardIteratorOverAllReads ( new ReadShardBalancer ( ) ) ;
2011-09-13 22:49:16 +08:00
else
2013-01-28 14:19:44 +08:00
return readsDataSource . createShardIteratorOverIntervals ( intervals , new ReadShardBalancer ( ) ) ;
2010-07-07 11:14:59 +08:00
}
2011-09-13 22:49:16 +08:00
else
throw new ReviewedStingException ( "Unable to determine walker type for walker " + walker . getClass ( ) . getName ( ) ) ;
}
else {
2011-12-23 09:54:35 +08:00
// TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well
// TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard
// TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB]
final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000 ;
2011-09-13 22:49:16 +08:00
if ( intervals = = null )
return referenceDataSource . createShardsOverEntireReference ( readsDataSource , genomeLocParser , SHARD_SIZE ) ;
else
return referenceDataSource . createShardsOverIntervals ( readsDataSource , intervals , SHARD_SIZE ) ;
2009-12-17 05:55:42 +08:00
}
2009-07-30 00:11:45 +08:00
}
2010-09-24 07:28:55 +08:00
protected boolean flashbackData ( ) {
return walker instanceof ReadWalker ;
2009-07-30 00:11:45 +08:00
}
2009-05-11 10:07:20 +08:00
2011-01-05 11:07:11 +08:00
/ * *
* Create the temp directory if it doesn ' t exist .
* /
private void initializeTempDirectory ( ) {
File tempDir = new File ( System . getProperty ( "java.io.tmpdir" ) ) ;
2011-10-15 00:06:41 +08:00
if ( ! tempDir . exists ( ) & & ! tempDir . mkdirs ( ) )
throw new UserException . BadTmpDir ( "Unable to create directory" ) ;
2011-01-05 11:07:11 +08:00
}
2009-07-10 06:10:22 +08:00
/ * *
* Initialize the output streams as specified by the user .
*
2009-09-30 06:23:19 +08:00
* @param outputTracker the tracker supplying the initialization data .
2009-07-10 06:10:22 +08:00
* /
2010-09-24 07:28:55 +08:00
private void initializeOutputStreams ( OutputTracker outputTracker ) {
for ( Map . Entry < ArgumentSource , Object > input : getInputs ( ) . entrySet ( ) )
2009-10-06 10:45:31 +08:00
outputTracker . addInput ( input . getKey ( ) , input . getValue ( ) ) ;
2010-09-24 07:28:55 +08:00
for ( Stub < ? > stub : getOutputs ( ) )
2009-08-23 08:56:02 +08:00
outputTracker . addOutput ( stub ) ;
2010-09-25 10:49:30 +08:00
outputTracker . prepareWalker ( walker , getArguments ( ) . strictnessLevel ) ;
2009-05-11 10:07:20 +08:00
}
2010-09-28 10:16:25 +08:00
2010-12-21 10:09:46 +08:00
public ReferenceDataSource getReferenceDataSource ( ) {
return referenceDataSource ;
}
public GenomeLocParser getGenomeLocParser ( ) {
return genomeLocParser ;
}
/ * *
* Manage lists of filters .
* /
private final FilterManager filterManager = new FilterManager ( ) ;
private Date startTime = null ; // the start time for execution
public void setParser ( ParsingEngine parsingEngine ) {
this . parsingEngine = parsingEngine ;
}
/ * *
* Explicitly set the GenomeLocParser , for unit testing .
* @param genomeLocParser GenomeLocParser to use .
* /
public void setGenomeLocParser ( GenomeLocParser genomeLocParser ) {
this . genomeLocParser = genomeLocParser ;
}
/ * *
* Sets the start time when the execute ( ) function was last called
* @param startTime the start time when the execute ( ) function was last called
* /
protected void setStartTime ( Date startTime ) {
this . startTime = startTime ;
}
/ * *
* @return the start time when the execute ( ) function was last called
* /
public Date getStartTime ( ) {
return startTime ;
}
/ * *
* Setup the intervals to be processed
* /
protected void initializeIntervals ( ) {
2013-01-17 01:43:15 +08:00
intervals = IntervalUtils . parseIntervalArguments ( this . referenceDataSource , argCollection . intervalArguments ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Add additional , externally managed IO streams for inputs .
*
* @param argumentSource Field into which to inject the value .
* @param value Instance to inject .
* /
public void addInput ( ArgumentSource argumentSource , Object value ) {
inputs . put ( argumentSource , value ) ;
}
/ * *
* Add additional , externally managed IO streams for output .
*
* @param stub Instance to inject .
* /
public void addOutput ( Stub < ? > stub ) {
outputs . add ( stub ) ;
}
2011-03-14 23:51:19 +08:00
/ * *
* Returns the tag associated with a given command - line argument .
* @param key Object for which to inspect the tag .
* @return Tags object associated with the given key , or an empty Tag structure if none are present .
* /
public Tags getTags ( Object key ) {
return parsingEngine . getTags ( key ) ;
}
2010-12-21 10:09:46 +08:00
protected void initializeDataSources ( ) {
logger . info ( "Strictness is " + argCollection . strictnessLevel ) ;
validateSuppliedReference ( ) ;
2011-01-13 02:25:12 +08:00
setReferenceDataSource ( argCollection . referenceFile ) ;
2010-12-21 10:09:46 +08:00
validateSuppliedReads ( ) ;
2012-08-31 23:42:50 +08:00
initializeReadTransformers ( walker ) ;
2011-03-17 05:48:47 +08:00
readsDataSource = createReadsDataSource ( argCollection , genomeLocParser , referenceDataSource . getReference ( ) ) ;
2010-12-21 10:09:46 +08:00
2011-05-05 03:29:08 +08:00
for ( ReadFilter filter : filters )
filter . initialize ( this ) ;
2010-12-21 10:09:46 +08:00
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
2010-12-31 12:52:22 +08:00
rodDataSources = getReferenceOrderedDataSources ( referenceMetaDataFiles , referenceDataSource . getReference ( ) . getSequenceDictionary ( ) , genomeLocParser , argCollection . unsafe ) ;
2011-10-04 05:41:13 +08:00
}
2013-04-08 03:25:52 +08:00
/ * *
* Purely for testing purposes . Do not use unless you absolutely positively know what you are doing ( or
* need to absolutely positively kill everyone in the room )
* @param dataSource
* /
public void setReadsDataSource ( final SAMDataSource dataSource ) {
this . readsDataSource = dataSource ;
}
2011-10-04 05:41:13 +08:00
/ * *
* Entry - point function to initialize the samples database from input data and pedigree arguments
* /
private void initializeSampleDB ( ) {
SampleDBBuilder sampleDBBuilder = new SampleDBBuilder ( this , argCollection . pedigreeValidationType ) ;
sampleDBBuilder . addSamplesFromSAMHeader ( getSAMFileHeader ( ) ) ;
sampleDBBuilder . addSamplesFromSampleNames ( SampleUtils . getUniqueSamplesFromRods ( this ) ) ;
2011-10-04 10:20:33 +08:00
sampleDBBuilder . addSamplesFromPedigreeFiles ( argCollection . pedigreeFiles ) ;
sampleDBBuilder . addSamplesFromPedigreeStrings ( argCollection . pedigreeStrings ) ;
2011-10-04 05:41:13 +08:00
sampleDB = sampleDBBuilder . getFinalSampleDB ( ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Gets a unique identifier for the reader sourcing this read .
* @param read Read to examine .
* @return A unique identifier for the source file of this read . Exception if not found .
* /
public SAMReaderID getReaderIDForRead ( final SAMRecord read ) {
return getReadsDataSource ( ) . getReaderID ( read ) ;
}
/ * *
* Gets the source file for this read .
* @param id Unique identifier determining which input file to use .
* @return The source filename for this read .
* /
public File getSourceFileForReaderID ( final SAMReaderID id ) {
return getReadsDataSource ( ) . getSAMFile ( id ) ;
}
/ * *
* Now that all files are open , validate the sequence dictionaries of the reads vs . the reference vrs the reference ordered data ( if available ) .
*
* @param reads Reads data source .
* @param reference Reference data source .
2010-12-31 12:52:22 +08:00
* @param rods a collection of the reference ordered data tracks
2010-12-21 10:09:46 +08:00
* /
Sequence dictionary validation: detect problematic contig indexing differences
The GATK engine does not behave correctly when contigs are indexed
differently in the reads sequence dictionaries vs. the reference
sequence dictionary, and the inconsistently-indexed contigs are included
in the user's intervals. For example, given the dictionaries:
Reference dictionary = { chrM, chr1, chr2, ... }
BAM dictionary = { chr1, chr2, ... }
and the interval "-L chr1", the engine would fail to correctly retrieve
the reads from chr1, since chr1 has a different index in the two dictionaries.
With this patch, we throw an exception if there are contig index differences
between the dictionaries for reads and reference, AND the user's intervals
include at least one of the mismatching contigs.
The user can disable this exception via -U ALLOW_SEQ_DICT_INCOMPATIBILITY
In all other cases, dictionary validation behaves as before.
I also added comprehensive unit tests for the (previously-untested)
SequenceDictionaryUtils class.
GSA-768 #resolve
2013-02-22 04:31:16 +08:00
private void validateDataSourcesAgainstReference ( SAMDataSource reads , ReferenceSequenceFile reference , Collection < ReferenceOrderedDataSource > rods ) {
2010-12-31 12:52:22 +08:00
if ( ( reads . isEmpty ( ) & & ( rods = = null | | rods . isEmpty ( ) ) ) | | reference = = null )
2010-12-21 10:09:46 +08:00
return ;
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference . getSequenceDictionary ( ) ;
if ( ! reads . isEmpty ( ) ) {
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads . getHeader ( ) . getSequenceDictionary ( ) ;
2011-04-09 02:33:10 +08:00
if ( readsDictionary . size ( ) = = 0 ) {
2010-12-21 10:09:46 +08:00
logger . info ( "Reads file is unmapped. Skipping validation against reference." ) ;
return ;
}
// compare the reads to the reference
Sequence dictionary validation: detect problematic contig indexing differences
The GATK engine does not behave correctly when contigs are indexed
differently in the reads sequence dictionaries vs. the reference
sequence dictionary, and the inconsistently-indexed contigs are included
in the user's intervals. For example, given the dictionaries:
Reference dictionary = { chrM, chr1, chr2, ... }
BAM dictionary = { chr1, chr2, ... }
and the interval "-L chr1", the engine would fail to correctly retrieve
the reads from chr1, since chr1 has a different index in the two dictionaries.
With this patch, we throw an exception if there are contig index differences
between the dictionaries for reads and reference, AND the user's intervals
include at least one of the mismatching contigs.
The user can disable this exception via -U ALLOW_SEQ_DICT_INCOMPATIBILITY
In all other cases, dictionary validation behaves as before.
I also added comprehensive unit tests for the (previously-untested)
SequenceDictionaryUtils class.
GSA-768 #resolve
2013-02-22 04:31:16 +08:00
SequenceDictionaryUtils . validateDictionaries ( logger , getArguments ( ) . unsafe , "reads" , readsDictionary ,
"reference" , referenceDictionary , true , intervals ) ;
2010-12-21 10:09:46 +08:00
}
2010-12-31 12:52:22 +08:00
for ( ReferenceOrderedDataSource rod : rods )
Sequence dictionary validation: detect problematic contig indexing differences
The GATK engine does not behave correctly when contigs are indexed
differently in the reads sequence dictionaries vs. the reference
sequence dictionary, and the inconsistently-indexed contigs are included
in the user's intervals. For example, given the dictionaries:
Reference dictionary = { chrM, chr1, chr2, ... }
BAM dictionary = { chr1, chr2, ... }
and the interval "-L chr1", the engine would fail to correctly retrieve
the reads from chr1, since chr1 has a different index in the two dictionaries.
With this patch, we throw an exception if there are contig index differences
between the dictionaries for reads and reference, AND the user's intervals
include at least one of the mismatching contigs.
The user can disable this exception via -U ALLOW_SEQ_DICT_INCOMPATIBILITY
In all other cases, dictionary validation behaves as before.
I also added comprehensive unit tests for the (previously-untested)
SequenceDictionaryUtils class.
GSA-768 #resolve
2013-02-22 04:31:16 +08:00
IndexDictionaryUtils . validateTrackSequenceDictionary ( rod . getName ( ) , rod . getSequenceDictionary ( ) , referenceDictionary , getArguments ( ) . unsafe ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Gets a data source for the given set of reads .
*
2011-10-15 00:06:41 +08:00
* @param argCollection arguments
* @param genomeLocParser parser
* @param refReader reader
2010-12-21 10:09:46 +08:00
* @return A data source for the given set of reads .
* /
2011-03-17 05:48:47 +08:00
private SAMDataSource createReadsDataSource ( GATKArgumentCollection argCollection , GenomeLocParser genomeLocParser , IndexedFastaSequenceFile refReader ) {
2012-05-24 21:17:11 +08:00
DownsamplingMethod downsamplingMethod = getDownsamplingMethod ( ) ;
2010-12-21 10:09:46 +08:00
2011-10-15 00:06:41 +08:00
// Synchronize the method back into the collection so that it shows up when
2013-01-28 14:19:44 +08:00
// interrogating for the downsampling method during command line recreation.
2012-05-24 21:17:11 +08:00
setDownsamplingMethod ( downsamplingMethod ) ;
logger . info ( downsamplingMethod ) ;
2011-10-15 00:06:41 +08:00
2012-08-01 03:09:36 +08:00
if ( argCollection . removeProgramRecords & & argCollection . keepProgramRecords )
throw new UserException . BadArgumentValue ( "rpr / kpr" , "Cannot enable both options" ) ;
2012-08-03 02:21:30 +08:00
boolean removeProgramRecords = argCollection . removeProgramRecords | | walker . getClass ( ) . isAnnotationPresent ( RemoveProgramRecords . class ) ;
2012-08-01 03:09:36 +08:00
if ( argCollection . keepProgramRecords )
removeProgramRecords = false ;
2013-01-15 23:24:45 +08:00
final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker ;
2013-01-07 04:54:15 +08:00
2011-10-15 00:06:41 +08:00
return new SAMDataSource (
2010-12-23 03:00:17 +08:00
samReaderIDs ,
2011-09-13 22:49:16 +08:00
threadAllocation ,
argCollection . numberOfBAMFileHandles ,
2010-12-21 10:09:46 +08:00
genomeLocParser ,
argCollection . useOriginalBaseQualities ,
argCollection . strictnessLevel ,
argCollection . readBufferSize ,
2012-05-24 21:17:11 +08:00
downsamplingMethod ,
2010-12-21 10:09:46 +08:00
new ValidationExclusion ( Arrays . asList ( argCollection . unsafe ) ) ,
filters ,
2012-08-31 23:42:50 +08:00
readTransformers ,
2010-12-21 10:09:46 +08:00
includeReadsWithDeletionAtLoci ( ) ,
2012-07-30 22:19:57 +08:00
argCollection . defaultBaseQualities ,
2013-01-07 04:54:15 +08:00
removeProgramRecords ,
keepReadsInLIBS ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
2011-01-13 02:25:12 +08:00
* Opens a reference sequence file paired with an index . Only public for testing purposes
2010-12-21 10:09:46 +08:00
*
* @param refFile Handle to a reference sequence file . Non - null .
* /
2011-01-13 02:25:12 +08:00
public void setReferenceDataSource ( File refFile ) {
this . referenceDataSource = new ReferenceDataSource ( refFile ) ;
genomeLocParser = new GenomeLocParser ( referenceDataSource . getReference ( ) ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Open the reference - ordered data sources .
*
2010-12-31 12:52:22 +08:00
* @param referenceMetaDataFiles collection of RMD descriptors to load and validate .
* @param sequenceDictionary GATK - wide sequnce dictionary to use for validation .
* @param genomeLocParser to use when creating and validating GenomeLocs .
* @param validationExclusionType potentially indicate which validations to include / exclude .
*
2010-12-21 10:09:46 +08:00
* @return A list of reference - ordered data sources .
* /
2010-12-31 12:52:22 +08:00
private List < ReferenceOrderedDataSource > getReferenceOrderedDataSources ( Collection < RMDTriplet > referenceMetaDataFiles ,
SAMSequenceDictionary sequenceDictionary ,
GenomeLocParser genomeLocParser ,
ValidationExclusion . TYPE validationExclusionType ) {
2012-08-17 00:39:54 +08:00
final RMDTrackBuilder builder = new RMDTrackBuilder ( sequenceDictionary , genomeLocParser , validationExclusionType ) ;
2010-12-31 12:52:22 +08:00
2012-08-17 00:39:54 +08:00
final List < ReferenceOrderedDataSource > dataSources = new ArrayList < ReferenceOrderedDataSource > ( ) ;
2010-12-31 12:52:22 +08:00
for ( RMDTriplet fileDescriptor : referenceMetaDataFiles )
dataSources . add ( new ReferenceOrderedDataSource ( fileDescriptor ,
builder ,
sequenceDictionary ,
2010-12-21 10:09:46 +08:00
genomeLocParser ,
flashbackData ( ) ) ) ;
2010-12-31 12:52:22 +08:00
2010-12-21 10:09:46 +08:00
return dataSources ;
}
/ * *
* Returns the SAM File Header from the input reads ' data source file
* @return the SAM File Header from the input reads ' data source file
* /
public SAMFileHeader getSAMFileHeader ( ) {
return readsDataSource . getHeader ( ) ;
}
2012-06-25 22:27:37 +08:00
public boolean lenientVCFProcessing ( ) {
return lenientVCFProcessing ( argCollection . unsafe ) ;
}
public static boolean lenientVCFProcessing ( final ValidationExclusion . TYPE val ) {
return val = = ValidationExclusion . TYPE . ALL
| | val = = ValidationExclusion . TYPE . LENIENT_VCF_PROCESSING ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Returns the unmerged SAM file header for an individual reader .
* @param reader The reader .
2013-02-06 12:42:15 +08:00
* @return Header for that reader or null if not available .
2010-12-21 10:09:46 +08:00
* /
public SAMFileHeader getSAMFileHeader ( SAMReaderID reader ) {
2013-02-06 12:42:15 +08:00
return readsDataSource = = null ? null : readsDataSource . getHeader ( reader ) ;
2010-12-21 10:09:46 +08:00
}
2011-09-30 22:43:51 +08:00
/ * *
* Returns an ordered list of the unmerged SAM file headers known to this engine .
* @return list of header for each input SAM file , in command line order
* /
public List < SAMFileHeader > getSAMFileHeaders ( ) {
final List < SAMFileHeader > headers = new ArrayList < SAMFileHeader > ( ) ;
for ( final SAMReaderID id : getReadsDataSource ( ) . getReaderIDs ( ) ) {
headers . add ( getReadsDataSource ( ) . getHeader ( id ) ) ;
}
return headers ;
}
2011-09-20 22:53:18 +08:00
/ * *
* Gets the master sequence dictionary for this GATK engine instance
* @return a never - null dictionary listing all of the contigs known to this engine instance
* /
public SAMSequenceDictionary getMasterSequenceDictionary ( ) {
return getReferenceDataSource ( ) . getReference ( ) . getSequenceDictionary ( ) ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads ; header merger , individual file readers etc can be accessed through the returned data source object .
*
* @return the reads data source
* /
public SAMDataSource getReadsDataSource ( ) {
return this . readsDataSource ;
}
/ * *
* Sets the collection of GATK main application arguments .
*
* @param argCollection the GATK argument collection
* /
public void setArguments ( GATKArgumentCollection argCollection ) {
this . argCollection = argCollection ;
}
/ * *
* Gets the collection of GATK main application arguments .
*
* @return the GATK argument collection
* /
public GATKArgumentCollection getArguments ( ) {
return this . argCollection ;
}
/ * *
* Get the list of intervals passed to the engine .
2011-08-23 05:25:15 +08:00
* @return List of intervals , or null if no intervals are in use
2010-12-21 10:09:46 +08:00
* /
public GenomeLocSortedSet getIntervals ( ) {
return this . intervals ;
}
2012-09-09 08:17:15 +08:00
/ * *
* Get the list of regions of the genome being processed . If the user
* requested specific intervals , return those , otherwise return regions
* corresponding to the entire genome . Never returns null .
*
* @return a non - null set of intervals being processed
* /
@Ensures ( "result != null" )
public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed ( ) {
if ( getIntervals ( ) = = null )
// if we don't have any intervals defined, create intervals from the reference itself
return GenomeLocSortedSet . createSetFromSequenceDictionary ( getReferenceDataSource ( ) . getReference ( ) . getSequenceDictionary ( ) ) ;
else
return getIntervals ( ) ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Gets the list of filters employed by this engine .
* @return Collection of filters ( actual instances ) used by this engine .
* /
2011-05-05 03:29:08 +08:00
public Collection < ReadFilter > getFilters ( ) {
2010-12-21 10:09:46 +08:00
return this . filters ;
}
/ * *
* Sets the list of filters employed by this engine .
* @param filters Collection of filters ( actual instances ) used by this engine .
* /
2011-05-05 03:29:08 +08:00
public void setFilters ( Collection < ReadFilter > filters ) {
2010-12-21 10:09:46 +08:00
this . filters = filters ;
}
/ * *
* Gets the filter manager for this engine .
* @return filter manager for this engine .
* /
protected FilterManager getFilterManager ( ) {
return filterManager ;
}
/ * *
* Gets the input sources for this engine .
* @return input sources for this engine .
* /
protected Map < ArgumentSource , Object > getInputs ( ) {
return inputs ;
}
/ * *
* Gets the output stubs for this engine .
* @return output stubs for this engine .
* /
protected Collection < Stub < ? > > getOutputs ( ) {
return outputs ;
}
/ * *
* Returns data source objects encapsulating all rod data ;
* individual rods can be accessed through the returned data source objects .
*
* @return the rods data sources
* /
public List < ReferenceOrderedDataSource > getRodDataSources ( ) {
return this . rodDataSources ;
}
/ * *
* Gets cumulative metrics about the entire run to this point .
2011-04-13 23:10:46 +08:00
* Returns a clone of this snapshot in time .
* @return cumulative metrics about the entire run at this point . ReadMetrics object is a unique instance and is
* owned by the caller ; the caller can do with the object what they wish .
2010-12-21 10:09:46 +08:00
* /
public ReadMetrics getCumulativeMetrics ( ) {
2012-09-05 02:50:06 +08:00
// todo -- probably shouldn't be lazy
if ( cumulativeMetrics = = null )
cumulativeMetrics = readsDataSource = = null ? new ReadMetrics ( ) : readsDataSource . getCumulativeReadMetrics ( ) ;
return cumulativeMetrics ;
2010-12-21 10:09:46 +08:00
}
2012-08-23 21:59:37 +08:00
/ * *
* Return the global ThreadEfficiencyMonitor , if there is one
*
* @return the monitor , or null if none is active
* /
public ThreadEfficiencyMonitor getThreadEfficiencyMonitor ( ) {
return threadEfficiencyMonitor ;
}
2011-09-29 22:34:51 +08:00
// -------------------------------------------------------------------------------------
//
// code for working with Samples database
//
// -------------------------------------------------------------------------------------
2010-12-21 10:09:46 +08:00
2011-10-04 00:33:30 +08:00
public SampleDB getSampleDB ( ) {
return this . sampleDB ;
2010-12-21 10:09:46 +08:00
}
public Map < String , String > getApproximateCommandLineArguments ( Object . . . argumentProviders ) {
return CommandLineUtils . getApproximateCommandLineArguments ( parsingEngine , argumentProviders ) ;
}
public String createApproximateCommandLineArgumentString ( Object . . . argumentProviders ) {
return CommandLineUtils . createApproximateCommandLineArgumentString ( parsingEngine , argumentProviders ) ;
}
2012-10-27 01:18:18 +08:00
/ * *
* Does the current runtime in unit exceed the runtime limit , if one has been provided ?
*
* @param runtime the runtime of this GATK instance in minutes
* @param unit the time unit of runtime
* @return false if not limit was requested or if runtime < = the limit , true otherwise
* /
public boolean exceedsRuntimeLimit ( final long runtime , final TimeUnit unit ) {
if ( runtime < 0 ) throw new IllegalArgumentException ( "runtime must be >= 0 but got " + runtime ) ;
if ( getArguments ( ) . maxRuntime = = NO_RUNTIME_LIMIT )
return false ;
else {
final long actualRuntimeNano = TimeUnit . NANOSECONDS . convert ( runtime , unit ) ;
final long maxRuntimeNano = getRuntimeLimitInNanoseconds ( ) ;
return actualRuntimeNano > maxRuntimeNano ;
}
}
/ * *
* @return the runtime limit in nanoseconds , or - 1 if no limit was specified
* /
public long getRuntimeLimitInNanoseconds ( ) {
if ( getArguments ( ) . maxRuntime = = NO_RUNTIME_LIMIT )
return - 1 ;
else
return TimeUnit . NANOSECONDS . convert ( getArguments ( ) . maxRuntime , getArguments ( ) . maxRuntimeUnits ) ;
}
2009-05-11 10:07:20 +08:00
}