2009-06-23 05:11:18 +08:00
/ *
2010-10-28 03:44:55 +08:00
* Copyright ( c ) 2010 , The Broad Institute
2010-04-20 23:26:32 +08:00
*
2009-06-23 05:11:18 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2009-06-23 05:11:18 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2009-06-23 05:11:18 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2009-06-23 05:11:18 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
2010-10-28 03:44:55 +08:00
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
2009-06-23 05:11:18 +08:00
* /
2009-05-11 10:07:20 +08:00
package org.broadinstitute.sting.gatk ;
2010-12-21 10:09:46 +08:00
import net.sf.picard.reference.IndexedFastaSequenceFile ;
2010-04-01 06:39:56 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2012-04-09 08:44:39 +08:00
import net.sf.samtools.SAMFileHeader ;
import net.sf.samtools.SAMRecord ;
import net.sf.samtools.SAMSequenceDictionary ;
2010-12-21 10:09:46 +08:00
import org.apache.log4j.Logger ;
2012-05-23 04:27:13 +08:00
import org.broad.tribble.readers.PositionalBufferedStream ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.commandline.* ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection ;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.gatk.datasources.reads.* ;
2011-02-04 01:59:19 +08:00
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource ;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource ;
2009-05-11 10:07:20 +08:00
import org.broadinstitute.sting.gatk.executive.MicroScheduler ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.filters.FilterManager ;
2011-05-05 03:29:08 +08:00
import org.broadinstitute.sting.gatk.filters.ReadFilter ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.gatk.io.OutputTracker ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.gatk.io.stubs.Stub ;
2011-07-26 01:21:52 +08:00
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder ;
2010-12-23 03:00:17 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet ;
2012-04-09 08:44:39 +08:00
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation ;
import org.broadinstitute.sting.gatk.samples.SampleDB ;
2011-10-04 05:41:13 +08:00
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2011-09-29 23:50:05 +08:00
import org.broadinstitute.sting.utils.* ;
BAQ calculation refactoring in the GATK. Single -baq argument can be NONE, CALCULATE_AS_NECESSARY, and RECALCULATE. Walkers can control bia the @BAQMode annotation how the BAQ calculation is applied. Can either be as a tag, by overwriting the qualities scores, or by only returning the baq-capped qualities scores. Additionally, walkers can be set up to have the BAQ applied to the incoming reads (ON_INPUT, the default), to output reads (ON_OUTPUT), or HANDLED_BY_WALKER, which means that calling into the BAQ system is the responsibility of the individual walker.
SAMFileWriterStub now supports BAQ writing as an internal feature. Several walkers have the @BAQMode applied to this, with parameters that I think are reasonable. Please look if you own these walkers, though
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4798 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-07 04:55:52 +08:00
import org.broadinstitute.sting.utils.baq.BAQ ;
2012-07-18 00:23:40 +08:00
import org.broadinstitute.sting.utils.classloader.PluginManager ;
2012-05-23 04:27:13 +08:00
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec ;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader ;
2012-06-27 13:15:22 +08:00
import org.broadinstitute.sting.utils.collections.Pair ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
import org.broadinstitute.sting.utils.exceptions.UserException ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.utils.interval.IntervalUtils ;
2012-02-06 02:09:03 +08:00
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration ;
2012-06-02 07:25:11 +08:00
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder ;
2009-05-11 10:07:20 +08:00
2010-12-21 10:09:46 +08:00
import java.io.File ;
2012-05-23 04:27:13 +08:00
import java.io.FileInputStream ;
import java.io.IOException ;
2009-07-30 00:11:45 +08:00
import java.util.* ;
2009-05-11 10:07:20 +08:00
2010-09-24 07:28:55 +08:00
/ * *
* A GenomeAnalysisEngine that runs a specified walker .
* /
2010-12-21 10:09:46 +08:00
public class GenomeAnalysisEngine {
/ * *
* our log , which we want to capture anything from this class
* /
private static Logger logger = Logger . getLogger ( GenomeAnalysisEngine . class ) ;
/ * *
* The GATK command - line argument parsing code .
* /
private ParsingEngine parsingEngine ;
/ * *
* The genomeLocParser can create and parse GenomeLocs .
* /
private GenomeLocParser genomeLocParser ;
/ * *
* Accessor for sharded read data .
* /
private SAMDataSource readsDataSource = null ;
/ * *
* Accessor for sharded reference data .
* /
private ReferenceDataSource referenceDataSource = null ;
/ * *
* Accessor for sample metadata
* /
2011-10-04 00:33:30 +08:00
private SampleDB sampleDB = null ;
2010-12-21 10:09:46 +08:00
/ * *
* Accessor for sharded reference - ordered data .
* /
private List < ReferenceOrderedDataSource > rodDataSources ;
// our argument collection
private GATKArgumentCollection argCollection ;
/ * *
* Collection of intervals used by the engine .
* /
private GenomeLocSortedSet intervals = null ;
2011-04-05 02:41:55 +08:00
/ * *
* Explicitly assign the interval set to use for this traversal ( for unit testing purposes )
* @param intervals set of intervals to use for this traversal
* /
public void setIntervals ( GenomeLocSortedSet intervals ) {
this . intervals = intervals ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Collection of inputs used by the engine .
* /
private Map < ArgumentSource , Object > inputs = new HashMap < ArgumentSource , Object > ( ) ;
/ * *
* Collection of outputs used by the engine .
* /
private Collection < Stub < ? > > outputs = new ArrayList < Stub < ? > > ( ) ;
/ * *
* Collection of the filters applied to the input data .
* /
2011-05-05 03:29:08 +08:00
private Collection < ReadFilter > filters ;
2010-12-21 10:09:46 +08:00
2011-09-13 22:49:16 +08:00
/ * *
* Controls the allocation of threads between CPU vs IO .
* /
private ThreadAllocation threadAllocation ;
2011-01-18 05:23:09 +08:00
/ * *
* A currently hacky unique name for this GATK instance
* /
2011-04-08 01:03:48 +08:00
private String myName = "GATK_" + Math . abs ( getRandomGenerator ( ) . nextInt ( ) ) ;
2011-01-18 05:23:09 +08:00
2009-10-06 10:45:31 +08:00
/ * *
* our walker manager
* /
2010-09-25 10:49:30 +08:00
private final WalkerManager walkerManager = new WalkerManager ( ) ;
2009-11-11 02:40:16 +08:00
2010-09-24 07:28:55 +08:00
private Walker < ? , ? > walker ;
2010-08-29 06:53:32 +08:00
2010-09-24 07:28:55 +08:00
public void setWalker ( Walker < ? , ? > walker ) {
this . walker = walker ;
2009-07-10 07:59:53 +08:00
}
2009-05-11 10:07:20 +08:00
2010-12-23 03:00:17 +08:00
/ * *
* A processed collection of SAM reader identifiers .
* /
2010-12-31 12:52:22 +08:00
private Collection < SAMReaderID > samReaderIDs = Collections . emptyList ( ) ;
2010-12-23 03:00:17 +08:00
/ * *
* Set the SAM / BAM files over which to traverse .
* @param samReaderIDs Collection of ids to use during this traversal .
* /
public void setSAMFileIDs ( Collection < SAMReaderID > samReaderIDs ) {
this . samReaderIDs = samReaderIDs ;
}
/ * *
* Collection of reference metadata files over which to traverse .
* /
private Collection < RMDTriplet > referenceMetaDataFiles ;
/ * *
* Set the reference metadata files to use for this traversal .
* @param referenceMetaDataFiles Collection of files and descriptors over which to traverse .
* /
public void setReferenceMetaDataFiles ( Collection < RMDTriplet > referenceMetaDataFiles ) {
this . referenceMetaDataFiles = referenceMetaDataFiles ;
}
2011-04-08 01:03:48 +08:00
/ * *
* Static random number generator and seed .
* /
private static final long GATK_RANDOM_SEED = 47382911L ;
private static Random randomGenerator = new Random ( GATK_RANDOM_SEED ) ;
2011-04-30 03:29:08 +08:00
public static Random getRandomGenerator ( ) { return randomGenerator ; }
2011-05-31 22:06:37 +08:00
public static void resetRandomGenerator ( ) { randomGenerator . setSeed ( GATK_RANDOM_SEED ) ; }
public static void resetRandomGenerator ( long seed ) { randomGenerator . setSeed ( seed ) ; }
2012-02-06 02:09:03 +08:00
/ * *
2012-02-14 01:35:09 +08:00
* Base Quality Score Recalibration helper object
2012-02-06 02:09:03 +08:00
* /
2012-02-14 01:35:09 +08:00
private BaseRecalibration baseRecalibration = null ;
public BaseRecalibration getBaseRecalibration ( ) { return baseRecalibration ; }
public boolean hasBaseRecalibration ( ) { return baseRecalibration ! = null ; }
2012-07-18 00:23:40 +08:00
public void setBaseRecalibration ( final File recalFile , final int quantizationLevels , final boolean disableIndelQuals , final int preserveQLessThan ) {
baseRecalibration = new BaseRecalibration ( recalFile , quantizationLevels , disableIndelQuals , preserveQLessThan , isGATKLite ( ) ) ;
2012-07-17 22:52:43 +08:00
}
/ * *
* Utility method to determine whether this is the lite version of the GATK
* /
public boolean isGATKLite ( ) {
2012-07-18 00:23:40 +08:00
if ( isLiteVersion = = null ) {
2012-07-18 01:37:11 +08:00
isLiteVersion = ! ( new PluginManager < Object > ( Object . class ) . exists ( DummyProtectedWalkerName ) ) ;
2012-07-18 00:23:40 +08:00
}
return isLiteVersion ;
2012-07-17 22:52:43 +08:00
}
2012-07-18 00:46:16 +08:00
private static final String DummyProtectedWalkerName = "DummyProtectedWalker" ;
2012-07-18 00:23:40 +08:00
private static Boolean isLiteVersion = null ;
2012-02-06 02:09:03 +08:00
2009-07-10 07:59:53 +08:00
/ * *
* Actually run the GATK with the specified walker .
2009-10-06 10:45:31 +08:00
*
2009-09-30 06:23:19 +08:00
* @return the value of this traversal .
2009-07-10 07:59:53 +08:00
* /
2010-09-24 07:28:55 +08:00
public Object execute ( ) {
2010-05-18 05:00:44 +08:00
//HeapSizeMonitor monitor = new HeapSizeMonitor();
//monitor.start();
2010-09-24 07:28:55 +08:00
setStartTime ( new java . util . Date ( ) ) ;
2010-05-18 05:00:44 +08:00
2012-07-17 22:52:43 +08:00
final GATKArgumentCollection args = this . getArguments ( ) ;
2009-05-11 10:07:20 +08:00
// validate our parameters
2012-07-17 22:52:43 +08:00
if ( args = = null ) {
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null." ) ;
2009-05-11 10:07:20 +08:00
}
2010-04-01 20:47:48 +08:00
// validate our parameters
2010-09-24 07:28:55 +08:00
if ( this . walker = = null )
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "The walker passed to GenomeAnalysisEngine can not be null." ) ;
2009-07-10 07:59:53 +08:00
2012-07-17 22:52:43 +08:00
if ( args . nonDeterministicRandomSeed )
2011-04-30 03:29:08 +08:00
resetRandomGenerator ( System . currentTimeMillis ( ) ) ;
2012-06-02 07:25:11 +08:00
// TODO -- REMOVE ME WHEN WE STOP BCF testing
2012-07-17 22:52:43 +08:00
if ( args . USE_SLOW_GENOTYPES )
2012-06-07 20:34:25 +08:00
GenotypeBuilder . MAKE_FAST_BY_DEFAULT = false ;
2012-06-02 07:25:11 +08:00
2012-02-06 02:09:03 +08:00
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
2012-07-17 22:52:43 +08:00
if ( args . BQSR_RECAL_FILE ! = null )
2012-07-18 00:23:40 +08:00
setBaseRecalibration ( args . BQSR_RECAL_FILE , args . quantizationLevels , args . disableIndelQuals , args . PRESERVE_QSCORES_LESS_THAN ) ;
2012-02-06 02:09:03 +08:00
2011-09-13 22:49:16 +08:00
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation ( ) ;
2009-07-30 07:00:15 +08:00
// Prepare the data for traversal.
2010-09-24 07:28:55 +08:00
initializeDataSources ( ) ;
2009-05-20 07:26:17 +08:00
2011-10-06 06:00:58 +08:00
// initialize sampleDB
initializeSampleDB ( ) ;
2011-01-13 01:32:27 +08:00
// initialize and validate the interval list
initializeIntervals ( ) ;
validateSuppliedIntervals ( ) ;
2009-05-11 10:07:20 +08:00
// our microscheduler, which is in charge of running everything
2010-09-24 07:28:55 +08:00
MicroScheduler microScheduler = createMicroscheduler ( ) ;
2009-05-11 10:07:20 +08:00
2011-01-05 11:07:11 +08:00
// create temp directories as necessary
initializeTempDirectory ( ) ;
2012-02-06 02:09:03 +08:00
// create the output streams
2010-09-24 07:28:55 +08:00
initializeOutputStreams ( microScheduler . getOutputTracker ( ) ) ;
2009-05-11 10:07:20 +08:00
2011-09-13 22:49:16 +08:00
Iterable < Shard > shardStrategy = getShardStrategy ( readsDataSource , microScheduler . getReference ( ) , intervals ) ;
2010-03-15 05:08:14 +08:00
// execute the microscheduler, storing the results
2011-10-15 00:06:41 +08:00
return microScheduler . execute ( this . walker , shardStrategy ) ;
2010-05-18 05:00:44 +08:00
//monitor.stop();
//logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
2011-10-15 00:06:41 +08:00
//return result;
2010-03-15 05:08:14 +08:00
}
2009-07-10 07:59:53 +08:00
/ * *
* Retrieves an instance of the walker based on the walker name .
2009-10-06 10:45:31 +08:00
*
2009-07-10 07:59:53 +08:00
* @param walkerName Name of the walker . Must not be null . If the walker cannot be instantiated , an exception will be thrown .
* @return An instance of the walker .
* /
2009-10-06 10:45:31 +08:00
public Walker < ? , ? > getWalkerByName ( String walkerName ) {
2009-11-11 02:40:16 +08:00
return walkerManager . createByName ( walkerName ) ;
}
/ * *
* Gets the name of a given walker type .
* @param walkerType Type of walker .
* @return Name of the walker .
* /
2010-05-21 03:02:02 +08:00
public String getWalkerName ( Class < ? extends Walker > walkerType ) {
2009-11-11 02:40:16 +08:00
return walkerManager . getName ( walkerType ) ;
2009-07-10 07:59:53 +08:00
}
2011-01-18 05:23:09 +08:00
public String getName ( ) {
return myName ;
}
2009-11-11 07:36:17 +08:00
/ * *
* Gets a list of the filters to associate with the given walker . Will NOT initialize the engine with this filters ;
* the caller must handle that directly .
* @return A collection of available filters .
* /
2011-05-05 03:29:08 +08:00
public Collection < ReadFilter > createFilters ( ) {
2011-10-07 09:51:40 +08:00
final List < ReadFilter > filters = WalkerManager . getReadFilters ( walker , this . getFilterManager ( ) ) ;
2010-12-21 10:09:46 +08:00
if ( this . getArguments ( ) . readGroupBlackList ! = null & & this . getArguments ( ) . readGroupBlackList . size ( ) > 0 )
filters . add ( new ReadGroupBlackListFilter ( this . getArguments ( ) . readGroupBlackList ) ) ;
2011-10-07 09:51:40 +08:00
for ( final String filterName : this . getArguments ( ) . readFilters )
2010-12-21 10:09:46 +08:00
filters . add ( this . getFilterManager ( ) . createByName ( filterName ) ) ;
2011-10-07 09:51:40 +08:00
return Collections . unmodifiableList ( filters ) ;
2009-11-11 07:36:17 +08:00
}
2011-09-13 22:49:16 +08:00
/ * *
* Parse out the thread allocation from the given command - line argument .
* /
private void determineThreadAllocation ( ) {
Tags tags = parsingEngine . getTags ( argCollection . numberOfThreads ) ;
2011-12-01 02:13:16 +08:00
// TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters.
Integer numCPUThreads = null ;
if ( tags . containsKey ( "cpu" ) & & argCollection . numberOfCPUThreads ! = null )
throw new UserException ( "Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other." ) ;
else if ( tags . containsKey ( "cpu" ) )
numCPUThreads = Integer . parseInt ( tags . getValue ( "cpu" ) ) ;
else if ( argCollection . numberOfCPUThreads ! = null )
numCPUThreads = argCollection . numberOfCPUThreads ;
Integer numIOThreads = null ;
if ( tags . containsKey ( "io" ) & & argCollection . numberOfIOThreads ! = null )
throw new UserException ( "Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other." ) ;
else if ( tags . containsKey ( "io" ) )
numIOThreads = Integer . parseInt ( tags . getValue ( "io" ) ) ;
else if ( argCollection . numberOfIOThreads ! = null )
numIOThreads = argCollection . numberOfIOThreads ;
2011-09-13 22:49:16 +08:00
this . threadAllocation = new ThreadAllocation ( argCollection . numberOfThreads , numCPUThreads , numIOThreads ) ;
}
2009-12-05 07:24:29 +08:00
/ * *
* Allow subclasses and others within this package direct access to the walker manager .
* @return The walker manager used by this package .
* /
protected WalkerManager getWalkerManager ( ) {
return walkerManager ;
}
2010-09-24 07:28:55 +08:00
2009-05-11 10:07:20 +08:00
/ * *
* setup a microscheduler
2009-10-06 10:45:31 +08:00
*
2009-05-11 10:07:20 +08:00
* @return a new microscheduler
* /
2010-09-24 07:28:55 +08:00
private MicroScheduler createMicroscheduler ( ) {
2010-03-22 07:22:25 +08:00
// Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary.
2011-01-14 10:49:04 +08:00
if ( ( walker instanceof ReadWalker | | walker instanceof DuplicateWalker | | walker instanceof ReadPairWalker ) & &
2010-09-24 07:28:55 +08:00
this . getArguments ( ) . referenceFile = = null ) {
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "Read-based traversals require a reference file but none was given" ) ;
2009-05-11 10:07:20 +08:00
}
2011-09-13 22:49:16 +08:00
return MicroScheduler . create ( this , walker , this . getReadsDataSource ( ) , this . getReferenceDataSource ( ) . getReference ( ) , this . getRodDataSources ( ) , threadAllocation ) ;
2010-02-04 12:12:49 +08:00
}
2010-09-24 07:28:55 +08:00
protected DownsamplingMethod getDownsamplingMethod ( ) {
GATKArgumentCollection argCollection = this . getArguments ( ) ;
DownsamplingMethod method ;
2010-08-27 05:38:03 +08:00
if ( argCollection . getDownsamplingMethod ( ) ! = null )
method = argCollection . getDownsamplingMethod ( ) ;
2010-05-19 13:40:05 +08:00
else if ( WalkerManager . getDownsamplingMethod ( walker ) ! = null )
method = WalkerManager . getDownsamplingMethod ( walker ) ;
else
2011-10-27 02:11:49 +08:00
method = GATKArgumentCollection . getDefaultDownsamplingMethod ( ) ;
2010-09-24 07:28:55 +08:00
return method ;
}
2011-10-15 00:06:41 +08:00
protected void setDownsamplingMethod ( DownsamplingMethod method ) {
argCollection . setDownsamplingMethod ( method ) ;
}
BAQ calculation refactoring in the GATK. Single -baq argument can be NONE, CALCULATE_AS_NECESSARY, and RECALCULATE. Walkers can control bia the @BAQMode annotation how the BAQ calculation is applied. Can either be as a tag, by overwriting the qualities scores, or by only returning the baq-capped qualities scores. Additionally, walkers can be set up to have the BAQ applied to the incoming reads (ON_INPUT, the default), to output reads (ON_OUTPUT), or HANDLED_BY_WALKER, which means that calling into the BAQ system is the responsibility of the individual walker.
SAMFileWriterStub now supports BAQ writing as an internal feature. Several walkers have the @BAQMode applied to this, with parameters that I think are reasonable. Please look if you own these walkers, though
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4798 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-07 04:55:52 +08:00
public BAQ . QualityMode getWalkerBAQQualityMode ( ) { return WalkerManager . getBAQQualityMode ( walker ) ; }
public BAQ . ApplicationTime getWalkerBAQApplicationTime ( ) { return WalkerManager . getBAQApplicationTime ( walker ) ; }
2010-09-24 07:28:55 +08:00
protected boolean includeReadsWithDeletionAtLoci ( ) {
return walker . includeReadsWithDeletionAtLoci ( ) ;
2009-06-10 21:39:32 +08:00
}
2009-07-30 07:00:15 +08:00
/ * *
2011-03-31 06:23:24 +08:00
* Verifies that the supplied set of reads files mesh with what the walker says it requires ,
* and also makes sure that there were no duplicate SAM files specified on the command line .
2009-07-30 07:00:15 +08:00
* /
2010-09-24 07:28:55 +08:00
protected void validateSuppliedReads ( ) {
GATKArgumentCollection arguments = this . getArguments ( ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( WalkerManager . isRequired ( walker , DataSource . READS ) & & ( arguments . samFiles = = null | | arguments . samFiles . size ( ) = = 0 ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker requires reads but none were provided." ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( ( arguments . samFiles ! = null & & arguments . samFiles . size ( ) > 0 ) & & ! WalkerManager . isAllowed ( walker , DataSource . READS ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker does not allow reads but reads were provided." ) ;
2011-03-31 06:23:24 +08:00
// Make sure no SAM files were specified multiple times by the user.
checkForDuplicateSamFiles ( ) ;
}
/ * *
* Checks whether there are SAM files that appear multiple times in the fully unpacked list of
* SAM files ( samReaderIDs ) . If there are , throws an ArgumentException listing the files in question .
* /
protected void checkForDuplicateSamFiles ( ) {
Set < SAMReaderID > encounteredSamFiles = new HashSet < SAMReaderID > ( ) ;
Set < String > duplicateSamFiles = new LinkedHashSet < String > ( ) ;
for ( SAMReaderID samFile : samReaderIDs ) {
if ( encounteredSamFiles . contains ( samFile ) ) {
duplicateSamFiles . add ( samFile . getSamFilePath ( ) ) ;
}
else {
encounteredSamFiles . add ( samFile ) ;
}
}
if ( duplicateSamFiles . size ( ) > 0 ) {
throw new ArgumentException ( "The following BAM files appear multiple times in the list of input files: " +
duplicateSamFiles + " BAM files may be specified at most once." ) ;
}
2009-07-30 07:00:15 +08:00
}
/ * *
* Verifies that the supplied reference file mesh with what the walker says it requires .
* /
2010-09-24 07:28:55 +08:00
protected void validateSuppliedReference ( ) {
GATKArgumentCollection arguments = this . getArguments ( ) ;
2009-07-30 07:00:15 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-12-11 03:15:48 +08:00
// TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required.
if ( /*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments . referenceFile = = null )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker requires a reference but none was provided." ) ;
2009-07-30 07:00:15 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( arguments . referenceFile ! = null & & ! WalkerManager . isAllowed ( walker , DataSource . REFERENCE ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker does not allow a reference but one was provided." ) ;
2009-07-30 07:00:15 +08:00
}
2009-05-20 07:26:17 +08:00
2010-12-15 02:24:18 +08:00
protected void validateSuppliedIntervals ( ) {
// Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped.
if ( ! ( walker instanceof ReadWalker ) ) {
GenomeLocSortedSet intervals = getIntervals ( ) ;
if ( intervals ! = null & & getIntervals ( ) . contains ( GenomeLoc . UNMAPPED ) )
throw new ArgumentException ( "Interval list specifies unmapped region. Only read walkers may include the unmapped region." ) ;
}
2011-04-05 02:41:55 +08:00
// If intervals is non-null and empty at this point, it means that the list of intervals to process
// was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since
// this was very likely unintentional, the user should be informed of this. Note that this is different
2011-10-29 00:12:14 +08:00
// from the case where intervals == null, which indicates that there were no interval arguments.
2011-10-28 21:23:25 +08:00
if ( intervals ! = null & & intervals . isEmpty ( ) ) {
2011-10-29 00:12:14 +08:00
logger . warn ( "The given combination of -L and -XL options results in an empty set. No intervals to process." ) ;
2011-04-05 02:41:55 +08:00
}
2010-12-15 02:24:18 +08:00
}
2009-07-30 00:11:45 +08:00
/ * *
* Get the sharding strategy given a driving data source .
*
2011-10-15 00:06:41 +08:00
* @param readsDataSource readsDataSource
2009-07-30 00:11:45 +08:00
* @param drivingDataSource Data on which to shard .
2011-10-15 00:06:41 +08:00
* @param intervals intervals
2010-09-24 07:28:55 +08:00
* @return the sharding strategy
2009-07-30 00:11:45 +08:00
* /
2011-09-13 22:49:16 +08:00
protected Iterable < Shard > getShardStrategy ( SAMDataSource readsDataSource , ReferenceSequenceFile drivingDataSource , GenomeLocSortedSet intervals ) {
2010-09-24 07:28:55 +08:00
ValidationExclusion exclusions = ( readsDataSource ! = null ? readsDataSource . getReadsInfo ( ) . getValidationExclusionList ( ) : null ) ;
ReferenceDataSource referenceDataSource = this . getReferenceDataSource ( ) ;
2011-09-13 22:49:16 +08:00
// If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
if ( ! readsDataSource . isEmpty ( ) ) {
if ( ! readsDataSource . hasIndex ( ) & & ! exclusions . contains ( ValidationExclusion . TYPE . ALLOW_UNINDEXED_BAM ) )
2011-01-15 05:32:53 +08:00
throw new UserException . CommandLineException ( "Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported." ) ;
2011-09-13 22:49:16 +08:00
if ( ! readsDataSource . hasIndex ( ) & & intervals ! = null & & ! argCollection . allowIntervalsWithUnindexedBAM )
2011-01-14 10:49:04 +08:00
throw new UserException . CommandLineException ( "Cannot perform interval processing when reads are present but no index is available." ) ;
2009-12-17 05:55:42 +08:00
2012-01-20 11:05:08 +08:00
if ( walker instanceof LocusWalker ) {
2010-07-19 00:29:59 +08:00
if ( readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
2010-09-14 13:04:26 +08:00
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . coordinate , "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately." ) ;
2011-09-13 22:49:16 +08:00
if ( intervals = = null )
return readsDataSource . createShardIteratorOverMappedReads ( referenceDataSource . getReference ( ) . getSequenceDictionary ( ) , new LocusShardBalancer ( ) ) ;
else
return readsDataSource . createShardIteratorOverIntervals ( intervals , new LocusShardBalancer ( ) ) ;
2012-01-20 11:05:08 +08:00
}
else if ( walker instanceof ActiveRegionWalker ) {
if ( readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . coordinate , "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately." ) ;
if ( intervals = = null )
return readsDataSource . createShardIteratorOverMappedReads ( referenceDataSource . getReference ( ) . getSequenceDictionary ( ) , new LocusShardBalancer ( ) ) ;
else
return readsDataSource . createShardIteratorOverIntervals ( ( ( ActiveRegionWalker ) walker ) . extendIntervals ( intervals , this . genomeLocParser , this . getReferenceDataSource ( ) . getReference ( ) ) , new LocusShardBalancer ( ) ) ;
}
2011-09-13 22:49:16 +08:00
else if ( walker instanceof ReadWalker | | walker instanceof ReadPairWalker | | walker instanceof DuplicateWalker ) {
// Apply special validation to read pair walkers.
if ( walker instanceof ReadPairWalker ) {
if ( readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . queryname )
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . queryname , "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker." ) ;
if ( intervals ! = null & & ! intervals . isEmpty ( ) )
throw new UserException . CommandLineException ( "Pairs traversal cannot be used in conjunction with intervals." ) ;
}
if ( intervals = = null )
return readsDataSource . createShardIteratorOverAllReads ( new ReadShardBalancer ( ) ) ;
else
return readsDataSource . createShardIteratorOverIntervals ( intervals , new ReadShardBalancer ( ) ) ;
2010-07-07 11:14:59 +08:00
}
2011-09-13 22:49:16 +08:00
else
throw new ReviewedStingException ( "Unable to determine walker type for walker " + walker . getClass ( ) . getName ( ) ) ;
}
else {
2011-12-23 09:54:35 +08:00
// TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well
// TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard
// TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB]
final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000 ;
2011-09-13 22:49:16 +08:00
if ( intervals = = null )
return referenceDataSource . createShardsOverEntireReference ( readsDataSource , genomeLocParser , SHARD_SIZE ) ;
else
return referenceDataSource . createShardsOverIntervals ( readsDataSource , intervals , SHARD_SIZE ) ;
2009-12-17 05:55:42 +08:00
}
2009-07-30 00:11:45 +08:00
}
2010-09-24 07:28:55 +08:00
protected boolean flashbackData ( ) {
return walker instanceof ReadWalker ;
2009-07-30 00:11:45 +08:00
}
2009-05-11 10:07:20 +08:00
2011-01-05 11:07:11 +08:00
/ * *
* Create the temp directory if it doesn ' t exist .
* /
private void initializeTempDirectory ( ) {
File tempDir = new File ( System . getProperty ( "java.io.tmpdir" ) ) ;
2011-10-15 00:06:41 +08:00
if ( ! tempDir . exists ( ) & & ! tempDir . mkdirs ( ) )
throw new UserException . BadTmpDir ( "Unable to create directory" ) ;
2011-01-05 11:07:11 +08:00
}
2009-07-10 06:10:22 +08:00
/ * *
* Initialize the output streams as specified by the user .
*
2009-09-30 06:23:19 +08:00
* @param outputTracker the tracker supplying the initialization data .
2009-07-10 06:10:22 +08:00
* /
2010-09-24 07:28:55 +08:00
private void initializeOutputStreams ( OutputTracker outputTracker ) {
for ( Map . Entry < ArgumentSource , Object > input : getInputs ( ) . entrySet ( ) )
2009-10-06 10:45:31 +08:00
outputTracker . addInput ( input . getKey ( ) , input . getValue ( ) ) ;
2010-09-24 07:28:55 +08:00
for ( Stub < ? > stub : getOutputs ( ) )
2009-08-23 08:56:02 +08:00
outputTracker . addOutput ( stub ) ;
2010-09-25 10:49:30 +08:00
outputTracker . prepareWalker ( walker , getArguments ( ) . strictnessLevel ) ;
2009-05-11 10:07:20 +08:00
}
2010-09-28 10:16:25 +08:00
2010-12-21 10:09:46 +08:00
public ReferenceDataSource getReferenceDataSource ( ) {
return referenceDataSource ;
}
public GenomeLocParser getGenomeLocParser ( ) {
return genomeLocParser ;
}
/ * *
* Manage lists of filters .
* /
private final FilterManager filterManager = new FilterManager ( ) ;
private Date startTime = null ; // the start time for execution
public void setParser ( ParsingEngine parsingEngine ) {
this . parsingEngine = parsingEngine ;
}
/ * *
* Explicitly set the GenomeLocParser , for unit testing .
* @param genomeLocParser GenomeLocParser to use .
* /
public void setGenomeLocParser ( GenomeLocParser genomeLocParser ) {
this . genomeLocParser = genomeLocParser ;
}
/ * *
* Sets the start time when the execute ( ) function was last called
* @param startTime the start time when the execute ( ) function was last called
* /
protected void setStartTime ( Date startTime ) {
this . startTime = startTime ;
}
/ * *
* @return the start time when the execute ( ) function was last called
* /
public Date getStartTime ( ) {
return startTime ;
}
/ * *
* Setup the intervals to be processed
* /
protected void initializeIntervals ( ) {
// return if no interval arguments at all
2011-10-27 02:11:49 +08:00
if ( argCollection . intervals = = null & & argCollection . excludeIntervals = = null )
2010-12-21 10:09:46 +08:00
return ;
2011-10-27 02:11:49 +08:00
// Note that the use of '-L all' is no longer supported.
2010-12-21 10:09:46 +08:00
// if include argument isn't given, create new set of all possible intervals
2012-06-27 13:15:22 +08:00
Pair < GenomeLocSortedSet , GenomeLocSortedSet > includeExcludePair = IntervalUtils . parseIntervalBindingsPair (
this . referenceDataSource ,
argCollection . intervals ,
argCollection . intervalSetRule , argCollection . intervalMerging , argCollection . intervalPadding ,
argCollection . excludeIntervals ) ;
GenomeLocSortedSet includeSortedSet = includeExcludePair . getFirst ( ) ;
GenomeLocSortedSet excludeSortedSet = includeExcludePair . getSecond ( ) ;
2010-12-21 10:09:46 +08:00
// if no exclude arguments, can return parseIntervalArguments directly
2012-06-27 13:15:22 +08:00
if ( excludeSortedSet = = null )
2010-12-21 10:09:46 +08:00
intervals = includeSortedSet ;
2011-10-27 02:11:49 +08:00
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
2010-12-21 10:09:46 +08:00
else {
intervals = includeSortedSet . subtractRegions ( excludeSortedSet ) ;
// logging messages only printed when exclude (-XL) arguments are given
long toPruneSize = includeSortedSet . coveredSize ( ) ;
long toExcludeSize = excludeSortedSet . coveredSize ( ) ;
long intervalSize = intervals . coveredSize ( ) ;
logger . info ( String . format ( "Initial include intervals span %d loci; exclude intervals span %d loci" , toPruneSize , toExcludeSize ) ) ;
logger . info ( String . format ( "Excluding %d loci from original intervals (%.2f%% reduction)" ,
toPruneSize - intervalSize , ( toPruneSize - intervalSize ) / ( 0.01 * toPruneSize ) ) ) ;
}
}
/ * *
* Add additional , externally managed IO streams for inputs .
*
* @param argumentSource Field into which to inject the value .
* @param value Instance to inject .
* /
public void addInput ( ArgumentSource argumentSource , Object value ) {
inputs . put ( argumentSource , value ) ;
}
/ * *
* Add additional , externally managed IO streams for output .
*
* @param stub Instance to inject .
* /
public void addOutput ( Stub < ? > stub ) {
outputs . add ( stub ) ;
}
2011-03-14 23:51:19 +08:00
/ * *
* Returns the tag associated with a given command - line argument .
* @param key Object for which to inspect the tag .
* @return Tags object associated with the given key , or an empty Tag structure if none are present .
* /
public Tags getTags ( Object key ) {
return parsingEngine . getTags ( key ) ;
}
2010-12-21 10:09:46 +08:00
protected void initializeDataSources ( ) {
logger . info ( "Strictness is " + argCollection . strictnessLevel ) ;
// TODO -- REMOVE ME
BAQ . DEFAULT_GOP = argCollection . BAQGOP ;
validateSuppliedReference ( ) ;
2011-01-13 02:25:12 +08:00
setReferenceDataSource ( argCollection . referenceFile ) ;
2010-12-21 10:09:46 +08:00
validateSuppliedReads ( ) ;
2011-03-17 05:48:47 +08:00
readsDataSource = createReadsDataSource ( argCollection , genomeLocParser , referenceDataSource . getReference ( ) ) ;
2010-12-21 10:09:46 +08:00
2011-05-05 03:29:08 +08:00
for ( ReadFilter filter : filters )
filter . initialize ( this ) ;
2010-12-21 10:09:46 +08:00
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
2010-12-31 12:52:22 +08:00
rodDataSources = getReferenceOrderedDataSources ( referenceMetaDataFiles , referenceDataSource . getReference ( ) . getSequenceDictionary ( ) , genomeLocParser , argCollection . unsafe ) ;
2011-10-04 05:41:13 +08:00
}
/ * *
* Entry - point function to initialize the samples database from input data and pedigree arguments
* /
private void initializeSampleDB ( ) {
SampleDBBuilder sampleDBBuilder = new SampleDBBuilder ( this , argCollection . pedigreeValidationType ) ;
sampleDBBuilder . addSamplesFromSAMHeader ( getSAMFileHeader ( ) ) ;
sampleDBBuilder . addSamplesFromSampleNames ( SampleUtils . getUniqueSamplesFromRods ( this ) ) ;
2011-10-04 10:20:33 +08:00
sampleDBBuilder . addSamplesFromPedigreeFiles ( argCollection . pedigreeFiles ) ;
sampleDBBuilder . addSamplesFromPedigreeStrings ( argCollection . pedigreeStrings ) ;
2011-10-04 05:41:13 +08:00
sampleDB = sampleDBBuilder . getFinalSampleDB ( ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Gets a unique identifier for the reader sourcing this read .
* @param read Read to examine .
* @return A unique identifier for the source file of this read . Exception if not found .
* /
public SAMReaderID getReaderIDForRead ( final SAMRecord read ) {
return getReadsDataSource ( ) . getReaderID ( read ) ;
}
/ * *
* Gets the source file for this read .
* @param id Unique identifier determining which input file to use .
* @return The source filename for this read .
* /
public File getSourceFileForReaderID ( final SAMReaderID id ) {
return getReadsDataSource ( ) . getSAMFile ( id ) ;
}
/ * *
* Now that all files are open , validate the sequence dictionaries of the reads vs . the reference vrs the reference ordered data ( if available ) .
*
* @param reads Reads data source .
* @param reference Reference data source .
2010-12-31 12:52:22 +08:00
* @param rods a collection of the reference ordered data tracks
2011-10-15 00:06:41 +08:00
* @param manager manager
2010-12-21 10:09:46 +08:00
* /
2010-12-31 12:52:22 +08:00
private void validateSourcesAgainstReference ( SAMDataSource reads , ReferenceSequenceFile reference , Collection < ReferenceOrderedDataSource > rods , RMDTrackBuilder manager ) {
if ( ( reads . isEmpty ( ) & & ( rods = = null | | rods . isEmpty ( ) ) ) | | reference = = null )
2010-12-21 10:09:46 +08:00
return ;
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference . getSequenceDictionary ( ) ;
if ( ! reads . isEmpty ( ) ) {
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads . getHeader ( ) . getSequenceDictionary ( ) ;
2011-04-09 02:33:10 +08:00
if ( readsDictionary . size ( ) = = 0 ) {
2010-12-21 10:09:46 +08:00
logger . info ( "Reads file is unmapped. Skipping validation against reference." ) ;
return ;
}
// compare the reads to the reference
SequenceDictionaryUtils . validateDictionaries ( logger , getArguments ( ) . unsafe , "reads" , readsDictionary , "reference" , referenceDictionary ) ;
}
2010-12-31 12:52:22 +08:00
for ( ReferenceOrderedDataSource rod : rods )
manager . validateTrackSequenceDictionary ( rod . getName ( ) , rod . getSequenceDictionary ( ) , referenceDictionary ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Gets a data source for the given set of reads .
*
2011-10-15 00:06:41 +08:00
* @param argCollection arguments
* @param genomeLocParser parser
* @param refReader reader
2010-12-21 10:09:46 +08:00
* @return A data source for the given set of reads .
* /
2011-03-17 05:48:47 +08:00
private SAMDataSource createReadsDataSource ( GATKArgumentCollection argCollection , GenomeLocParser genomeLocParser , IndexedFastaSequenceFile refReader ) {
2010-12-21 10:09:46 +08:00
DownsamplingMethod method = getDownsamplingMethod ( ) ;
2011-10-15 00:06:41 +08:00
// Synchronize the method back into the collection so that it shows up when
// interrogating for the downsample method during command line recreation.
setDownsamplingMethod ( method ) ;
2010-12-21 10:09:46 +08:00
if ( getWalkerBAQApplicationTime ( ) = = BAQ . ApplicationTime . FORBIDDEN & & argCollection . BAQMode ! = BAQ . CalculationMode . OFF )
throw new UserException . BadArgumentValue ( "baq" , "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection . BAQMode + " was requested." ) ;
2011-10-15 00:06:41 +08:00
return new SAMDataSource (
2010-12-23 03:00:17 +08:00
samReaderIDs ,
2011-09-13 22:49:16 +08:00
threadAllocation ,
argCollection . numberOfBAMFileHandles ,
2010-12-21 10:09:46 +08:00
genomeLocParser ,
argCollection . useOriginalBaseQualities ,
argCollection . strictnessLevel ,
argCollection . readBufferSize ,
method ,
new ValidationExclusion ( Arrays . asList ( argCollection . unsafe ) ) ,
filters ,
includeReadsWithDeletionAtLoci ( ) ,
getWalkerBAQApplicationTime ( ) = = BAQ . ApplicationTime . ON_INPUT ? argCollection . BAQMode : BAQ . CalculationMode . OFF ,
getWalkerBAQQualityMode ( ) ,
2011-01-06 06:25:08 +08:00
refReader ,
2012-02-14 01:35:09 +08:00
getBaseRecalibration ( ) ,
2011-09-13 22:49:16 +08:00
argCollection . defaultBaseQualities ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
2011-01-13 02:25:12 +08:00
* Opens a reference sequence file paired with an index . Only public for testing purposes
2010-12-21 10:09:46 +08:00
*
* @param refFile Handle to a reference sequence file . Non - null .
* /
2011-01-13 02:25:12 +08:00
public void setReferenceDataSource ( File refFile ) {
this . referenceDataSource = new ReferenceDataSource ( refFile ) ;
genomeLocParser = new GenomeLocParser ( referenceDataSource . getReference ( ) ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Open the reference - ordered data sources .
*
2010-12-31 12:52:22 +08:00
* @param referenceMetaDataFiles collection of RMD descriptors to load and validate .
* @param sequenceDictionary GATK - wide sequnce dictionary to use for validation .
* @param genomeLocParser to use when creating and validating GenomeLocs .
* @param validationExclusionType potentially indicate which validations to include / exclude .
*
2010-12-21 10:09:46 +08:00
* @return A list of reference - ordered data sources .
* /
2010-12-31 12:52:22 +08:00
private List < ReferenceOrderedDataSource > getReferenceOrderedDataSources ( Collection < RMDTriplet > referenceMetaDataFiles ,
SAMSequenceDictionary sequenceDictionary ,
GenomeLocParser genomeLocParser ,
ValidationExclusion . TYPE validationExclusionType ) {
2012-05-23 04:27:13 +08:00
VCFHeader header = null ;
if ( getArguments ( ) . repairVCFHeader ! = null ) {
try {
final PositionalBufferedStream pbs = new PositionalBufferedStream ( new FileInputStream ( getArguments ( ) . repairVCFHeader ) ) ;
header = ( VCFHeader ) new VCFCodec ( ) . readHeader ( pbs ) . getHeaderValue ( ) ;
pbs . close ( ) ;
} catch ( IOException e ) {
throw new UserException . CouldNotReadInputFile ( getArguments ( ) . repairVCFHeader , e ) ;
}
}
2012-06-25 22:27:37 +08:00
RMDTrackBuilder builder = new RMDTrackBuilder ( sequenceDictionary , genomeLocParser , header , validationExclusionType ) ;
2010-12-31 12:52:22 +08:00
2010-12-21 10:09:46 +08:00
List < ReferenceOrderedDataSource > dataSources = new ArrayList < ReferenceOrderedDataSource > ( ) ;
2010-12-31 12:52:22 +08:00
for ( RMDTriplet fileDescriptor : referenceMetaDataFiles )
dataSources . add ( new ReferenceOrderedDataSource ( fileDescriptor ,
builder ,
sequenceDictionary ,
2010-12-21 10:09:46 +08:00
genomeLocParser ,
flashbackData ( ) ) ) ;
2010-12-31 12:52:22 +08:00
// validation: check to make sure everything the walker needs is present, and that all sequence dictionaries match.
validateSourcesAgainstReference ( readsDataSource , referenceDataSource . getReference ( ) , dataSources , builder ) ;
2010-12-21 10:09:46 +08:00
return dataSources ;
}
/ * *
* Returns the SAM File Header from the input reads ' data source file
* @return the SAM File Header from the input reads ' data source file
* /
public SAMFileHeader getSAMFileHeader ( ) {
return readsDataSource . getHeader ( ) ;
}
2012-06-25 22:27:37 +08:00
public boolean lenientVCFProcessing ( ) {
return lenientVCFProcessing ( argCollection . unsafe ) ;
}
public static boolean lenientVCFProcessing ( final ValidationExclusion . TYPE val ) {
return val = = ValidationExclusion . TYPE . ALL
| | val = = ValidationExclusion . TYPE . LENIENT_VCF_PROCESSING ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Returns the unmerged SAM file header for an individual reader .
* @param reader The reader .
* @return Header for that reader .
* /
public SAMFileHeader getSAMFileHeader ( SAMReaderID reader ) {
return readsDataSource . getHeader ( reader ) ;
}
2011-09-30 22:43:51 +08:00
/ * *
* Returns an ordered list of the unmerged SAM file headers known to this engine .
* @return list of header for each input SAM file , in command line order
* /
public List < SAMFileHeader > getSAMFileHeaders ( ) {
final List < SAMFileHeader > headers = new ArrayList < SAMFileHeader > ( ) ;
for ( final SAMReaderID id : getReadsDataSource ( ) . getReaderIDs ( ) ) {
headers . add ( getReadsDataSource ( ) . getHeader ( id ) ) ;
}
return headers ;
}
2011-09-20 22:53:18 +08:00
/ * *
* Gets the master sequence dictionary for this GATK engine instance
* @return a never - null dictionary listing all of the contigs known to this engine instance
* /
public SAMSequenceDictionary getMasterSequenceDictionary ( ) {
return getReferenceDataSource ( ) . getReference ( ) . getSequenceDictionary ( ) ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads ; header merger , individual file readers etc can be accessed through the returned data source object .
*
* @return the reads data source
* /
public SAMDataSource getReadsDataSource ( ) {
return this . readsDataSource ;
}
/ * *
* Sets the collection of GATK main application arguments .
*
* @param argCollection the GATK argument collection
* /
public void setArguments ( GATKArgumentCollection argCollection ) {
this . argCollection = argCollection ;
}
/ * *
* Gets the collection of GATK main application arguments .
*
* @return the GATK argument collection
* /
public GATKArgumentCollection getArguments ( ) {
return this . argCollection ;
}
/ * *
* Get the list of intervals passed to the engine .
2011-08-23 05:25:15 +08:00
* @return List of intervals , or null if no intervals are in use
2010-12-21 10:09:46 +08:00
* /
public GenomeLocSortedSet getIntervals ( ) {
return this . intervals ;
}
/ * *
* Gets the list of filters employed by this engine .
* @return Collection of filters ( actual instances ) used by this engine .
* /
2011-05-05 03:29:08 +08:00
public Collection < ReadFilter > getFilters ( ) {
2010-12-21 10:09:46 +08:00
return this . filters ;
}
/ * *
* Sets the list of filters employed by this engine .
* @param filters Collection of filters ( actual instances ) used by this engine .
* /
2011-05-05 03:29:08 +08:00
public void setFilters ( Collection < ReadFilter > filters ) {
2010-12-21 10:09:46 +08:00
this . filters = filters ;
}
/ * *
* Gets the filter manager for this engine .
* @return filter manager for this engine .
* /
protected FilterManager getFilterManager ( ) {
return filterManager ;
}
/ * *
* Gets the input sources for this engine .
* @return input sources for this engine .
* /
protected Map < ArgumentSource , Object > getInputs ( ) {
return inputs ;
}
/ * *
* Gets the output stubs for this engine .
* @return output stubs for this engine .
* /
protected Collection < Stub < ? > > getOutputs ( ) {
return outputs ;
}
/ * *
* Returns data source objects encapsulating all rod data ;
* individual rods can be accessed through the returned data source objects .
*
* @return the rods data sources
* /
public List < ReferenceOrderedDataSource > getRodDataSources ( ) {
return this . rodDataSources ;
}
/ * *
* Gets cumulative metrics about the entire run to this point .
2011-04-13 23:10:46 +08:00
* Returns a clone of this snapshot in time .
* @return cumulative metrics about the entire run at this point . ReadMetrics object is a unique instance and is
* owned by the caller ; the caller can do with the object what they wish .
2010-12-21 10:09:46 +08:00
* /
public ReadMetrics getCumulativeMetrics ( ) {
return readsDataSource = = null ? null : readsDataSource . getCumulativeReadMetrics ( ) ;
}
2011-09-29 22:34:51 +08:00
// -------------------------------------------------------------------------------------
//
// code for working with Samples database
//
// -------------------------------------------------------------------------------------
2010-12-21 10:09:46 +08:00
2011-10-04 00:33:30 +08:00
public SampleDB getSampleDB ( ) {
return this . sampleDB ;
2010-12-21 10:09:46 +08:00
}
public Map < String , String > getApproximateCommandLineArguments ( Object . . . argumentProviders ) {
return CommandLineUtils . getApproximateCommandLineArguments ( parsingEngine , argumentProviders ) ;
}
public String createApproximateCommandLineArgumentString ( Object . . . argumentProviders ) {
return CommandLineUtils . createApproximateCommandLineArgumentString ( parsingEngine , argumentProviders ) ;
}
2009-05-11 10:07:20 +08:00
}