2009-06-23 05:11:18 +08:00
/ *
2010-10-28 03:44:55 +08:00
* Copyright ( c ) 2010 , The Broad Institute
2010-04-20 23:26:32 +08:00
*
2009-06-23 05:11:18 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2009-06-23 05:11:18 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2009-06-23 05:11:18 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2009-06-23 05:11:18 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
2010-10-28 03:44:55 +08:00
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
2009-06-23 05:11:18 +08:00
* /
2009-05-11 10:07:20 +08:00
package org.broadinstitute.sting.gatk ;
2010-12-21 10:09:46 +08:00
import net.sf.picard.reference.IndexedFastaSequenceFile ;
2010-04-01 06:39:56 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2010-12-21 10:09:46 +08:00
import net.sf.samtools.* ;
import org.apache.log4j.Logger ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.commandline.* ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection ;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.gatk.datasources.reads.* ;
2011-02-04 01:59:19 +08:00
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource ;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.datasources.sample.Sample ;
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource ;
2009-05-11 10:07:20 +08:00
import org.broadinstitute.sting.gatk.executive.MicroScheduler ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.filters.FilterManager ;
2011-05-05 03:29:08 +08:00
import org.broadinstitute.sting.gatk.filters.ReadFilter ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.gatk.io.OutputTracker ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.gatk.io.stubs.Stub ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack ;
2011-07-26 01:21:52 +08:00
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator ;
2010-12-23 03:00:17 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
import org.broadinstitute.sting.utils.GenomeLocParser ;
import org.broadinstitute.sting.utils.GenomeLocSortedSet ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.utils.SequenceDictionaryUtils ;
BAQ calculation refactoring in the GATK. Single -baq argument can be NONE, CALCULATE_AS_NECESSARY, and RECALCULATE. Walkers can control bia the @BAQMode annotation how the BAQ calculation is applied. Can either be as a tag, by overwriting the qualities scores, or by only returning the baq-capped qualities scores. Additionally, walkers can be set up to have the BAQ applied to the incoming reads (ON_INPUT, the default), to output reads (ON_OUTPUT), or HANDLED_BY_WALKER, which means that calling into the BAQ system is the responsibility of the individual walker.
SAMFileWriterStub now supports BAQ writing as an internal feature. Several walkers have the @BAQMode applied to this, with parameters that I think are reasonable. Please look if you own these walkers, though
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4798 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-07 04:55:52 +08:00
import org.broadinstitute.sting.utils.baq.BAQ ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
import org.broadinstitute.sting.utils.exceptions.UserException ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.utils.interval.IntervalUtils ;
2011-07-18 08:29:58 +08:00
import org.broadinstitute.sting.utils.variantcontext.VariantContext ;
2009-05-11 10:07:20 +08:00
2010-12-21 10:09:46 +08:00
import java.io.File ;
2009-07-30 00:11:45 +08:00
import java.util.* ;
2009-05-11 10:07:20 +08:00
2010-09-24 07:28:55 +08:00
/ * *
* A GenomeAnalysisEngine that runs a specified walker .
* /
2010-12-21 10:09:46 +08:00
public class GenomeAnalysisEngine {
/ * *
* our log , which we want to capture anything from this class
* /
private static Logger logger = Logger . getLogger ( GenomeAnalysisEngine . class ) ;
/ * *
* The GATK command - line argument parsing code .
* /
private ParsingEngine parsingEngine ;
/ * *
* The genomeLocParser can create and parse GenomeLocs .
* /
private GenomeLocParser genomeLocParser ;
/ * *
* Accessor for sharded read data .
* /
private SAMDataSource readsDataSource = null ;
/ * *
* Accessor for sharded reference data .
* /
private ReferenceDataSource referenceDataSource = null ;
/ * *
* Accessor for sample metadata
* /
private SampleDataSource sampleDataSource = null ;
/ * *
* Accessor for sharded reference - ordered data .
* /
private List < ReferenceOrderedDataSource > rodDataSources ;
// our argument collection
private GATKArgumentCollection argCollection ;
/ * *
* Collection of intervals used by the engine .
* /
private GenomeLocSortedSet intervals = null ;
2011-04-05 02:41:55 +08:00
/ * *
* Explicitly assign the interval set to use for this traversal ( for unit testing purposes )
* @param intervals set of intervals to use for this traversal
* /
public void setIntervals ( GenomeLocSortedSet intervals ) {
this . intervals = intervals ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Collection of inputs used by the engine .
* /
private Map < ArgumentSource , Object > inputs = new HashMap < ArgumentSource , Object > ( ) ;
/ * *
* Collection of outputs used by the engine .
* /
private Collection < Stub < ? > > outputs = new ArrayList < Stub < ? > > ( ) ;
/ * *
* Collection of the filters applied to the input data .
* /
2011-05-05 03:29:08 +08:00
private Collection < ReadFilter > filters ;
2010-12-21 10:09:46 +08:00
2011-01-18 05:23:09 +08:00
/ * *
* A currently hacky unique name for this GATK instance
* /
2011-04-08 01:03:48 +08:00
private String myName = "GATK_" + Math . abs ( getRandomGenerator ( ) . nextInt ( ) ) ;
2011-01-18 05:23:09 +08:00
2009-10-06 10:45:31 +08:00
/ * *
* our walker manager
* /
2010-09-25 10:49:30 +08:00
private final WalkerManager walkerManager = new WalkerManager ( ) ;
2009-11-11 02:40:16 +08:00
2010-09-24 07:28:55 +08:00
private Walker < ? , ? > walker ;
2010-08-29 06:53:32 +08:00
2010-09-24 07:28:55 +08:00
public void setWalker ( Walker < ? , ? > walker ) {
this . walker = walker ;
2009-07-10 07:59:53 +08:00
}
2009-05-11 10:07:20 +08:00
2010-12-23 03:00:17 +08:00
/ * *
* A processed collection of SAM reader identifiers .
* /
2010-12-31 12:52:22 +08:00
private Collection < SAMReaderID > samReaderIDs = Collections . emptyList ( ) ;
2010-12-23 03:00:17 +08:00
/ * *
* Set the SAM / BAM files over which to traverse .
* @param samReaderIDs Collection of ids to use during this traversal .
* /
public void setSAMFileIDs ( Collection < SAMReaderID > samReaderIDs ) {
this . samReaderIDs = samReaderIDs ;
}
/ * *
* Collection of reference metadata files over which to traverse .
* /
private Collection < RMDTriplet > referenceMetaDataFiles ;
/ * *
* Set the reference metadata files to use for this traversal .
* @param referenceMetaDataFiles Collection of files and descriptors over which to traverse .
* /
public void setReferenceMetaDataFiles ( Collection < RMDTriplet > referenceMetaDataFiles ) {
this . referenceMetaDataFiles = referenceMetaDataFiles ;
}
2011-04-08 01:03:48 +08:00
/ * *
* Static random number generator and seed .
* /
private static final long GATK_RANDOM_SEED = 47382911L ;
private static Random randomGenerator = new Random ( GATK_RANDOM_SEED ) ;
2011-04-30 03:29:08 +08:00
public static Random getRandomGenerator ( ) { return randomGenerator ; }
2011-05-31 22:06:37 +08:00
public static void resetRandomGenerator ( ) { randomGenerator . setSeed ( GATK_RANDOM_SEED ) ; }
public static void resetRandomGenerator ( long seed ) { randomGenerator . setSeed ( seed ) ; }
2009-07-10 07:59:53 +08:00
/ * *
* Actually run the GATK with the specified walker .
2009-10-06 10:45:31 +08:00
*
2009-09-30 06:23:19 +08:00
* @return the value of this traversal .
2009-07-10 07:59:53 +08:00
* /
2010-09-24 07:28:55 +08:00
public Object execute ( ) {
2010-05-18 05:00:44 +08:00
//HeapSizeMonitor monitor = new HeapSizeMonitor();
//monitor.start();
2010-09-24 07:28:55 +08:00
setStartTime ( new java . util . Date ( ) ) ;
2010-05-18 05:00:44 +08:00
2009-05-11 10:07:20 +08:00
// validate our parameters
2010-09-24 07:28:55 +08:00
if ( this . getArguments ( ) = = null ) {
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null." ) ;
2009-05-11 10:07:20 +08:00
}
2010-04-01 20:47:48 +08:00
// validate our parameters
2010-09-24 07:28:55 +08:00
if ( this . walker = = null )
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "The walker passed to GenomeAnalysisEngine can not be null." ) ;
2009-07-10 07:59:53 +08:00
2011-04-30 03:29:08 +08:00
if ( this . getArguments ( ) . nonDeterministicRandomSeed )
resetRandomGenerator ( System . currentTimeMillis ( ) ) ;
2009-07-30 07:00:15 +08:00
// Prepare the data for traversal.
2010-09-24 07:28:55 +08:00
initializeDataSources ( ) ;
2009-05-20 07:26:17 +08:00
2011-01-13 01:32:27 +08:00
// initialize and validate the interval list
initializeIntervals ( ) ;
validateSuppliedIntervals ( ) ;
2009-05-11 10:07:20 +08:00
// our microscheduler, which is in charge of running everything
2010-09-24 07:28:55 +08:00
MicroScheduler microScheduler = createMicroscheduler ( ) ;
2009-05-11 10:07:20 +08:00
2011-01-05 11:07:11 +08:00
// create temp directories as necessary
initializeTempDirectory ( ) ;
2010-08-22 22:27:05 +08:00
// create the output streams "
2010-09-24 07:28:55 +08:00
initializeOutputStreams ( microScheduler . getOutputTracker ( ) ) ;
2009-05-11 10:07:20 +08:00
2011-03-17 05:48:47 +08:00
ShardStrategy shardStrategy = getShardStrategy ( readsDataSource , microScheduler . getReference ( ) , intervals ) ;
2010-03-15 05:08:14 +08:00
// execute the microscheduler, storing the results
2010-09-24 07:28:55 +08:00
Object result = microScheduler . execute ( this . walker , shardStrategy ) ;
2010-05-18 05:00:44 +08:00
//monitor.stop();
//logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
return result ;
2010-03-15 05:08:14 +08:00
}
2009-07-10 07:59:53 +08:00
/ * *
* Retrieves an instance of the walker based on the walker name .
2009-10-06 10:45:31 +08:00
*
2009-07-10 07:59:53 +08:00
* @param walkerName Name of the walker . Must not be null . If the walker cannot be instantiated , an exception will be thrown .
* @return An instance of the walker .
* /
2009-10-06 10:45:31 +08:00
public Walker < ? , ? > getWalkerByName ( String walkerName ) {
2009-11-11 02:40:16 +08:00
return walkerManager . createByName ( walkerName ) ;
}
/ * *
* Gets the name of a given walker type .
* @param walkerType Type of walker .
* @return Name of the walker .
* /
2010-05-21 03:02:02 +08:00
public String getWalkerName ( Class < ? extends Walker > walkerType ) {
2009-11-11 02:40:16 +08:00
return walkerManager . getName ( walkerType ) ;
2009-07-10 07:59:53 +08:00
}
2011-01-18 05:23:09 +08:00
public String getName ( ) {
return myName ;
}
2009-11-11 07:36:17 +08:00
/ * *
* Gets a list of the filters to associate with the given walker . Will NOT initialize the engine with this filters ;
* the caller must handle that directly .
* @return A collection of available filters .
* /
2011-05-05 03:29:08 +08:00
public Collection < ReadFilter > createFilters ( ) {
Set < ReadFilter > filters = new HashSet < ReadFilter > ( ) ;
2010-09-24 07:28:55 +08:00
filters . addAll ( WalkerManager . getReadFilters ( walker , this . getFilterManager ( ) ) ) ;
2010-12-21 10:09:46 +08:00
if ( this . getArguments ( ) . readGroupBlackList ! = null & & this . getArguments ( ) . readGroupBlackList . size ( ) > 0 )
filters . add ( new ReadGroupBlackListFilter ( this . getArguments ( ) . readGroupBlackList ) ) ;
for ( String filterName : this . getArguments ( ) . readFilters )
filters . add ( this . getFilterManager ( ) . createByName ( filterName ) ) ;
2009-11-11 07:36:17 +08:00
return Collections . unmodifiableSet ( filters ) ;
}
2009-12-05 07:24:29 +08:00
/ * *
* Allow subclasses and others within this package direct access to the walker manager .
* @return The walker manager used by this package .
* /
protected WalkerManager getWalkerManager ( ) {
return walkerManager ;
}
2010-09-24 07:28:55 +08:00
2009-05-11 10:07:20 +08:00
/ * *
* setup a microscheduler
2009-10-06 10:45:31 +08:00
*
2009-05-11 10:07:20 +08:00
* @return a new microscheduler
* /
2010-09-24 07:28:55 +08:00
private MicroScheduler createMicroscheduler ( ) {
2010-03-22 07:22:25 +08:00
// Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary.
2011-01-14 10:49:04 +08:00
if ( ( walker instanceof ReadWalker | | walker instanceof DuplicateWalker | | walker instanceof ReadPairWalker ) & &
2010-09-24 07:28:55 +08:00
this . getArguments ( ) . referenceFile = = null ) {
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "Read-based traversals require a reference file but none was given" ) ;
2009-05-11 10:07:20 +08:00
}
2011-01-14 10:49:04 +08:00
return MicroScheduler . create ( this , walker , this . getReadsDataSource ( ) , this . getReferenceDataSource ( ) . getReference ( ) , this . getRodDataSources ( ) , this . getArguments ( ) . numberOfThreads ) ;
2010-02-04 12:12:49 +08:00
}
2010-09-24 07:28:55 +08:00
protected DownsamplingMethod getDownsamplingMethod ( ) {
GATKArgumentCollection argCollection = this . getArguments ( ) ;
DownsamplingMethod method ;
2010-08-27 05:38:03 +08:00
if ( argCollection . getDownsamplingMethod ( ) ! = null )
method = argCollection . getDownsamplingMethod ( ) ;
2010-05-19 13:40:05 +08:00
else if ( WalkerManager . getDownsamplingMethod ( walker ) ! = null )
method = WalkerManager . getDownsamplingMethod ( walker ) ;
else
2010-08-27 05:38:03 +08:00
method = argCollection . getDefaultDownsamplingMethod ( ) ;
2010-09-24 07:28:55 +08:00
return method ;
}
BAQ calculation refactoring in the GATK. Single -baq argument can be NONE, CALCULATE_AS_NECESSARY, and RECALCULATE. Walkers can control bia the @BAQMode annotation how the BAQ calculation is applied. Can either be as a tag, by overwriting the qualities scores, or by only returning the baq-capped qualities scores. Additionally, walkers can be set up to have the BAQ applied to the incoming reads (ON_INPUT, the default), to output reads (ON_OUTPUT), or HANDLED_BY_WALKER, which means that calling into the BAQ system is the responsibility of the individual walker.
SAMFileWriterStub now supports BAQ writing as an internal feature. Several walkers have the @BAQMode applied to this, with parameters that I think are reasonable. Please look if you own these walkers, though
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4798 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-07 04:55:52 +08:00
public BAQ . QualityMode getWalkerBAQQualityMode ( ) { return WalkerManager . getBAQQualityMode ( walker ) ; }
public BAQ . ApplicationTime getWalkerBAQApplicationTime ( ) { return WalkerManager . getBAQApplicationTime ( walker ) ; }
2010-09-24 07:28:55 +08:00
protected boolean generateExtendedEvents ( ) {
return walker . generateExtendedEvents ( ) ;
}
2010-05-19 13:40:05 +08:00
2010-09-24 07:28:55 +08:00
protected boolean includeReadsWithDeletionAtLoci ( ) {
return walker . includeReadsWithDeletionAtLoci ( ) ;
2009-06-10 21:39:32 +08:00
}
2009-07-30 07:00:15 +08:00
/ * *
2011-03-31 06:23:24 +08:00
* Verifies that the supplied set of reads files mesh with what the walker says it requires ,
* and also makes sure that there were no duplicate SAM files specified on the command line .
2009-07-30 07:00:15 +08:00
* /
2010-09-24 07:28:55 +08:00
protected void validateSuppliedReads ( ) {
GATKArgumentCollection arguments = this . getArguments ( ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( WalkerManager . isRequired ( walker , DataSource . READS ) & & ( arguments . samFiles = = null | | arguments . samFiles . size ( ) = = 0 ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker requires reads but none were provided." ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( ( arguments . samFiles ! = null & & arguments . samFiles . size ( ) > 0 ) & & ! WalkerManager . isAllowed ( walker , DataSource . READS ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker does not allow reads but reads were provided." ) ;
2011-03-31 06:23:24 +08:00
// Make sure no SAM files were specified multiple times by the user.
checkForDuplicateSamFiles ( ) ;
}
/ * *
* Checks whether there are SAM files that appear multiple times in the fully unpacked list of
* SAM files ( samReaderIDs ) . If there are , throws an ArgumentException listing the files in question .
* /
protected void checkForDuplicateSamFiles ( ) {
Set < SAMReaderID > encounteredSamFiles = new HashSet < SAMReaderID > ( ) ;
Set < String > duplicateSamFiles = new LinkedHashSet < String > ( ) ;
for ( SAMReaderID samFile : samReaderIDs ) {
if ( encounteredSamFiles . contains ( samFile ) ) {
duplicateSamFiles . add ( samFile . getSamFilePath ( ) ) ;
}
else {
encounteredSamFiles . add ( samFile ) ;
}
}
if ( duplicateSamFiles . size ( ) > 0 ) {
throw new ArgumentException ( "The following BAM files appear multiple times in the list of input files: " +
duplicateSamFiles + " BAM files may be specified at most once." ) ;
}
2009-07-30 07:00:15 +08:00
}
/ * *
* Verifies that the supplied reference file mesh with what the walker says it requires .
* /
2010-09-24 07:28:55 +08:00
protected void validateSuppliedReference ( ) {
GATKArgumentCollection arguments = this . getArguments ( ) ;
2009-07-30 07:00:15 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-12-11 03:15:48 +08:00
// TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required.
if ( /*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments . referenceFile = = null )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker requires a reference but none was provided." ) ;
2009-07-30 07:00:15 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( arguments . referenceFile ! = null & & ! WalkerManager . isAllowed ( walker , DataSource . REFERENCE ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker does not allow a reference but one was provided." ) ;
2009-07-30 07:00:15 +08:00
}
2009-05-20 07:26:17 +08:00
2010-12-15 02:24:18 +08:00
protected void validateSuppliedIntervals ( ) {
// Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped.
if ( ! ( walker instanceof ReadWalker ) ) {
GenomeLocSortedSet intervals = getIntervals ( ) ;
if ( intervals ! = null & & getIntervals ( ) . contains ( GenomeLoc . UNMAPPED ) )
throw new ArgumentException ( "Interval list specifies unmapped region. Only read walkers may include the unmapped region." ) ;
}
2011-04-05 02:41:55 +08:00
// If intervals is non-null and empty at this point, it means that the list of intervals to process
// was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since
// this was very likely unintentional, the user should be informed of this. Note that this is different
// from the case where intervals == null, which indicates either that there were no interval arguments,
// or that -L all was specified.
if ( intervals ! = null & & intervals . isEmpty ( ) ) {
throw new ArgumentException ( "The given combination of -L and -XL options results in an empty set. " +
"No intervals to process." ) ;
}
2010-12-15 02:24:18 +08:00
}
2009-07-30 00:11:45 +08:00
/ * *
* Get the sharding strategy given a driving data source .
*
* @param drivingDataSource Data on which to shard .
2010-09-24 07:28:55 +08:00
* @return the sharding strategy
2009-07-30 00:11:45 +08:00
* /
2011-03-17 05:48:47 +08:00
protected ShardStrategy getShardStrategy ( SAMDataSource readsDataSource , ReferenceSequenceFile drivingDataSource , GenomeLocSortedSet intervals ) {
2010-09-24 07:28:55 +08:00
ValidationExclusion exclusions = ( readsDataSource ! = null ? readsDataSource . getReadsInfo ( ) . getValidationExclusionList ( ) : null ) ;
ReferenceDataSource referenceDataSource = this . getReferenceDataSource ( ) ;
2010-03-22 07:22:25 +08:00
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
// sharding system; it's required with the new sharding system only for locus walkers.
2010-07-03 04:09:25 +08:00
if ( readsDataSource ! = null & & ! readsDataSource . hasIndex ( ) ) {
2010-07-07 11:14:59 +08:00
if ( ! exclusions . contains ( ValidationExclusion . TYPE . ALLOW_UNINDEXED_BAM ) )
2011-01-15 05:32:53 +08:00
throw new UserException . CommandLineException ( "Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported." ) ;
2011-02-01 06:10:06 +08:00
if ( intervals ! = null & & ! argCollection . allowIntervalsWithUnindexedBAM )
2011-01-14 10:49:04 +08:00
throw new UserException . CommandLineException ( "Cannot perform interval processing when reads are present but no index is available." ) ;
2009-12-17 05:55:42 +08:00
Shard . ShardType shardType ;
2010-03-22 07:31:45 +08:00
if ( walker instanceof LocusWalker ) {
2010-07-19 00:29:59 +08:00
if ( readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
2010-09-14 13:04:26 +08:00
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . coordinate , "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately." ) ;
2009-12-17 05:55:42 +08:00
shardType = Shard . ShardType . LOCUS ;
2010-03-22 07:31:45 +08:00
}
2010-03-22 07:22:25 +08:00
else if ( walker instanceof ReadWalker | | walker instanceof DuplicateWalker | | walker instanceof ReadPairWalker )
2009-12-17 05:55:42 +08:00
shardType = Shard . ShardType . READ ;
else
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "The GATK cannot currently process unindexed BAM files" ) ;
2009-12-17 05:55:42 +08:00
2010-07-07 11:14:59 +08:00
List < GenomeLoc > region ;
if ( intervals ! = null )
region = intervals . toList ( ) ;
else {
region = new ArrayList < GenomeLoc > ( ) ;
for ( SAMSequenceRecord sequenceRecord : drivingDataSource . getSequenceDictionary ( ) . getSequences ( ) )
2010-11-11 01:59:50 +08:00
region . add ( getGenomeLocParser ( ) . createGenomeLoc ( sequenceRecord . getSequenceName ( ) , 1 , sequenceRecord . getSequenceLength ( ) ) ) ;
2010-07-07 11:14:59 +08:00
}
2011-01-20 20:58:13 +08:00
return new MonolithicShardStrategy ( getGenomeLocParser ( ) , readsDataSource , shardType , region ) ;
2009-12-17 05:55:42 +08:00
}
2009-07-30 00:11:45 +08:00
ShardStrategy shardStrategy = null ;
ShardStrategyFactory . SHATTER_STRATEGY shardType ;
2009-09-13 03:13:15 +08:00
2009-12-17 05:55:42 +08:00
long SHARD_SIZE = 100000L ;
2009-07-30 00:11:45 +08:00
if ( walker instanceof LocusWalker ) {
2009-10-21 07:31:13 +08:00
if ( walker instanceof RodWalker ) SHARD_SIZE * = 1000 ;
2009-09-13 03:13:15 +08:00
2010-01-13 01:52:27 +08:00
if ( intervals ! = null & & ! intervals . isEmpty ( ) ) {
2010-08-12 04:17:11 +08:00
if ( ! readsDataSource . isEmpty ( ) & & readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
2010-09-14 13:04:26 +08:00
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . coordinate , "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately." ) ;
2010-03-22 07:22:25 +08:00
2010-01-15 11:35:55 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-06-18 04:17:31 +08:00
ShardStrategyFactory . SHATTER_STRATEGY . LOCUS_EXPERIMENTAL ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
2010-11-11 01:59:50 +08:00
getGenomeLocParser ( ) ,
2010-08-10 04:41:50 +08:00
intervals ) ;
2009-07-30 00:11:45 +08:00
} else
2010-03-01 09:58:44 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-06-18 04:17:31 +08:00
ShardStrategyFactory . SHATTER_STRATEGY . LOCUS_EXPERIMENTAL ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
2010-11-11 01:59:50 +08:00
SHARD_SIZE , getGenomeLocParser ( ) ) ;
2009-07-30 00:11:45 +08:00
} else if ( walker instanceof ReadWalker | |
walker instanceof DuplicateWalker ) {
2010-06-18 04:17:31 +08:00
shardType = ShardStrategyFactory . SHATTER_STRATEGY . READS_EXPERIMENTAL ;
2009-07-30 00:11:45 +08:00
2010-01-13 01:52:27 +08:00
if ( intervals ! = null & & ! intervals . isEmpty ( ) ) {
2010-03-08 05:01:25 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-03-08 05:01:25 +08:00
shardType ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
2010-11-11 01:59:50 +08:00
getGenomeLocParser ( ) ,
2010-08-10 04:41:50 +08:00
intervals ) ;
2009-07-30 00:11:45 +08:00
} else {
2010-03-08 05:01:25 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-03-08 05:01:25 +08:00
shardType ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
2010-11-11 01:59:50 +08:00
SHARD_SIZE ,
getGenomeLocParser ( ) ) ;
2009-07-30 00:11:45 +08:00
}
2010-03-22 07:22:25 +08:00
} else if ( walker instanceof ReadPairWalker ) {
if ( readsDataSource ! = null & & readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . queryname )
2010-09-12 22:02:43 +08:00
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . queryname , "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file." ) ;
2010-03-22 07:22:25 +08:00
if ( intervals ! = null & & ! intervals . isEmpty ( ) )
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "Pairs traversal cannot be used in conjunction with intervals." ) ;
2010-03-22 07:22:25 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-03-22 07:22:25 +08:00
ShardStrategyFactory . SHATTER_STRATEGY . READS_EXPERIMENTAL ,
drivingDataSource . getSequenceDictionary ( ) ,
2010-11-11 01:59:50 +08:00
SHARD_SIZE ,
getGenomeLocParser ( ) ) ;
2009-07-30 00:11:45 +08:00
} else
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Unable to support walker of type" + walker . getClass ( ) . getName ( ) ) ;
2009-07-30 00:11:45 +08:00
return shardStrategy ;
}
2010-09-24 07:28:55 +08:00
protected boolean flashbackData ( ) {
return walker instanceof ReadWalker ;
2009-07-30 00:11:45 +08:00
}
2009-05-11 10:07:20 +08:00
2011-01-05 11:07:11 +08:00
/ * *
* Create the temp directory if it doesn ' t exist .
* /
private void initializeTempDirectory ( ) {
File tempDir = new File ( System . getProperty ( "java.io.tmpdir" ) ) ;
tempDir . mkdirs ( ) ;
}
2009-07-10 06:10:22 +08:00
/ * *
* Initialize the output streams as specified by the user .
*
2009-09-30 06:23:19 +08:00
* @param outputTracker the tracker supplying the initialization data .
2009-07-10 06:10:22 +08:00
* /
2010-09-24 07:28:55 +08:00
private void initializeOutputStreams ( OutputTracker outputTracker ) {
for ( Map . Entry < ArgumentSource , Object > input : getInputs ( ) . entrySet ( ) )
2009-10-06 10:45:31 +08:00
outputTracker . addInput ( input . getKey ( ) , input . getValue ( ) ) ;
2010-09-24 07:28:55 +08:00
for ( Stub < ? > stub : getOutputs ( ) )
2009-08-23 08:56:02 +08:00
outputTracker . addOutput ( stub ) ;
2010-09-25 10:49:30 +08:00
outputTracker . prepareWalker ( walker , getArguments ( ) . strictnessLevel ) ;
2009-05-11 10:07:20 +08:00
}
2010-09-28 10:16:25 +08:00
2010-12-21 10:09:46 +08:00
public ReferenceDataSource getReferenceDataSource ( ) {
return referenceDataSource ;
}
public GenomeLocParser getGenomeLocParser ( ) {
return genomeLocParser ;
}
/ * *
* Manage lists of filters .
* /
private final FilterManager filterManager = new FilterManager ( ) ;
private Date startTime = null ; // the start time for execution
public void setParser ( ParsingEngine parsingEngine ) {
this . parsingEngine = parsingEngine ;
}
/ * *
* Explicitly set the GenomeLocParser , for unit testing .
* @param genomeLocParser GenomeLocParser to use .
* /
public void setGenomeLocParser ( GenomeLocParser genomeLocParser ) {
this . genomeLocParser = genomeLocParser ;
}
/ * *
* Sets the start time when the execute ( ) function was last called
* @param startTime the start time when the execute ( ) function was last called
* /
protected void setStartTime ( Date startTime ) {
this . startTime = startTime ;
}
/ * *
* @return the start time when the execute ( ) function was last called
* /
public Date getStartTime ( ) {
return startTime ;
}
/ * *
* Setup the intervals to be processed
* /
protected void initializeIntervals ( ) {
// return if no interval arguments at all
if ( ( argCollection . intervals = = null ) & & ( argCollection . excludeIntervals = = null ) & & ( argCollection . RODToInterval = = null ) )
return ;
// if '-L all' was specified, verify that it was the only -L specified and return if so.
if ( argCollection . intervals ! = null ) {
for ( String interval : argCollection . intervals ) {
if ( interval . trim ( ) . equals ( "all" ) ) {
if ( argCollection . intervals . size ( ) > 1 )
throw new UserException ( "'-L all' was specified along with other intervals or interval lists; the GATK cannot combine '-L all' with other intervals." ) ;
// '-L all' was specified and seems valid. Return.
return ;
}
}
}
// if include argument isn't given, create new set of all possible intervals
GenomeLocSortedSet includeSortedSet = ( argCollection . intervals = = null & & argCollection . RODToInterval = = null ?
2011-04-28 02:12:10 +08:00
GenomeLocSortedSet . createSetFromSequenceDictionary ( this . referenceDataSource . getReference ( ) . getSequenceDictionary ( ) ) :
Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.
Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go
Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable. Several iterator classes have changed to remove their use of clone()
Removed misc. unnecessary imports
Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-20 23:43:27 +08:00
loadIntervals ( argCollection . intervals , IntervalUtils . mergeIntervalLocations ( getRODIntervals ( ) , argCollection . intervalMerging ) ) ) ;
2010-12-21 10:09:46 +08:00
// if no exclude arguments, can return parseIntervalArguments directly
if ( argCollection . excludeIntervals = = null )
intervals = includeSortedSet ;
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
2011-04-28 02:12:10 +08:00
GenomeLocSortedSet excludeSortedSet = loadIntervals ( argCollection . excludeIntervals , null ) ;
2010-12-21 10:09:46 +08:00
intervals = includeSortedSet . subtractRegions ( excludeSortedSet ) ;
// logging messages only printed when exclude (-XL) arguments are given
long toPruneSize = includeSortedSet . coveredSize ( ) ;
long toExcludeSize = excludeSortedSet . coveredSize ( ) ;
long intervalSize = intervals . coveredSize ( ) ;
logger . info ( String . format ( "Initial include intervals span %d loci; exclude intervals span %d loci" , toPruneSize , toExcludeSize ) ) ;
logger . info ( String . format ( "Excluding %d loci from original intervals (%.2f%% reduction)" ,
toPruneSize - intervalSize , ( toPruneSize - intervalSize ) / ( 0.01 * toPruneSize ) ) ) ;
}
}
/ * *
* Loads the intervals relevant to the current execution
* @param argList String representation of arguments ; might include ' all ' , filenames , intervals in samtools
2011-04-28 02:12:10 +08:00
* notation , or a combination of the above
* @param rodIntervals a list of ROD intervals to add to the returned set . Can be empty or null .
2010-12-21 10:09:46 +08:00
* @return A sorted , merged list of all intervals specified in this arg list .
* /
2011-04-28 02:12:10 +08:00
protected GenomeLocSortedSet loadIntervals ( List < String > argList , List < GenomeLoc > rodIntervals ) {
2010-12-21 10:09:46 +08:00
2011-04-28 02:12:10 +08:00
boolean allowEmptyIntervalList = ( argCollection . unsafe = = ValidationExclusion . TYPE . ALLOW_EMPTY_INTERVAL_LIST | |
argCollection . unsafe = = ValidationExclusion . TYPE . ALL ) ;
List < GenomeLoc > nonRODIntervals = IntervalUtils . parseIntervalArguments ( genomeLocParser , argList , allowEmptyIntervalList ) ;
List < GenomeLoc > allIntervals = IntervalUtils . mergeListsBySetOperator ( rodIntervals , nonRODIntervals , argCollection . BTIMergeRule ) ;
return IntervalUtils . sortAndMergeIntervals ( genomeLocParser , allIntervals , argCollection . intervalMerging ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* if we have a ROD specified as a ' rodToIntervalTrackName ' , convert its records to RODs
* @return ROD intervals as GenomeLocs
* /
2011-04-28 02:12:10 +08:00
private List < GenomeLoc > getRODIntervals ( ) {
2010-12-21 10:09:46 +08:00
Map < String , ReferenceOrderedDataSource > rodNames = RMDIntervalGenerator . getRMDTrackNames ( rodDataSources ) ;
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
List < GenomeLoc > ret = new ArrayList < GenomeLoc > ( ) ;
if ( rodNames ! = null & & argCollection . RODToInterval ! = null ) {
String rodName = argCollection . RODToInterval ;
// check to make sure we have a rod of that name
if ( ! rodNames . containsKey ( rodName ) )
throw new UserException . CommandLineException ( "--rodToIntervalTrackName (-BTI) was passed the name '" + rodName + "', which wasn't given as a ROD name in the -B option" ) ;
for ( String str : rodNames . keySet ( ) )
if ( str . equals ( rodName ) ) {
logger . info ( "Adding interval list from track (ROD) named " + rodName ) ;
2010-12-31 12:52:22 +08:00
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator ( rodNames . get ( str ) ) ;
2010-12-21 10:09:46 +08:00
ret . addAll ( intervalGenerator . toGenomeLocList ( ) ) ;
}
}
return ret ;
}
/ * *
* Add additional , externally managed IO streams for inputs .
*
* @param argumentSource Field into which to inject the value .
* @param value Instance to inject .
* /
public void addInput ( ArgumentSource argumentSource , Object value ) {
inputs . put ( argumentSource , value ) ;
}
/ * *
* Add additional , externally managed IO streams for output .
*
* @param stub Instance to inject .
* /
public void addOutput ( Stub < ? > stub ) {
outputs . add ( stub ) ;
}
2011-03-14 23:51:19 +08:00
/ * *
* Returns the tag associated with a given command - line argument .
* @param key Object for which to inspect the tag .
* @return Tags object associated with the given key , or an empty Tag structure if none are present .
* /
public Tags getTags ( Object key ) {
return parsingEngine . getTags ( key ) ;
}
2010-12-21 10:09:46 +08:00
protected void initializeDataSources ( ) {
logger . info ( "Strictness is " + argCollection . strictnessLevel ) ;
// TODO -- REMOVE ME
BAQ . DEFAULT_GOP = argCollection . BAQGOP ;
validateSuppliedReference ( ) ;
2011-01-13 02:25:12 +08:00
setReferenceDataSource ( argCollection . referenceFile ) ;
2010-12-21 10:09:46 +08:00
validateSuppliedReads ( ) ;
2011-03-17 05:48:47 +08:00
readsDataSource = createReadsDataSource ( argCollection , genomeLocParser , referenceDataSource . getReference ( ) ) ;
2010-12-21 10:09:46 +08:00
2011-05-05 03:29:08 +08:00
for ( ReadFilter filter : filters )
filter . initialize ( this ) ;
2010-12-21 10:09:46 +08:00
sampleDataSource = new SampleDataSource ( getSAMFileHeader ( ) , argCollection . sampleFiles ) ;
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
2010-12-31 12:52:22 +08:00
rodDataSources = getReferenceOrderedDataSources ( referenceMetaDataFiles , referenceDataSource . getReference ( ) . getSequenceDictionary ( ) , genomeLocParser , argCollection . unsafe ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Gets a unique identifier for the reader sourcing this read .
* @param read Read to examine .
* @return A unique identifier for the source file of this read . Exception if not found .
* /
public SAMReaderID getReaderIDForRead ( final SAMRecord read ) {
return getReadsDataSource ( ) . getReaderID ( read ) ;
}
/ * *
* Gets the source file for this read .
* @param id Unique identifier determining which input file to use .
* @return The source filename for this read .
* /
public File getSourceFileForReaderID ( final SAMReaderID id ) {
return getReadsDataSource ( ) . getSAMFile ( id ) ;
}
/ * *
* Returns sets of samples present in the ( merged ) input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of sample names
* found in the corresponding bam file .
*
* @return Sets of samples in the merged input SAM stream , grouped by readers
* /
public List < Set < String > > getSamplesByReaders ( ) {
2010-12-23 03:00:17 +08:00
Collection < SAMReaderID > readers = getReadsDataSource ( ) . getReaderIDs ( ) ;
2010-12-21 10:09:46 +08:00
List < Set < String > > sample_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
for ( SAMReaderID r : readers ) {
Set < String > samples = new HashSet < String > ( 1 ) ;
sample_sets . add ( samples ) ;
for ( SAMReadGroupRecord g : getReadsDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
samples . add ( g . getSample ( ) ) ;
}
}
return sample_sets ;
}
/ * *
* Returns sets of libraries present in the ( merged ) input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of library names
* found in the corresponding bam file .
*
* @return Sets of libraries present in the ( merged ) input SAM stream , grouped by readers
* /
public List < Set < String > > getLibrariesByReaders ( ) {
2010-12-23 03:00:17 +08:00
Collection < SAMReaderID > readers = getReadsDataSource ( ) . getReaderIDs ( ) ;
2010-12-21 10:09:46 +08:00
List < Set < String > > lib_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
for ( SAMReaderID r : readers ) {
Set < String > libs = new HashSet < String > ( 2 ) ;
lib_sets . add ( libs ) ;
for ( SAMReadGroupRecord g : getReadsDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
libs . add ( g . getLibrary ( ) ) ;
}
}
return lib_sets ;
}
/ * *
* * * * * UNLESS YOU HAVE GOOD REASON TO , DO NOT USE THIS METHOD ; USE getFileToReadGroupIdMapping ( ) INSTEAD * * * *
*
* Returns sets of ( remapped ) read groups in input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of remapped read groups
* ( i . e . as seen by read . getReadGroup ( ) . getReadGroupId ( ) in the merged stream ) that come from the corresponding bam file .
*
* @return sets of ( merged ) read group ids in order of input bams
* /
public List < Set < String > > getMergedReadGroupsByReaders ( ) {
2010-12-23 03:00:17 +08:00
Collection < SAMReaderID > readers = getReadsDataSource ( ) . getReaderIDs ( ) ;
2010-12-21 10:09:46 +08:00
List < Set < String > > rg_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
for ( SAMReaderID r : readers ) {
Set < String > groups = new HashSet < String > ( 5 ) ;
rg_sets . add ( groups ) ;
for ( SAMReadGroupRecord g : getReadsDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
if ( getReadsDataSource ( ) . hasReadGroupCollisions ( ) ) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
// use HeaderMerger to translate original read group id from the reader into the read group id in the
// merged stream, and save that remapped read group id to associate it with specific reader
groups . add ( getReadsDataSource ( ) . getReadGroupId ( r , g . getReadGroupId ( ) ) ) ;
} else {
// otherwise, pass through the unmapped read groups since this is what Picard does as well
groups . add ( g . getReadGroupId ( ) ) ;
}
}
}
return rg_sets ;
}
/ * *
* Now that all files are open , validate the sequence dictionaries of the reads vs . the reference vrs the reference ordered data ( if available ) .
*
* @param reads Reads data source .
* @param reference Reference data source .
2010-12-31 12:52:22 +08:00
* @param rods a collection of the reference ordered data tracks
2010-12-21 10:09:46 +08:00
* /
2010-12-31 12:52:22 +08:00
private void validateSourcesAgainstReference ( SAMDataSource reads , ReferenceSequenceFile reference , Collection < ReferenceOrderedDataSource > rods , RMDTrackBuilder manager ) {
if ( ( reads . isEmpty ( ) & & ( rods = = null | | rods . isEmpty ( ) ) ) | | reference = = null )
2010-12-21 10:09:46 +08:00
return ;
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference . getSequenceDictionary ( ) ;
if ( ! reads . isEmpty ( ) ) {
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads . getHeader ( ) . getSequenceDictionary ( ) ;
2011-04-09 02:33:10 +08:00
if ( readsDictionary . size ( ) = = 0 ) {
2010-12-21 10:09:46 +08:00
logger . info ( "Reads file is unmapped. Skipping validation against reference." ) ;
return ;
}
// compare the reads to the reference
SequenceDictionaryUtils . validateDictionaries ( logger , getArguments ( ) . unsafe , "reads" , readsDictionary , "reference" , referenceDictionary ) ;
}
2010-12-31 12:52:22 +08:00
for ( ReferenceOrderedDataSource rod : rods )
manager . validateTrackSequenceDictionary ( rod . getName ( ) , rod . getSequenceDictionary ( ) , referenceDictionary ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Gets a data source for the given set of reads .
*
* @return A data source for the given set of reads .
* /
2011-03-17 05:48:47 +08:00
private SAMDataSource createReadsDataSource ( GATKArgumentCollection argCollection , GenomeLocParser genomeLocParser , IndexedFastaSequenceFile refReader ) {
2010-12-21 10:09:46 +08:00
DownsamplingMethod method = getDownsamplingMethod ( ) ;
if ( getWalkerBAQApplicationTime ( ) = = BAQ . ApplicationTime . FORBIDDEN & & argCollection . BAQMode ! = BAQ . CalculationMode . OFF )
throw new UserException . BadArgumentValue ( "baq" , "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection . BAQMode + " was requested." ) ;
2011-03-17 05:48:47 +08:00
SAMDataSource dataSource = new SAMDataSource (
2010-12-23 03:00:17 +08:00
samReaderIDs ,
2010-12-21 10:09:46 +08:00
genomeLocParser ,
argCollection . useOriginalBaseQualities ,
argCollection . strictnessLevel ,
argCollection . readBufferSize ,
method ,
new ValidationExclusion ( Arrays . asList ( argCollection . unsafe ) ) ,
filters ,
includeReadsWithDeletionAtLoci ( ) ,
generateExtendedEvents ( ) ,
getWalkerBAQApplicationTime ( ) = = BAQ . ApplicationTime . ON_INPUT ? argCollection . BAQMode : BAQ . CalculationMode . OFF ,
getWalkerBAQQualityMode ( ) ,
2011-01-06 06:25:08 +08:00
refReader ,
2011-04-15 22:22:07 +08:00
argCollection . defaultBaseQualities ,
! argCollection . disableLowMemorySharding ) ;
2011-03-17 05:48:47 +08:00
return dataSource ;
2010-12-21 10:09:46 +08:00
}
/ * *
2011-01-13 02:25:12 +08:00
* Opens a reference sequence file paired with an index . Only public for testing purposes
2010-12-21 10:09:46 +08:00
*
* @param refFile Handle to a reference sequence file . Non - null .
* @return A thread - safe file wrapper .
* /
2011-01-13 02:25:12 +08:00
public void setReferenceDataSource ( File refFile ) {
this . referenceDataSource = new ReferenceDataSource ( refFile ) ;
genomeLocParser = new GenomeLocParser ( referenceDataSource . getReference ( ) ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Open the reference - ordered data sources .
*
2010-12-31 12:52:22 +08:00
* @param referenceMetaDataFiles collection of RMD descriptors to load and validate .
* @param sequenceDictionary GATK - wide sequnce dictionary to use for validation .
* @param genomeLocParser to use when creating and validating GenomeLocs .
* @param validationExclusionType potentially indicate which validations to include / exclude .
*
2010-12-21 10:09:46 +08:00
* @return A list of reference - ordered data sources .
* /
2010-12-31 12:52:22 +08:00
private List < ReferenceOrderedDataSource > getReferenceOrderedDataSources ( Collection < RMDTriplet > referenceMetaDataFiles ,
SAMSequenceDictionary sequenceDictionary ,
GenomeLocParser genomeLocParser ,
ValidationExclusion . TYPE validationExclusionType ) {
2011-01-15 02:52:21 +08:00
RMDTrackBuilder builder = new RMDTrackBuilder ( sequenceDictionary , genomeLocParser , validationExclusionType ) ;
2010-12-31 12:52:22 +08:00
2010-12-21 10:09:46 +08:00
List < ReferenceOrderedDataSource > dataSources = new ArrayList < ReferenceOrderedDataSource > ( ) ;
2010-12-31 12:52:22 +08:00
for ( RMDTriplet fileDescriptor : referenceMetaDataFiles )
dataSources . add ( new ReferenceOrderedDataSource ( fileDescriptor ,
builder ,
sequenceDictionary ,
2010-12-21 10:09:46 +08:00
genomeLocParser ,
flashbackData ( ) ) ) ;
2010-12-31 12:52:22 +08:00
// validation: check to make sure everything the walker needs is present, and that all sequence dictionaries match.
validateSourcesAgainstReference ( readsDataSource , referenceDataSource . getReference ( ) , dataSources , builder ) ;
2010-12-21 10:09:46 +08:00
return dataSources ;
}
/ * *
* Returns the SAM File Header from the input reads ' data source file
* @return the SAM File Header from the input reads ' data source file
* /
public SAMFileHeader getSAMFileHeader ( ) {
return readsDataSource . getHeader ( ) ;
}
/ * *
* Returns the unmerged SAM file header for an individual reader .
* @param reader The reader .
* @return Header for that reader .
* /
public SAMFileHeader getSAMFileHeader ( SAMReaderID reader ) {
return readsDataSource . getHeader ( reader ) ;
}
2011-09-20 22:53:18 +08:00
/ * *
* Gets the master sequence dictionary for this GATK engine instance
* @return a never - null dictionary listing all of the contigs known to this engine instance
* /
public SAMSequenceDictionary getMasterSequenceDictionary ( ) {
return getReferenceDataSource ( ) . getReference ( ) . getSequenceDictionary ( ) ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads ; header merger , individual file readers etc can be accessed through the returned data source object .
*
* @return the reads data source
* /
public SAMDataSource getReadsDataSource ( ) {
return this . readsDataSource ;
}
/ * *
* Sets the collection of GATK main application arguments .
*
* @param argCollection the GATK argument collection
* /
public void setArguments ( GATKArgumentCollection argCollection ) {
this . argCollection = argCollection ;
}
/ * *
* Gets the collection of GATK main application arguments .
*
* @return the GATK argument collection
* /
public GATKArgumentCollection getArguments ( ) {
return this . argCollection ;
}
/ * *
* Get the list of intervals passed to the engine .
2011-08-23 05:25:15 +08:00
* @return List of intervals , or null if no intervals are in use
2010-12-21 10:09:46 +08:00
* /
public GenomeLocSortedSet getIntervals ( ) {
return this . intervals ;
}
/ * *
* Gets the list of filters employed by this engine .
* @return Collection of filters ( actual instances ) used by this engine .
* /
2011-05-05 03:29:08 +08:00
public Collection < ReadFilter > getFilters ( ) {
2010-12-21 10:09:46 +08:00
return this . filters ;
}
/ * *
* Sets the list of filters employed by this engine .
* @param filters Collection of filters ( actual instances ) used by this engine .
* /
2011-05-05 03:29:08 +08:00
public void setFilters ( Collection < ReadFilter > filters ) {
2010-12-21 10:09:46 +08:00
this . filters = filters ;
}
/ * *
* Gets the filter manager for this engine .
* @return filter manager for this engine .
* /
protected FilterManager getFilterManager ( ) {
return filterManager ;
}
/ * *
* Gets the input sources for this engine .
* @return input sources for this engine .
* /
protected Map < ArgumentSource , Object > getInputs ( ) {
return inputs ;
}
/ * *
* Gets the output stubs for this engine .
* @return output stubs for this engine .
* /
protected Collection < Stub < ? > > getOutputs ( ) {
return outputs ;
}
/ * *
* Returns data source objects encapsulating all rod data ;
* individual rods can be accessed through the returned data source objects .
*
* @return the rods data sources
* /
public List < ReferenceOrderedDataSource > getRodDataSources ( ) {
return this . rodDataSources ;
}
/ * *
* Gets cumulative metrics about the entire run to this point .
2011-04-13 23:10:46 +08:00
* Returns a clone of this snapshot in time .
* @return cumulative metrics about the entire run at this point . ReadMetrics object is a unique instance and is
* owned by the caller ; the caller can do with the object what they wish .
2010-12-21 10:09:46 +08:00
* /
public ReadMetrics getCumulativeMetrics ( ) {
return readsDataSource = = null ? null : readsDataSource . getCumulativeReadMetrics ( ) ;
}
public SampleDataSource getSampleMetadata ( ) {
return this . sampleDataSource ;
}
/ * *
* Get a sample by its ID
* If an alias is passed in , return the main sample object
* @param id sample id
* @return sample Object with this ID
* /
public Sample getSampleById ( String id ) {
return sampleDataSource . getSampleById ( id ) ;
}
/ * *
* Get the sample for a given read group
* Must first look up ID for read group
* @param readGroup of sample
* @return sample object with ID from the read group
* /
public Sample getSampleByReadGroup ( SAMReadGroupRecord readGroup ) {
return sampleDataSource . getSampleByReadGroup ( readGroup ) ;
}
/ * *
* Get a sample for a given read
* Must first look up read group , and then sample ID for that read group
* @param read of sample
* @return sample object of this read
* /
public Sample getSampleByRead ( SAMRecord read ) {
return getSampleByReadGroup ( read . getReadGroup ( ) ) ;
}
/ * *
* Get number of sample objects
* @return size of samples map
* /
public int sampleCount ( ) {
return sampleDataSource . sampleCount ( ) ;
}
/ * *
* Return all samples with a given family ID
* Note that this isn ' t terribly efficient ( linear ) - it may be worth adding a new family ID data structure for this
* @param familyId family ID
* @return Samples with the given family ID
* /
public Set < Sample > getFamily ( String familyId ) {
return sampleDataSource . getFamily ( familyId ) ;
}
/ * *
* Returns all children of a given sample
* See note on the efficiency of getFamily ( ) - since this depends on getFamily ( ) it ' s also not efficient
* @param sample parent sample
* @return children of the given sample
* /
public Set < Sample > getChildren ( Sample sample ) {
return sampleDataSource . getChildren ( sample ) ;
}
/ * *
* Gets all the samples
* @return
* /
public Collection < Sample > getSamples ( ) {
return sampleDataSource . getSamples ( ) ;
}
/ * *
* Takes a list of sample names and returns their corresponding sample objects
*
* @param sampleNameList List of sample names
* @return Corresponding set of samples
* /
public Set < Sample > getSamples ( Collection < String > sampleNameList ) {
return sampleDataSource . getSamples ( sampleNameList ) ;
}
/ * *
* Returns a set of samples that have any value ( which could be null ) for a given property
* @param key Property key
* @return Set of samples with the property
* /
public Set < Sample > getSamplesWithProperty ( String key ) {
return sampleDataSource . getSamplesWithProperty ( key ) ;
}
/ * *
* Returns a set of samples that have a property with a certain value
* Value must be a string for now - could add a similar method for matching any objects in the future
*
* @param key Property key
* @param value String property value
* @return Set of samples that match key and value
* /
public Set < Sample > getSamplesWithProperty ( String key , String value ) {
return sampleDataSource . getSamplesWithProperty ( key , value ) ;
}
/ * *
* Returns a set of sample objects for the sample names in a variant context
*
* @param context Any variant context
* @return a set of the sample objects
* /
public Set < Sample > getSamplesByVariantContext ( VariantContext context ) {
Set < Sample > samples = new HashSet < Sample > ( ) ;
for ( String sampleName : context . getSampleNames ( ) ) {
samples . add ( sampleDataSource . getOrCreateSample ( sampleName ) ) ;
}
return samples ;
}
/ * *
* Returns all samples that were referenced in the SAM file
* /
public Set < Sample > getSAMFileSamples ( ) {
return sampleDataSource . getSAMFileSamples ( ) ;
}
/ * *
* Return a subcontext restricted to samples with a given property key / value
* Gets the sample names from key / value and relies on VariantContext . subContextFromGenotypes for the filtering
* @param context VariantContext to filter
* @param key property key
* @param value property value ( must be string )
* @return subcontext
* /
public VariantContext subContextFromSampleProperty ( VariantContext context , String key , String value ) {
return sampleDataSource . subContextFromSampleProperty ( context , key , value ) ;
}
public Map < String , String > getApproximateCommandLineArguments ( Object . . . argumentProviders ) {
return CommandLineUtils . getApproximateCommandLineArguments ( parsingEngine , argumentProviders ) ;
}
public String createApproximateCommandLineArgumentString ( Object . . . argumentProviders ) {
return CommandLineUtils . createApproximateCommandLineArgumentString ( parsingEngine , argumentProviders ) ;
}
2009-05-11 10:07:20 +08:00
}