2009-06-23 05:11:18 +08:00
/ *
2010-10-28 03:44:55 +08:00
* Copyright ( c ) 2010 , The Broad Institute
2010-04-20 23:26:32 +08:00
*
2009-06-23 05:11:18 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2009-06-23 05:11:18 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2009-06-23 05:11:18 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2009-06-23 05:11:18 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
2010-10-28 03:44:55 +08:00
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
2009-06-23 05:11:18 +08:00
* /
2009-05-11 10:07:20 +08:00
package org.broadinstitute.sting.gatk ;
2009-07-10 07:59:53 +08:00
import net.sf.picard.filter.SamRecordFilter ;
2010-12-21 10:09:46 +08:00
import net.sf.picard.reference.IndexedFastaSequenceFile ;
2010-04-01 06:39:56 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2010-12-21 10:09:46 +08:00
import net.sf.samtools.* ;
import org.apache.log4j.Logger ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.commandline.ArgumentException ;
import org.broadinstitute.sting.commandline.ArgumentSource ;
2010-09-28 10:16:25 +08:00
import org.broad.tribble.util.variantcontext.VariantContext ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.commandline.CommandLineUtils ;
import org.broadinstitute.sting.commandline.ParsingEngine ;
2011-03-14 23:51:19 +08:00
import org.broadinstitute.sting.commandline.Tags ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection ;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion ;
2011-02-04 01:59:19 +08:00
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID ;
import org.broadinstitute.sting.gatk.datasources.reads.Shard ;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource ;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.datasources.sample.Sample ;
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource ;
2011-02-04 01:59:19 +08:00
import org.broadinstitute.sting.gatk.datasources.reads.MonolithicShardStrategy ;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy ;
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory ;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource ;
2009-05-11 10:07:20 +08:00
import org.broadinstitute.sting.gatk.executive.MicroScheduler ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.filters.FilterManager ;
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter ;
import org.broadinstitute.sting.gatk.filters.SamRecordHeaderFilter ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.gatk.io.OutputTracker ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.gatk.io.stubs.Stub ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder ;
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator ;
2010-12-23 03:00:17 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
import org.broadinstitute.sting.utils.GenomeLocParser ;
import org.broadinstitute.sting.utils.GenomeLocSortedSet ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.utils.SequenceDictionaryUtils ;
BAQ calculation refactoring in the GATK. Single -baq argument can be NONE, CALCULATE_AS_NECESSARY, and RECALCULATE. Walkers can control bia the @BAQMode annotation how the BAQ calculation is applied. Can either be as a tag, by overwriting the qualities scores, or by only returning the baq-capped qualities scores. Additionally, walkers can be set up to have the BAQ applied to the incoming reads (ON_INPUT, the default), to output reads (ON_OUTPUT), or HANDLED_BY_WALKER, which means that calling into the BAQ system is the responsibility of the individual walker.
SAMFileWriterStub now supports BAQ writing as an internal feature. Several walkers have the @BAQMode applied to this, with parameters that I think are reasonable. Please look if you own these walkers, though
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4798 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-07 04:55:52 +08:00
import org.broadinstitute.sting.utils.baq.BAQ ;
2010-09-24 07:28:55 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
import org.broadinstitute.sting.utils.exceptions.UserException ;
2010-12-21 10:09:46 +08:00
import org.broadinstitute.sting.utils.interval.IntervalMergingRule ;
import org.broadinstitute.sting.utils.interval.IntervalUtils ;
2009-05-11 10:07:20 +08:00
2010-12-21 10:09:46 +08:00
import java.io.File ;
2009-07-30 00:11:45 +08:00
import java.util.* ;
2009-05-11 10:07:20 +08:00
2010-09-24 07:28:55 +08:00
/ * *
* A GenomeAnalysisEngine that runs a specified walker .
* /
2010-12-21 10:09:46 +08:00
public class GenomeAnalysisEngine {
/ * *
* our log , which we want to capture anything from this class
* /
private static Logger logger = Logger . getLogger ( GenomeAnalysisEngine . class ) ;
/ * *
* The GATK command - line argument parsing code .
* /
private ParsingEngine parsingEngine ;
/ * *
* The genomeLocParser can create and parse GenomeLocs .
* /
private GenomeLocParser genomeLocParser ;
/ * *
* Accessor for sharded read data .
* /
private SAMDataSource readsDataSource = null ;
/ * *
* Accessor for sharded reference data .
* /
private ReferenceDataSource referenceDataSource = null ;
/ * *
* Accessor for sample metadata
* /
private SampleDataSource sampleDataSource = null ;
/ * *
* Accessor for sharded reference - ordered data .
* /
private List < ReferenceOrderedDataSource > rodDataSources ;
// our argument collection
private GATKArgumentCollection argCollection ;
/ * *
* Collection of intervals used by the engine .
* /
private GenomeLocSortedSet intervals = null ;
2011-04-05 02:41:55 +08:00
/ * *
* Explicitly assign the interval set to use for this traversal ( for unit testing purposes )
* @param intervals set of intervals to use for this traversal
* /
public void setIntervals ( GenomeLocSortedSet intervals ) {
this . intervals = intervals ;
}
2010-12-21 10:09:46 +08:00
/ * *
* Collection of inputs used by the engine .
* /
private Map < ArgumentSource , Object > inputs = new HashMap < ArgumentSource , Object > ( ) ;
/ * *
* Collection of outputs used by the engine .
* /
private Collection < Stub < ? > > outputs = new ArrayList < Stub < ? > > ( ) ;
/ * *
* Collection of the filters applied to the input data .
* /
private Collection < SamRecordFilter > filters ;
2011-01-18 05:23:09 +08:00
/ * *
* A currently hacky unique name for this GATK instance
* /
private String myName = "GATK_" + Math . abs ( new Random ( ) . nextInt ( ) ) ;
2009-10-06 10:45:31 +08:00
/ * *
* our walker manager
* /
2010-09-25 10:49:30 +08:00
private final WalkerManager walkerManager = new WalkerManager ( ) ;
2009-11-11 02:40:16 +08:00
2010-09-24 07:28:55 +08:00
private Walker < ? , ? > walker ;
2010-08-29 06:53:32 +08:00
2010-09-24 07:28:55 +08:00
public void setWalker ( Walker < ? , ? > walker ) {
this . walker = walker ;
2009-07-10 07:59:53 +08:00
}
2009-05-11 10:07:20 +08:00
2010-12-23 03:00:17 +08:00
/ * *
* A processed collection of SAM reader identifiers .
* /
2010-12-31 12:52:22 +08:00
private Collection < SAMReaderID > samReaderIDs = Collections . emptyList ( ) ;
2010-12-23 03:00:17 +08:00
/ * *
* Set the SAM / BAM files over which to traverse .
* @param samReaderIDs Collection of ids to use during this traversal .
* /
public void setSAMFileIDs ( Collection < SAMReaderID > samReaderIDs ) {
this . samReaderIDs = samReaderIDs ;
}
/ * *
* Collection of reference metadata files over which to traverse .
* /
private Collection < RMDTriplet > referenceMetaDataFiles ;
/ * *
* Set the reference metadata files to use for this traversal .
* @param referenceMetaDataFiles Collection of files and descriptors over which to traverse .
* /
public void setReferenceMetaDataFiles ( Collection < RMDTriplet > referenceMetaDataFiles ) {
this . referenceMetaDataFiles = referenceMetaDataFiles ;
}
2009-07-10 07:59:53 +08:00
/ * *
* Actually run the GATK with the specified walker .
2009-10-06 10:45:31 +08:00
*
2009-09-30 06:23:19 +08:00
* @return the value of this traversal .
2009-07-10 07:59:53 +08:00
* /
2010-09-24 07:28:55 +08:00
public Object execute ( ) {
2010-05-18 05:00:44 +08:00
//HeapSizeMonitor monitor = new HeapSizeMonitor();
//monitor.start();
2010-09-24 07:28:55 +08:00
setStartTime ( new java . util . Date ( ) ) ;
2010-05-18 05:00:44 +08:00
2009-05-11 10:07:20 +08:00
// validate our parameters
2010-09-24 07:28:55 +08:00
if ( this . getArguments ( ) = = null ) {
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null." ) ;
2009-05-11 10:07:20 +08:00
}
2010-04-01 20:47:48 +08:00
// validate our parameters
2010-09-24 07:28:55 +08:00
if ( this . walker = = null )
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "The walker passed to GenomeAnalysisEngine can not be null." ) ;
2009-07-10 07:59:53 +08:00
2009-07-30 07:00:15 +08:00
// Prepare the data for traversal.
2010-09-24 07:28:55 +08:00
initializeDataSources ( ) ;
2009-05-20 07:26:17 +08:00
2011-01-13 01:32:27 +08:00
// initialize and validate the interval list
initializeIntervals ( ) ;
validateSuppliedIntervals ( ) ;
2009-05-11 10:07:20 +08:00
// our microscheduler, which is in charge of running everything
2010-09-24 07:28:55 +08:00
MicroScheduler microScheduler = createMicroscheduler ( ) ;
2009-05-11 10:07:20 +08:00
2011-01-05 11:07:11 +08:00
// create temp directories as necessary
initializeTempDirectory ( ) ;
2010-08-22 22:27:05 +08:00
// create the output streams "
2010-09-24 07:28:55 +08:00
initializeOutputStreams ( microScheduler . getOutputTracker ( ) ) ;
2009-05-11 10:07:20 +08:00
2011-03-17 05:48:47 +08:00
ShardStrategy shardStrategy = getShardStrategy ( readsDataSource , microScheduler . getReference ( ) , intervals ) ;
2010-03-15 05:08:14 +08:00
// execute the microscheduler, storing the results
2010-09-24 07:28:55 +08:00
Object result = microScheduler . execute ( this . walker , shardStrategy ) ;
2010-05-18 05:00:44 +08:00
//monitor.stop();
//logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
return result ;
2010-03-15 05:08:14 +08:00
}
2009-07-10 07:59:53 +08:00
/ * *
* Retrieves an instance of the walker based on the walker name .
2009-10-06 10:45:31 +08:00
*
2009-07-10 07:59:53 +08:00
* @param walkerName Name of the walker . Must not be null . If the walker cannot be instantiated , an exception will be thrown .
* @return An instance of the walker .
* /
2009-10-06 10:45:31 +08:00
public Walker < ? , ? > getWalkerByName ( String walkerName ) {
2009-11-11 02:40:16 +08:00
return walkerManager . createByName ( walkerName ) ;
}
/ * *
* Gets the name of a given walker type .
* @param walkerType Type of walker .
* @return Name of the walker .
* /
2010-05-21 03:02:02 +08:00
public String getWalkerName ( Class < ? extends Walker > walkerType ) {
2009-11-11 02:40:16 +08:00
return walkerManager . getName ( walkerType ) ;
2009-07-10 07:59:53 +08:00
}
2011-01-18 05:23:09 +08:00
public String getName ( ) {
return myName ;
}
2009-11-11 07:36:17 +08:00
/ * *
* Gets a list of the filters to associate with the given walker . Will NOT initialize the engine with this filters ;
* the caller must handle that directly .
* @return A collection of available filters .
* /
2010-09-25 10:49:30 +08:00
public Collection < SamRecordFilter > createFilters ( ) {
2010-03-18 00:24:10 +08:00
Set < SamRecordFilter > filters = new HashSet < SamRecordFilter > ( ) ;
2010-09-24 07:28:55 +08:00
filters . addAll ( WalkerManager . getReadFilters ( walker , this . getFilterManager ( ) ) ) ;
2010-12-21 10:09:46 +08:00
if ( this . getArguments ( ) . readGroupBlackList ! = null & & this . getArguments ( ) . readGroupBlackList . size ( ) > 0 )
filters . add ( new ReadGroupBlackListFilter ( this . getArguments ( ) . readGroupBlackList ) ) ;
for ( String filterName : this . getArguments ( ) . readFilters )
filters . add ( this . getFilterManager ( ) . createByName ( filterName ) ) ;
2009-11-11 07:36:17 +08:00
return Collections . unmodifiableSet ( filters ) ;
}
2009-12-05 07:24:29 +08:00
/ * *
* Allow subclasses and others within this package direct access to the walker manager .
* @return The walker manager used by this package .
* /
protected WalkerManager getWalkerManager ( ) {
return walkerManager ;
}
2010-09-24 07:28:55 +08:00
2009-05-11 10:07:20 +08:00
/ * *
* setup a microscheduler
2009-10-06 10:45:31 +08:00
*
2009-05-11 10:07:20 +08:00
* @return a new microscheduler
* /
2010-09-24 07:28:55 +08:00
private MicroScheduler createMicroscheduler ( ) {
2010-03-22 07:22:25 +08:00
// Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary.
2011-01-14 10:49:04 +08:00
if ( ( walker instanceof ReadWalker | | walker instanceof DuplicateWalker | | walker instanceof ReadPairWalker ) & &
2010-09-24 07:28:55 +08:00
this . getArguments ( ) . referenceFile = = null ) {
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "Read-based traversals require a reference file but none was given" ) ;
2009-05-11 10:07:20 +08:00
}
2011-01-14 10:49:04 +08:00
return MicroScheduler . create ( this , walker , this . getReadsDataSource ( ) , this . getReferenceDataSource ( ) . getReference ( ) , this . getRodDataSources ( ) , this . getArguments ( ) . numberOfThreads ) ;
2010-02-04 12:12:49 +08:00
}
2010-09-24 07:28:55 +08:00
protected DownsamplingMethod getDownsamplingMethod ( ) {
GATKArgumentCollection argCollection = this . getArguments ( ) ;
DownsamplingMethod method ;
2010-08-27 05:38:03 +08:00
if ( argCollection . getDownsamplingMethod ( ) ! = null )
method = argCollection . getDownsamplingMethod ( ) ;
2010-05-19 13:40:05 +08:00
else if ( WalkerManager . getDownsamplingMethod ( walker ) ! = null )
method = WalkerManager . getDownsamplingMethod ( walker ) ;
else
2010-08-27 05:38:03 +08:00
method = argCollection . getDefaultDownsamplingMethod ( ) ;
2010-09-24 07:28:55 +08:00
return method ;
}
BAQ calculation refactoring in the GATK. Single -baq argument can be NONE, CALCULATE_AS_NECESSARY, and RECALCULATE. Walkers can control bia the @BAQMode annotation how the BAQ calculation is applied. Can either be as a tag, by overwriting the qualities scores, or by only returning the baq-capped qualities scores. Additionally, walkers can be set up to have the BAQ applied to the incoming reads (ON_INPUT, the default), to output reads (ON_OUTPUT), or HANDLED_BY_WALKER, which means that calling into the BAQ system is the responsibility of the individual walker.
SAMFileWriterStub now supports BAQ writing as an internal feature. Several walkers have the @BAQMode applied to this, with parameters that I think are reasonable. Please look if you own these walkers, though
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4798 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-07 04:55:52 +08:00
public BAQ . QualityMode getWalkerBAQQualityMode ( ) { return WalkerManager . getBAQQualityMode ( walker ) ; }
public BAQ . ApplicationTime getWalkerBAQApplicationTime ( ) { return WalkerManager . getBAQApplicationTime ( walker ) ; }
2010-09-24 07:28:55 +08:00
protected boolean generateExtendedEvents ( ) {
return walker . generateExtendedEvents ( ) ;
}
2010-05-19 13:40:05 +08:00
2010-09-24 07:28:55 +08:00
protected boolean includeReadsWithDeletionAtLoci ( ) {
return walker . includeReadsWithDeletionAtLoci ( ) ;
2009-06-10 21:39:32 +08:00
}
2009-07-30 07:00:15 +08:00
/ * *
2011-03-31 06:23:24 +08:00
* Verifies that the supplied set of reads files mesh with what the walker says it requires ,
* and also makes sure that there were no duplicate SAM files specified on the command line .
2009-07-30 07:00:15 +08:00
* /
2010-09-24 07:28:55 +08:00
protected void validateSuppliedReads ( ) {
GATKArgumentCollection arguments = this . getArguments ( ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( WalkerManager . isRequired ( walker , DataSource . READS ) & & ( arguments . samFiles = = null | | arguments . samFiles . size ( ) = = 0 ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker requires reads but none were provided." ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( ( arguments . samFiles ! = null & & arguments . samFiles . size ( ) > 0 ) & & ! WalkerManager . isAllowed ( walker , DataSource . READS ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker does not allow reads but reads were provided." ) ;
2011-03-31 06:23:24 +08:00
// Make sure no SAM files were specified multiple times by the user.
checkForDuplicateSamFiles ( ) ;
}
/ * *
* Checks whether there are SAM files that appear multiple times in the fully unpacked list of
* SAM files ( samReaderIDs ) . If there are , throws an ArgumentException listing the files in question .
* /
protected void checkForDuplicateSamFiles ( ) {
Set < SAMReaderID > encounteredSamFiles = new HashSet < SAMReaderID > ( ) ;
Set < String > duplicateSamFiles = new LinkedHashSet < String > ( ) ;
for ( SAMReaderID samFile : samReaderIDs ) {
if ( encounteredSamFiles . contains ( samFile ) ) {
duplicateSamFiles . add ( samFile . getSamFilePath ( ) ) ;
}
else {
encounteredSamFiles . add ( samFile ) ;
}
}
if ( duplicateSamFiles . size ( ) > 0 ) {
throw new ArgumentException ( "The following BAM files appear multiple times in the list of input files: " +
duplicateSamFiles + " BAM files may be specified at most once." ) ;
}
2009-07-30 07:00:15 +08:00
}
/ * *
* Verifies that the supplied reference file mesh with what the walker says it requires .
* /
2010-09-24 07:28:55 +08:00
protected void validateSuppliedReference ( ) {
GATKArgumentCollection arguments = this . getArguments ( ) ;
2009-07-30 07:00:15 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-12-11 03:15:48 +08:00
// TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required.
if ( /*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments . referenceFile = = null )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker requires a reference but none was provided." ) ;
2009-07-30 07:00:15 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( arguments . referenceFile ! = null & & ! WalkerManager . isAllowed ( walker , DataSource . REFERENCE ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( "Walker does not allow a reference but one was provided." ) ;
2009-07-30 07:00:15 +08:00
}
2009-05-20 07:26:17 +08:00
2009-07-30 07:00:15 +08:00
/ * *
* Verifies that all required reference - ordered data has been supplied , and any reference - ordered data that was not
* ' allowed ' is still present .
2009-10-06 10:45:31 +08:00
*
* @param rods Reference - ordered data to load .
2009-07-30 07:00:15 +08:00
* /
2010-12-31 12:52:22 +08:00
protected void validateSuppliedReferenceOrderedData ( List < ReferenceOrderedDataSource > rods ) {
2009-05-20 07:26:17 +08:00
// Check to make sure that all required metadata is present.
List < RMD > allRequired = WalkerManager . getRequiredMetaData ( walker ) ;
2009-07-10 05:57:00 +08:00
for ( RMD required : allRequired ) {
2009-05-20 07:26:17 +08:00
boolean found = false ;
2010-12-31 12:52:22 +08:00
for ( ReferenceOrderedDataSource rod : rods ) {
2010-05-28 22:52:44 +08:00
if ( rod . matchesNameAndRecordType ( required . name ( ) , required . type ( ) ) )
2009-05-20 07:26:17 +08:00
found = true ;
}
2009-07-10 05:57:00 +08:00
if ( ! found )
Error message fixes for the following issues:
nvjpM4yOwQAu3fNGxi4oXLuVpKn6aAlf,1GL0OuXK2xKQfvbu34tWYgbojSVSLo0l,
ehEGBJOfgc4V7qj8W0Homf5ICuVK5Sm3,cZsreLm1CbY3aYKZhV7DOSvQNwur41zp,
GlrlyGEyP9kJDIRCQNFQp7BGJBXSzdDJ,hyz1uiHXr39ANmdZu9K1epOSX8EL3mDw,
q0n4EucZESCI4LZhQik306zD4VAuH2cb.
Messages:
camrhG5tHzlY9WUSEVpVZGkU1tyJqKb5,s0OX2g7nYRctJxyFoQCa6clac9IsjHyi,
THIAtjllvYNlnTmiMnJEIHd2Ju4gqQIO,jwVk3JYZJNHloW7HO4LeGxFexknqro0v,
BFNRGOGmGGJNNPZqgeF1ikTNFfskbyLc,...
Were fixed in 4392.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4428 348d0f76-0448-11de-a6fe-93d51630548a
2010-10-05 11:37:13 +08:00
throw new ArgumentException ( String . format ( "Walker requires reference metadata to be supplied named '%s' of type '%s', but this metadata was not provided. " +
"Please supply the specified metadata file." , required . name ( ) , required . type ( ) . getSimpleName ( ) ) ) ;
2009-05-20 07:26:17 +08:00
}
// Check to see that no forbidden rods are present.
2010-12-31 12:52:22 +08:00
for ( ReferenceOrderedDataSource rod : rods ) {
2009-07-10 05:57:00 +08:00
if ( ! WalkerManager . isAllowed ( walker , rod ) )
2010-09-23 20:08:27 +08:00
throw new ArgumentException ( String . format ( "Walker of type %s does not allow access to metadata: %s" , walker . getClass ( ) , rod . getName ( ) ) ) ;
2009-07-30 07:00:15 +08:00
}
}
2010-12-15 02:24:18 +08:00
protected void validateSuppliedIntervals ( ) {
// Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped.
if ( ! ( walker instanceof ReadWalker ) ) {
GenomeLocSortedSet intervals = getIntervals ( ) ;
if ( intervals ! = null & & getIntervals ( ) . contains ( GenomeLoc . UNMAPPED ) )
throw new ArgumentException ( "Interval list specifies unmapped region. Only read walkers may include the unmapped region." ) ;
}
2011-04-05 02:41:55 +08:00
// If intervals is non-null and empty at this point, it means that the list of intervals to process
// was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since
// this was very likely unintentional, the user should be informed of this. Note that this is different
// from the case where intervals == null, which indicates either that there were no interval arguments,
// or that -L all was specified.
if ( intervals ! = null & & intervals . isEmpty ( ) ) {
throw new ArgumentException ( "The given combination of -L and -XL options results in an empty set. " +
"No intervals to process." ) ;
}
2010-12-15 02:24:18 +08:00
}
2009-07-30 00:11:45 +08:00
/ * *
* Get the sharding strategy given a driving data source .
*
* @param drivingDataSource Data on which to shard .
2010-09-24 07:28:55 +08:00
* @return the sharding strategy
2009-07-30 00:11:45 +08:00
* /
2011-03-17 05:48:47 +08:00
protected ShardStrategy getShardStrategy ( SAMDataSource readsDataSource , ReferenceSequenceFile drivingDataSource , GenomeLocSortedSet intervals ) {
2010-09-24 07:28:55 +08:00
ValidationExclusion exclusions = ( readsDataSource ! = null ? readsDataSource . getReadsInfo ( ) . getValidationExclusionList ( ) : null ) ;
ReferenceDataSource referenceDataSource = this . getReferenceDataSource ( ) ;
2010-03-22 07:22:25 +08:00
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
// sharding system; it's required with the new sharding system only for locus walkers.
2010-07-03 04:09:25 +08:00
if ( readsDataSource ! = null & & ! readsDataSource . hasIndex ( ) ) {
2010-07-07 11:14:59 +08:00
if ( ! exclusions . contains ( ValidationExclusion . TYPE . ALLOW_UNINDEXED_BAM ) )
2011-01-15 05:32:53 +08:00
throw new UserException . CommandLineException ( "Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported." ) ;
2011-02-01 06:10:06 +08:00
if ( intervals ! = null & & ! argCollection . allowIntervalsWithUnindexedBAM )
2011-01-14 10:49:04 +08:00
throw new UserException . CommandLineException ( "Cannot perform interval processing when reads are present but no index is available." ) ;
2009-12-17 05:55:42 +08:00
Shard . ShardType shardType ;
2010-03-22 07:31:45 +08:00
if ( walker instanceof LocusWalker ) {
2010-07-19 00:29:59 +08:00
if ( readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
2010-09-14 13:04:26 +08:00
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . coordinate , "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately." ) ;
2009-12-17 05:55:42 +08:00
shardType = Shard . ShardType . LOCUS ;
2010-03-22 07:31:45 +08:00
}
2010-03-22 07:22:25 +08:00
else if ( walker instanceof ReadWalker | | walker instanceof DuplicateWalker | | walker instanceof ReadPairWalker )
2009-12-17 05:55:42 +08:00
shardType = Shard . ShardType . READ ;
else
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "The GATK cannot currently process unindexed BAM files" ) ;
2009-12-17 05:55:42 +08:00
2010-07-07 11:14:59 +08:00
List < GenomeLoc > region ;
if ( intervals ! = null )
region = intervals . toList ( ) ;
else {
region = new ArrayList < GenomeLoc > ( ) ;
for ( SAMSequenceRecord sequenceRecord : drivingDataSource . getSequenceDictionary ( ) . getSequences ( ) )
2010-11-11 01:59:50 +08:00
region . add ( getGenomeLocParser ( ) . createGenomeLoc ( sequenceRecord . getSequenceName ( ) , 1 , sequenceRecord . getSequenceLength ( ) ) ) ;
2010-07-07 11:14:59 +08:00
}
2011-01-20 20:58:13 +08:00
return new MonolithicShardStrategy ( getGenomeLocParser ( ) , readsDataSource , shardType , region ) ;
2009-12-17 05:55:42 +08:00
}
2009-07-30 00:11:45 +08:00
ShardStrategy shardStrategy = null ;
ShardStrategyFactory . SHATTER_STRATEGY shardType ;
2009-09-13 03:13:15 +08:00
2009-12-17 05:55:42 +08:00
long SHARD_SIZE = 100000L ;
2009-07-30 00:11:45 +08:00
if ( walker instanceof LocusWalker ) {
2009-10-21 07:31:13 +08:00
if ( walker instanceof RodWalker ) SHARD_SIZE * = 1000 ;
2009-09-13 03:13:15 +08:00
2010-01-13 01:52:27 +08:00
if ( intervals ! = null & & ! intervals . isEmpty ( ) ) {
2010-08-12 04:17:11 +08:00
if ( ! readsDataSource . isEmpty ( ) & & readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate )
2010-09-14 13:04:26 +08:00
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . coordinate , "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately." ) ;
2010-03-22 07:22:25 +08:00
2010-01-15 11:35:55 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-06-18 04:17:31 +08:00
ShardStrategyFactory . SHATTER_STRATEGY . LOCUS_EXPERIMENTAL ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
2010-11-11 01:59:50 +08:00
getGenomeLocParser ( ) ,
2010-08-10 04:41:50 +08:00
intervals ) ;
2009-07-30 00:11:45 +08:00
} else
2010-03-01 09:58:44 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-06-18 04:17:31 +08:00
ShardStrategyFactory . SHATTER_STRATEGY . LOCUS_EXPERIMENTAL ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
2010-11-11 01:59:50 +08:00
SHARD_SIZE , getGenomeLocParser ( ) ) ;
2009-07-30 00:11:45 +08:00
} else if ( walker instanceof ReadWalker | |
walker instanceof DuplicateWalker ) {
2010-06-18 04:17:31 +08:00
shardType = ShardStrategyFactory . SHATTER_STRATEGY . READS_EXPERIMENTAL ;
2009-07-30 00:11:45 +08:00
2010-01-13 01:52:27 +08:00
if ( intervals ! = null & & ! intervals . isEmpty ( ) ) {
2010-03-08 05:01:25 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-03-08 05:01:25 +08:00
shardType ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
2010-11-11 01:59:50 +08:00
getGenomeLocParser ( ) ,
2010-08-10 04:41:50 +08:00
intervals ) ;
2009-07-30 00:11:45 +08:00
} else {
2010-03-08 05:01:25 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-03-08 05:01:25 +08:00
shardType ,
2009-07-30 00:11:45 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
2010-11-11 01:59:50 +08:00
SHARD_SIZE ,
getGenomeLocParser ( ) ) ;
2009-07-30 00:11:45 +08:00
}
2010-03-22 07:22:25 +08:00
} else if ( walker instanceof ReadPairWalker ) {
if ( readsDataSource ! = null & & readsDataSource . getSortOrder ( ) ! = SAMFileHeader . SortOrder . queryname )
2010-09-12 22:02:43 +08:00
throw new UserException . MissortedBAM ( SAMFileHeader . SortOrder . queryname , "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file." ) ;
2010-03-22 07:22:25 +08:00
if ( intervals ! = null & & ! intervals . isEmpty ( ) )
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "Pairs traversal cannot be used in conjunction with intervals." ) ;
2010-03-22 07:22:25 +08:00
shardStrategy = ShardStrategyFactory . shatter ( readsDataSource ,
2010-06-11 04:10:23 +08:00
referenceDataSource . getReference ( ) ,
2010-03-22 07:22:25 +08:00
ShardStrategyFactory . SHATTER_STRATEGY . READS_EXPERIMENTAL ,
drivingDataSource . getSequenceDictionary ( ) ,
2010-11-11 01:59:50 +08:00
SHARD_SIZE ,
getGenomeLocParser ( ) ) ;
2009-07-30 00:11:45 +08:00
} else
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Unable to support walker of type" + walker . getClass ( ) . getName ( ) ) ;
2009-07-30 00:11:45 +08:00
return shardStrategy ;
}
2010-09-24 07:28:55 +08:00
protected boolean flashbackData ( ) {
return walker instanceof ReadWalker ;
2009-07-30 00:11:45 +08:00
}
2009-05-11 10:07:20 +08:00
2011-01-05 11:07:11 +08:00
/ * *
* Create the temp directory if it doesn ' t exist .
* /
private void initializeTempDirectory ( ) {
File tempDir = new File ( System . getProperty ( "java.io.tmpdir" ) ) ;
tempDir . mkdirs ( ) ;
}
2009-07-10 06:10:22 +08:00
/ * *
* Initialize the output streams as specified by the user .
*
2009-09-30 06:23:19 +08:00
* @param outputTracker the tracker supplying the initialization data .
2009-07-10 06:10:22 +08:00
* /
2010-09-24 07:28:55 +08:00
private void initializeOutputStreams ( OutputTracker outputTracker ) {
for ( Map . Entry < ArgumentSource , Object > input : getInputs ( ) . entrySet ( ) )
2009-10-06 10:45:31 +08:00
outputTracker . addInput ( input . getKey ( ) , input . getValue ( ) ) ;
2010-09-24 07:28:55 +08:00
for ( Stub < ? > stub : getOutputs ( ) )
2009-08-23 08:56:02 +08:00
outputTracker . addOutput ( stub ) ;
2010-09-25 10:49:30 +08:00
outputTracker . prepareWalker ( walker , getArguments ( ) . strictnessLevel ) ;
2009-05-11 10:07:20 +08:00
}
2010-09-28 10:16:25 +08:00
2010-12-21 10:09:46 +08:00
public ReferenceDataSource getReferenceDataSource ( ) {
return referenceDataSource ;
}
public GenomeLocParser getGenomeLocParser ( ) {
return genomeLocParser ;
}
/ * *
* Manage lists of filters .
* /
private final FilterManager filterManager = new FilterManager ( ) ;
private Date startTime = null ; // the start time for execution
public void setParser ( ParsingEngine parsingEngine ) {
this . parsingEngine = parsingEngine ;
}
/ * *
* Explicitly set the GenomeLocParser , for unit testing .
* @param genomeLocParser GenomeLocParser to use .
* /
public void setGenomeLocParser ( GenomeLocParser genomeLocParser ) {
this . genomeLocParser = genomeLocParser ;
}
/ * *
* Sets the start time when the execute ( ) function was last called
* @param startTime the start time when the execute ( ) function was last called
* /
protected void setStartTime ( Date startTime ) {
this . startTime = startTime ;
}
/ * *
* @return the start time when the execute ( ) function was last called
* /
public Date getStartTime ( ) {
return startTime ;
}
/ * *
* Setup the intervals to be processed
* /
protected void initializeIntervals ( ) {
// return if no interval arguments at all
if ( ( argCollection . intervals = = null ) & & ( argCollection . excludeIntervals = = null ) & & ( argCollection . RODToInterval = = null ) )
return ;
// if '-L all' was specified, verify that it was the only -L specified and return if so.
if ( argCollection . intervals ! = null ) {
for ( String interval : argCollection . intervals ) {
if ( interval . trim ( ) . equals ( "all" ) ) {
if ( argCollection . intervals . size ( ) > 1 )
throw new UserException ( "'-L all' was specified along with other intervals or interval lists; the GATK cannot combine '-L all' with other intervals." ) ;
// '-L all' was specified and seems valid. Return.
return ;
}
}
}
// if include argument isn't given, create new set of all possible intervals
GenomeLocSortedSet includeSortedSet = ( argCollection . intervals = = null & & argCollection . RODToInterval = = null ?
GenomeLocSortedSet . createSetFromSequenceDictionary ( this . referenceDataSource . getReference ( ) . getSequenceDictionary ( ) ) :
loadIntervals ( argCollection . intervals ,
argCollection . intervalMerging ,
genomeLocParser . mergeIntervalLocations ( checkRODToIntervalArgument ( ) , argCollection . intervalMerging ) ) ) ;
// if no exclude arguments, can return parseIntervalArguments directly
if ( argCollection . excludeIntervals = = null )
intervals = includeSortedSet ;
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
GenomeLocSortedSet excludeSortedSet = loadIntervals ( argCollection . excludeIntervals , argCollection . intervalMerging , null ) ;
intervals = includeSortedSet . subtractRegions ( excludeSortedSet ) ;
// logging messages only printed when exclude (-XL) arguments are given
long toPruneSize = includeSortedSet . coveredSize ( ) ;
long toExcludeSize = excludeSortedSet . coveredSize ( ) ;
long intervalSize = intervals . coveredSize ( ) ;
logger . info ( String . format ( "Initial include intervals span %d loci; exclude intervals span %d loci" , toPruneSize , toExcludeSize ) ) ;
logger . info ( String . format ( "Excluding %d loci from original intervals (%.2f%% reduction)" ,
toPruneSize - intervalSize , ( toPruneSize - intervalSize ) / ( 0.01 * toPruneSize ) ) ) ;
}
}
/ * *
* Loads the intervals relevant to the current execution
* @param argList String representation of arguments ; might include ' all ' , filenames , intervals in samtools
* notation , or a combination of the
* @param mergingRule Technique to use when merging interval data .
* @param additionalIntervals a list of additional intervals to add to the returned set . Can be null .
* @return A sorted , merged list of all intervals specified in this arg list .
* /
private GenomeLocSortedSet loadIntervals ( List < String > argList ,
IntervalMergingRule mergingRule ,
List < GenomeLoc > additionalIntervals ) {
return IntervalUtils . sortAndMergeIntervals ( genomeLocParser , IntervalUtils . mergeListsBySetOperator ( additionalIntervals ,
IntervalUtils . parseIntervalArguments ( genomeLocParser , argList ,
this . getArguments ( ) . unsafe ! = ValidationExclusion . TYPE . ALLOW_EMPTY_INTERVAL_LIST ) ,
argCollection . BTIMergeRule ) ,
mergingRule ) ;
}
/ * *
* if we have a ROD specified as a ' rodToIntervalTrackName ' , convert its records to RODs
* @return ROD intervals as GenomeLocs
* /
private List < GenomeLoc > checkRODToIntervalArgument ( ) {
Map < String , ReferenceOrderedDataSource > rodNames = RMDIntervalGenerator . getRMDTrackNames ( rodDataSources ) ;
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
List < GenomeLoc > ret = new ArrayList < GenomeLoc > ( ) ;
if ( rodNames ! = null & & argCollection . RODToInterval ! = null ) {
String rodName = argCollection . RODToInterval ;
// check to make sure we have a rod of that name
if ( ! rodNames . containsKey ( rodName ) )
throw new UserException . CommandLineException ( "--rodToIntervalTrackName (-BTI) was passed the name '" + rodName + "', which wasn't given as a ROD name in the -B option" ) ;
for ( String str : rodNames . keySet ( ) )
if ( str . equals ( rodName ) ) {
logger . info ( "Adding interval list from track (ROD) named " + rodName ) ;
2010-12-31 12:52:22 +08:00
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator ( rodNames . get ( str ) ) ;
2010-12-21 10:09:46 +08:00
ret . addAll ( intervalGenerator . toGenomeLocList ( ) ) ;
}
}
return ret ;
}
/ * *
* Add additional , externally managed IO streams for inputs .
*
* @param argumentSource Field into which to inject the value .
* @param value Instance to inject .
* /
public void addInput ( ArgumentSource argumentSource , Object value ) {
inputs . put ( argumentSource , value ) ;
}
/ * *
* Add additional , externally managed IO streams for output .
*
* @param stub Instance to inject .
* /
public void addOutput ( Stub < ? > stub ) {
outputs . add ( stub ) ;
}
2011-03-14 23:51:19 +08:00
/ * *
* Returns the tag associated with a given command - line argument .
* @param key Object for which to inspect the tag .
* @return Tags object associated with the given key , or an empty Tag structure if none are present .
* /
public Tags getTags ( Object key ) {
return parsingEngine . getTags ( key ) ;
}
2010-12-21 10:09:46 +08:00
protected void initializeDataSources ( ) {
logger . info ( "Strictness is " + argCollection . strictnessLevel ) ;
// TODO -- REMOVE ME
BAQ . DEFAULT_GOP = argCollection . BAQGOP ;
validateSuppliedReference ( ) ;
2011-01-13 02:25:12 +08:00
setReferenceDataSource ( argCollection . referenceFile ) ;
2010-12-21 10:09:46 +08:00
validateSuppliedReads ( ) ;
2011-03-17 05:48:47 +08:00
readsDataSource = createReadsDataSource ( argCollection , genomeLocParser , referenceDataSource . getReference ( ) ) ;
2010-12-21 10:09:46 +08:00
sampleDataSource = new SampleDataSource ( getSAMFileHeader ( ) , argCollection . sampleFiles ) ;
for ( SamRecordFilter filter : filters )
if ( filter instanceof SamRecordHeaderFilter )
( ( SamRecordHeaderFilter ) filter ) . setHeader ( this . getSAMFileHeader ( ) ) ;
sampleDataSource = new SampleDataSource ( getSAMFileHeader ( ) , argCollection . sampleFiles ) ;
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
2010-12-31 12:52:22 +08:00
rodDataSources = getReferenceOrderedDataSources ( referenceMetaDataFiles , referenceDataSource . getReference ( ) . getSequenceDictionary ( ) , genomeLocParser , argCollection . unsafe ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Gets a unique identifier for the reader sourcing this read .
* @param read Read to examine .
* @return A unique identifier for the source file of this read . Exception if not found .
* /
public SAMReaderID getReaderIDForRead ( final SAMRecord read ) {
return getReadsDataSource ( ) . getReaderID ( read ) ;
}
/ * *
* Gets the source file for this read .
* @param id Unique identifier determining which input file to use .
* @return The source filename for this read .
* /
public File getSourceFileForReaderID ( final SAMReaderID id ) {
return getReadsDataSource ( ) . getSAMFile ( id ) ;
}
/ * *
* Returns sets of samples present in the ( merged ) input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of sample names
* found in the corresponding bam file .
*
* @return Sets of samples in the merged input SAM stream , grouped by readers
* /
public List < Set < String > > getSamplesByReaders ( ) {
2010-12-23 03:00:17 +08:00
Collection < SAMReaderID > readers = getReadsDataSource ( ) . getReaderIDs ( ) ;
2010-12-21 10:09:46 +08:00
List < Set < String > > sample_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
for ( SAMReaderID r : readers ) {
Set < String > samples = new HashSet < String > ( 1 ) ;
sample_sets . add ( samples ) ;
for ( SAMReadGroupRecord g : getReadsDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
samples . add ( g . getSample ( ) ) ;
}
}
return sample_sets ;
}
/ * *
* Returns sets of libraries present in the ( merged ) input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of library names
* found in the corresponding bam file .
*
* @return Sets of libraries present in the ( merged ) input SAM stream , grouped by readers
* /
public List < Set < String > > getLibrariesByReaders ( ) {
2010-12-23 03:00:17 +08:00
Collection < SAMReaderID > readers = getReadsDataSource ( ) . getReaderIDs ( ) ;
2010-12-21 10:09:46 +08:00
List < Set < String > > lib_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
for ( SAMReaderID r : readers ) {
Set < String > libs = new HashSet < String > ( 2 ) ;
lib_sets . add ( libs ) ;
for ( SAMReadGroupRecord g : getReadsDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
libs . add ( g . getLibrary ( ) ) ;
}
}
return lib_sets ;
}
/ * *
* * * * * UNLESS YOU HAVE GOOD REASON TO , DO NOT USE THIS METHOD ; USE getFileToReadGroupIdMapping ( ) INSTEAD * * * *
*
* Returns sets of ( remapped ) read groups in input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of remapped read groups
* ( i . e . as seen by read . getReadGroup ( ) . getReadGroupId ( ) in the merged stream ) that come from the corresponding bam file .
*
* @return sets of ( merged ) read group ids in order of input bams
* /
public List < Set < String > > getMergedReadGroupsByReaders ( ) {
2010-12-23 03:00:17 +08:00
Collection < SAMReaderID > readers = getReadsDataSource ( ) . getReaderIDs ( ) ;
2010-12-21 10:09:46 +08:00
List < Set < String > > rg_sets = new ArrayList < Set < String > > ( readers . size ( ) ) ;
for ( SAMReaderID r : readers ) {
Set < String > groups = new HashSet < String > ( 5 ) ;
rg_sets . add ( groups ) ;
for ( SAMReadGroupRecord g : getReadsDataSource ( ) . getHeader ( r ) . getReadGroups ( ) ) {
if ( getReadsDataSource ( ) . hasReadGroupCollisions ( ) ) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
// use HeaderMerger to translate original read group id from the reader into the read group id in the
// merged stream, and save that remapped read group id to associate it with specific reader
groups . add ( getReadsDataSource ( ) . getReadGroupId ( r , g . getReadGroupId ( ) ) ) ;
} else {
// otherwise, pass through the unmapped read groups since this is what Picard does as well
groups . add ( g . getReadGroupId ( ) ) ;
}
}
}
return rg_sets ;
}
/ * *
* Now that all files are open , validate the sequence dictionaries of the reads vs . the reference vrs the reference ordered data ( if available ) .
*
* @param reads Reads data source .
* @param reference Reference data source .
2010-12-31 12:52:22 +08:00
* @param rods a collection of the reference ordered data tracks
2010-12-21 10:09:46 +08:00
* /
2010-12-31 12:52:22 +08:00
private void validateSourcesAgainstReference ( SAMDataSource reads , ReferenceSequenceFile reference , Collection < ReferenceOrderedDataSource > rods , RMDTrackBuilder manager ) {
if ( ( reads . isEmpty ( ) & & ( rods = = null | | rods . isEmpty ( ) ) ) | | reference = = null )
2010-12-21 10:09:46 +08:00
return ;
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference . getSequenceDictionary ( ) ;
if ( ! reads . isEmpty ( ) ) {
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads . getHeader ( ) . getSequenceDictionary ( ) ;
Set < String > readsSequenceNames = new TreeSet < String > ( ) ;
for ( SAMSequenceRecord dictionaryEntry : readsDictionary . getSequences ( ) )
readsSequenceNames . add ( dictionaryEntry . getSequenceName ( ) ) ;
if ( readsSequenceNames . size ( ) = = 0 ) {
logger . info ( "Reads file is unmapped. Skipping validation against reference." ) ;
return ;
}
// compare the reads to the reference
SequenceDictionaryUtils . validateDictionaries ( logger , getArguments ( ) . unsafe , "reads" , readsDictionary , "reference" , referenceDictionary ) ;
}
2010-12-31 12:52:22 +08:00
for ( ReferenceOrderedDataSource rod : rods )
manager . validateTrackSequenceDictionary ( rod . getName ( ) , rod . getSequenceDictionary ( ) , referenceDictionary ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Gets a data source for the given set of reads .
*
* @return A data source for the given set of reads .
* /
2011-03-17 05:48:47 +08:00
private SAMDataSource createReadsDataSource ( GATKArgumentCollection argCollection , GenomeLocParser genomeLocParser , IndexedFastaSequenceFile refReader ) {
2010-12-21 10:09:46 +08:00
DownsamplingMethod method = getDownsamplingMethod ( ) ;
if ( getWalkerBAQApplicationTime ( ) = = BAQ . ApplicationTime . FORBIDDEN & & argCollection . BAQMode ! = BAQ . CalculationMode . OFF )
throw new UserException . BadArgumentValue ( "baq" , "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection . BAQMode + " was requested." ) ;
2011-03-17 05:48:47 +08:00
SAMDataSource dataSource = new SAMDataSource (
2010-12-23 03:00:17 +08:00
samReaderIDs ,
2010-12-21 10:09:46 +08:00
genomeLocParser ,
argCollection . useOriginalBaseQualities ,
argCollection . strictnessLevel ,
argCollection . readBufferSize ,
method ,
new ValidationExclusion ( Arrays . asList ( argCollection . unsafe ) ) ,
filters ,
includeReadsWithDeletionAtLoci ( ) ,
generateExtendedEvents ( ) ,
getWalkerBAQApplicationTime ( ) = = BAQ . ApplicationTime . ON_INPUT ? argCollection . BAQMode : BAQ . CalculationMode . OFF ,
getWalkerBAQQualityMode ( ) ,
2011-01-06 06:25:08 +08:00
refReader ,
2011-03-17 05:48:47 +08:00
argCollection . defaultBaseQualities , argCollection . enableLowMemorySharding ) ;
return dataSource ;
2010-12-21 10:09:46 +08:00
}
/ * *
2011-01-13 02:25:12 +08:00
* Opens a reference sequence file paired with an index . Only public for testing purposes
2010-12-21 10:09:46 +08:00
*
* @param refFile Handle to a reference sequence file . Non - null .
* @return A thread - safe file wrapper .
* /
2011-01-13 02:25:12 +08:00
public void setReferenceDataSource ( File refFile ) {
this . referenceDataSource = new ReferenceDataSource ( refFile ) ;
genomeLocParser = new GenomeLocParser ( referenceDataSource . getReference ( ) ) ;
2010-12-21 10:09:46 +08:00
}
/ * *
* Open the reference - ordered data sources .
*
2010-12-31 12:52:22 +08:00
* @param referenceMetaDataFiles collection of RMD descriptors to load and validate .
* @param sequenceDictionary GATK - wide sequnce dictionary to use for validation .
* @param genomeLocParser to use when creating and validating GenomeLocs .
* @param validationExclusionType potentially indicate which validations to include / exclude .
*
2010-12-21 10:09:46 +08:00
* @return A list of reference - ordered data sources .
* /
2010-12-31 12:52:22 +08:00
private List < ReferenceOrderedDataSource > getReferenceOrderedDataSources ( Collection < RMDTriplet > referenceMetaDataFiles ,
SAMSequenceDictionary sequenceDictionary ,
GenomeLocParser genomeLocParser ,
ValidationExclusion . TYPE validationExclusionType ) {
2011-01-15 02:52:21 +08:00
RMDTrackBuilder builder = new RMDTrackBuilder ( sequenceDictionary , genomeLocParser , validationExclusionType ) ;
2010-12-31 12:52:22 +08:00
// try and make the tracks given their requests
// create of live instances of the tracks
List < RMDTrack > tracks = new ArrayList < RMDTrack > ( ) ;
2010-12-21 10:09:46 +08:00
List < ReferenceOrderedDataSource > dataSources = new ArrayList < ReferenceOrderedDataSource > ( ) ;
2010-12-31 12:52:22 +08:00
for ( RMDTriplet fileDescriptor : referenceMetaDataFiles )
dataSources . add ( new ReferenceOrderedDataSource ( fileDescriptor ,
builder ,
sequenceDictionary ,
2010-12-21 10:09:46 +08:00
genomeLocParser ,
flashbackData ( ) ) ) ;
2010-12-31 12:52:22 +08:00
// validation: check to make sure everything the walker needs is present, and that all sequence dictionaries match.
validateSuppliedReferenceOrderedData ( dataSources ) ;
validateSourcesAgainstReference ( readsDataSource , referenceDataSource . getReference ( ) , dataSources , builder ) ;
2010-12-21 10:09:46 +08:00
return dataSources ;
}
/ * *
* Returns the SAM File Header from the input reads ' data source file
* @return the SAM File Header from the input reads ' data source file
* /
public SAMFileHeader getSAMFileHeader ( ) {
return readsDataSource . getHeader ( ) ;
}
/ * *
* Returns the unmerged SAM file header for an individual reader .
* @param reader The reader .
* @return Header for that reader .
* /
public SAMFileHeader getSAMFileHeader ( SAMReaderID reader ) {
return readsDataSource . getHeader ( reader ) ;
}
/ * *
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads ; header merger , individual file readers etc can be accessed through the returned data source object .
*
* @return the reads data source
* /
public SAMDataSource getReadsDataSource ( ) {
return this . readsDataSource ;
}
/ * *
* Sets the collection of GATK main application arguments .
*
* @param argCollection the GATK argument collection
* /
public void setArguments ( GATKArgumentCollection argCollection ) {
this . argCollection = argCollection ;
}
/ * *
* Gets the collection of GATK main application arguments .
*
* @return the GATK argument collection
* /
public GATKArgumentCollection getArguments ( ) {
return this . argCollection ;
}
/ * *
* Get the list of intervals passed to the engine .
* @return List of intervals .
* /
public GenomeLocSortedSet getIntervals ( ) {
return this . intervals ;
}
/ * *
* Gets the list of filters employed by this engine .
* @return Collection of filters ( actual instances ) used by this engine .
* /
public Collection < SamRecordFilter > getFilters ( ) {
return this . filters ;
}
/ * *
* Sets the list of filters employed by this engine .
* @param filters Collection of filters ( actual instances ) used by this engine .
* /
public void setFilters ( Collection < SamRecordFilter > filters ) {
this . filters = filters ;
}
/ * *
* Gets the filter manager for this engine .
* @return filter manager for this engine .
* /
protected FilterManager getFilterManager ( ) {
return filterManager ;
}
/ * *
* Gets the input sources for this engine .
* @return input sources for this engine .
* /
protected Map < ArgumentSource , Object > getInputs ( ) {
return inputs ;
}
/ * *
* Gets the output stubs for this engine .
* @return output stubs for this engine .
* /
protected Collection < Stub < ? > > getOutputs ( ) {
return outputs ;
}
/ * *
* Returns data source objects encapsulating all rod data ;
* individual rods can be accessed through the returned data source objects .
*
* @return the rods data sources
* /
public List < ReferenceOrderedDataSource > getRodDataSources ( ) {
return this . rodDataSources ;
}
/ * *
* Gets cumulative metrics about the entire run to this point .
* @return cumulative metrics about the entire run .
* /
public ReadMetrics getCumulativeMetrics ( ) {
return readsDataSource = = null ? null : readsDataSource . getCumulativeReadMetrics ( ) ;
}
public SampleDataSource getSampleMetadata ( ) {
return this . sampleDataSource ;
}
/ * *
* Get a sample by its ID
* If an alias is passed in , return the main sample object
* @param id sample id
* @return sample Object with this ID
* /
public Sample getSampleById ( String id ) {
return sampleDataSource . getSampleById ( id ) ;
}
/ * *
* Get the sample for a given read group
* Must first look up ID for read group
* @param readGroup of sample
* @return sample object with ID from the read group
* /
public Sample getSampleByReadGroup ( SAMReadGroupRecord readGroup ) {
return sampleDataSource . getSampleByReadGroup ( readGroup ) ;
}
/ * *
* Get a sample for a given read
* Must first look up read group , and then sample ID for that read group
* @param read of sample
* @return sample object of this read
* /
public Sample getSampleByRead ( SAMRecord read ) {
return getSampleByReadGroup ( read . getReadGroup ( ) ) ;
}
/ * *
* Get number of sample objects
* @return size of samples map
* /
public int sampleCount ( ) {
return sampleDataSource . sampleCount ( ) ;
}
/ * *
* Return all samples with a given family ID
* Note that this isn ' t terribly efficient ( linear ) - it may be worth adding a new family ID data structure for this
* @param familyId family ID
* @return Samples with the given family ID
* /
public Set < Sample > getFamily ( String familyId ) {
return sampleDataSource . getFamily ( familyId ) ;
}
/ * *
* Returns all children of a given sample
* See note on the efficiency of getFamily ( ) - since this depends on getFamily ( ) it ' s also not efficient
* @param sample parent sample
* @return children of the given sample
* /
public Set < Sample > getChildren ( Sample sample ) {
return sampleDataSource . getChildren ( sample ) ;
}
/ * *
* Gets all the samples
* @return
* /
public Collection < Sample > getSamples ( ) {
return sampleDataSource . getSamples ( ) ;
}
/ * *
* Takes a list of sample names and returns their corresponding sample objects
*
* @param sampleNameList List of sample names
* @return Corresponding set of samples
* /
public Set < Sample > getSamples ( Collection < String > sampleNameList ) {
return sampleDataSource . getSamples ( sampleNameList ) ;
}
/ * *
* Returns a set of samples that have any value ( which could be null ) for a given property
* @param key Property key
* @return Set of samples with the property
* /
public Set < Sample > getSamplesWithProperty ( String key ) {
return sampleDataSource . getSamplesWithProperty ( key ) ;
}
/ * *
* Returns a set of samples that have a property with a certain value
* Value must be a string for now - could add a similar method for matching any objects in the future
*
* @param key Property key
* @param value String property value
* @return Set of samples that match key and value
* /
public Set < Sample > getSamplesWithProperty ( String key , String value ) {
return sampleDataSource . getSamplesWithProperty ( key , value ) ;
}
/ * *
* Returns a set of sample objects for the sample names in a variant context
*
* @param context Any variant context
* @return a set of the sample objects
* /
public Set < Sample > getSamplesByVariantContext ( VariantContext context ) {
Set < Sample > samples = new HashSet < Sample > ( ) ;
for ( String sampleName : context . getSampleNames ( ) ) {
samples . add ( sampleDataSource . getOrCreateSample ( sampleName ) ) ;
}
return samples ;
}
/ * *
* Returns all samples that were referenced in the SAM file
* /
public Set < Sample > getSAMFileSamples ( ) {
return sampleDataSource . getSAMFileSamples ( ) ;
}
/ * *
* Return a subcontext restricted to samples with a given property key / value
* Gets the sample names from key / value and relies on VariantContext . subContextFromGenotypes for the filtering
* @param context VariantContext to filter
* @param key property key
* @param value property value ( must be string )
* @return subcontext
* /
public VariantContext subContextFromSampleProperty ( VariantContext context , String key , String value ) {
return sampleDataSource . subContextFromSampleProperty ( context , key , value ) ;
}
public Map < String , String > getApproximateCommandLineArguments ( Object . . . argumentProviders ) {
return CommandLineUtils . getApproximateCommandLineArguments ( parsingEngine , argumentProviders ) ;
}
public String createApproximateCommandLineArgumentString ( Object . . . argumentProviders ) {
return CommandLineUtils . createApproximateCommandLineArgumentString ( parsingEngine , argumentProviders ) ;
}
2009-05-11 10:07:20 +08:00
}