2009-06-23 05:11:18 +08:00
/ *
* Copyright ( c ) 2009 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
* /
2009-05-11 10:07:20 +08:00
package org.broadinstitute.sting.gatk ;
2009-05-29 04:13:01 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
import net.sf.picard.reference.ReferenceSequenceFileFactory ;
2009-05-11 10:07:20 +08:00
import org.apache.log4j.Logger ;
import org.broadinstitute.sting.gatk.executive.MicroScheduler ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum ;
2009-07-10 05:57:00 +08:00
import org.broadinstitute.sting.gatk.traversals.TraversalEngine ;
2009-05-11 10:07:20 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2009-06-06 07:34:37 +08:00
import org.broadinstitute.sting.utils.* ;
2009-05-20 07:26:17 +08:00
import org.broadinstitute.sting.utils.cmdLine.ArgumentException ;
2009-05-11 10:07:20 +08:00
2009-07-10 05:57:00 +08:00
import java.io.File ;
2009-05-11 10:07:20 +08:00
import java.util.ArrayList ;
import java.util.List ;
public class GenomeAnalysisEngine {
// our instance of this genome analysis toolkit; it's used by other classes to extract the traversal engine
// TODO: public static without final tends to indicate we're thinking about this the wrong way
public static GenomeAnalysisEngine instance ;
// our traversal engine
private TraversalEngine engine = null ;
// the level of debugging we're using
public boolean DEBUGGING = false ;
// our argument collection
private final GATKArgumentCollection argCollection ;
/** Collection of output streams used by the walker. */
private OutputTracker outputTracker = null ;
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger . getLogger ( GenomeAnalysisEngine . class ) ;
2009-06-05 22:41:42 +08:00
/** the return value from our walker */
private Object walkerReturn = null ;
2009-05-11 10:07:20 +08:00
/ * *
* our constructor , where all the work is done
* < p / >
* legacy traversal types are sent to legacyTraversal function ; as we move more of the traversals to the
* new MicroScheduler class we ' ll be able to delete that function .
*
* @param args the argument collection , where we get all our setup information from
* @param my_walker the walker we ' re running with
* /
2009-05-12 06:05:58 +08:00
public GenomeAnalysisEngine ( GATKArgumentCollection args , Walker my_walker ) {
2009-05-11 10:07:20 +08:00
// validate our parameters
if ( args = = null | | my_walker = = null ) {
throw new StingException ( "Neither the GATKArgumentCollection or the Walker passed to GenomeAnalysisEngine can be null." ) ;
}
// save our argument parameter
this . argCollection = args ;
// make sure our instance variable points to this analysis engine
instance = this ;
// our reference ordered data collection
List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods = new ArrayList < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > ( ) ;
//
// please don't use these in the future, use the new syntax <- if we're not using these please remove them
//
if ( argCollection . DBSNPFile ! = null ) bindConvenienceRods ( "dbSNP" , "dbsnp" , argCollection . DBSNPFile ) ;
if ( argCollection . HAPMAPFile ! = null )
bindConvenienceRods ( "hapmap" , "HapMapAlleleFrequencies" , argCollection . HAPMAPFile ) ;
if ( argCollection . HAPMAPChipFile ! = null )
bindConvenienceRods ( "hapmap-chip" , "GFF" , argCollection . HAPMAPChipFile ) ;
2009-06-26 04:44:23 +08:00
// TODO: The ROD iterator currently does not understand multiple intervals file. Fix this by cleaning the ROD system.
2009-07-10 05:57:00 +08:00
if ( argCollection . intervals ! = null & & argCollection . intervals . size ( ) = = 1 ) {
2009-06-26 04:44:23 +08:00
bindConvenienceRods ( "interval" , "Intervals" , argCollection . intervals . get ( 0 ) . replaceAll ( "," , "" ) ) ;
}
2009-05-11 10:07:20 +08:00
// parse out the rod bindings
ReferenceOrderedData . parseBindings ( logger , argCollection . RODBindings , rods ) ;
2009-05-20 07:26:17 +08:00
// Validate the walker inputs against the walker.
validateInputsAgainstWalker ( my_walker , argCollection , rods ) ;
2009-05-11 10:07:20 +08:00
// create the output streams
2009-07-10 05:57:00 +08:00
initializeOutputStreams ( my_walker ) ;
2009-05-11 10:07:20 +08:00
// our microscheduler, which is in charge of running everything
MicroScheduler microScheduler = null ;
2009-07-01 21:46:35 +08:00
microScheduler = createMicroscheduler ( my_walker , rods ) ;
2009-05-11 10:07:20 +08:00
// Prepare the sort ordering w.r.t. the sequence dictionary
if ( argCollection . referenceFile ! = null ) {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory . getReferenceSequenceFile ( argCollection . referenceFile ) ;
2009-06-22 22:39:41 +08:00
GenomeLocParser . setupRefContigOrdering ( refFile ) ;
2009-05-11 10:07:20 +08:00
}
2009-07-09 04:26:16 +08:00
logger . info ( "Strictness is " + argCollection . strictnessLevel ) ;
2009-05-11 10:07:20 +08:00
// perform validation steps that are common to all the engines
2009-06-10 21:39:32 +08:00
genericEngineSetup ( ) ;
2009-05-11 10:07:20 +08:00
2009-05-27 04:57:46 +08:00
GenomeLocSortedSet locs = null ;
2009-07-10 05:57:00 +08:00
if ( argCollection . intervals ! = null ) {
locs = GenomeLocSortedSet . createSetFromList ( parseIntervalRegion ( argCollection . intervals ) ) ;
}
2009-06-05 22:41:42 +08:00
// excute the microscheduler, storing the results
2009-06-08 23:12:24 +08:00
walkerReturn = microScheduler . execute ( my_walker , locs , argCollection . maximumEngineIterations ) ;
2009-05-11 10:07:20 +08:00
}
/ * *
* setup a microscheduler
*
* @param my_walker our walker of type LocusWalker
* @param rods the reference order data
2009-07-10 05:57:00 +08:00
*
2009-05-11 10:07:20 +08:00
* @return a new microscheduler
* /
private MicroScheduler createMicroscheduler ( Walker my_walker , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
// the mircoscheduler to return
MicroScheduler microScheduler = null ;
// we need to verify different parameter based on the walker type
2009-06-26 06:51:38 +08:00
if ( my_walker instanceof LocusWalker | | my_walker instanceof LocusWindowWalker ) {
2009-05-11 10:07:20 +08:00
// create the MicroScheduler
2009-06-10 21:39:32 +08:00
microScheduler = MicroScheduler . create ( my_walker , extractSourceInfoFromArguments ( argCollection ) , argCollection . referenceFile , rods , argCollection . numberOfThreads ) ;
2009-05-11 10:07:20 +08:00
engine = microScheduler . getTraversalEngine ( ) ;
2009-07-10 05:57:00 +08:00
} else if ( my_walker instanceof ReadWalker | | my_walker instanceof DuplicateWalker ) {
2009-05-11 10:07:20 +08:00
if ( argCollection . referenceFile = = null )
2009-06-23 05:11:18 +08:00
Utils . scareUser ( String . format ( "Read-based traversals require a reference file but none was given" ) ) ;
2009-06-10 21:39:32 +08:00
microScheduler = MicroScheduler . create ( my_walker , extractSourceInfoFromArguments ( argCollection ) , argCollection . referenceFile , rods , argCollection . numberOfThreads ) ;
2009-05-11 10:07:20 +08:00
engine = microScheduler . getTraversalEngine ( ) ;
2009-06-23 05:11:18 +08:00
} else {
2009-07-10 05:57:00 +08:00
Utils . scareUser ( String . format ( "Unable to create the appropriate TraversalEngine for analysis type " + argCollection . analysisName ) ) ;
2009-05-11 10:07:20 +08:00
}
return microScheduler ;
}
2009-07-10 05:57:00 +08:00
/** commands that get executed for each engine, regardless of the type */
2009-06-10 21:39:32 +08:00
private void genericEngineSetup ( ) {
Reads sourceInfo = extractSourceInfoFromArguments ( argCollection ) ;
2009-06-08 23:12:24 +08:00
engine . setMaxReads ( argCollection . maximumEngineIterations ) ;
2009-05-11 10:07:20 +08:00
engine . initialize ( ) ;
}
/ * *
* setup the interval regions , from either the interval file of the genome region string
*
* @return a list of genomeLoc representing the interval file
* /
2009-07-10 05:57:00 +08:00
public static List < GenomeLoc > parseIntervalRegion ( final List < String > intervals ) {
2009-06-26 04:44:23 +08:00
List < GenomeLoc > locs = new ArrayList < GenomeLoc > ( ) ;
2009-07-10 05:57:00 +08:00
for ( String interval : intervals ) {
2009-06-26 04:44:23 +08:00
if ( new File ( interval ) . exists ( ) ) {
locs . addAll ( GenomeLocParser . intervalFileToList ( interval ) ) ;
2009-05-12 23:33:55 +08:00
} else {
2009-06-26 04:44:23 +08:00
locs . addAll ( GenomeLocParser . parseGenomeLocs ( interval ) ) ;
2009-05-12 23:33:55 +08:00
}
2009-06-26 04:44:23 +08:00
2009-05-12 09:04:18 +08:00
}
2009-05-12 23:33:55 +08:00
return locs ;
2009-05-11 10:07:20 +08:00
}
2009-06-10 21:39:32 +08:00
/ * *
* Bundles all the source information about the reads into a unified data structure .
2009-07-10 05:57:00 +08:00
*
2009-06-10 21:39:32 +08:00
* @param argCollection The collection of arguments passed to the engine .
2009-07-10 05:57:00 +08:00
*
2009-06-10 21:39:32 +08:00
* @return The reads object providing reads source info .
* /
2009-07-10 05:57:00 +08:00
2009-06-10 21:39:32 +08:00
private Reads extractSourceInfoFromArguments ( GATKArgumentCollection argCollection ) {
return new Reads ( argCollection . samFiles ,
2009-07-09 04:26:16 +08:00
argCollection . strictnessLevel ,
2009-06-10 21:39:32 +08:00
argCollection . downsampleFraction ,
argCollection . downsampleCoverage ,
! argCollection . unsafe ,
argCollection . filterZeroMappingQualityReads ) ;
}
2009-05-20 07:26:17 +08:00
private void validateInputsAgainstWalker ( Walker walker ,
GATKArgumentCollection arguments ,
List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
String walkerName = WalkerManager . getWalkerName ( walker . getClass ( ) ) ;
// Check what the walker says is required against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( WalkerManager . isRequired ( walker , DataSource . READS ) & & ( arguments . samFiles = = null | | arguments . samFiles . size ( ) = = 0 ) )
throw new ArgumentException ( String . format ( "Walker %s requires reads but none were provided. If this is incorrect, alter the walker's @Requires annotation." , walkerName ) ) ;
if ( WalkerManager . isRequired ( walker , DataSource . REFERENCE ) & & arguments . referenceFile = = null )
throw new ArgumentException ( String . format ( "Walker %s requires a reference but none was provided. If this is incorrect, alter the walker's @Requires annotation." , walkerName ) ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( ( arguments . samFiles ! = null & & arguments . samFiles . size ( ) > 0 ) & & ! WalkerManager . isAllowed ( walker , DataSource . READS ) )
throw new ArgumentException ( String . format ( "Walker %s does not allow reads but reads were provided. If this is incorrect, alter the walker's @Allows annotation" , walkerName ) ) ;
if ( arguments . referenceFile ! = null & & ! WalkerManager . isAllowed ( walker , DataSource . REFERENCE ) )
throw new ArgumentException ( String . format ( "Walker %s does not allow a reference but one was provided. If this is incorrect, alter the walker's @Allows annotation" , walkerName ) ) ;
2009-05-20 07:26:17 +08:00
// Check to make sure that all required metadata is present.
List < RMD > allRequired = WalkerManager . getRequiredMetaData ( walker ) ;
2009-07-10 05:57:00 +08:00
for ( RMD required : allRequired ) {
2009-05-20 07:26:17 +08:00
boolean found = false ;
2009-07-10 05:57:00 +08:00
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > rod : rods ) {
if ( rod . matches ( required . name ( ) , required . type ( ) ) )
2009-05-20 07:26:17 +08:00
found = true ;
}
2009-07-10 05:57:00 +08:00
if ( ! found )
throw new ArgumentException ( String . format ( "Unable to find reference metadata (%s,%s)" , required . name ( ) , required . type ( ) ) ) ;
2009-05-20 07:26:17 +08:00
}
// Check to see that no forbidden rods are present.
2009-07-10 05:57:00 +08:00
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > rod : rods ) {
if ( ! WalkerManager . isAllowed ( walker , rod ) )
throw new ArgumentException ( String . format ( "Walker does not allow access to metadata: %s. If this is correct, change the @Allows metadata" , rod . getName ( ) ) ) ;
2009-05-20 07:26:17 +08:00
}
}
2009-06-05 16:48:34 +08:00
/ * *
* Default to 5 ( based on research by Alec Wysoker )
*
* @return the BAM compression
* /
public int getBAMCompression ( ) {
return ( argCollection . BAMcompression = = null | |
2009-07-10 05:57:00 +08:00
argCollection . BAMcompression < 1 | |
2009-06-05 16:48:34 +08:00
argCollection . BAMcompression > 8 ) ? 5 : argCollection . BAMcompression ;
}
2009-05-11 10:07:20 +08:00
/ * *
* Convenience function that binds RODs using the old - style command line parser to the new style list for
* a uniform processing .
*
* @param name
* @param type
* @param file
* /
private void bindConvenienceRods ( final String name , final String type , final String file ) {
argCollection . RODBindings . add ( Utils . join ( "," , new String [ ] { name , type , file } ) ) ;
}
/** Initialize the output streams as specified by the user. */
2009-07-10 05:57:00 +08:00
private void initializeOutputStreams ( Walker walker ) {
2009-05-11 10:07:20 +08:00
outputTracker = ( argCollection . outErrFileName ! = null ) ? new OutputTracker ( argCollection . outErrFileName , argCollection . outErrFileName )
: new OutputTracker ( argCollection . outFileName , argCollection . errFileName ) ;
2009-05-12 09:04:18 +08:00
walker . initializeOutputStreams ( outputTracker ) ;
2009-05-11 10:07:20 +08:00
}
/ * *
* Gets the output tracker . Tracks data available to a given walker .
*
* @return The output tracker .
* /
public OutputTracker getOutputTracker ( ) {
return outputTracker ;
}
public TraversalEngine getEngine ( ) {
return this . engine ;
}
2009-05-14 21:55:52 +08:00
2009-06-05 22:41:42 +08:00
/** Gets the collection of GATK main application arguments for enhanced walker validation. */
2009-05-14 21:55:52 +08:00
public GATKArgumentCollection getArguments ( ) {
return this . argCollection ;
}
2009-06-05 22:41:42 +08:00
/ * *
* Get ' s the return value of the walker
*
* @return an Object representing the return value of the walker
* /
public Object getWalkerReturn ( ) {
return walkerReturn ;
}
2009-05-11 10:07:20 +08:00
}