2009-06-23 05:11:18 +08:00
/ *
* Copyright ( c ) 2009 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
* /
2009-05-11 10:07:20 +08:00
package org.broadinstitute.sting.gatk ;
2009-05-29 04:13:01 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
import net.sf.picard.reference.ReferenceSequenceFileFactory ;
2009-05-11 10:07:20 +08:00
import net.sf.samtools.SAMFileReader ;
import net.sf.samtools.SAMFileReader.ValidationStringency ;
import org.apache.log4j.Logger ;
import org.broadinstitute.sting.gatk.executive.MicroScheduler ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum ;
import org.broadinstitute.sting.gatk.traversals.* ;
import org.broadinstitute.sting.gatk.walkers.* ;
2009-06-06 07:34:37 +08:00
import org.broadinstitute.sting.utils.* ;
2009-05-20 07:26:17 +08:00
import org.broadinstitute.sting.utils.cmdLine.ArgumentException ;
2009-05-11 10:07:20 +08:00
import java.util.ArrayList ;
import java.util.List ;
2009-05-12 09:04:18 +08:00
import java.io.File ;
2009-05-11 10:07:20 +08:00
public class GenomeAnalysisEngine {
// our instance of this genome analysis toolkit; it's used by other classes to extract the traversal engine
// TODO: public static without final tends to indicate we're thinking about this the wrong way
public static GenomeAnalysisEngine instance ;
// our traversal engine
private TraversalEngine engine = null ;
// the level of debugging we're using
public boolean DEBUGGING = false ;
// our argument collection
private final GATKArgumentCollection argCollection ;
/** Collection of output streams used by the walker. */
private OutputTracker outputTracker = null ;
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger . getLogger ( GenomeAnalysisEngine . class ) ;
2009-06-05 22:41:42 +08:00
/** the return value from our walker */
private Object walkerReturn = null ;
2009-05-11 10:07:20 +08:00
/ * *
* our constructor , where all the work is done
* < p / >
* legacy traversal types are sent to legacyTraversal function ; as we move more of the traversals to the
* new MicroScheduler class we ' ll be able to delete that function .
*
* @param args the argument collection , where we get all our setup information from
* @param my_walker the walker we ' re running with
* /
2009-05-12 06:05:58 +08:00
public GenomeAnalysisEngine ( GATKArgumentCollection args , Walker my_walker ) {
2009-05-11 10:07:20 +08:00
// validate our parameters
if ( args = = null | | my_walker = = null ) {
throw new StingException ( "Neither the GATKArgumentCollection or the Walker passed to GenomeAnalysisEngine can be null." ) ;
}
// save our argument parameter
this . argCollection = args ;
// make sure our instance variable points to this analysis engine
instance = this ;
// our reference ordered data collection
List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods = new ArrayList < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > ( ) ;
//
// please don't use these in the future, use the new syntax <- if we're not using these please remove them
//
if ( argCollection . DBSNPFile ! = null ) bindConvenienceRods ( "dbSNP" , "dbsnp" , argCollection . DBSNPFile ) ;
if ( argCollection . HAPMAPFile ! = null )
bindConvenienceRods ( "hapmap" , "HapMapAlleleFrequencies" , argCollection . HAPMAPFile ) ;
if ( argCollection . HAPMAPChipFile ! = null )
bindConvenienceRods ( "hapmap-chip" , "GFF" , argCollection . HAPMAPChipFile ) ;
2009-06-06 07:34:37 +08:00
if ( argCollection . intervals ! = null )
2009-06-08 02:19:51 +08:00
bindConvenienceRods ( "interval" , "Intervals" , argCollection . intervals . replaceAll ( "," , "" ) ) ;
2009-05-11 10:07:20 +08:00
// parse out the rod bindings
ReferenceOrderedData . parseBindings ( logger , argCollection . RODBindings , rods ) ;
2009-05-20 07:26:17 +08:00
// Validate the walker inputs against the walker.
validateInputsAgainstWalker ( my_walker , argCollection , rods ) ;
2009-05-11 10:07:20 +08:00
// create the output streams
2009-05-12 09:04:18 +08:00
initializeOutputStreams ( my_walker ) ;
2009-05-11 10:07:20 +08:00
// our microscheduler, which is in charge of running everything
MicroScheduler microScheduler = null ;
// if we're a read or a locus walker, we use the new system. Right now we have complicated
// branching based on the input data, but this should disapear when all the traversals are switched over
2009-06-23 05:11:18 +08:00
if ( my_walker instanceof LocusWalker | | my_walker instanceof ReadWalker | | my_walker instanceof DuplicateWalker ) {
2009-05-11 10:07:20 +08:00
microScheduler = createMicroscheduler ( my_walker , rods ) ;
} else { // we have an old style traversal, once we're done return
legacyTraversal ( my_walker , rods ) ;
return ;
}
// Prepare the sort ordering w.r.t. the sequence dictionary
if ( argCollection . referenceFile ! = null ) {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory . getReferenceSequenceFile ( argCollection . referenceFile ) ;
2009-06-22 22:39:41 +08:00
GenomeLocParser . setupRefContigOrdering ( refFile ) ;
2009-05-11 10:07:20 +08:00
}
// Determine the validation stringency. Default to ValidationStringency.STRICT.
ValidationStringency strictness = getValidationStringency ( ) ;
logger . info ( "Strictness is " + strictness ) ;
// perform validation steps that are common to all the engines
2009-06-10 21:39:32 +08:00
genericEngineSetup ( ) ;
2009-05-11 10:07:20 +08:00
// parse out any genomic location they've provided
2009-06-06 07:34:37 +08:00
//List<GenomeLoc> locationsList = setupIntervalRegion();
List < GenomeLoc > locationsList = engine . getLocations ( ) ;
2009-05-27 04:57:46 +08:00
GenomeLocSortedSet locs = null ;
if ( locationsList ! = null )
locs = GenomeLocSortedSet . createSetFromList ( locationsList ) ;
2009-05-11 10:07:20 +08:00
2009-06-05 22:41:42 +08:00
// excute the microscheduler, storing the results
2009-06-08 23:12:24 +08:00
walkerReturn = microScheduler . execute ( my_walker , locs , argCollection . maximumEngineIterations ) ;
2009-05-11 10:07:20 +08:00
}
/ * *
* this is to accomdate the older style traversals , that haven ' t been converted over to the new system . Putting them
* into their own function allows us to deviate in the two behaviors so the new style traversals aren ' t limited to what
* the old style does . As traversals are converted , this function should disappear .
*
2009-06-23 05:11:18 +08:00
* @param my_walker the walker to run
* @param rods the rod tracks to associate with
2009-05-11 10:07:20 +08:00
* /
private void legacyTraversal ( Walker my_walker , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
if ( my_walker instanceof LocusWindowWalker ) {
this . engine = new TraverseByLocusWindows ( argCollection . samFiles , argCollection . referenceFile , rods ) ;
} else {
throw new RuntimeException ( "Unexpected walker type: " + my_walker ) ;
}
// Prepare the sort ordering w.r.t. the sequence dictionary
if ( argCollection . referenceFile ! = null ) {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory . getReferenceSequenceFile ( argCollection . referenceFile ) ;
2009-06-22 22:39:41 +08:00
GenomeLocParser . setupRefContigOrdering ( refFile ) ;
2009-05-11 10:07:20 +08:00
}
// Determine the validation stringency. Default to ValidationStringency.STRICT.
ValidationStringency strictness = getValidationStringency ( ) ;
logger . info ( "Strictness is " + strictness ) ;
2009-06-10 21:39:32 +08:00
genericEngineSetup ( ) ;
2009-05-11 10:07:20 +08:00
2009-06-05 22:41:42 +08:00
// store the results of the walker run
walkerReturn = engine . traverse ( my_walker ) ;
2009-05-11 10:07:20 +08:00
}
/ * *
* setup a microscheduler
*
* @param my_walker our walker of type LocusWalker
* @param rods the reference order data
* @return a new microscheduler
* /
private MicroScheduler createMicroscheduler ( Walker my_walker , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
// the mircoscheduler to return
MicroScheduler microScheduler = null ;
// we need to verify different parameter based on the walker type
if ( my_walker instanceof LocusWalker ) {
// create the MicroScheduler
2009-05-16 05:02:12 +08:00
if ( argCollection . walkAllLoci )
Utils . scareUser ( "Argument --all_loci is deprecated. Please annotate your walker with @By(DataSource.REFERENCE) to perform a by-reference traversal." ) ;
2009-06-10 21:39:32 +08:00
microScheduler = MicroScheduler . create ( my_walker , extractSourceInfoFromArguments ( argCollection ) , argCollection . referenceFile , rods , argCollection . numberOfThreads ) ;
2009-05-11 10:07:20 +08:00
engine = microScheduler . getTraversalEngine ( ) ;
}
2009-06-23 05:11:18 +08:00
else if ( my_walker instanceof ReadWalker | | my_walker instanceof DuplicateWalker ) {
2009-05-11 10:07:20 +08:00
if ( argCollection . referenceFile = = null )
2009-06-23 05:11:18 +08:00
Utils . scareUser ( String . format ( "Read-based traversals require a reference file but none was given" ) ) ;
2009-06-10 21:39:32 +08:00
microScheduler = MicroScheduler . create ( my_walker , extractSourceInfoFromArguments ( argCollection ) , argCollection . referenceFile , rods , argCollection . numberOfThreads ) ;
2009-05-11 10:07:20 +08:00
engine = microScheduler . getTraversalEngine ( ) ;
2009-06-23 05:11:18 +08:00
} else {
Utils . scareUser ( String . format ( "Unable to create the appropriate TraversalEngine for analysis type " + argCollection . analysisName ) ) ;
2009-05-11 10:07:20 +08:00
}
return microScheduler ;
}
/ * *
* commands that get executed for each engine , regardless of the type
* /
2009-06-10 21:39:32 +08:00
private void genericEngineSetup ( ) {
Reads sourceInfo = extractSourceInfoFromArguments ( argCollection ) ;
engine . setStrictness ( sourceInfo . getValidationStringency ( ) ) ;
2009-05-11 10:07:20 +08:00
2009-06-08 23:12:24 +08:00
engine . setMaxReads ( argCollection . maximumEngineIterations ) ;
2009-05-29 22:51:08 +08:00
engine . setFilterZeroMappingQualityReads ( argCollection . filterZeroMappingQualityReads ) ;
2009-05-11 10:07:20 +08:00
2009-05-27 04:57:46 +08:00
// we default interval files over the genome region string
2009-05-12 09:04:18 +08:00
if ( argCollection . intervals ! = null ) {
2009-06-06 07:34:37 +08:00
engine . setLocation ( parseIntervalRegion ( argCollection . intervals , false ) ) ;
2009-05-11 10:07:20 +08:00
}
2009-06-10 21:39:32 +08:00
engine . setReadFilters ( sourceInfo ) ;
2009-05-11 10:07:20 +08:00
engine . setThreadedIO ( argCollection . enabledThreadedIO ) ;
engine . setWalkOverAllSites ( argCollection . walkAllLoci ) ;
engine . initialize ( ) ;
}
/ * *
* setup the interval regions , from either the interval file of the genome region string
*
* @return a list of genomeLoc representing the interval file
* /
2009-06-06 07:34:37 +08:00
public static List < GenomeLoc > parseIntervalRegion ( final String intervalsString , boolean quiet ) {
2009-05-12 23:33:55 +08:00
List < GenomeLoc > locs = null ;
2009-06-06 07:34:37 +08:00
if ( intervalsString ! = null ) {
if ( new File ( intervalsString ) . exists ( ) ) {
if ( ! quiet ) logger . info ( "Intervals argument specifies a file. Loading intervals from file." ) ;
2009-06-22 22:39:41 +08:00
locs = GenomeLocParser . intervalFileToList ( intervalsString ) ;
2009-05-12 23:33:55 +08:00
} else {
2009-06-06 07:34:37 +08:00
if ( ! quiet ) logger . info ( "Intervals argument does not specify a file. Trying to parse it as a simple string." ) ;
2009-06-22 22:39:41 +08:00
locs = GenomeLocParser . parseGenomeLocs ( intervalsString ) ;
2009-05-12 23:33:55 +08:00
}
2009-05-12 09:04:18 +08:00
}
2009-05-12 23:33:55 +08:00
return locs ;
2009-05-11 10:07:20 +08:00
}
2009-06-10 21:39:32 +08:00
/ * *
* Bundles all the source information about the reads into a unified data structure .
* @param argCollection The collection of arguments passed to the engine .
* @return The reads object providing reads source info .
* /
private Reads extractSourceInfoFromArguments ( GATKArgumentCollection argCollection ) {
return new Reads ( argCollection . samFiles ,
getValidationStringency ( ) ,
argCollection . downsampleFraction ,
argCollection . downsampleCoverage ,
argCollection . maximumReadSorts ,
! argCollection . unsafe ,
argCollection . filterZeroMappingQualityReads ) ;
}
2009-05-20 07:26:17 +08:00
private void validateInputsAgainstWalker ( Walker walker ,
GATKArgumentCollection arguments ,
List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
String walkerName = WalkerManager . getWalkerName ( walker . getClass ( ) ) ;
// Check what the walker says is required against what was provided on the command line.
if ( WalkerManager . isRequired ( walker , DataSource . READS ) & & ( arguments . samFiles = = null | | arguments . samFiles . size ( ) = = 0 ) )
throw new ArgumentException ( String . format ( "Walker %s requires reads but none were provided. If this is incorrect, alter the walker's @Requires annotation." , walkerName ) ) ;
if ( WalkerManager . isRequired ( walker , DataSource . REFERENCE ) & & arguments . referenceFile = = null )
throw new ArgumentException ( String . format ( "Walker %s requires a reference but none was provided. If this is incorrect, alter the walker's @Requires annotation." , walkerName ) ) ;
// Check what the walker says is allowed against what was provided on the command line.
if ( ( arguments . samFiles ! = null & & arguments . samFiles . size ( ) > 0 ) & & ! WalkerManager . isAllowed ( walker , DataSource . READS ) )
throw new ArgumentException ( String . format ( "Walker %s does not allow reads but reads were provided. If this is incorrect, alter the walker's @Allows annotation" , walkerName ) ) ;
if ( arguments . referenceFile ! = null & & ! WalkerManager . isAllowed ( walker , DataSource . REFERENCE ) )
throw new ArgumentException ( String . format ( "Walker %s does not allow a reference but one was provided. If this is incorrect, alter the walker's @Allows annotation" , walkerName ) ) ;
// Check to make sure that all required metadata is present.
List < RMD > allRequired = WalkerManager . getRequiredMetaData ( walker ) ;
for ( RMD required : allRequired ) {
boolean found = false ;
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > rod : rods ) {
if ( rod . matches ( required . name ( ) , required . type ( ) ) )
found = true ;
}
if ( ! found )
throw new ArgumentException ( String . format ( "Unable to find reference metadata (%s,%s)" , required . name ( ) , required . type ( ) ) ) ;
}
// Check to see that no forbidden rods are present.
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > rod : rods ) {
if ( ! WalkerManager . isAllowed ( walker , rod ) )
throw new ArgumentException ( String . format ( "Walker does not allow access to metadata: %s. If this is correct, change the @Allows metadata" , rod . getName ( ) ) ) ;
}
}
2009-05-11 10:07:20 +08:00
/ * *
* Default to ValidationStringency . STRICT .
*
* @return the validation stringency
* /
private ValidationStringency getValidationStringency ( ) {
2009-06-10 21:39:32 +08:00
ValidationStringency strictness = ValidationStringency . SILENT ;
2009-05-11 10:07:20 +08:00
try {
2009-06-10 21:39:32 +08:00
strictness = Enum . valueOf ( ValidationStringency . class , argCollection . strictnessLevel . toUpperCase ( ) . trim ( ) ) ;
2009-05-11 10:07:20 +08:00
}
catch ( IllegalArgumentException ex ) {
2009-06-10 21:39:32 +08:00
logger . debug ( "Unable to parse strictness from command line. Assuming SILENT" ) ;
2009-05-11 10:07:20 +08:00
}
return strictness ;
}
2009-06-05 16:48:34 +08:00
/ * *
* Default to 5 ( based on research by Alec Wysoker )
*
* @return the BAM compression
* /
public int getBAMCompression ( ) {
return ( argCollection . BAMcompression = = null | |
argCollection . BAMcompression < 1 | |
argCollection . BAMcompression > 8 ) ? 5 : argCollection . BAMcompression ;
}
2009-05-11 10:07:20 +08:00
/ * *
* Convenience function that binds RODs using the old - style command line parser to the new style list for
* a uniform processing .
*
* @param name
* @param type
* @param file
* /
private void bindConvenienceRods ( final String name , final String type , final String file ) {
argCollection . RODBindings . add ( Utils . join ( "," , new String [ ] { name , type , file } ) ) ;
}
/** Initialize the output streams as specified by the user. */
2009-05-12 09:04:18 +08:00
private void initializeOutputStreams ( Walker walker ) {
2009-05-11 10:07:20 +08:00
outputTracker = ( argCollection . outErrFileName ! = null ) ? new OutputTracker ( argCollection . outErrFileName , argCollection . outErrFileName )
: new OutputTracker ( argCollection . outFileName , argCollection . errFileName ) ;
2009-05-12 09:04:18 +08:00
walker . initializeOutputStreams ( outputTracker ) ;
2009-05-11 10:07:20 +08:00
}
/ * *
* Gets the output tracker . Tracks data available to a given walker .
*
* @return The output tracker .
* /
public OutputTracker getOutputTracker ( ) {
return outputTracker ;
}
2009-05-13 02:52:42 +08:00
/ * *
* This function is deprecated in the new traversal engines , if you need to get the header
* please get the engine and then use the getHeader function
*
* @return
* /
@Deprecated
2009-05-11 10:07:20 +08:00
public SAMFileReader getSamReader ( ) {
return this . engine . getSamReader ( ) ;
}
public TraversalEngine getEngine ( ) {
return this . engine ;
}
2009-05-14 21:55:52 +08:00
2009-06-05 22:41:42 +08:00
/** Gets the collection of GATK main application arguments for enhanced walker validation. */
2009-05-14 21:55:52 +08:00
public GATKArgumentCollection getArguments ( ) {
return this . argCollection ;
}
2009-06-05 22:41:42 +08:00
/ * *
* Get ' s the return value of the walker
*
* @return an Object representing the return value of the walker
* /
public Object getWalkerReturn ( ) {
return walkerReturn ;
}
2009-05-11 10:07:20 +08:00
}