2009-06-23 05:11:18 +08:00
/ *
* Copyright ( c ) 2009 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
* /
2009-04-27 01:42:00 +08:00
package org.broadinstitute.sting.gatk.executive ;
import org.apache.log4j.Logger ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.Shard ;
2009-06-12 02:13:22 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy ;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource ;
2010-08-25 11:47:57 +08:00
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID ;
2009-06-26 06:51:38 +08:00
import org.broadinstitute.sting.gatk.traversals.* ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.gatk.io.OutputTracker ;
2010-02-25 08:16:50 +08:00
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator ;
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator ;
2010-08-12 04:17:11 +08:00
import org.broadinstitute.sting.gatk.ReadProperties ;
2010-02-25 08:16:50 +08:00
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine ;
2010-08-12 04:17:11 +08:00
import org.broadinstitute.sting.gatk.ReadMetrics ;
2009-04-27 01:42:00 +08:00
2009-07-14 04:42:12 +08:00
import java.util.* ;
2009-06-23 05:11:18 +08:00
2010-07-01 12:40:31 +08:00
import net.sf.picard.reference.IndexedFastaSequenceFile ;
2010-09-12 22:02:43 +08:00
import org.broadinstitute.sting.utils.exceptions.UserException ;
2010-07-01 12:40:31 +08:00
2009-04-27 01:42:00 +08:00
/ * *
* Created by IntelliJ IDEA .
* User : mhanna
* Date : Apr 26 , 2009
* Time : 12 : 37 : 23 PM
* To change this template use File | Settings | File Templates .
* /
2009-04-27 07:08:12 +08:00
2009-05-28 02:24:31 +08:00
/** Shards and schedules data in manageable chunks. */
2009-04-27 01:42:00 +08:00
public abstract class MicroScheduler {
2009-04-27 01:46:52 +08:00
protected static Logger logger = Logger . getLogger ( MicroScheduler . class ) ;
2009-04-27 01:42:00 +08:00
2010-02-25 08:16:50 +08:00
/ * *
* The engine invoking this scheduler .
* /
protected final GenomeAnalysisEngine engine ;
2009-05-09 05:27:54 +08:00
protected final TraversalEngine traversalEngine ;
protected final IndexedFastaSequenceFile reference ;
2009-05-08 08:58:37 +08:00
2009-05-09 05:27:54 +08:00
private final SAMDataSource reads ;
2010-02-25 08:16:50 +08:00
protected final Collection < ReferenceOrderedDataSource > rods ;
2009-05-08 08:58:37 +08:00
2009-04-27 07:08:12 +08:00
/ * *
* MicroScheduler factory function . Create a microscheduler appropriate for reducing the
* selected walker .
2009-06-23 05:11:18 +08:00
*
* @param walker Which walker to use .
* @param reads the informations associated with the reads
2009-07-30 00:11:45 +08:00
* @param reference the reference file
2009-06-23 05:11:18 +08:00
* @param rods the rods to include in the traversal
2009-04-27 07:08:12 +08:00
* @param nThreadsToUse Number of threads to utilize .
2009-06-23 05:11:18 +08:00
*
2009-04-27 07:08:12 +08:00
* @return The best - fit microscheduler .
* /
2010-02-25 08:16:50 +08:00
public static MicroScheduler create ( GenomeAnalysisEngine engine , Walker walker , SAMDataSource reads , IndexedFastaSequenceFile reference , Collection < ReferenceOrderedDataSource > rods , int nThreadsToUse ) {
2009-05-28 02:24:31 +08:00
if ( walker instanceof TreeReducible & & nThreadsToUse > 1 ) {
2010-05-21 03:02:02 +08:00
if ( walker . isReduceByInterval ( ) )
2010-09-12 22:02:43 +08:00
throw new UserException . BadArgumentValue ( "nt" , String . format ( "The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option." , engine . getWalkerName ( walker . getClass ( ) ) ) ) ;
2010-06-04 02:14:33 +08:00
if ( walker instanceof ReadWalker )
2010-09-12 22:02:43 +08:00
throw new UserException . BadArgumentValue ( "nt" , String . format ( "The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option." , engine . getWalkerName ( walker . getClass ( ) ) ) ) ;
2010-05-21 03:02:02 +08:00
logger . info ( String . format ( "Running the GATK in parallel mode with %d concurrent threads" , nThreadsToUse ) ) ;
2010-02-25 08:16:50 +08:00
return new HierarchicalMicroScheduler ( engine , walker , reads , reference , rods , nThreadsToUse ) ;
2009-05-28 02:24:31 +08:00
} else {
2010-05-21 03:02:02 +08:00
if ( nThreadsToUse > 1 )
2010-09-12 22:02:43 +08:00
throw new UserException . BadArgumentValue ( "nt" , String . format ( "The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option." , engine . getWalkerName ( walker . getClass ( ) ) ) ) ;
2010-02-25 08:16:50 +08:00
return new LinearMicroScheduler ( engine , walker , reads , reference , rods ) ;
2009-04-27 07:08:12 +08:00
}
}
2009-04-27 01:42:00 +08:00
/ * *
* Create a microscheduler given the reads and reference .
2009-06-23 05:11:18 +08:00
*
* @param walker the walker to execute with
* @param reads The reads .
2009-07-30 00:11:45 +08:00
* @param reference The reference .
2009-06-23 05:11:18 +08:00
* @param rods the rods to include in the traversal
2009-04-27 01:42:00 +08:00
* /
2010-02-25 08:16:50 +08:00
protected MicroScheduler ( GenomeAnalysisEngine engine , Walker walker , SAMDataSource reads , IndexedFastaSequenceFile reference , Collection < ReferenceOrderedDataSource > rods ) {
this . engine = engine ;
2009-07-30 07:00:15 +08:00
this . reads = reads ;
this . reference = reference ;
this . rods = rods ;
2009-05-09 05:27:54 +08:00
if ( walker instanceof ReadWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseReads ( ) ;
2009-06-23 05:11:18 +08:00
} else if ( walker instanceof LocusWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseLoci ( ) ;
2009-06-23 05:11:18 +08:00
} else if ( walker instanceof DuplicateWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseDuplicates ( ) ;
2010-03-22 07:22:25 +08:00
} else if ( walker instanceof ReadPairWalker ) {
traversalEngine = new TraverseReadPairs ( ) ;
2009-06-23 05:11:18 +08:00
} else {
throw new UnsupportedOperationException ( "Unable to determine traversal type, the walker is an unknown type." ) ;
2009-07-30 07:00:15 +08:00
}
2009-07-30 00:11:45 +08:00
2010-09-25 10:49:30 +08:00
traversalEngine . initialize ( engine ) ;
2009-04-27 01:42:00 +08:00
}
/ * *
* Walks a walker over the given list of intervals .
2009-06-23 05:11:18 +08:00
*
* @param walker Computation to perform over dataset .
2009-07-30 00:11:45 +08:00
* @param shardStrategy A strategy for sharding the data .
2009-06-23 05:11:18 +08:00
*
2009-05-16 04:20:27 +08:00
* @return the return type of the walker
2009-04-27 01:42:00 +08:00
* /
2010-08-10 04:41:50 +08:00
public abstract Object execute ( Walker walker , ShardStrategy shardStrategy ) ;
2009-04-27 01:42:00 +08:00
2009-08-23 08:56:02 +08:00
/ * *
* Retrieves the object responsible for tracking and managing output .
* @return An output tracker , for loading data in and extracting results . Will not be null .
* /
public abstract OutputTracker getOutputTracker ( ) ;
2009-05-09 05:27:54 +08:00
/ * *
2010-02-25 08:16:50 +08:00
* Gets the an iterator over the given reads , which will iterate over the reads in the given shard .
* @param shard the shard to use when querying reads .
* @return an iterator over the reads specified in the shard .
2009-05-09 05:27:54 +08:00
* /
2010-02-25 08:16:50 +08:00
protected StingSAMIterator getReadIterator ( Shard shard ) {
2010-09-25 10:49:30 +08:00
return ( ! reads . isEmpty ( ) ) ? reads . seek ( shard ) : new NullSAMIterator ( ) ;
2009-05-09 05:27:54 +08:00
}
2009-07-07 06:50:22 +08:00
/ * *
* Print summary information for the analysis .
* @param sum The final reduce output .
* /
2010-08-12 04:17:11 +08:00
protected void printOnTraversalDone ( Object sum , ReadMetrics metrics ) {
traversalEngine . printOnTraversalDone ( metrics ) ;
2009-07-07 06:50:22 +08:00
}
2010-11-11 01:59:50 +08:00
/ * *
* Gets the engine that created this microscheduler .
* @return The engine owning this microscheduler .
* /
public GenomeAnalysisEngine getEngine ( ) { return engine ; }
2009-07-25 06:59:49 +08:00
/ * *
* Returns data source maintained by this scheduler
* @return
* /
public SAMDataSource getSAMDataSource ( ) { return reads ; }
2009-04-27 01:42:00 +08:00
2009-05-22 04:09:32 +08:00
/ * *
2009-07-30 00:11:45 +08:00
* Returns the reference maintained by this scheduler .
* @return The reference maintained by this scheduler .
2009-05-22 04:09:32 +08:00
* /
2009-07-30 00:11:45 +08:00
public IndexedFastaSequenceFile getReference ( ) { return reference ; }
2009-04-27 01:42:00 +08:00
}