2009-06-23 05:11:18 +08:00
/ *
* Copyright ( c ) 2009 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
* /
2009-04-27 01:42:00 +08:00
package org.broadinstitute.sting.gatk.executive ;
import org.apache.log4j.Logger ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.Shard ;
2009-06-12 02:13:22 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy ;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource ;
2009-06-26 06:51:38 +08:00
import org.broadinstitute.sting.gatk.traversals.* ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.gatk.io.OutputTracker ;
2010-02-25 08:16:50 +08:00
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator ;
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator ;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine ;
2010-08-12 04:17:11 +08:00
import org.broadinstitute.sting.gatk.ReadMetrics ;
2009-04-27 01:42:00 +08:00
2011-01-25 00:45:07 +08:00
import java.io.FileNotFoundException ;
import java.io.FileOutputStream ;
import java.io.OutputStream ;
import java.io.PrintStream ;
2011-01-13 12:20:53 +08:00
import java.lang.management.ManagementFactory ;
2009-07-14 04:42:12 +08:00
import java.util.* ;
2009-06-23 05:11:18 +08:00
2010-07-01 12:40:31 +08:00
import net.sf.picard.reference.IndexedFastaSequenceFile ;
2011-01-18 05:23:09 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
2011-01-13 12:20:53 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
2010-09-12 22:02:43 +08:00
import org.broadinstitute.sting.utils.exceptions.UserException ;
2011-01-26 21:45:40 +08:00
import org.broadinstitute.sting.utils.threading.* ;
2010-07-01 12:40:31 +08:00
2011-01-13 12:20:53 +08:00
import javax.management.JMException ;
import javax.management.MBeanServer ;
import javax.management.ObjectName ;
2009-04-27 01:42:00 +08:00
/ * *
* Created by IntelliJ IDEA .
* User : mhanna
* Date : Apr 26 , 2009
* Time : 12 : 37 : 23 PM
2011-01-20 20:58:13 +08:00
*
* General base class for all scheduling algorithms
2009-04-27 01:42:00 +08:00
* /
2009-04-27 07:08:12 +08:00
2009-05-28 02:24:31 +08:00
/** Shards and schedules data in manageable chunks. */
2011-01-13 12:20:53 +08:00
public abstract class MicroScheduler implements MicroSchedulerMBean {
2009-04-27 01:46:52 +08:00
protected static Logger logger = Logger . getLogger ( MicroScheduler . class ) ;
2009-04-27 01:42:00 +08:00
2011-01-13 12:20:53 +08:00
/ * *
* Counts the number of instances of the class that are currently alive .
* /
private static int instanceNumber = 0 ;
2010-02-25 08:16:50 +08:00
/ * *
* The engine invoking this scheduler .
* /
protected final GenomeAnalysisEngine engine ;
2009-05-09 05:27:54 +08:00
protected final TraversalEngine traversalEngine ;
protected final IndexedFastaSequenceFile reference ;
2009-05-08 08:58:37 +08:00
2009-05-09 05:27:54 +08:00
private final SAMDataSource reads ;
2010-02-25 08:16:50 +08:00
protected final Collection < ReferenceOrderedDataSource > rods ;
2009-05-08 08:58:37 +08:00
2011-01-13 12:20:53 +08:00
private final MBeanServer mBeanServer ;
private final ObjectName mBeanName ;
2011-01-20 20:58:13 +08:00
protected GenomeLocProcessingTracker processingTracker ;
2011-01-18 05:23:09 +08:00
2009-04-27 07:08:12 +08:00
/ * *
* MicroScheduler factory function . Create a microscheduler appropriate for reducing the
* selected walker .
2009-06-23 05:11:18 +08:00
*
* @param walker Which walker to use .
* @param reads the informations associated with the reads
2009-07-30 00:11:45 +08:00
* @param reference the reference file
2009-06-23 05:11:18 +08:00
* @param rods the rods to include in the traversal
2009-04-27 07:08:12 +08:00
* @param nThreadsToUse Number of threads to utilize .
2009-06-23 05:11:18 +08:00
*
2009-04-27 07:08:12 +08:00
* @return The best - fit microscheduler .
* /
2010-02-25 08:16:50 +08:00
public static MicroScheduler create ( GenomeAnalysisEngine engine , Walker walker , SAMDataSource reads , IndexedFastaSequenceFile reference , Collection < ReferenceOrderedDataSource > rods , int nThreadsToUse ) {
2011-01-20 20:58:13 +08:00
if ( engine . getArguments ( ) . processingTrackerFile ! = null ) {
if ( walker instanceof ReadWalker )
throw new UserException . BadArgumentValue ( "C" , String . format ( "Distributed GATK processing not enabled for read walkers" ) ) ;
}
2009-05-28 02:24:31 +08:00
if ( walker instanceof TreeReducible & & nThreadsToUse > 1 ) {
2010-05-21 03:02:02 +08:00
if ( walker . isReduceByInterval ( ) )
2010-09-12 22:02:43 +08:00
throw new UserException . BadArgumentValue ( "nt" , String . format ( "The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option." , engine . getWalkerName ( walker . getClass ( ) ) ) ) ;
2010-06-04 02:14:33 +08:00
if ( walker instanceof ReadWalker )
2010-09-12 22:02:43 +08:00
throw new UserException . BadArgumentValue ( "nt" , String . format ( "The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option." , engine . getWalkerName ( walker . getClass ( ) ) ) ) ;
2010-05-21 03:02:02 +08:00
logger . info ( String . format ( "Running the GATK in parallel mode with %d concurrent threads" , nThreadsToUse ) ) ;
2010-02-25 08:16:50 +08:00
return new HierarchicalMicroScheduler ( engine , walker , reads , reference , rods , nThreadsToUse ) ;
2009-05-28 02:24:31 +08:00
} else {
2010-05-21 03:02:02 +08:00
if ( nThreadsToUse > 1 )
2010-09-12 22:02:43 +08:00
throw new UserException . BadArgumentValue ( "nt" , String . format ( "The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option." , engine . getWalkerName ( walker . getClass ( ) ) ) ) ;
2010-02-25 08:16:50 +08:00
return new LinearMicroScheduler ( engine , walker , reads , reference , rods ) ;
2009-04-27 07:08:12 +08:00
}
}
2009-04-27 01:42:00 +08:00
/ * *
* Create a microscheduler given the reads and reference .
2009-06-23 05:11:18 +08:00
*
* @param walker the walker to execute with
* @param reads The reads .
2009-07-30 00:11:45 +08:00
* @param reference The reference .
2009-06-23 05:11:18 +08:00
* @param rods the rods to include in the traversal
2009-04-27 01:42:00 +08:00
* /
2010-02-25 08:16:50 +08:00
protected MicroScheduler ( GenomeAnalysisEngine engine , Walker walker , SAMDataSource reads , IndexedFastaSequenceFile reference , Collection < ReferenceOrderedDataSource > rods ) {
this . engine = engine ;
2009-07-30 07:00:15 +08:00
this . reads = reads ;
this . reference = reference ;
this . rods = rods ;
2009-05-09 05:27:54 +08:00
if ( walker instanceof ReadWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseReads ( ) ;
2009-06-23 05:11:18 +08:00
} else if ( walker instanceof LocusWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseLoci ( ) ;
2009-06-23 05:11:18 +08:00
} else if ( walker instanceof DuplicateWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseDuplicates ( ) ;
2010-03-22 07:22:25 +08:00
} else if ( walker instanceof ReadPairWalker ) {
traversalEngine = new TraverseReadPairs ( ) ;
2009-06-23 05:11:18 +08:00
} else {
throw new UnsupportedOperationException ( "Unable to determine traversal type, the walker is an unknown type." ) ;
2009-07-30 07:00:15 +08:00
}
2009-07-30 00:11:45 +08:00
2010-09-25 10:49:30 +08:00
traversalEngine . initialize ( engine ) ;
2011-01-13 12:20:53 +08:00
// JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean.
// To get around this limitation and since we have no job identifier at this point, register a simple counter that
// will count the number of instances of this object that have been created in this JVM.
int thisInstance = instanceNumber + + ;
mBeanServer = ManagementFactory . getPlatformMBeanServer ( ) ;
try {
mBeanName = new ObjectName ( "org.broadinstitute.sting.gatk.executive:type=MicroScheduler,instanceNumber=" + thisInstance ) ;
mBeanServer . registerMBean ( this , mBeanName ) ;
}
catch ( JMException ex ) {
throw new ReviewedStingException ( "Unable to register microscheduler with JMX" , ex ) ;
}
2011-01-18 05:23:09 +08:00
// create the processing tracker
if ( engine . getArguments ( ) . processingTrackerFile ! = null ) {
if ( engine . getArguments ( ) . restartProcessingTracker & & engine . getArguments ( ) . processingTrackerFile . exists ( ) ) {
engine . getArguments ( ) . processingTrackerFile . delete ( ) ;
logger . info ( "Deleting ProcessingTracker file " + engine . getArguments ( ) . processingTrackerFile ) ;
}
2011-01-25 00:45:07 +08:00
PrintStream statusStream = null ;
if ( engine . getArguments ( ) . processingTrackerStatusFile ! = null ) {
try {
statusStream = new PrintStream ( new FileOutputStream ( engine . getArguments ( ) . processingTrackerStatusFile ) ) ;
} catch ( FileNotFoundException e ) {
throw new UserException . CouldNotCreateOutputFile ( engine . getArguments ( ) . processingTrackerStatusFile , e ) ;
}
}
2011-01-26 21:45:40 +08:00
ClosableReentrantLock lock = new SharedFileThreadSafeLock ( engine . getArguments ( ) . processingTrackerFile , engine . getArguments ( ) . processTrackerID ) ;
processingTracker = new FileBackedGenomeLocProcessingTracker ( engine . getArguments ( ) . processingTrackerFile , engine . getGenomeLocParser ( ) , lock , statusStream ) ;
logger . info ( "Creating ProcessingTracker using shared file " + engine . getArguments ( ) . processingTrackerFile + " process.id = " + engine . getName ( ) + " CID = " + engine . getArguments ( ) . processTrackerID ) ;
2011-01-18 05:23:09 +08:00
} else {
2011-01-26 21:45:40 +08:00
processingTracker = new NoOpGenomeLocProcessingTracker ( ) ;
2011-01-18 05:23:09 +08:00
}
2009-04-27 01:42:00 +08:00
}
/ * *
* Walks a walker over the given list of intervals .
2009-06-23 05:11:18 +08:00
*
* @param walker Computation to perform over dataset .
2009-07-30 00:11:45 +08:00
* @param shardStrategy A strategy for sharding the data .
2009-06-23 05:11:18 +08:00
*
2009-05-16 04:20:27 +08:00
* @return the return type of the walker
2009-04-27 01:42:00 +08:00
* /
2010-08-10 04:41:50 +08:00
public abstract Object execute ( Walker walker , ShardStrategy shardStrategy ) ;
2009-04-27 01:42:00 +08:00
2009-08-23 08:56:02 +08:00
/ * *
* Retrieves the object responsible for tracking and managing output .
* @return An output tracker , for loading data in and extracting results . Will not be null .
* /
public abstract OutputTracker getOutputTracker ( ) ;
2009-05-09 05:27:54 +08:00
/ * *
2010-02-25 08:16:50 +08:00
* Gets the an iterator over the given reads , which will iterate over the reads in the given shard .
* @param shard the shard to use when querying reads .
* @return an iterator over the reads specified in the shard .
2009-05-09 05:27:54 +08:00
* /
2010-02-25 08:16:50 +08:00
protected StingSAMIterator getReadIterator ( Shard shard ) {
2010-09-25 10:49:30 +08:00
return ( ! reads . isEmpty ( ) ) ? reads . seek ( shard ) : new NullSAMIterator ( ) ;
2009-05-09 05:27:54 +08:00
}
2009-07-07 06:50:22 +08:00
/ * *
* Print summary information for the analysis .
* @param sum The final reduce output .
* /
2010-08-12 04:17:11 +08:00
protected void printOnTraversalDone ( Object sum , ReadMetrics metrics ) {
traversalEngine . printOnTraversalDone ( metrics ) ;
2009-07-07 06:50:22 +08:00
}
2010-11-11 01:59:50 +08:00
/ * *
* Gets the engine that created this microscheduler .
* @return The engine owning this microscheduler .
* /
public GenomeAnalysisEngine getEngine ( ) { return engine ; }
2009-07-25 06:59:49 +08:00
/ * *
* Returns data source maintained by this scheduler
* @return
* /
public SAMDataSource getSAMDataSource ( ) { return reads ; }
2009-04-27 01:42:00 +08:00
2009-05-22 04:09:32 +08:00
/ * *
2009-07-30 00:11:45 +08:00
* Returns the reference maintained by this scheduler .
* @return The reference maintained by this scheduler .
2009-05-22 04:09:32 +08:00
* /
2009-07-30 00:11:45 +08:00
public IndexedFastaSequenceFile getReference ( ) { return reference ; }
2011-01-13 12:20:53 +08:00
/ * *
* Gets the filename to which performance data is currently being written .
* @return Filename to which performance data is currently being written .
* /
public String getPerformanceLogFileName ( ) {
return traversalEngine . getPerformanceLogFileName ( ) ;
}
/ * *
* Set the filename of the log for performance . If set ,
* @param fileName filename to use when writing performance data .
* /
public void setPerformanceLogFileName ( String fileName ) {
traversalEngine . setPerformanceLogFileName ( fileName ) ;
}
/ * *
* Gets the frequency with which performance data is written .
* @return Frequency , in seconds , of performance log writes .
* /
public long getPerformanceProgressPrintFrequencySeconds ( ) {
return traversalEngine . getPerformanceProgressPrintFrequencySeconds ( ) ;
}
/ * *
* How often should the performance log message be written ?
* @param seconds number of seconds between messages indicating performance frequency .
* /
public void setPerformanceProgressPrintFrequencySeconds ( long seconds ) {
traversalEngine . setPerformanceProgressPrintFrequencySeconds ( seconds ) ;
}
protected void cleanup ( ) {
try {
mBeanServer . unregisterMBean ( mBeanName ) ;
}
catch ( JMException ex ) {
throw new ReviewedStingException ( "Unable to unregister microscheduler with JMX" , ex ) ;
}
}
2009-04-27 01:42:00 +08:00
}