2009-04-27 01:42:00 +08:00
|
|
|
package org.broadinstitute.sting.gatk.executive;
|
|
|
|
|
|
2009-05-07 06:36:25 +08:00
|
|
|
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
2009-04-27 01:42:00 +08:00
|
|
|
import org.apache.log4j.Logger;
|
|
|
|
|
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
|
|
|
|
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
2009-05-09 05:27:54 +08:00
|
|
|
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
2009-04-27 01:42:00 +08:00
|
|
|
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
|
2009-05-09 05:27:54 +08:00
|
|
|
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
|
2009-04-27 01:42:00 +08:00
|
|
|
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
2009-05-09 05:27:54 +08:00
|
|
|
import org.broadinstitute.sting.gatk.traversals.TraverseByReads;
|
|
|
|
|
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
|
2009-04-27 07:08:12 +08:00
|
|
|
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
2009-05-07 06:36:25 +08:00
|
|
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
2009-05-09 05:27:54 +08:00
|
|
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
2009-05-07 07:26:21 +08:00
|
|
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
|
|
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
2009-04-27 01:42:00 +08:00
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
2009-05-09 05:27:54 +08:00
|
|
|
import org.broadinstitute.sting.utils.StingException;
|
2009-04-27 01:42:00 +08:00
|
|
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
|
|
|
|
|
|
|
|
|
import java.io.File;
|
2009-05-07 06:36:25 +08:00
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
|
import java.util.List;
|
2009-04-27 01:42:00 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Created by IntelliJ IDEA.
|
|
|
|
|
* User: mhanna
|
|
|
|
|
* Date: Apr 26, 2009
|
|
|
|
|
* Time: 12:37:23 PM
|
|
|
|
|
* To change this template use File | Settings | File Templates.
|
|
|
|
|
*/
|
2009-04-27 07:08:12 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Shards and schedules data in manageable chunks.
|
|
|
|
|
*/
|
2009-04-27 01:42:00 +08:00
|
|
|
public abstract class MicroScheduler {
|
|
|
|
|
private static long SHARD_SIZE = 100000L;
|
|
|
|
|
|
2009-04-27 01:46:52 +08:00
|
|
|
protected static Logger logger = Logger.getLogger(MicroScheduler.class);
|
2009-04-27 01:42:00 +08:00
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
protected final TraversalEngine traversalEngine;
|
|
|
|
|
protected final IndexedFastaSequenceFile reference;
|
2009-05-08 08:58:37 +08:00
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
private final SAMDataSource reads;
|
2009-05-08 08:58:37 +08:00
|
|
|
|
2009-04-27 07:08:12 +08:00
|
|
|
/**
|
|
|
|
|
* MicroScheduler factory function. Create a microscheduler appropriate for reducing the
|
|
|
|
|
* selected walker.
|
|
|
|
|
* @param walker Which walker to use.
|
|
|
|
|
* @param nThreadsToUse Number of threads to utilize.
|
|
|
|
|
* @return The best-fit microscheduler.
|
|
|
|
|
*/
|
2009-05-07 07:26:21 +08:00
|
|
|
public static MicroScheduler create( Walker walker, List<File> reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse ) {
|
2009-04-27 07:08:12 +08:00
|
|
|
if( walker instanceof TreeReducible && nThreadsToUse > 1 ) {
|
|
|
|
|
logger.info("Creating hierarchical microscheduler");
|
2009-05-09 05:27:54 +08:00
|
|
|
return new HierarchicalMicroScheduler( walker, reads, ref, rods, nThreadsToUse );
|
2009-04-27 07:08:12 +08:00
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
logger.info("Creating linear microscheduler");
|
2009-05-09 05:27:54 +08:00
|
|
|
return new LinearMicroScheduler( walker, reads, ref, rods );
|
2009-04-27 07:08:12 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-27 01:42:00 +08:00
|
|
|
/**
|
|
|
|
|
* Create a microscheduler given the reads and reference.
|
|
|
|
|
* @param reads The reads.
|
|
|
|
|
* @param refFile File pointer to the reference.
|
|
|
|
|
*/
|
2009-05-09 05:27:54 +08:00
|
|
|
protected MicroScheduler( Walker walker, List<File> reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods ) {
|
|
|
|
|
if (walker instanceof ReadWalker) {
|
|
|
|
|
traversalEngine = new TraverseByReads(reads, refFile, rods);
|
|
|
|
|
} else {
|
|
|
|
|
traversalEngine = new TraverseLociByReference(reads, refFile, rods);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.reads = getReadsDataSource( reads );
|
2009-04-27 01:42:00 +08:00
|
|
|
this.reference = openReferenceSequenceFile( refFile );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* A temporary getter for the traversal engine. In the future, clients
|
|
|
|
|
* of the microscheduler shouldn't need to know anything about the traversal engine.
|
|
|
|
|
* @return The traversal engine.
|
|
|
|
|
*/
|
2009-05-08 08:58:37 +08:00
|
|
|
public TraversalEngine getTraversalEngine() {
|
|
|
|
|
return traversalEngine;
|
|
|
|
|
}
|
2009-04-27 01:42:00 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Walks a walker over the given list of intervals.
|
|
|
|
|
* @param walker Computation to perform over dataset.
|
|
|
|
|
* @param intervals A list of intervals over which to walk. Null for whole dataset.
|
|
|
|
|
*/
|
|
|
|
|
public abstract void execute( Walker walker, List<GenomeLoc> intervals);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get the sharding strategy given a driving data source.
|
|
|
|
|
* @param drivingDataSource Data on which to shard.
|
|
|
|
|
* @param intervals Intervals to use when limiting sharding.
|
|
|
|
|
* @return Sharding strategy for this driving data source.
|
|
|
|
|
*/
|
|
|
|
|
protected ShardStrategy getShardStrategy( ReferenceSequenceFile drivingDataSource, List<GenomeLoc> intervals ) {
|
|
|
|
|
ShardStrategy shardStrategy = null;
|
|
|
|
|
if( intervals != null )
|
|
|
|
|
shardStrategy = ShardStrategyFactory.shatter( ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
|
|
|
|
drivingDataSource.getSequenceDictionary(),
|
|
|
|
|
SHARD_SIZE,
|
|
|
|
|
intervals );
|
|
|
|
|
else
|
|
|
|
|
shardStrategy = ShardStrategyFactory.shatter( ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
|
|
|
|
drivingDataSource.getSequenceDictionary(),
|
|
|
|
|
SHARD_SIZE );
|
|
|
|
|
|
|
|
|
|
return shardStrategy;
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
/**
|
|
|
|
|
* Gets an window into all the data that can be viewed as a single shard.
|
|
|
|
|
* @param shard The section of data to view.
|
|
|
|
|
* @return An accessor for all the data in this shard.
|
|
|
|
|
*/
|
|
|
|
|
protected ShardDataProvider getShardDataProvider( Shard shard ) {
|
|
|
|
|
return new ShardDataProvider( shard, reads, reference );
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-27 01:42:00 +08:00
|
|
|
/**
|
|
|
|
|
* Gets a data source for the given set of reads.
|
|
|
|
|
* @return A data source for the given set of reads.
|
|
|
|
|
*/
|
2009-05-09 05:27:54 +08:00
|
|
|
private SAMDataSource getReadsDataSource( List<File> reads ) {
|
|
|
|
|
List<File> unpackedReads = null;
|
2009-04-27 01:42:00 +08:00
|
|
|
try {
|
2009-05-09 05:27:54 +08:00
|
|
|
unpackedReads = TraversalEngine.unpackReads(reads);
|
2009-04-27 01:42:00 +08:00
|
|
|
}
|
|
|
|
|
catch( FileNotFoundException ex ) {
|
2009-05-09 05:27:54 +08:00
|
|
|
throw new StingException( "Cannot unpack list of reads files", ex );
|
2009-04-27 01:42:00 +08:00
|
|
|
}
|
2009-05-08 08:58:37 +08:00
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
SAMDataSource dataSource = new SAMDataSource( unpackedReads );
|
|
|
|
|
|
2009-05-08 08:58:37 +08:00
|
|
|
// Side effect: initialize the traversal engine with reads data.
|
|
|
|
|
// TODO: Give users a dedicated way of getting the header so that the MicroScheduler
|
|
|
|
|
// doesn't have to bend over backward providing legacy getters and setters.
|
|
|
|
|
traversalEngine.setSAMHeader(dataSource.getHeader());
|
|
|
|
|
|
2009-04-27 01:42:00 +08:00
|
|
|
return dataSource;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Opens a reference sequence file paired with an index.
|
|
|
|
|
* @param refFile Handle to a reference sequence file. Non-null.
|
|
|
|
|
* @return A thread-safe file wrapper.
|
|
|
|
|
*/
|
|
|
|
|
private IndexedFastaSequenceFile openReferenceSequenceFile( File refFile ) {
|
|
|
|
|
IndexedFastaSequenceFile ref = null;
|
|
|
|
|
try {
|
|
|
|
|
ref = new IndexedFastaSequenceFile(refFile);
|
|
|
|
|
}
|
|
|
|
|
catch( FileNotFoundException ex ) {
|
|
|
|
|
throw new RuntimeException("File not found opening fasta file; please do this check before MicroManaging", ex);
|
|
|
|
|
}
|
|
|
|
|
GenomeLoc.setupRefContigOrdering(ref);
|
|
|
|
|
return ref;
|
|
|
|
|
}
|
|
|
|
|
}
|