2009-06-23 05:11:18 +08:00
|
|
|
/*
|
|
|
|
|
* Copyright (c) 2009 The Broad Institute
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person
|
|
|
|
|
* obtaining a copy of this software and associated documentation
|
|
|
|
|
* files (the "Software"), to deal in the Software without
|
|
|
|
|
* restriction, including without limitation the rights to use,
|
|
|
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
* copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following
|
|
|
|
|
* conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice shall be
|
|
|
|
|
* included in all copies or substantial portions of the Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
|
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
|
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
|
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
|
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2009-04-27 01:42:00 +08:00
|
|
|
package org.broadinstitute.sting.gatk.executive;
|
|
|
|
|
|
|
|
|
|
import org.apache.log4j.Logger;
|
2009-06-23 05:11:18 +08:00
|
|
|
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
|
|
|
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
2009-06-12 02:13:22 +08:00
|
|
|
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
|
|
|
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
2009-06-23 05:11:18 +08:00
|
|
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
2009-06-26 06:51:38 +08:00
|
|
|
import org.broadinstitute.sting.gatk.traversals.*;
|
2009-06-23 05:11:18 +08:00
|
|
|
import org.broadinstitute.sting.gatk.walkers.*;
|
2009-08-23 08:56:02 +08:00
|
|
|
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
2009-04-27 01:42:00 +08:00
|
|
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
|
|
|
|
|
2009-07-14 04:42:12 +08:00
|
|
|
import java.util.*;
|
2009-06-23 05:11:18 +08:00
|
|
|
|
2009-04-27 01:42:00 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Created by IntelliJ IDEA.
|
|
|
|
|
* User: mhanna
|
|
|
|
|
* Date: Apr 26, 2009
|
|
|
|
|
* Time: 12:37:23 PM
|
|
|
|
|
* To change this template use File | Settings | File Templates.
|
|
|
|
|
*/
|
2009-04-27 07:08:12 +08:00
|
|
|
|
2009-05-28 02:24:31 +08:00
|
|
|
/** Shards and schedules data in manageable chunks. */
|
2009-04-27 01:42:00 +08:00
|
|
|
public abstract class MicroScheduler {
|
2009-04-27 01:46:52 +08:00
|
|
|
protected static Logger logger = Logger.getLogger(MicroScheduler.class);
|
2009-04-27 01:42:00 +08:00
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
protected final TraversalEngine traversalEngine;
|
|
|
|
|
protected final IndexedFastaSequenceFile reference;
|
2009-05-08 08:58:37 +08:00
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
private final SAMDataSource reads;
|
2009-07-30 00:11:45 +08:00
|
|
|
private final Collection<ReferenceOrderedDataSource> rods;
|
2009-05-08 08:58:37 +08:00
|
|
|
|
2009-04-27 07:08:12 +08:00
|
|
|
/**
|
|
|
|
|
* MicroScheduler factory function. Create a microscheduler appropriate for reducing the
|
|
|
|
|
* selected walker.
|
2009-06-23 05:11:18 +08:00
|
|
|
*
|
|
|
|
|
* @param walker Which walker to use.
|
|
|
|
|
* @param reads the informations associated with the reads
|
2009-07-30 00:11:45 +08:00
|
|
|
* @param reference the reference file
|
2009-06-23 05:11:18 +08:00
|
|
|
* @param rods the rods to include in the traversal
|
2009-04-27 07:08:12 +08:00
|
|
|
* @param nThreadsToUse Number of threads to utilize.
|
2009-06-23 05:11:18 +08:00
|
|
|
*
|
2009-04-27 07:08:12 +08:00
|
|
|
* @return The best-fit microscheduler.
|
|
|
|
|
*/
|
2009-07-30 00:11:45 +08:00
|
|
|
public static MicroScheduler create(Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse) {
|
2009-05-28 02:24:31 +08:00
|
|
|
if (walker instanceof TreeReducible && nThreadsToUse > 1) {
|
2009-04-27 07:08:12 +08:00
|
|
|
logger.info("Creating hierarchical microscheduler");
|
2009-07-30 00:11:45 +08:00
|
|
|
return new HierarchicalMicroScheduler(walker, reads, reference, rods, nThreadsToUse);
|
2009-05-28 02:24:31 +08:00
|
|
|
} else {
|
2009-04-27 07:08:12 +08:00
|
|
|
logger.info("Creating linear microscheduler");
|
2009-07-30 00:11:45 +08:00
|
|
|
return new LinearMicroScheduler(walker, reads, reference, rods);
|
2009-04-27 07:08:12 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-27 01:42:00 +08:00
|
|
|
/**
|
|
|
|
|
* Create a microscheduler given the reads and reference.
|
2009-06-23 05:11:18 +08:00
|
|
|
*
|
|
|
|
|
* @param walker the walker to execute with
|
|
|
|
|
* @param reads The reads.
|
2009-07-30 00:11:45 +08:00
|
|
|
* @param reference The reference.
|
2009-06-23 05:11:18 +08:00
|
|
|
* @param rods the rods to include in the traversal
|
2009-04-27 01:42:00 +08:00
|
|
|
*/
|
2009-07-30 00:11:45 +08:00
|
|
|
protected MicroScheduler(Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods) {
|
2009-07-30 07:00:15 +08:00
|
|
|
this.reads = reads;
|
|
|
|
|
this.reference = reference;
|
|
|
|
|
this.rods = rods;
|
|
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
if (walker instanceof ReadWalker) {
|
2009-07-10 05:57:00 +08:00
|
|
|
traversalEngine = new TraverseReads();
|
2009-06-23 05:11:18 +08:00
|
|
|
} else if (walker instanceof LocusWalker) {
|
2009-07-10 05:57:00 +08:00
|
|
|
traversalEngine = new TraverseLoci();
|
2009-06-26 06:51:38 +08:00
|
|
|
} else if (walker instanceof LocusWindowWalker) {
|
2009-07-10 05:57:00 +08:00
|
|
|
traversalEngine = new TraverseLocusWindows();
|
2009-06-23 05:11:18 +08:00
|
|
|
} else if (walker instanceof DuplicateWalker) {
|
2009-07-10 05:57:00 +08:00
|
|
|
traversalEngine = new TraverseDuplicates();
|
2009-06-23 05:11:18 +08:00
|
|
|
} else {
|
|
|
|
|
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
2009-07-30 07:00:15 +08:00
|
|
|
}
|
2009-07-30 00:11:45 +08:00
|
|
|
|
|
|
|
|
traversalEngine.initialize();
|
2009-04-27 01:42:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Walks a walker over the given list of intervals.
|
2009-06-23 05:11:18 +08:00
|
|
|
*
|
|
|
|
|
* @param walker Computation to perform over dataset.
|
2009-07-30 00:11:45 +08:00
|
|
|
* @param shardStrategy A strategy for sharding the data.
|
2009-06-23 05:11:18 +08:00
|
|
|
*
|
2009-05-16 04:20:27 +08:00
|
|
|
* @return the return type of the walker
|
2009-04-27 01:42:00 +08:00
|
|
|
*/
|
2009-07-30 07:00:15 +08:00
|
|
|
public abstract Object execute(Walker walker, ShardStrategy shardStrategy, int iterations );
|
2009-04-27 01:42:00 +08:00
|
|
|
|
2009-08-23 08:56:02 +08:00
|
|
|
/**
|
|
|
|
|
* Retrieves the object responsible for tracking and managing output.
|
|
|
|
|
* @return An output tracker, for loading data in and extracting results. Will not be null.
|
|
|
|
|
*/
|
|
|
|
|
public abstract OutputTracker getOutputTracker();
|
|
|
|
|
|
2009-04-27 01:42:00 +08:00
|
|
|
|
2009-05-09 05:27:54 +08:00
|
|
|
/**
|
|
|
|
|
* Gets an window into all the data that can be viewed as a single shard.
|
2009-06-23 05:11:18 +08:00
|
|
|
*
|
2009-05-09 05:27:54 +08:00
|
|
|
* @param shard The section of data to view.
|
2009-06-23 05:11:18 +08:00
|
|
|
*
|
2009-05-09 05:27:54 +08:00
|
|
|
* @return An accessor for all the data in this shard.
|
|
|
|
|
*/
|
2009-05-28 02:24:31 +08:00
|
|
|
protected ShardDataProvider getShardDataProvider(Shard shard) {
|
|
|
|
|
return new ShardDataProvider(shard, reads, reference, rods);
|
2009-05-09 05:27:54 +08:00
|
|
|
}
|
|
|
|
|
|
2009-07-07 06:50:22 +08:00
|
|
|
/**
|
|
|
|
|
* Print summary information for the analysis.
|
|
|
|
|
* @param sum The final reduce output.
|
|
|
|
|
*/
|
|
|
|
|
protected void printOnTraversalDone(Object sum) {
|
2009-07-10 01:26:59 +08:00
|
|
|
// HACK: The microscheduler should be too dumb to know anything about the data
|
|
|
|
|
// it's actually processing; it should just funnel anything it receives
|
|
|
|
|
// to the traversal engine.
|
|
|
|
|
// TODO: Implement code to allow the datasources to print summary info of the
|
|
|
|
|
// data they've seen.
|
|
|
|
|
if( reads != null && reads.getViolationHistogram().getViolationCount() > 0 )
|
|
|
|
|
logger.warn(String.format("%n%s",reads.getViolationHistogram()));
|
|
|
|
|
|
2009-07-07 06:50:22 +08:00
|
|
|
traversalEngine.printOnTraversalDone(sum);
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-25 06:59:49 +08:00
|
|
|
/**
|
|
|
|
|
* Returns data source maintained by this scheduler
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public SAMDataSource getSAMDataSource() { return reads; }
|
2009-04-27 01:42:00 +08:00
|
|
|
|
2009-05-22 04:09:32 +08:00
|
|
|
/**
|
2009-07-30 00:11:45 +08:00
|
|
|
* Returns the reference maintained by this scheduler.
|
|
|
|
|
* @return The reference maintained by this scheduler.
|
2009-05-22 04:09:32 +08:00
|
|
|
*/
|
2009-07-30 00:11:45 +08:00
|
|
|
public IndexedFastaSequenceFile getReference() { return reference; }
|
2009-04-27 01:42:00 +08:00
|
|
|
}
|