gatk-3.8/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java

/*
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

package org.broadinstitute.sting.gatk.executive;

import net.sf.picard.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseDuplicates;
import org.broadinstitute.sting.gatk.traversals.TraverseLoci;
import org.broadinstitute.sting.gatk.traversals.TraverseReads;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;


/**
 * Created by IntelliJ IDEA.
 * User: mhanna
 * Date: Apr 26, 2009
 * Time: 12:37:23 PM
 * To change this template use File | Settings | File Templates.
 */

/** Shards and schedules data in manageable chunks. */
public abstract class MicroScheduler {
    private static long SHARD_SIZE = 100000L;

    protected static Logger logger = Logger.getLogger(MicroScheduler.class);

    protected final TraversalEngine traversalEngine;
    protected final IndexedFastaSequenceFile reference;

    private final SAMDataSource reads;
    private final List<ReferenceOrderedDataSource> rods;

    /**
     * MicroScheduler factory function.  Create a microscheduler appropriate for reducing the
     * selected walker.
     *
     * @param walker        Which walker to use.
     * @param reads         the informations associated with the reads
     * @param ref           the reference file
     * @param rods          the rods to include in the traversal
     * @param nThreadsToUse Number of threads to utilize.
     *
     * @return The best-fit microscheduler.
     */
    public static MicroScheduler create(Walker walker, Reads reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods, int nThreadsToUse) {
        if (walker instanceof TreeReducible && nThreadsToUse > 1) {
            logger.info("Creating hierarchical microscheduler");
            return new HierarchicalMicroScheduler(walker, reads, ref, rods, nThreadsToUse);
        } else {
            logger.info("Creating linear microscheduler");
            return new LinearMicroScheduler(walker, reads, ref, rods);
        }
    }

    /**
     * Create a microscheduler given the reads and reference.
     *
     * @param walker  the walker to execute with
     * @param reads   The reads.
     * @param refFile File pointer to the reference.
     * @param rods    the rods to include in the traversal
     */
    protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
        if (walker instanceof ReadWalker) {
            traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
            this.reads = getReadsDataSource(reads, true);
        } else if (walker instanceof LocusWalker) {
            traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
            this.reads = getReadsDataSource(reads, false);
        } else if (walker instanceof DuplicateWalker) {
            traversalEngine = new TraverseDuplicates(reads.getReadsFiles(), refFile, rods);
            this.reads = getReadsDataSource(reads, true);
        } else {
            throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
        }


        this.reference = openReferenceSequenceFile(refFile);
        this.rods = getReferenceOrderedDataSources(rods);
    }

    /**
     * A temporary getter for the traversal engine.  In the future, clients
     * of the microscheduler shouldn't need to know anything about the traversal engine.
     *
     * @return The traversal engine.
     */
    public TraversalEngine getTraversalEngine() {
        return traversalEngine;
    }

    /**
     * Walks a walker over the given list of intervals.
     *
     * @param walker        Computation to perform over dataset.
     * @param intervals     A list of intervals over which to walk.  Null for whole dataset.
     * @param maxIterations the maximum number of iterations we're to perform
     *
     * @return the return type of the walker
     */
    public abstract Object execute(Walker walker, GenomeLocSortedSet intervals, Integer maxIterations);

    /**
     * Get the sharding strategy given a driving data source.
     *
     * @param walker            Walker for which to infer sharding strategy.
     * @param drivingDataSource Data on which to shard.
     * @param intervals         Intervals to use when limiting sharding.
     * @param maxIterations     the maximum number of iterations to run through
     *
     * @return Sharding strategy for this driving data source.
     */
    protected ShardStrategy getShardStrategy(Walker walker,
                                             ReferenceSequenceFile drivingDataSource,
                                             GenomeLocSortedSet intervals,
                                             Integer maxIterations) {
        ShardStrategy shardStrategy = null;
        ShardStrategyFactory.SHATTER_STRATEGY shardType;
        if (walker instanceof LocusWalker) {
            if (intervals != null) {
                shardType = (walker.isReduceByInterval()) ?
                        ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
                        ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;

                shardStrategy = ShardStrategyFactory.shatter(shardType,
                        drivingDataSource.getSequenceDictionary(),
                        SHARD_SIZE,
                        intervals, maxIterations);
            } else
                shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
                        drivingDataSource.getSequenceDictionary(),
                        SHARD_SIZE, maxIterations);

        } else if (walker instanceof ReadWalker ||
                walker instanceof DuplicateWalker) {

            shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS;

            if (intervals != null) {
                shardStrategy = ShardStrategyFactory.shatter(shardType,
                        drivingDataSource.getSequenceDictionary(),
                        SHARD_SIZE,
                        intervals, maxIterations);
            } else {
                shardStrategy = ShardStrategyFactory.shatter(shardType,
                        drivingDataSource.getSequenceDictionary(),
                        SHARD_SIZE, maxIterations);
            }
        } else
            throw new StingException("Unable to support walker of type" + walker.getClass().getName());

        return shardStrategy;
    }

    /**
     * Gets an window into all the data that can be viewed as a single shard.
     *
     * @param shard The section of data to view.
     *
     * @return An accessor for all the data in this shard.
     */
    protected ShardDataProvider getShardDataProvider(Shard shard) {
        return new ShardDataProvider(shard, reads, reference, rods);
    }

    /**
     * Gets a data source for the given set of reads.
     *
     * @param reads   the read source information
     * @param byReads are we a by reads traversal, or not
     *
     * @return A data source for the given set of reads.
     */
    private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) {
        // By reference traversals are happy with no reads.  Make sure that case is handled.
        if (reads.getReadsFiles().size() == 0)
            return null;

        SAMDataSource dataSource = new SAMDataSource(reads, byReads);

        // Side effect: initialize the traversal engine with reads data.
        // TODO: Give users a dedicated way of getting the header so that the MicroScheduler
        //       doesn't have to bend over backward providing legacy getters and setters.
        traversalEngine.setSAMHeader(dataSource.getHeader());

        return dataSource;
    }

    /**
     * Open the reference-ordered data sources.
     *
     * @param rods the reference order data to execute using
     *
     * @return A list of reference-ordered data sources.
     */
    private List<ReferenceOrderedDataSource> getReferenceOrderedDataSources(List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
        List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
        for (ReferenceOrderedData<? extends ReferenceOrderedDatum> rod : rods)
            dataSources.add(new ReferenceOrderedDataSource(rod));
        return dataSources;
    }

    /**
     * Opens a reference sequence file paired with an index.
     *
     * @param refFile Handle to a reference sequence file.  Non-null.
     *
     * @return A thread-safe file wrapper.
     */
    private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) {
        IndexedFastaSequenceFile ref = null;
        try {
            ref = new IndexedFastaSequenceFile(refFile);
        }
        catch (FileNotFoundException ex) {
            throw new StingException("I/O error while opening fasta file: " + ex.getMessage(), ex);
        }
        GenomeLocParser.setupRefContigOrdering(ref);
        return ref;
    }
}