gatk-3.8/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java

242 lines
11 KiB
Java
Raw Normal View History

/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.executive;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Apr 26, 2009
* Time: 12:37:23 PM
* To change this template use File | Settings | File Templates.
*/
/** Shards and schedules data in manageable chunks. */
public abstract class MicroScheduler {
protected static Logger logger = Logger.getLogger(MicroScheduler.class);
protected final TraversalEngine traversalEngine;
protected final IndexedFastaSequenceFile reference;
private final SAMDataSource reads;
private final Collection<ReferenceOrderedDataSource> rods;
/**
* MicroScheduler factory function. Create a microscheduler appropriate for reducing the
* selected walker.
*
* @param walker Which walker to use.
* @param reads the informations associated with the reads
* @param reference the reference file
* @param rods the rods to include in the traversal
* @param nThreadsToUse Number of threads to utilize.
*
* @return The best-fit microscheduler.
*/
public static MicroScheduler create(Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse) {
if (walker instanceof TreeReducible && nThreadsToUse > 1) {
logger.info("Creating hierarchical microscheduler");
return new HierarchicalMicroScheduler(walker, reads, reference, rods, nThreadsToUse);
} else {
logger.info("Creating linear microscheduler");
return new LinearMicroScheduler(walker, reads, reference, rods);
}
}
/**
* Create a microscheduler given the reads and reference.
*
* @param walker the walker to execute with
* @param reads The reads.
* @param reference The reference.
* @param rods the rods to include in the traversal
*/
protected MicroScheduler(Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods) {
if (walker instanceof ReadWalker) {
traversalEngine = new TraverseReads();
} else if (walker instanceof LocusWalker) {
traversalEngine = new TraverseLoci();
} else if (walker instanceof LocusWindowWalker) {
traversalEngine = new TraverseLocusWindows();
} else if (walker instanceof DuplicateWalker) {
traversalEngine = new TraverseDuplicates();
} else {
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
}
this.reads = reads;
this.reference = reference;
this.rods = rods;
validate(this.reads,this.reference);
// Side effect: initialize the traversal engine with reads data.
// TODO: Give users a dedicated way of getting the header so that the MicroScheduler
// doesn't have to bend over backward providing legacy getters and setters.
traversalEngine.setSAMHeader(reads.getHeader());
traversalEngine.initialize();
}
/**
* A temporary getter for the traversal engine. In the future, clients
* of the microscheduler shouldn't need to know anything about the traversal engine.
*
* @return The traversal engine.
*/
public TraversalEngine getTraversalEngine() {
return traversalEngine;
}
/**
* Walks a walker over the given list of intervals.
*
* @param walker Computation to perform over dataset.
* @param shardStrategy A strategy for sharding the data.
*
* @return the return type of the walker
*/
public abstract Object execute(Walker walker, ShardStrategy shardStrategy);
/**
* Gets an window into all the data that can be viewed as a single shard.
*
* @param shard The section of data to view.
*
* @return An accessor for all the data in this shard.
*/
protected ShardDataProvider getShardDataProvider(Shard shard) {
return new ShardDataProvider(shard, reads, reference, rods);
}
/**
* Print summary information for the analysis.
* @param sum The final reduce output.
*/
protected void printOnTraversalDone(Object sum) {
// HACK: The microscheduler should be too dumb to know anything about the data
// it's actually processing; it should just funnel anything it receives
// to the traversal engine.
// TODO: Implement code to allow the datasources to print summary info of the
// data they've seen.
if( reads != null && reads.getViolationHistogram().getViolationCount() > 0 )
logger.warn(String.format("%n%s",reads.getViolationHistogram()));
traversalEngine.printOnTraversalDone(sum);
}
/**
* Returns data source maintained by this scheduler
* @return
*/
public SAMDataSource getSAMDataSource() { return reads; }
/**
* Returns the reference maintained by this scheduler.
* @return The reference maintained by this scheduler.
*/
public IndexedFastaSequenceFile getReference() { return reference; }
/**
* Now that all files are open, validate the sequence dictionaries of the reads vs. the reference.
* TODO: Doing this in the MicroScheduler is a bit late, but this is where data sources are initialized.
* TODO: Move the initialization of data sources back to the GenomeAnalysisEngine.
* @param reads Reads data source.
* @param reference Reference data source.
*/
private void validate( SAMDataSource reads, ReferenceSequenceFile reference ) {
if( reads == null || reference == null )
return;
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
Set<String> readsSequenceNames = new TreeSet<String>();
for( SAMSequenceRecord dictionaryEntry: readsDictionary.getSequences() )
readsSequenceNames.add(dictionaryEntry.getSequenceName());
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary();
Set<String> referenceSequenceNames = new TreeSet<String>();
for( SAMSequenceRecord dictionaryEntry: referenceDictionary.getSequences() )
referenceSequenceNames.add(dictionaryEntry.getSequenceName());
if( readsSequenceNames.size() == 0 ) {
logger.info("Reads file is unmapped. Skipping validation against reference.");
return;
}
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
Set<String> intersectingSequenceNames = new HashSet<String>(readsSequenceNames);
intersectingSequenceNames.retainAll(referenceSequenceNames);
if( intersectingSequenceNames.size() == 0 ) {
StringBuilder error = new StringBuilder();
error.append("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n");
error.append(System.getProperty("line.separator"));
error.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary)));
error.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary)));
logger.error(error.toString());
Utils.scareUser("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference.");
}
// If the two datasets are not equal and neither is a strict subset of the other, warn the user.
if( !readsSequenceNames.equals(referenceSequenceNames) &&
!readsSequenceNames.containsAll(referenceSequenceNames) &&
!referenceSequenceNames.containsAll(readsSequenceNames)) {
StringBuilder warning = new StringBuilder();
warning.append("Limited overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n");
warning.append(System.getProperty("line.separator"));
warning.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary)));
warning.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary)));
logger.warn(warning.toString());
}
}
private String prettyPrintSequenceRecords( SAMSequenceDictionary sequenceDictionary ) {
String[] sequenceRecordNames = new String[ sequenceDictionary.size() ];
int sequenceRecordIndex = 0;
for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() )
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
return Arrays.deepToString(sequenceRecordNames);
}
}