2009-06-23 05:11:18 +08:00
/ *
* Copyright ( c ) 2009 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
* /
2009-04-27 01:42:00 +08:00
package org.broadinstitute.sting.gatk.executive ;
2009-05-29 04:13:01 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2009-07-14 04:42:12 +08:00
import net.sf.samtools.SAMFileHeader ;
import net.sf.samtools.SAMSequenceDictionary ;
import net.sf.samtools.SAMSequenceRecord ;
2009-04-27 01:42:00 +08:00
import org.apache.log4j.Logger ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.Reads ;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider ;
import org.broadinstitute.sting.gatk.datasources.shards.Shard ;
2009-06-12 02:13:22 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy ;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory ;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum ;
2009-06-26 06:51:38 +08:00
import org.broadinstitute.sting.gatk.traversals.* ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.gatk.walkers.* ;
2009-06-22 22:39:41 +08:00
import org.broadinstitute.sting.utils.GenomeLocParser ;
2009-06-23 05:11:18 +08:00
import org.broadinstitute.sting.utils.GenomeLocSortedSet ;
import org.broadinstitute.sting.utils.StingException ;
2009-07-14 04:42:12 +08:00
import org.broadinstitute.sting.utils.Utils ;
2009-04-27 01:42:00 +08:00
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile ;
import java.io.File ;
2009-05-07 06:36:25 +08:00
import java.io.FileNotFoundException ;
2009-07-14 04:42:12 +08:00
import java.util.* ;
2009-06-23 05:11:18 +08:00
2009-04-27 01:42:00 +08:00
/ * *
* Created by IntelliJ IDEA .
* User : mhanna
* Date : Apr 26 , 2009
* Time : 12 : 37 : 23 PM
* To change this template use File | Settings | File Templates .
* /
2009-04-27 07:08:12 +08:00
2009-05-28 02:24:31 +08:00
/** Shards and schedules data in manageable chunks. */
2009-04-27 01:42:00 +08:00
public abstract class MicroScheduler {
2009-05-28 02:24:31 +08:00
private static long SHARD_SIZE = 100000L ;
2009-04-27 01:42:00 +08:00
2009-04-27 01:46:52 +08:00
protected static Logger logger = Logger . getLogger ( MicroScheduler . class ) ;
2009-04-27 01:42:00 +08:00
2009-05-09 05:27:54 +08:00
protected final TraversalEngine traversalEngine ;
protected final IndexedFastaSequenceFile reference ;
2009-05-08 08:58:37 +08:00
2009-05-09 05:27:54 +08:00
private final SAMDataSource reads ;
2009-05-22 04:09:32 +08:00
private final List < ReferenceOrderedDataSource > rods ;
2009-05-08 08:58:37 +08:00
2009-04-27 07:08:12 +08:00
/ * *
* MicroScheduler factory function . Create a microscheduler appropriate for reducing the
* selected walker .
2009-06-23 05:11:18 +08:00
*
* @param walker Which walker to use .
* @param reads the informations associated with the reads
* @param ref the reference file
* @param rods the rods to include in the traversal
2009-04-27 07:08:12 +08:00
* @param nThreadsToUse Number of threads to utilize .
2009-06-23 05:11:18 +08:00
*
2009-04-27 07:08:12 +08:00
* @return The best - fit microscheduler .
* /
2009-05-28 02:24:31 +08:00
public static MicroScheduler create ( Walker walker , Reads reads , File ref , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods , int nThreadsToUse ) {
if ( walker instanceof TreeReducible & & nThreadsToUse > 1 ) {
2009-04-27 07:08:12 +08:00
logger . info ( "Creating hierarchical microscheduler" ) ;
2009-05-28 02:24:31 +08:00
return new HierarchicalMicroScheduler ( walker , reads , ref , rods , nThreadsToUse ) ;
} else {
2009-04-27 07:08:12 +08:00
logger . info ( "Creating linear microscheduler" ) ;
2009-05-28 02:24:31 +08:00
return new LinearMicroScheduler ( walker , reads , ref , rods ) ;
2009-04-27 07:08:12 +08:00
}
}
2009-04-27 01:42:00 +08:00
/ * *
* Create a microscheduler given the reads and reference .
2009-06-23 05:11:18 +08:00
*
* @param walker the walker to execute with
* @param reads The reads .
2009-04-27 01:42:00 +08:00
* @param refFile File pointer to the reference .
2009-06-23 05:11:18 +08:00
* @param rods the rods to include in the traversal
2009-04-27 01:42:00 +08:00
* /
2009-05-28 02:24:31 +08:00
protected MicroScheduler ( Walker walker , Reads reads , File refFile , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
2009-05-09 05:27:54 +08:00
if ( walker instanceof ReadWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseReads ( ) ;
2009-06-23 05:11:18 +08:00
} else if ( walker instanceof LocusWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseLoci ( ) ;
2009-06-26 06:51:38 +08:00
} else if ( walker instanceof LocusWindowWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseLocusWindows ( ) ;
2009-06-23 05:11:18 +08:00
} else if ( walker instanceof DuplicateWalker ) {
2009-07-10 05:57:00 +08:00
traversalEngine = new TraverseDuplicates ( ) ;
2009-06-23 05:11:18 +08:00
} else {
throw new UnsupportedOperationException ( "Unable to determine traversal type, the walker is an unknown type." ) ;
2009-05-09 05:27:54 +08:00
}
2009-07-25 06:59:49 +08:00
this . reads = setupReadsDataSource ( reads ) ;
2009-05-28 02:24:31 +08:00
this . reference = openReferenceSequenceFile ( refFile ) ;
this . rods = getReferenceOrderedDataSources ( rods ) ;
2009-07-14 04:42:12 +08:00
validate ( this . reads , this . reference ) ;
2009-04-27 01:42:00 +08:00
}
/ * *
* A temporary getter for the traversal engine . In the future , clients
* of the microscheduler shouldn ' t need to know anything about the traversal engine .
2009-06-23 05:11:18 +08:00
*
2009-04-27 01:42:00 +08:00
* @return The traversal engine .
* /
2009-05-08 08:58:37 +08:00
public TraversalEngine getTraversalEngine ( ) {
return traversalEngine ;
}
2009-04-27 01:42:00 +08:00
/ * *
* Walks a walker over the given list of intervals .
2009-06-23 05:11:18 +08:00
*
* @param walker Computation to perform over dataset .
* @param intervals A list of intervals over which to walk . Null for whole dataset .
2009-06-08 23:12:24 +08:00
* @param maxIterations the maximum number of iterations we ' re to perform
2009-06-23 05:11:18 +08:00
*
2009-05-16 04:20:27 +08:00
* @return the return type of the walker
2009-04-27 01:42:00 +08:00
* /
2009-06-08 23:12:24 +08:00
public abstract Object execute ( Walker walker , GenomeLocSortedSet intervals , Integer maxIterations ) ;
2009-04-27 01:42:00 +08:00
/ * *
* Get the sharding strategy given a driving data source .
2009-06-23 05:11:18 +08:00
*
* @param walker Walker for which to infer sharding strategy .
2009-04-27 01:42:00 +08:00
* @param drivingDataSource Data on which to shard .
2009-06-23 05:11:18 +08:00
* @param intervals Intervals to use when limiting sharding .
* @param maxIterations the maximum number of iterations to run through
*
2009-04-27 01:42:00 +08:00
* @return Sharding strategy for this driving data source .
* /
2009-06-08 23:12:24 +08:00
protected ShardStrategy getShardStrategy ( Walker walker ,
ReferenceSequenceFile drivingDataSource ,
GenomeLocSortedSet intervals ,
Integer maxIterations ) {
2009-04-27 01:42:00 +08:00
ShardStrategy shardStrategy = null ;
2009-05-28 02:24:31 +08:00
ShardStrategyFactory . SHATTER_STRATEGY shardType ;
if ( walker instanceof LocusWalker ) {
if ( intervals ! = null ) {
shardType = ( walker . isReduceByInterval ( ) ) ?
2009-06-23 05:11:18 +08:00
ShardStrategyFactory . SHATTER_STRATEGY . INTERVAL :
ShardStrategyFactory . SHATTER_STRATEGY . LINEAR ;
2009-05-28 02:24:31 +08:00
shardStrategy = ShardStrategyFactory . shatter ( shardType ,
2009-06-23 05:11:18 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
intervals , maxIterations ) ;
2009-05-28 02:24:31 +08:00
} else
shardStrategy = ShardStrategyFactory . shatter ( ShardStrategyFactory . SHATTER_STRATEGY . LINEAR ,
2009-06-23 05:11:18 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE , maxIterations ) ;
2009-05-28 02:24:31 +08:00
2009-06-23 05:11:18 +08:00
} else if ( walker instanceof ReadWalker | |
walker instanceof DuplicateWalker ) {
2009-05-28 02:24:31 +08:00
shardType = ShardStrategyFactory . SHATTER_STRATEGY . READS ;
if ( intervals ! = null ) {
shardStrategy = ShardStrategyFactory . shatter ( shardType ,
2009-06-23 05:11:18 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
intervals , maxIterations ) ;
2009-05-28 02:24:31 +08:00
} else {
shardStrategy = ShardStrategyFactory . shatter ( shardType ,
2009-06-23 05:11:18 +08:00
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE , maxIterations ) ;
2009-05-19 06:54:18 +08:00
}
2009-06-26 06:51:38 +08:00
} else if ( walker instanceof LocusWindowWalker ) {
if ( intervals = = null )
throw new StingException ( "Unable to shard: walker is of type LocusWindow, but no intervals were provided" ) ;
shardStrategy = ShardStrategyFactory . shatter ( ShardStrategyFactory . SHATTER_STRATEGY . INTERVAL ,
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
intervals , maxIterations ) ;
2009-05-28 02:24:31 +08:00
} else
2009-05-19 06:54:18 +08:00
throw new StingException ( "Unable to support walker of type" + walker . getClass ( ) . getName ( ) ) ;
2009-04-27 01:42:00 +08:00
2009-05-28 02:24:31 +08:00
return shardStrategy ;
2009-04-27 01:42:00 +08:00
}
2009-05-09 05:27:54 +08:00
/ * *
* Gets an window into all the data that can be viewed as a single shard .
2009-06-23 05:11:18 +08:00
*
2009-05-09 05:27:54 +08:00
* @param shard The section of data to view .
2009-06-23 05:11:18 +08:00
*
2009-05-09 05:27:54 +08:00
* @return An accessor for all the data in this shard .
* /
2009-05-28 02:24:31 +08:00
protected ShardDataProvider getShardDataProvider ( Shard shard ) {
return new ShardDataProvider ( shard , reads , reference , rods ) ;
2009-05-09 05:27:54 +08:00
}
2009-07-07 06:50:22 +08:00
/ * *
* Print summary information for the analysis .
* @param sum The final reduce output .
* /
protected void printOnTraversalDone ( Object sum ) {
2009-07-10 01:26:59 +08:00
// HACK: The microscheduler should be too dumb to know anything about the data
// it's actually processing; it should just funnel anything it receives
// to the traversal engine.
// TODO: Implement code to allow the datasources to print summary info of the
// data they've seen.
if ( reads ! = null & & reads . getViolationHistogram ( ) . getViolationCount ( ) > 0 )
logger . warn ( String . format ( "%n%s" , reads . getViolationHistogram ( ) ) ) ;
2009-07-07 06:50:22 +08:00
traversalEngine . printOnTraversalDone ( sum ) ;
}
2009-04-27 01:42:00 +08:00
/ * *
* Gets a data source for the given set of reads .
2009-06-23 05:11:18 +08:00
*
* @param reads the read source information
*
2009-04-27 01:42:00 +08:00
* @return A data source for the given set of reads .
* /
2009-07-25 06:59:49 +08:00
private SAMDataSource setupReadsDataSource ( Reads reads ) {
2009-05-20 07:26:17 +08:00
// By reference traversals are happy with no reads. Make sure that case is handled.
2009-05-28 02:24:31 +08:00
if ( reads . getReadsFiles ( ) . size ( ) = = 0 )
2009-05-20 07:26:17 +08:00
return null ;
2009-06-26 01:54:15 +08:00
SAMDataSource dataSource = new SAMDataSource ( reads ) ;
2009-05-09 05:27:54 +08:00
2009-05-08 08:58:37 +08:00
// Side effect: initialize the traversal engine with reads data.
// TODO: Give users a dedicated way of getting the header so that the MicroScheduler
// doesn't have to bend over backward providing legacy getters and setters.
traversalEngine . setSAMHeader ( dataSource . getHeader ( ) ) ;
2009-04-27 01:42:00 +08:00
return dataSource ;
}
2009-07-25 06:59:49 +08:00
/ * *
* Returns data source maintained by this scheduler
* @return
* /
public SAMDataSource getSAMDataSource ( ) { return reads ; }
2009-04-27 01:42:00 +08:00
2009-05-22 04:09:32 +08:00
/ * *
* Open the reference - ordered data sources .
2009-06-23 05:11:18 +08:00
*
* @param rods the reference order data to execute using
*
2009-05-22 04:09:32 +08:00
* @return A list of reference - ordered data sources .
* /
2009-05-28 02:24:31 +08:00
private List < ReferenceOrderedDataSource > getReferenceOrderedDataSources ( List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
2009-05-22 04:09:32 +08:00
List < ReferenceOrderedDataSource > dataSources = new ArrayList < ReferenceOrderedDataSource > ( ) ;
2009-05-28 02:24:31 +08:00
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > rod : rods )
dataSources . add ( new ReferenceOrderedDataSource ( rod ) ) ;
2009-05-22 04:09:32 +08:00
return dataSources ;
}
2009-04-27 01:42:00 +08:00
/ * *
* Opens a reference sequence file paired with an index .
2009-06-23 05:11:18 +08:00
*
2009-04-27 01:42:00 +08:00
* @param refFile Handle to a reference sequence file . Non - null .
2009-06-23 05:11:18 +08:00
*
2009-04-27 01:42:00 +08:00
* @return A thread - safe file wrapper .
* /
2009-05-28 02:24:31 +08:00
private IndexedFastaSequenceFile openReferenceSequenceFile ( File refFile ) {
2009-04-27 01:42:00 +08:00
IndexedFastaSequenceFile ref = null ;
try {
ref = new IndexedFastaSequenceFile ( refFile ) ;
}
2009-05-28 02:24:31 +08:00
catch ( FileNotFoundException ex ) {
2009-06-01 23:34:38 +08:00
throw new StingException ( "I/O error while opening fasta file: " + ex . getMessage ( ) , ex ) ;
2009-04-27 01:42:00 +08:00
}
2009-06-22 22:39:41 +08:00
GenomeLocParser . setupRefContigOrdering ( ref ) ;
2009-04-27 01:42:00 +08:00
return ref ;
}
2009-07-14 04:42:12 +08:00
/ * *
* Now that all files are open , validate the sequence dictionaries of the reads vs . the reference .
* TODO : Doing this in the MicroScheduler is a bit late , but this is where data sources are initialized .
* TODO : Move the initialization of data sources back to the GenomeAnalysisEngine .
* @param reads Reads data source .
* @param reference Reference data source .
* /
private void validate ( SAMDataSource reads , ReferenceSequenceFile reference ) {
if ( reads = = null | | reference = = null )
return ;
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads . getHeader ( ) . getSequenceDictionary ( ) ;
Set < String > readsSequenceNames = new TreeSet < String > ( ) ;
for ( SAMSequenceRecord dictionaryEntry : readsDictionary . getSequences ( ) )
readsSequenceNames . add ( dictionaryEntry . getSequenceName ( ) ) ;
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference . getSequenceDictionary ( ) ;
Set < String > referenceSequenceNames = new TreeSet < String > ( ) ;
for ( SAMSequenceRecord dictionaryEntry : referenceDictionary . getSequences ( ) )
referenceSequenceNames . add ( dictionaryEntry . getSequenceName ( ) ) ;
2009-07-14 10:03:36 +08:00
if ( readsSequenceNames . size ( ) = = 0 ) {
logger . info ( "Reads file is unmapped. Skipping validation against reference." ) ;
return ;
}
2009-07-14 04:42:12 +08:00
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
Set < String > intersectingSequenceNames = new HashSet < String > ( readsSequenceNames ) ;
intersectingSequenceNames . retainAll ( referenceSequenceNames ) ;
if ( intersectingSequenceNames . size ( ) = = 0 ) {
StringBuilder error = new StringBuilder ( ) ;
error . append ( "No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n" ) ;
error . append ( System . getProperty ( "line.separator" ) ) ;
error . append ( String . format ( "Reads contigs: %s%n" , prettyPrintSequenceRecords ( readsDictionary ) ) ) ;
error . append ( String . format ( "Reference contigs: %s%n" , prettyPrintSequenceRecords ( referenceDictionary ) ) ) ;
logger . error ( error . toString ( ) ) ;
Utils . scareUser ( "No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference." ) ;
}
2009-07-15 00:21:22 +08:00
// If the two datasets are not equal and neither is a strict subset of the other, warn the user.
if ( ! readsSequenceNames . equals ( referenceSequenceNames ) & &
! readsSequenceNames . containsAll ( referenceSequenceNames ) & &
! referenceSequenceNames . containsAll ( readsSequenceNames ) ) {
2009-07-14 04:42:12 +08:00
StringBuilder warning = new StringBuilder ( ) ;
warning . append ( "Limited overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n" ) ;
warning . append ( System . getProperty ( "line.separator" ) ) ;
warning . append ( String . format ( "Reads contigs: %s%n" , prettyPrintSequenceRecords ( readsDictionary ) ) ) ;
warning . append ( String . format ( "Reference contigs: %s%n" , prettyPrintSequenceRecords ( referenceDictionary ) ) ) ;
logger . warn ( warning . toString ( ) ) ;
}
}
private String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
String [ ] sequenceRecordNames = new String [ sequenceDictionary . size ( ) ] ;
int sequenceRecordIndex = 0 ;
for ( SAMSequenceRecord sequenceRecord : sequenceDictionary . getSequences ( ) )
sequenceRecordNames [ sequenceRecordIndex + + ] = sequenceRecord . getSequenceName ( ) ;
return Arrays . deepToString ( sequenceRecordNames ) ;
}
2009-04-27 01:42:00 +08:00
}