2009-06-23 05:11:18 +08:00
/ *
* Copyright ( c ) 2009 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
* /
2009-05-11 10:07:20 +08:00
package org.broadinstitute.sting.gatk ;
2009-05-29 04:13:01 +08:00
import net.sf.picard.reference.ReferenceSequenceFile ;
2009-07-25 06:59:49 +08:00
import net.sf.picard.sam.SamFileHeaderMerger ;
2009-07-10 07:59:53 +08:00
import net.sf.picard.filter.SamRecordFilter ;
2009-07-30 07:00:15 +08:00
import net.sf.samtools.* ;
2009-07-25 06:59:49 +08:00
2009-05-11 10:07:20 +08:00
import org.apache.log4j.Logger ;
2009-07-25 06:59:49 +08:00
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource ;
2009-07-30 00:11:45 +08:00
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource ;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy ;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory ;
2009-05-11 10:07:20 +08:00
import org.broadinstitute.sting.gatk.executive.MicroScheduler ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum ;
import org.broadinstitute.sting.gatk.walkers.* ;
2009-07-10 07:59:53 +08:00
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.gatk.io.OutputTracker ;
2009-06-06 07:34:37 +08:00
import org.broadinstitute.sting.utils.* ;
2009-10-06 10:45:31 +08:00
import org.broadinstitute.sting.utils.bed.BedParser ;
2009-07-30 00:11:45 +08:00
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile ;
2009-05-20 07:26:17 +08:00
import org.broadinstitute.sting.utils.cmdLine.ArgumentException ;
2009-08-23 08:56:02 +08:00
import org.broadinstitute.sting.utils.cmdLine.ArgumentSource ;
import org.broadinstitute.sting.gatk.io.stubs.Stub ;
2009-05-11 10:07:20 +08:00
2009-07-10 05:57:00 +08:00
import java.io.File ;
2009-07-30 00:11:45 +08:00
import java.io.FileNotFoundException ;
import java.util.* ;
2009-05-11 10:07:20 +08:00
public class GenomeAnalysisEngine {
// our instance of this genome analysis toolkit; it's used by other classes to extract the traversal engine
// TODO: public static without final tends to indicate we're thinking about this the wrong way
public static GenomeAnalysisEngine instance ;
2009-07-30 07:00:15 +08:00
/ * *
* Accessor for sharded read data .
* /
private SAMDataSource readsDataSource = null ;
/ * *
* Accessor for sharded reference data .
* /
private IndexedFastaSequenceFile referenceDataSource = null ;
/ * *
* Accessor for sharded reference - ordered data .
* /
private List < ReferenceOrderedDataSource > rodDataSources ;
2009-05-11 10:07:20 +08:00
// our argument collection
2009-07-10 07:59:53 +08:00
private GATKArgumentCollection argCollection ;
2009-05-11 10:07:20 +08:00
2009-10-06 10:45:31 +08:00
/ * *
* Collection of inputs used by the walker .
* /
private Map < ArgumentSource , Object > inputs = new HashMap < ArgumentSource , Object > ( ) ;
2009-08-23 08:56:02 +08:00
2009-10-06 10:45:31 +08:00
/ * *
* Collection of outputs used by the walker .
* /
2009-08-23 08:56:02 +08:00
private Collection < Stub < ? > > outputs = new ArrayList < Stub < ? > > ( ) ;
2009-05-11 10:07:20 +08:00
2009-10-06 10:45:31 +08:00
/ * *
* our log , which we want to capture anything from this class
* /
2009-05-11 10:07:20 +08:00
private static Logger logger = Logger . getLogger ( GenomeAnalysisEngine . class ) ;
2009-10-06 10:45:31 +08:00
/ * *
* our walker manager
* /
2009-07-10 07:59:53 +08:00
private final WalkerManager walkerManager ;
2009-05-11 10:07:20 +08:00
/ * *
* our constructor , where all the work is done
* < p / >
* legacy traversal types are sent to legacyTraversal function ; as we move more of the traversals to the
* new MicroScheduler class we ' ll be able to delete that function .
* /
2009-07-22 02:32:22 +08:00
public GenomeAnalysisEngine ( ) {
2009-07-10 07:59:53 +08:00
// make sure our instance variable points to this analysis engine
instance = this ;
2009-07-22 02:32:22 +08:00
walkerManager = new WalkerManager ( ) ;
2009-07-10 07:59:53 +08:00
}
2009-05-11 10:07:20 +08:00
2009-07-10 07:59:53 +08:00
/ * *
* Actually run the GATK with the specified walker .
2009-10-06 10:45:31 +08:00
*
2009-07-10 07:59:53 +08:00
* @param args the argument collection , where we get all our setup information from
* @param my_walker Walker to run over the dataset . Must not be null .
2009-09-30 06:23:19 +08:00
* @return the value of this traversal .
2009-07-10 07:59:53 +08:00
* /
2009-10-06 10:45:31 +08:00
public Object execute ( GATKArgumentCollection args , Walker < ? , ? > my_walker ) {
2009-05-11 10:07:20 +08:00
// validate our parameters
2009-07-10 07:59:53 +08:00
if ( args = = null ) {
2009-08-21 13:35:49 +08:00
throw new StingException ( "The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null." ) ;
2009-05-11 10:07:20 +08:00
}
2009-07-10 07:59:53 +08:00
// validate our parameters
if ( my_walker = = null )
2009-08-21 13:35:49 +08:00
throw new StingException ( "The walker passed to GenomeAnalysisEngine can not be null." ) ;
2009-07-10 07:59:53 +08:00
2009-05-11 10:07:20 +08:00
// save our argument parameter
this . argCollection = args ;
2009-07-30 07:00:15 +08:00
// Prepare the data for traversal.
2009-10-06 10:45:31 +08:00
initializeDataSources ( my_walker , argCollection ) ;
2009-05-20 07:26:17 +08:00
2009-05-11 10:07:20 +08:00
// our microscheduler, which is in charge of running everything
2009-07-30 07:00:15 +08:00
MicroScheduler microScheduler = createMicroscheduler ( my_walker ) ;
2009-05-11 10:07:20 +08:00
2009-07-30 00:11:45 +08:00
// create the output streams
2009-08-23 08:56:02 +08:00
initializeOutputStreams ( my_walker , microScheduler . getOutputTracker ( ) ) ;
2009-05-11 10:07:20 +08:00
2009-05-27 04:57:46 +08:00
GenomeLocSortedSet locs = null ;
2009-07-10 05:57:00 +08:00
if ( argCollection . intervals ! = null ) {
locs = GenomeLocSortedSet . createSetFromList ( parseIntervalRegion ( argCollection . intervals ) ) ;
}
2009-07-30 00:11:45 +08:00
2009-07-30 07:00:15 +08:00
ShardStrategy shardStrategy = getShardStrategy ( my_walker , microScheduler . getReference ( ) , locs , argCollection . maximumEngineIterations ) ;
2009-07-30 00:11:45 +08:00
// execute the microscheduler, storing the results
2009-07-30 07:00:15 +08:00
return microScheduler . execute ( my_walker , shardStrategy , argCollection . maximumEngineIterations ) ;
2009-07-30 00:11:45 +08:00
}
/ * *
2009-08-23 08:56:02 +08:00
* Add additional , externally managed IO streams for walker input .
2009-10-06 10:45:31 +08:00
*
2009-08-23 08:56:02 +08:00
* @param argumentSource Field in the walker into which to inject the value .
2009-10-06 10:45:31 +08:00
* @param value Instance to inject .
2009-07-30 00:11:45 +08:00
* /
2009-10-06 10:45:31 +08:00
public void addInput ( ArgumentSource argumentSource , Object value ) {
inputs . put ( argumentSource , value ) ;
2009-08-23 08:56:02 +08:00
}
/ * *
* Add additional , externally managed IO streams for walker output .
2009-10-06 10:45:31 +08:00
*
2009-08-23 08:56:02 +08:00
* @param stub Instance to inject .
* /
2009-10-06 10:45:31 +08:00
public void addOutput ( Stub < ? > stub ) {
2009-08-23 08:56:02 +08:00
outputs . add ( stub ) ;
2009-05-11 10:07:20 +08:00
}
2009-07-22 06:23:28 +08:00
/ * *
* Gets a set of the names of all walkers that the GATK has discovered .
2009-10-06 10:45:31 +08:00
*
2009-07-22 06:23:28 +08:00
* @return A set of the names of all discovered walkers .
* /
public Set < String > getWalkerNames ( ) {
return walkerManager . getWalkerNames ( ) ;
}
2009-07-10 07:59:53 +08:00
/ * *
* Retrieves an instance of the walker based on the walker name .
2009-10-06 10:45:31 +08:00
*
2009-07-10 07:59:53 +08:00
* @param walkerName Name of the walker . Must not be null . If the walker cannot be instantiated , an exception will be thrown .
* @return An instance of the walker .
* /
2009-10-06 10:45:31 +08:00
public Walker < ? , ? > getWalkerByName ( String walkerName ) {
2009-07-22 06:23:28 +08:00
return walkerManager . createWalkerByName ( walkerName ) ;
2009-07-10 07:59:53 +08:00
}
2009-10-06 10:45:31 +08:00
private void initializeDataSources ( Walker my_walker , GATKArgumentCollection argCollection ) {
validateSuppliedReadsAgainstWalker ( my_walker , argCollection ) ;
2009-07-30 07:00:15 +08:00
logger . info ( "Strictness is " + argCollection . strictnessLevel ) ;
2009-10-06 10:45:31 +08:00
readsDataSource = createReadsDataSource ( extractSourceInfo ( my_walker , argCollection ) ) ;
2009-07-30 07:00:15 +08:00
2009-10-06 10:45:31 +08:00
validateSuppliedReferenceAgainstWalker ( my_walker , argCollection ) ;
2009-07-30 07:00:15 +08:00
referenceDataSource = openReferenceSequenceFile ( argCollection . referenceFile ) ;
validateReadsAndReferenceAreCompatible ( readsDataSource , referenceDataSource ) ;
// our reference ordered data collection
List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods = new ArrayList < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > ( ) ;
//
// please don't use these in the future, use the new syntax <- if we're not using these please remove them
//
if ( argCollection . DBSNPFile ! = null ) bindConvenienceRods ( "dbSNP" , "dbsnp" , argCollection . DBSNPFile ) ;
if ( argCollection . HAPMAPFile ! = null )
bindConvenienceRods ( "hapmap" , "HapMapAlleleFrequencies" , argCollection . HAPMAPFile ) ;
if ( argCollection . HAPMAPChipFile ! = null )
bindConvenienceRods ( "hapmap-chip" , "GFF" , argCollection . HAPMAPChipFile ) ;
// TODO: The ROD iterator currently does not understand multiple intervals file. Fix this by cleaning the ROD system.
if ( argCollection . intervals ! = null & & argCollection . intervals . size ( ) = = 1 ) {
bindConvenienceRods ( "interval" , "Intervals" , argCollection . intervals . get ( 0 ) . replaceAll ( "," , "" ) ) ;
}
// parse out the rod bindings
2009-08-12 06:10:20 +08:00
ReferenceOrderedData . parseBindings ( argCollection . RODBindings , rods ) ;
2009-07-30 07:00:15 +08:00
2009-10-06 10:45:31 +08:00
validateSuppliedReferenceOrderedDataAgainstWalker ( my_walker , rods ) ;
2009-07-30 07:00:15 +08:00
rodDataSources = getReferenceOrderedDataSources ( rods ) ;
}
2009-07-10 07:59:53 +08:00
2009-05-11 10:07:20 +08:00
/ * *
* setup a microscheduler
2009-10-06 10:45:31 +08:00
*
2009-05-11 10:07:20 +08:00
* @param my_walker our walker of type LocusWalker
* @return a new microscheduler
* /
2009-07-30 07:00:15 +08:00
private MicroScheduler createMicroscheduler ( Walker my_walker ) {
2009-05-11 10:07:20 +08:00
// the mircoscheduler to return
MicroScheduler microScheduler = null ;
// we need to verify different parameter based on the walker type
2009-06-26 06:51:38 +08:00
if ( my_walker instanceof LocusWalker | | my_walker instanceof LocusWindowWalker ) {
2009-05-11 10:07:20 +08:00
// create the MicroScheduler
2009-07-30 00:11:45 +08:00
microScheduler = MicroScheduler . create ( my_walker , readsDataSource , referenceDataSource , rodDataSources , argCollection . numberOfThreads ) ;
2009-07-10 05:57:00 +08:00
} else if ( my_walker instanceof ReadWalker | | my_walker instanceof DuplicateWalker ) {
2009-05-11 10:07:20 +08:00
if ( argCollection . referenceFile = = null )
2009-06-23 05:11:18 +08:00
Utils . scareUser ( String . format ( "Read-based traversals require a reference file but none was given" ) ) ;
2009-07-30 00:11:45 +08:00
microScheduler = MicroScheduler . create ( my_walker , readsDataSource , referenceDataSource , rodDataSources , argCollection . numberOfThreads ) ;
2009-06-23 05:11:18 +08:00
} else {
2009-09-30 06:23:19 +08:00
Utils . scareUser ( String . format ( "Unable to create the appropriate TraversalEngine for analysis type %s" , WalkerManager . getWalkerName ( my_walker . getClass ( ) ) ) ) ;
2009-05-11 10:07:20 +08:00
}
return microScheduler ;
}
/ * *
* setup the interval regions , from either the interval file of the genome region string
*
2009-07-10 06:10:22 +08:00
* @param intervals the list of intervals to parse
2009-05-11 10:07:20 +08:00
* @return a list of genomeLoc representing the interval file
* /
2009-07-10 05:57:00 +08:00
public static List < GenomeLoc > parseIntervalRegion ( final List < String > intervals ) {
2009-06-26 04:44:23 +08:00
List < GenomeLoc > locs = new ArrayList < GenomeLoc > ( ) ;
2009-07-10 05:57:00 +08:00
for ( String interval : intervals ) {
2009-06-26 04:44:23 +08:00
if ( new File ( interval ) . exists ( ) ) {
2009-10-06 10:45:31 +08:00
// support for the bed style interval format
if ( interval . endsWith ( ".bed" ) | | interval . endsWith ( ".BED" ) ) {
2009-11-03 05:09:18 +08:00
Utils . warnUser ( "Bed files are zero based half open intervals, which are converted to one based closed intervals in the GATK. " +
"Be aware that all output information and intervals are one based closed intervals." ) ;
2009-10-06 10:45:31 +08:00
BedParser parser = new BedParser ( new File ( interval ) ) ;
locs . addAll ( parser . getSortedAndMergedLocations ( ) ) ;
} else {
locs . addAll ( GenomeLocParser . intervalFileToList ( interval ) ) ;
}
2009-05-12 23:33:55 +08:00
} else {
2009-06-26 04:44:23 +08:00
locs . addAll ( GenomeLocParser . parseGenomeLocs ( interval ) ) ;
2009-05-12 23:33:55 +08:00
}
2009-06-26 04:44:23 +08:00
2009-05-12 09:04:18 +08:00
}
2009-05-12 23:33:55 +08:00
return locs ;
2009-05-11 10:07:20 +08:00
}
2009-10-06 10:45:31 +08:00
/ * *
* Returns sets of samples present in the ( merged ) input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of sample names
* found in the corresponding bam file .
*
* @return
* /
public List < Set < String > > getSamplesByReaders ( ) {
SamFileHeaderMerger hm = getDataSource ( ) . getHeaderMerger ( ) ;
List < Set < String > > sample_sets = new ArrayList < Set < String > > ( hm . getReaders ( ) . size ( ) ) ;
for ( SAMFileReader r : hm . getReaders ( ) ) {
Set < String > samples = new HashSet < String > ( 1 ) ;
sample_sets . add ( samples ) ;
for ( SAMReadGroupRecord g : r . getFileHeader ( ) . getReadGroups ( ) ) {
samples . add ( g . getSample ( ) ) ;
}
}
return sample_sets ;
}
/ * *
* Returns sets of libraries present in the ( merged ) input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of library names
* found in the corresponding bam file .
*
* @return
* /
public List < Set < String > > getLibrariesByReaders ( ) {
SamFileHeaderMerger hm = getDataSource ( ) . getHeaderMerger ( ) ;
List < Set < String > > lib_sets = new ArrayList < Set < String > > ( hm . getReaders ( ) . size ( ) ) ;
for ( SAMFileReader r : hm . getReaders ( ) ) {
Set < String > libs = new HashSet < String > ( 2 ) ;
lib_sets . add ( libs ) ;
for ( SAMReadGroupRecord g : r . getFileHeader ( ) . getReadGroups ( ) ) {
libs . add ( g . getLibrary ( ) ) ;
}
}
return lib_sets ;
}
/ * *
* Returns sets of ( remapped ) read groups in input SAM stream , grouped by readers ( i . e . underlying
* individual bam files ) . For instance : if GATK is run with three input bam files ( three - I arguments ) , then the list
* returned by this method will contain 3 elements ( one for each reader ) , with each element being a set of remapped read groups
* ( i . e . as seen by read . getReadGroup ( ) . getReadGroupId ( ) in the merged stream ) that come from the corresponding bam file .
*
* @return
* /
public List < Set < String > > getMergedReadGroupsByReaders ( ) {
SamFileHeaderMerger hm = getDataSource ( ) . getHeaderMerger ( ) ;
List < Set < String > > rg_sets = new ArrayList < Set < String > > ( hm . getReaders ( ) . size ( ) ) ;
for ( SAMFileReader r : hm . getReaders ( ) ) {
Set < String > groups = new HashSet < String > ( 5 ) ;
rg_sets . add ( groups ) ;
for ( SAMReadGroupRecord g : r . getFileHeader ( ) . getReadGroups ( ) ) {
if ( hm . hasGroupIdDuplicates ( ) ) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
// use HeaderMerger to translate original read group id from the reader into the read group id in the
// merged stream, and save that remapped read group id to associate it with specific reader
groups . add ( hm . getReadGroupId ( r , g . getReadGroupId ( ) ) ) ;
} else {
// otherwise, pass through the unmapped read groups since this is what Picard does as well
groups . add ( g . getReadGroupId ( ) ) ;
}
}
}
return rg_sets ;
}
2009-06-10 21:39:32 +08:00
/ * *
* Bundles all the source information about the reads into a unified data structure .
2009-07-10 05:57:00 +08:00
*
2009-10-06 10:45:31 +08:00
* @param walker The walker for which to extract info .
2009-06-10 21:39:32 +08:00
* @param argCollection The collection of arguments passed to the engine .
* @return The reads object providing reads source info .
* /
2009-10-06 10:45:31 +08:00
private Reads extractSourceInfo ( Walker walker , GATKArgumentCollection argCollection ) {
2009-07-10 07:59:53 +08:00
List < SamRecordFilter > filters = new ArrayList < SamRecordFilter > ( ) ;
2009-10-06 10:45:31 +08:00
filters . addAll ( WalkerManager . getReadFilters ( walker ) ) ;
if ( argCollection . filterZeroMappingQualityReads ! = null & & argCollection . filterZeroMappingQualityReads )
filters . add ( new ZeroMappingQualityReadFilter ( ) ) ;
return new Reads ( argCollection . samFiles ,
argCollection . strictnessLevel ,
argCollection . downsampleFraction ,
argCollection . downsampleCoverage ,
! argCollection . unsafe ,
filters ,
argCollection . readMaxPileup ,
walker . includeReadsWithDeletionAtLoci ( ) ) ;
2009-06-10 21:39:32 +08:00
}
2009-07-30 07:00:15 +08:00
/ * *
* Verifies that the supplied set of reads files mesh with what the walker says it requires .
2009-10-06 10:45:31 +08:00
*
* @param walker Walker to test .
2009-07-30 07:00:15 +08:00
* @param arguments Supplied reads files .
* /
2009-10-06 10:45:31 +08:00
private void validateSuppliedReadsAgainstWalker ( Walker walker , GATKArgumentCollection arguments ) {
2009-05-20 07:26:17 +08:00
// Check what the walker says is required against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( WalkerManager . isRequired ( walker , DataSource . READS ) & & ( arguments . samFiles = = null | | arguments . samFiles . size ( ) = = 0 ) )
2009-07-30 07:00:15 +08:00
throw new ArgumentException ( "Walker requires reads but none were provided. If this is incorrect, alter the walker's @Requires annotation." ) ;
2009-05-20 07:26:17 +08:00
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( ( arguments . samFiles ! = null & & arguments . samFiles . size ( ) > 0 ) & & ! WalkerManager . isAllowed ( walker , DataSource . READS ) )
2009-07-30 07:00:15 +08:00
throw new ArgumentException ( "Walker does not allow reads but reads were provided. If this is incorrect, alter the walker's @Allows annotation" ) ;
}
/ * *
* Verifies that the supplied reference file mesh with what the walker says it requires .
2009-10-06 10:45:31 +08:00
*
* @param walker Walker to test .
2009-07-30 07:00:15 +08:00
* @param arguments Supplied reads files .
* /
2009-10-06 10:45:31 +08:00
private void validateSuppliedReferenceAgainstWalker ( Walker walker , GATKArgumentCollection arguments ) {
2009-07-30 07:00:15 +08:00
// Check what the walker says is required against what was provided on the command line.
if ( WalkerManager . isRequired ( walker , DataSource . REFERENCE ) & & arguments . referenceFile = = null )
throw new ArgumentException ( "Walker requires a reference but none was provided. If this is incorrect, alter the walker's @Requires annotation." ) ;
// Check what the walker says is allowed against what was provided on the command line.
2009-07-10 05:57:00 +08:00
if ( arguments . referenceFile ! = null & & ! WalkerManager . isAllowed ( walker , DataSource . REFERENCE ) )
2009-07-30 07:00:15 +08:00
throw new ArgumentException ( "Walker does not allow a reference but one was provided. If this is incorrect, alter the walker's @Allows annotation" ) ;
}
2009-05-20 07:26:17 +08:00
2009-07-30 07:00:15 +08:00
/ * *
* Verifies that all required reference - ordered data has been supplied , and any reference - ordered data that was not
* ' allowed ' is still present .
2009-10-06 10:45:31 +08:00
*
2009-07-30 07:00:15 +08:00
* @param walker Walker to test .
2009-10-06 10:45:31 +08:00
* @param rods Reference - ordered data to load .
2009-07-30 07:00:15 +08:00
* /
2009-10-06 10:45:31 +08:00
private void validateSuppliedReferenceOrderedDataAgainstWalker ( Walker walker , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
2009-05-20 07:26:17 +08:00
// Check to make sure that all required metadata is present.
List < RMD > allRequired = WalkerManager . getRequiredMetaData ( walker ) ;
2009-07-10 05:57:00 +08:00
for ( RMD required : allRequired ) {
2009-05-20 07:26:17 +08:00
boolean found = false ;
2009-07-10 05:57:00 +08:00
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > rod : rods ) {
if ( rod . matches ( required . name ( ) , required . type ( ) ) )
2009-05-20 07:26:17 +08:00
found = true ;
}
2009-07-10 05:57:00 +08:00
if ( ! found )
throw new ArgumentException ( String . format ( "Unable to find reference metadata (%s,%s)" , required . name ( ) , required . type ( ) ) ) ;
2009-05-20 07:26:17 +08:00
}
// Check to see that no forbidden rods are present.
2009-07-10 05:57:00 +08:00
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > rod : rods ) {
if ( ! WalkerManager . isAllowed ( walker , rod ) )
2009-10-16 23:39:08 +08:00
throw new ArgumentException ( String . format ( "Walker of type %s does not allow access to metadata: %s. If this is incorrect, change the @Allows metadata" , walker . getClass ( ) , rod . getName ( ) ) ) ;
2009-07-30 07:00:15 +08:00
}
}
/ * *
* Now that all files are open , validate the sequence dictionaries of the reads vs . the reference .
2009-10-06 10:45:31 +08:00
*
* @param reads Reads data source .
2009-07-30 07:00:15 +08:00
* @param reference Reference data source .
* /
2009-10-06 10:45:31 +08:00
private void validateReadsAndReferenceAreCompatible ( SAMDataSource reads , ReferenceSequenceFile reference ) {
if ( reads = = null | | reference = = null )
2009-07-30 07:00:15 +08:00
return ;
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads . getHeader ( ) . getSequenceDictionary ( ) ;
Set < String > readsSequenceNames = new TreeSet < String > ( ) ;
2009-10-06 10:45:31 +08:00
for ( SAMSequenceRecord dictionaryEntry : readsDictionary . getSequences ( ) )
2009-07-30 07:00:15 +08:00
readsSequenceNames . add ( dictionaryEntry . getSequenceName ( ) ) ;
// Compile a set of sequence names that exist in the reference file.
SAMSequenceDictionary referenceDictionary = reference . getSequenceDictionary ( ) ;
Set < String > referenceSequenceNames = new TreeSet < String > ( ) ;
2009-10-06 10:45:31 +08:00
for ( SAMSequenceRecord dictionaryEntry : referenceDictionary . getSequences ( ) )
2009-07-30 07:00:15 +08:00
referenceSequenceNames . add ( dictionaryEntry . getSequenceName ( ) ) ;
2009-10-06 10:45:31 +08:00
if ( readsSequenceNames . size ( ) = = 0 ) {
2009-07-30 07:00:15 +08:00
logger . info ( "Reads file is unmapped. Skipping validation against reference." ) ;
return ;
}
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
Set < String > intersectingSequenceNames = new HashSet < String > ( readsSequenceNames ) ;
intersectingSequenceNames . retainAll ( referenceSequenceNames ) ;
2009-10-06 10:45:31 +08:00
if ( intersectingSequenceNames . size ( ) = = 0 ) {
2009-07-30 07:00:15 +08:00
StringBuilder error = new StringBuilder ( ) ;
error . append ( "No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n" ) ;
error . append ( System . getProperty ( "line.separator" ) ) ;
error . append ( String . format ( "Reads contigs: %s%n" , prettyPrintSequenceRecords ( readsDictionary ) ) ) ;
error . append ( String . format ( "Reference contigs: %s%n" , prettyPrintSequenceRecords ( referenceDictionary ) ) ) ;
logger . error ( error . toString ( ) ) ;
Utils . scareUser ( "No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference." ) ;
}
// If the two datasets are not equal and neither is a strict subset of the other, warn the user.
2009-10-06 10:45:31 +08:00
if ( ! readsSequenceNames . equals ( referenceSequenceNames ) & &
! readsSequenceNames . containsAll ( referenceSequenceNames ) & &
! referenceSequenceNames . containsAll ( readsSequenceNames ) ) {
2009-07-30 07:00:15 +08:00
StringBuilder warning = new StringBuilder ( ) ;
warning . append ( "Limited overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n" ) ;
warning . append ( System . getProperty ( "line.separator" ) ) ;
warning . append ( String . format ( "Reads contigs: %s%n" , prettyPrintSequenceRecords ( readsDictionary ) ) ) ;
warning . append ( String . format ( "Reference contigs: %s%n" , prettyPrintSequenceRecords ( referenceDictionary ) ) ) ;
logger . warn ( warning . toString ( ) ) ;
2009-05-20 07:26:17 +08:00
}
}
2009-10-06 10:45:31 +08:00
private String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
String [ ] sequenceRecordNames = new String [ sequenceDictionary . size ( ) ] ;
2009-07-30 07:00:15 +08:00
int sequenceRecordIndex = 0 ;
2009-10-06 10:45:31 +08:00
for ( SAMSequenceRecord sequenceRecord : sequenceDictionary . getSequences ( ) )
2009-07-30 07:00:15 +08:00
sequenceRecordNames [ sequenceRecordIndex + + ] = sequenceRecord . getSequenceName ( ) ;
return Arrays . deepToString ( sequenceRecordNames ) ;
}
2009-05-11 10:07:20 +08:00
/ * *
* Convenience function that binds RODs using the old - style command line parser to the new style list for
* a uniform processing .
*
2009-07-10 06:10:22 +08:00
* @param name the name of the rod
* @param type its type
* @param file the file to load the rod from
2009-05-11 10:07:20 +08:00
* /
private void bindConvenienceRods ( final String name , final String type , final String file ) {
argCollection . RODBindings . add ( Utils . join ( "," , new String [ ] { name , type , file } ) ) ;
}
2009-07-30 00:11:45 +08:00
/ * *
* Get the sharding strategy given a driving data source .
*
* @param walker Walker for which to infer sharding strategy .
* @param drivingDataSource Data on which to shard .
* @param intervals Intervals to use when limiting sharding .
* @param maxIterations the maximum number of iterations to run through
* @return Sharding strategy for this driving data source .
* /
protected ShardStrategy getShardStrategy ( Walker walker ,
ReferenceSequenceFile drivingDataSource ,
GenomeLocSortedSet intervals ,
Integer maxIterations ) {
2009-09-13 03:13:15 +08:00
long SHARD_SIZE = 100000L ;
2009-07-30 00:11:45 +08:00
ShardStrategy shardStrategy = null ;
ShardStrategyFactory . SHATTER_STRATEGY shardType ;
2009-09-13 03:13:15 +08:00
2009-07-30 00:11:45 +08:00
if ( walker instanceof LocusWalker ) {
2009-10-21 07:31:13 +08:00
if ( walker instanceof RodWalker ) SHARD_SIZE * = 1000 ;
2009-09-13 03:13:15 +08:00
2009-07-30 00:11:45 +08:00
if ( intervals ! = null ) {
shardType = ( walker . isReduceByInterval ( ) ) ?
ShardStrategyFactory . SHATTER_STRATEGY . INTERVAL :
ShardStrategyFactory . SHATTER_STRATEGY . LINEAR ;
shardStrategy = ShardStrategyFactory . shatter ( shardType ,
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
intervals , maxIterations ) ;
} else
shardStrategy = ShardStrategyFactory . shatter ( ShardStrategyFactory . SHATTER_STRATEGY . LINEAR ,
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE , maxIterations ) ;
} else if ( walker instanceof ReadWalker | |
walker instanceof DuplicateWalker ) {
shardType = ShardStrategyFactory . SHATTER_STRATEGY . READS ;
if ( intervals ! = null ) {
shardStrategy = ShardStrategyFactory . shatter ( shardType ,
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
intervals , maxIterations ) ;
} else {
shardStrategy = ShardStrategyFactory . shatter ( shardType ,
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE , maxIterations ) ;
}
} else if ( walker instanceof LocusWindowWalker ) {
2009-10-06 10:45:31 +08:00
if ( intervals = = null )
2009-07-30 00:11:45 +08:00
throw new StingException ( "Unable to shard: walker is of type LocusWindow, but no intervals were provided" ) ;
shardStrategy = ShardStrategyFactory . shatter ( ShardStrategyFactory . SHATTER_STRATEGY . INTERVAL ,
drivingDataSource . getSequenceDictionary ( ) ,
SHARD_SIZE ,
intervals , maxIterations ) ;
} else
throw new StingException ( "Unable to support walker of type" + walker . getClass ( ) . getName ( ) ) ;
return shardStrategy ;
}
/ * *
* Gets a data source for the given set of reads .
*
2009-10-06 10:45:31 +08:00
* @param reads the read source information
2009-07-30 00:11:45 +08:00
* @return A data source for the given set of reads .
* /
private SAMDataSource createReadsDataSource ( Reads reads ) {
// By reference traversals are happy with no reads. Make sure that case is handled.
if ( reads . getReadsFiles ( ) . size ( ) = = 0 )
return null ;
SAMDataSource dataSource = new SAMDataSource ( reads ) ;
return dataSource ;
}
/ * *
* Opens a reference sequence file paired with an index .
*
* @param refFile Handle to a reference sequence file . Non - null .
* @return A thread - safe file wrapper .
* /
private IndexedFastaSequenceFile openReferenceSequenceFile ( File refFile ) {
IndexedFastaSequenceFile ref = null ;
try {
ref = new IndexedFastaSequenceFile ( refFile ) ;
}
catch ( FileNotFoundException ex ) {
throw new StingException ( "I/O error while opening fasta file: " + ex . getMessage ( ) , ex ) ;
}
GenomeLocParser . setupRefContigOrdering ( ref ) ;
return ref ;
}
/ * *
* Open the reference - ordered data sources .
*
* @param rods the reference order data to execute using
* @return A list of reference - ordered data sources .
* /
private List < ReferenceOrderedDataSource > getReferenceOrderedDataSources ( List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
List < ReferenceOrderedDataSource > dataSources = new ArrayList < ReferenceOrderedDataSource > ( ) ;
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > rod : rods )
dataSources . add ( new ReferenceOrderedDataSource ( rod ) ) ;
return dataSources ;
}
2009-05-11 10:07:20 +08:00
2009-07-10 06:10:22 +08:00
/ * *
* Initialize the output streams as specified by the user .
*
2009-10-06 10:45:31 +08:00
* @param walker the walker to initialize output streams for
2009-09-30 06:23:19 +08:00
* @param outputTracker the tracker supplying the initialization data .
2009-07-10 06:10:22 +08:00
* /
2009-08-23 08:56:02 +08:00
private void initializeOutputStreams ( Walker walker , OutputTracker outputTracker ) {
2009-10-06 10:45:31 +08:00
if ( argCollection . outErrFileName ! = null )
outputTracker . initializeCoreIO ( argCollection . outErrFileName , argCollection . outErrFileName ) ;
2009-07-30 00:11:45 +08:00
else
2009-10-06 10:45:31 +08:00
outputTracker . initializeCoreIO ( argCollection . outFileName , argCollection . errFileName ) ;
2009-05-11 10:07:20 +08:00
2009-10-06 10:45:31 +08:00
for ( Map . Entry < ArgumentSource , Object > input : inputs . entrySet ( ) )
outputTracker . addInput ( input . getKey ( ) , input . getValue ( ) ) ;
for ( Stub < ? > stub : outputs )
2009-08-23 08:56:02 +08:00
outputTracker . addOutput ( stub ) ;
outputTracker . prepareWalker ( walker ) ;
2009-05-11 10:07:20 +08:00
}
2009-07-30 07:00:15 +08:00
public SAMFileHeader getSAMFileHeader ( ) {
return readsDataSource . getHeader ( ) ;
2009-05-11 10:07:20 +08:00
}
2009-05-14 21:55:52 +08:00
2009-07-25 06:59:49 +08:00
/ * *
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads ; header merger , individual file readers etc can be accessed through the returned data source object .
2009-10-06 10:45:31 +08:00
*
2009-07-25 06:59:49 +08:00
* @return
* /
public SAMDataSource getDataSource ( ) {
2009-07-30 07:00:15 +08:00
return this . readsDataSource ;
2009-07-25 06:59:49 +08:00
}
2009-07-10 06:10:22 +08:00
/ * *
* Gets the collection of GATK main application arguments for enhanced walker validation .
*
* @return the GATK argument collection
* /
2009-05-14 21:55:52 +08:00
public GATKArgumentCollection getArguments ( ) {
return this . argCollection ;
}
2009-05-11 10:07:20 +08:00
}