2009-03-27 23:40:45 +08:00
package org.broadinstitute.sting.gatk.traversals ;
2009-02-27 05:50:29 +08:00
2009-05-29 04:13:01 +08:00
import net.sf.picard.filter.SamRecordFilter ;
2009-05-29 22:51:08 +08:00
import net.sf.picard.filter.FilteringIterator ;
2009-05-29 04:13:01 +08:00
import net.sf.picard.reference.ReferenceSequence ;
2009-03-24 04:27:21 +08:00
import net.sf.samtools.SAMFileHeader ;
import net.sf.samtools.SAMFileReader ;
import net.sf.samtools.SAMFileReader.ValidationStringency ;
import net.sf.samtools.SAMRecord ;
2009-06-20 06:04:47 +08:00
import net.sf.samtools.SAMReadGroupRecord ;
2009-03-24 04:27:21 +08:00
import net.sf.samtools.util.RuntimeIOException ;
import org.apache.log4j.Logger ;
2009-03-16 06:21:48 +08:00
import org.broadinstitute.sting.gatk.iterators.* ;
2009-06-06 07:34:37 +08:00
import org.broadinstitute.sting.gatk.refdata.* ;
2009-03-27 23:40:45 +08:00
import org.broadinstitute.sting.gatk.walkers.Walker ;
2009-06-12 02:13:22 +08:00
import org.broadinstitute.sting.gatk.datasources.shards.Shard ;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider ;
2009-05-16 05:02:12 +08:00
import org.broadinstitute.sting.gatk.Reads ;
2009-03-25 23:17:38 +08:00
import org.broadinstitute.sting.utils.* ;
2009-04-15 02:13:23 +08:00
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2 ;
2009-02-27 05:50:29 +08:00
2009-04-16 02:29:38 +08:00
import java.io.File ;
2009-04-25 03:40:21 +08:00
import java.util.* ;
2009-03-03 02:18:48 +08:00
2009-03-27 23:40:45 +08:00
public abstract class TraversalEngine {
2009-03-16 06:21:48 +08:00
// list of reference ordered data objects
2009-04-03 04:53:01 +08:00
protected List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods = null ;
2009-03-16 06:21:48 +08:00
// Iterator over rods
2009-06-06 07:34:37 +08:00
List < Pair < String , RODIterator < ? extends ReferenceOrderedDatum > > > rodIters ;
2009-03-01 04:47:48 +08:00
2009-03-12 05:43:31 +08:00
// How strict should we be with SAM/BAM parsing?
2009-03-27 23:40:45 +08:00
protected ValidationStringency strictness = ValidationStringency . STRICT ;
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
// Time in milliseconds since we initialized this engine
2009-03-27 23:40:45 +08:00
protected long startTime = - 1 ;
protected long lastProgressPrintTime = - 1 ; // When was the last time we printed our progress?
2009-03-12 05:43:31 +08:00
// How long can we go without printing some progress info?
2009-06-21 22:10:02 +08:00
protected long MAX_PROGRESS_PRINT_TIME = 30 * 1000 ; // 10 seconds in millisecs
protected long N_RECORDS_TO_PRINT = 1000000 ;
2009-03-12 05:43:31 +08:00
// Maximum number of reads to process before finishing
2009-03-27 23:40:45 +08:00
protected long maxReads = - 1 ;
2009-03-12 05:43:31 +08:00
// Name of the reads file, in BAM/SAM format
2009-04-25 03:40:21 +08:00
protected List < File > readsFiles = null ; // the name of the reads file
2009-03-27 23:40:45 +08:00
protected SAMFileReader samReader = null ;
2009-03-12 05:43:31 +08:00
// iterator over the sam records in the readsFile
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > samReadIter = null ;
2009-03-12 05:43:31 +08:00
2009-03-17 07:22:04 +08:00
// The verifying iterator, it does checking
2009-03-27 23:40:45 +08:00
protected VerifyingSamIterator verifyingSamReadIter = null ;
2009-03-17 07:22:04 +08:00
2009-03-12 05:43:31 +08:00
// The reference data -- filename, refSeqFile, and iterator
2009-03-27 23:40:45 +08:00
protected File refFileName = null ; // the name of the reference file
2009-03-23 03:56:54 +08:00
//private ReferenceSequenceFile refFile = null;
2009-03-27 23:40:45 +08:00
protected FastaSequenceFile2 refFile = null ; // todo: merge FastaSequenceFile2 into picard!
protected ReferenceIterator refIter = null ;
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
// Progress tracker for the sam file
2009-03-27 23:40:45 +08:00
protected FileProgressTracker samReadingTracker = null ;
protected boolean DEBUGGING = false ;
protected boolean beSafeP = true ;
protected boolean SORT_ON_FLY = false ;
2009-04-01 10:11:13 +08:00
protected boolean DOWNSAMPLE_BY_FRACTION = false ;
2009-04-02 04:27:06 +08:00
protected boolean DOWNSAMPLE_BY_COVERAGE = false ;
2009-03-27 23:40:45 +08:00
protected boolean FILTER_UNSORTED_READS = false ;
protected boolean walkOverAllSites = false ;
2009-04-01 10:11:13 +08:00
protected int maxOnFlySorts = 100000 ;
protected double downsamplingFraction = 1.0 ;
2009-04-02 04:27:06 +08:00
protected int downsamplingCoverage = 0 ;
2009-03-27 23:40:45 +08:00
protected boolean THREADED_IO = false ;
protected int THREADED_IO_BUFFER_SIZE = 10000 ;
2009-05-29 22:51:08 +08:00
protected boolean filterZeroMappingQualityReads = false ;
2009-04-24 04:34:52 +08:00
// the stored header
protected SAMFileHeader myHeader = null ;
2009-03-24 04:27:21 +08:00
/ * *
* our log , which we want to capture anything from this class
* /
2009-03-27 23:40:45 +08:00
protected static Logger logger = Logger . getLogger ( TraversalEngine . class ) ;
2009-03-24 04:27:21 +08:00
2009-03-12 05:43:31 +08:00
// Locations we are going to process during the traversal
2009-05-12 09:04:18 +08:00
private List < GenomeLoc > locs = null ;
2009-03-03 02:18:48 +08:00
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Setting up the engine
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Creates a new , uninitialized TraversalEngine
*
* @param reads SAM / BAM file of reads
2009-03-24 04:27:21 +08:00
* @param ref Reference file in FASTA format , assumes a . dict file is also available
* @param rods Array of reference ordered data sets
2009-03-12 05:43:31 +08:00
* /
2009-04-25 03:40:21 +08:00
public TraversalEngine ( List < File > reads , File ref , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
readsFiles = reads ;
2009-02-27 05:50:29 +08:00
refFileName = ref ;
2009-03-24 11:58:03 +08:00
this . rods = rods ;
2009-02-27 05:50:29 +08:00
}
2009-03-03 05:49:08 +08:00
2009-03-12 05:43:31 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Manipulating the underlying engine parameters
//
// --------------------------------------------------------------------------------------------------------------
//public void setRegion(final String reg) { regionStr = regionStr; }
//public void setTraversalType(final String type) { traversalType = type; }
2009-03-24 04:27:21 +08:00
public void setStrictness ( final ValidationStringency s ) {
strictness = s ;
}
public void setMaxReads ( final int maxReads ) {
this . maxReads = maxReads ;
}
2009-05-29 22:51:08 +08:00
public void setFilterZeroMappingQualityReads ( final boolean filterZeroMappingQualityReads ) {
this . filterZeroMappingQualityReads = filterZeroMappingQualityReads ;
}
2009-03-25 06:32:45 +08:00
public void setThreadedIO ( final boolean threadedIO ) {
this . THREADED_IO = threadedIO ;
}
2009-03-27 08:12:35 +08:00
public void setWalkOverAllSites ( final boolean walkOverAllSites ) {
this . walkOverAllSites = walkOverAllSites ;
}
2009-05-16 05:02:12 +08:00
private void setDebugging ( final boolean d ) {
2009-03-24 04:27:21 +08:00
DEBUGGING = d ;
}
2009-05-16 05:02:12 +08:00
private void setSafetyChecking ( final boolean beSafeP ) {
2009-03-24 04:27:21 +08:00
if ( ! beSafeP )
2009-03-24 04:46:55 +08:00
logger . warn ( "*** Turning off safety checking, I hope you know what you are doing. Errors will result in debugging assert failures and other inscrutable messages..." ) ;
2009-03-17 07:22:04 +08:00
this . beSafeP = beSafeP ;
}
2009-03-24 04:27:21 +08:00
2009-03-25 04:55:34 +08:00
public void setFilterUnsortedReads ( final boolean filterUnsorted ) {
if ( ! filterUnsorted )
logger . warn ( "*** Turning on filtering of out of order reads, I *really* hope you know what you are doing, as you are removing data..." ) ;
this . FILTER_UNSORTED_READS = filterUnsorted ;
}
2009-05-16 05:02:12 +08:00
private void setSortOnFly ( final int maxReadsToSort ) {
2009-04-01 10:11:13 +08:00
logger . info ( "Sorting read file on the fly: max reads allowed is " + maxReadsToSort ) ;
SORT_ON_FLY = true ;
maxOnFlySorts = maxReadsToSort ;
}
2009-05-16 05:02:12 +08:00
private void setSortOnFly ( ) { setSortOnFly ( 100000 ) ; }
2009-04-01 10:11:13 +08:00
2009-05-16 05:02:12 +08:00
private void setDownsampleByFraction ( final double fraction ) {
2009-04-01 10:11:13 +08:00
logger . info ( "Downsampling to approximately " + ( fraction * 100.0 ) + "% of filtered reads" ) ;
DOWNSAMPLE_BY_FRACTION = true ;
downsamplingFraction = fraction ;
2009-03-18 04:29:09 +08:00
}
2009-02-27 05:50:29 +08:00
2009-05-16 05:02:12 +08:00
private void setDownsampleByCoverage ( final int coverage ) {
2009-04-02 04:27:06 +08:00
logger . info ( "Downsampling to coverage " + coverage ) ;
DOWNSAMPLE_BY_COVERAGE = true ;
downsamplingCoverage = coverage ;
}
2009-05-16 05:02:12 +08:00
/ * *
* Sets up read filtering performed by the traversal engine .
* @param reads Read filters to instantiate as the traversal engine walks over the data .
* @deprecated Should only be used by old - style traversals .
* /
@Deprecated
public void setReadFilters ( Reads reads ) {
if ( reads . getDownsamplingFraction ( ) ! = null ) setDownsampleByFraction ( reads . getDownsamplingFraction ( ) ) ;
if ( reads . getDownsampleToCoverage ( ) ! = null ) setDownsampleByCoverage ( reads . getDownsampleToCoverage ( ) ) ;
if ( reads . getMaxOnTheFlySorts ( ) ! = null ) setSortOnFly ( reads . getMaxOnTheFlySorts ( ) ) ;
if ( reads . getSafetyChecking ( ) ! = null ) setSafetyChecking ( reads . getSafetyChecking ( ) ) ;
}
2009-03-03 02:18:48 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// functions for dealing locations (areas of the genome we're traversing over)
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
2009-04-24 04:34:52 +08:00
/ * *
* get the associated SAM header for our run
* @return the header if it ' s stored , null if not
* /
public SAMFileHeader getSAMHeader ( ) {
2009-05-15 01:18:51 +08:00
if ( myHeader = = null )
myHeader = samReader . getFileHeader ( ) ;
2009-04-24 04:34:52 +08:00
return myHeader ;
}
/ * *
* set ' s the SAM header for this traversal , which should
* be the merged header in the multiple BAM file case .
*
* @param myHeader the passed in header
* /
public void setSAMHeader ( SAMFileHeader myHeader ) {
this . myHeader = myHeader ;
}
2009-03-12 05:43:31 +08:00
/ * *
2009-05-12 09:04:18 +08:00
* Sets the intervals over which the traversal ( s ) should happen .
* @param locs
2009-03-22 00:07:32 +08:00
* /
2009-05-12 09:04:18 +08:00
public void setLocation ( final List < GenomeLoc > locs ) {
this . locs = locs ;
2009-03-22 00:07:32 +08:00
}
2009-03-03 02:18:48 +08:00
2009-03-26 00:01:58 +08:00
public boolean hasLocations ( ) {
2009-05-12 09:22:18 +08:00
return this . locs ! = null & & ! this . locs . isEmpty ( ) ;
2009-03-26 00:01:58 +08:00
}
2009-06-06 07:34:37 +08:00
public List < GenomeLoc > getLocations ( ) {
return this . locs ;
}
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
2009-03-12 05:43:31 +08:00
// printing
2009-02-27 05:50:29 +08:00
//
// --------------------------------------------------------------------------------------------------------------
2009-03-24 04:27:21 +08:00
2009-03-10 22:59:42 +08:00
/ * *
* @param curTime ( current runtime , in millisecs )
* @return true if the maximum interval ( in millisecs ) has passed since the last printing
* /
2009-03-27 23:40:45 +08:00
protected boolean maxElapsedIntervalForPrinting ( final long curTime ) {
2009-03-10 22:59:42 +08:00
return ( curTime - this . lastProgressPrintTime ) > MAX_PROGRESS_PRINT_TIME ;
}
2009-03-12 05:43:31 +08:00
/ * *
* Forward request to printProgress
*
* @param type
* @param loc
* /
public void printProgress ( final String type , GenomeLoc loc ) {
2009-03-24 04:27:21 +08:00
printProgress ( false , type , loc ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Utility routine that prints out process information ( including timing ) every N records or
* every M seconds , for N and M set in global variables .
*
* @param mustPrint If true , will print out info , regardless of nRecords or time interval
2009-03-24 04:27:21 +08:00
* @param type String to print out describing our atomic traversal type ( "read" , "locus" , etc )
* @param loc Current location
2009-03-12 05:43:31 +08:00
* /
2009-03-24 04:27:21 +08:00
public void printProgress ( boolean mustPrint , final String type , GenomeLoc loc ) {
2009-04-10 04:28:17 +08:00
final long nRecords = TraversalStatistics . nRecords ;
2009-03-10 22:59:42 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2009-04-03 00:44:12 +08:00
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
2009-03-24 04:27:21 +08:00
2009-06-22 05:27:40 +08:00
if ( mustPrint | | nRecords = = 1 | | nRecords % N_RECORDS_TO_PRINT = = 0 | | maxElapsedIntervalForPrinting ( curTime ) ) {
2009-03-10 22:59:42 +08:00
this . lastProgressPrintTime = curTime ;
2009-02-27 05:50:29 +08:00
final double secsPer1MReads = ( elapsed * 1000000.0 ) / nRecords ;
2009-03-24 04:27:21 +08:00
if ( loc ! = null )
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed to %s, processing %,d %s in %.2f secs (%.2f secs per 1M %s)" , loc , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-03-03 05:51:25 +08:00
else
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)" , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-03-10 22:59:42 +08:00
2009-03-12 05:43:31 +08:00
// Currently samReadingTracker will print misleading info if we're not processing the whole file
2009-03-20 07:02:49 +08:00
// If an index is enabled, file read progress is meaningless because a linear
// traversal is not being performed. For now, don't bother printing progress.
// TODO: Create a sam indexed read tracker that tracks based on percentage through the query.
2009-03-24 04:27:21 +08:00
if ( samReadingTracker ! = null & & this . locs = = null )
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] -> %s" , samReadingTracker . progressMeter ( ) ) ) ;
2009-02-27 05:50:29 +08:00
}
}
2009-05-08 22:12:45 +08:00
/ * *
* A passthrough method so that subclasses can report which types of traversals they ' re using .
* TODO : Make this method abstract once all traversals support it .
* @param sum Result of the computation .
* @param < T > Type of the computation .
* /
public < T > void printOnTraversalDone ( T sum ) {
2009-05-09 05:27:54 +08:00
throw new UnsupportedOperationException ( "This method is a required override for new traversal engines. Please port your traversal engine to the new style." ) ;
2009-05-08 22:12:45 +08:00
}
2009-03-12 05:43:31 +08:00
/ * *
* Called after a traversal to print out information about the traversal process
*
* @param type String describing this type of traversal ( "loci" , "read" )
2009-03-24 04:27:21 +08:00
* @param sum The reduce result of the traversal
* @param < T > ReduceType of the traversal
2009-03-12 05:43:31 +08:00
* /
2009-05-08 22:12:45 +08:00
protected < T > void printOnTraversalDone ( final String type , T sum ) {
2009-03-24 04:27:21 +08:00
printProgress ( true , type , null ) ;
2009-03-27 21:27:04 +08:00
logger . info ( "Traversal reduce result is " + sum ) ;
2009-03-26 22:40:50 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
logger . info ( String . format ( "Total runtime %.2f secs, %.2f min, %.2f hours%n" , elapsed , elapsed / 60 , elapsed / 3600 ) ) ;
2009-04-10 04:28:17 +08:00
logger . info ( String . format ( "Traversal skipped %d reads out of %d total (%.2f%%)" ,
TraversalStatistics . nSkippedReads ,
TraversalStatistics . nReads ,
( TraversalStatistics . nSkippedReads * 100.0 ) / TraversalStatistics . nReads ) ) ;
logger . info ( String . format ( " -> %d unmapped reads" , TraversalStatistics . nUnmappedReads ) ) ;
2009-04-17 09:27:36 +08:00
logger . info ( String . format ( " -> %d duplicate reads" , TraversalStatistics . nDuplicates ) ) ;
2009-04-10 04:28:17 +08:00
logger . info ( String . format ( " -> %d non-primary reads" , TraversalStatistics . nNotPrimary ) ) ;
logger . info ( String . format ( " -> %d reads with bad alignments" , TraversalStatistics . nBadAlignments ) ) ;
logger . info ( String . format ( " -> %d reads with indels" , TraversalStatistics . nSkippedIndels ) ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
2009-03-12 05:43:31 +08:00
// Initialization
2009-02-27 05:50:29 +08:00
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Initialize the traversal engine . After this point traversals can be run over the data
*
* @return true on success
* /
2009-03-25 06:32:45 +08:00
public boolean initialize ( ) {
2009-03-12 05:43:31 +08:00
lastProgressPrintTime = startTime = System . currentTimeMillis ( ) ;
2009-03-16 06:21:48 +08:00
initializeReference ( ) ;
// Initial the reference ordered data iterators
initializeRODs ( ) ;
return true ;
}
2009-06-05 06:37:51 +08:00
2009-05-15 00:52:18 +08:00
@Deprecated
2009-05-15 03:27:10 +08:00
protected StingSAMIterator wrapReadsIterator ( final Iterator < SAMRecord > rawIterator , final boolean enableVerification ) {
2009-05-16 05:02:12 +08:00
// Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to
// throw an exception if reads info isn't present.
StingSAMIterator wrappedIterator = StingSAMIteratorAdapter . adapt ( null , rawIterator ) ;
wrappedIterator = applyDecoratingIterators ( enableVerification , wrappedIterator ) ;
2009-05-15 00:52:18 +08:00
if ( THREADED_IO ) {
logger . info ( String . format ( "Enabling threaded I/O with buffer of %d reads" , THREADED_IO_BUFFER_SIZE ) ) ;
2009-05-16 05:02:12 +08:00
wrappedIterator = StingSAMIteratorAdapter . adapt ( null , new ThreadedIterator < SAMRecord > ( wrappedIterator , THREADED_IO_BUFFER_SIZE ) ) ;
2009-05-15 00:52:18 +08:00
}
return wrappedIterator ;
}
2009-05-16 05:02:12 +08:00
/ * *
* Repackage instance variables and call static method .
* TODO : This method ' s days are numbered .
* @param enableVerification
* @param wrappedIterator
* @return
* /
protected StingSAMIterator applyDecoratingIterators ( final boolean enableVerification , final StingSAMIterator wrappedIterator ) {
return applyDecoratingIterators ( enableVerification ,
wrappedIterator ,
DOWNSAMPLE_BY_FRACTION ? downsamplingFraction : null ,
SORT_ON_FLY ? maxOnFlySorts : null ,
2009-05-29 22:51:08 +08:00
filterZeroMappingQualityReads ,
2009-05-16 05:02:12 +08:00
beSafeP ) ;
}
/ * *
* WARNING : In TraversalEngine for backward compatibility ONLY . Reads are not used as the data source , only as parameters
* for validation .
* /
public static StingSAMIterator applyDecoratingIterators ( boolean enableVerification ,
StingSAMIterator wrappedIterator ,
Double downsamplingFraction ,
Integer maxOnFlySorts ,
2009-05-29 22:51:08 +08:00
Boolean filterZeroMappingQualityReads ,
2009-05-16 05:02:12 +08:00
Boolean beSafeP ) {
2009-04-01 10:11:13 +08:00
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
2009-05-16 05:02:12 +08:00
if ( downsamplingFraction ! = null )
2009-04-01 10:11:13 +08:00
wrappedIterator = new DownsampleIterator ( wrappedIterator , downsamplingFraction ) ;
2009-05-16 05:02:12 +08:00
if ( maxOnFlySorts ! = null )
2009-04-01 10:11:13 +08:00
wrappedIterator = new SortSamIterator ( wrappedIterator , maxOnFlySorts ) ;
2009-03-25 06:32:45 +08:00
2009-05-16 05:02:12 +08:00
if ( beSafeP ! = null & & beSafeP & & enableVerification )
2009-03-25 06:32:45 +08:00
wrappedIterator = new VerifyingSamIterator ( wrappedIterator ) ;
2009-05-29 22:51:08 +08:00
2009-06-01 23:50:44 +08:00
if ( filterZeroMappingQualityReads ! = null & & filterZeroMappingQualityReads )
2009-05-29 22:51:08 +08:00
wrappedIterator = StingSAMIteratorAdapter . adapt ( wrappedIterator . getSourceInfo ( ) ,
new FilteringIterator ( wrappedIterator , new ZeroMappingQualityReadFilterFunc ( ) ) ) ;
2009-03-25 06:32:45 +08:00
return wrappedIterator ;
}
2009-03-18 00:39:03 +08:00
2009-05-29 22:51:08 +08:00
private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter {
public boolean filterOut ( SAMRecord rec ) {
if ( rec . getMappingQuality ( ) = = 0 ) {
//System.out.printf("Filtering 0 mapping quality read %s%n", rec.format());
return true ;
} else {
return false ;
}
}
}
2009-04-25 03:40:21 +08:00
protected SAMFileReader initializeSAMFile ( File samFile ) {
2009-03-26 22:40:50 +08:00
// todo: fixme, this is a hack to try out dynamic merging
if ( samFile . toString ( ) . endsWith ( ".list" ) ) {
return null ;
// todo: omg, this is just scary, just it's just for testing purposes. fix with the new DataSource system
} else {
SAMFileReader samReader = new SAMFileReader ( samFile , true ) ;
samReader . setValidationStringency ( strictness ) ;
2009-03-18 00:39:03 +08:00
2009-03-26 22:40:50 +08:00
final SAMFileHeader header = samReader . getFileHeader ( ) ;
2009-04-25 05:39:44 +08:00
logger . debug ( String . format ( "Sort order is: " + header . getSortOrder ( ) ) ) ;
2009-03-18 00:39:03 +08:00
2009-06-20 06:04:47 +08:00
// Kludge filename into sam file header.
if ( samReader . getFileHeader ( ) . getReadGroups ( ) . size ( ) < 1 ) {
//logger.warn("Setting header in reader " + f.getName());
SAMReadGroupRecord rec = new SAMReadGroupRecord ( samFile . getName ( ) ) ;
rec . setLibrary ( samFile . getName ( ) ) ;
rec . setSample ( samFile . getName ( ) ) ;
samReader . getFileHeader ( ) . addReadGroup ( rec ) ;
}
2009-03-26 22:40:50 +08:00
return samReader ;
}
2009-03-12 05:43:31 +08:00
}
/ * *
* Prepare the reference for stream processing
* /
2009-03-16 06:21:48 +08:00
protected void initializeReference ( ) {
2009-03-24 04:27:21 +08:00
if ( refFileName ! = null ) {
2009-03-23 03:56:54 +08:00
//this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
this . refFile = new FastaSequenceFile2 ( refFileName ) ; // todo: replace when FastaSequenceFile2 is in picard
2009-02-27 05:50:29 +08:00
this . refIter = new ReferenceIterator ( this . refFile ) ;
2009-04-08 06:19:54 +08:00
if ( ! GenomeLoc . setupRefContigOrdering ( this . refFile ) ) {
// We couldn't process the reference contig ordering, fail since we need it
Utils . scareUser ( String . format ( "We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. Please use /seq/software/picard/current/bin/CreateSequenceDictionary.jar to create a sequence dictionary for your file" , refFileName ) ) ;
}
2009-03-24 04:27:21 +08:00
}
2009-02-27 05:50:29 +08:00
}
2009-03-12 05:43:31 +08:00
/ * *
* Prepare the list of reference ordered data iterators for each of the rods
2009-03-24 04:27:21 +08:00
*
2009-03-12 05:43:31 +08:00
* @return A list of ROD iterators for getting data from each ROD
* /
2009-04-04 03:54:54 +08:00
protected void initializeRODs ( ) {
2009-03-12 04:58:01 +08:00
// set up reference ordered data
2009-06-06 07:34:37 +08:00
rodIters = new ArrayList < Pair < String , RODIterator < ? extends ReferenceOrderedDatum > > > ( ) ;
2009-04-03 04:53:01 +08:00
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > data : rods ) {
2009-06-06 07:34:37 +08:00
rodIters . add ( new Pair < String , RODIterator < ? extends ReferenceOrderedDatum > > ( data . getName ( ) , data . iterator ( ) ) ) ;
2009-03-12 04:58:01 +08:00
}
}
2009-03-23 03:56:54 +08:00
/ * *
* An inappropriately placed testing of reading the reference
* /
2009-03-13 22:50:45 +08:00
protected void testReference ( ) {
while ( true ) {
ReferenceSequence ref = refFile . nextSequence ( ) ;
2009-03-24 04:46:55 +08:00
logger . debug ( String . format ( "%s %d %d" , ref . getName ( ) , ref . length ( ) , System . currentTimeMillis ( ) ) ) ;
2009-03-13 22:50:45 +08:00
printProgress ( true , "loci" , new GenomeLoc ( "foo" , 1 ) ) ;
}
}
2009-03-12 05:43:31 +08:00
2009-03-27 23:40:45 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// shutdown
//
// --------------------------------------------------------------------------------------------------------------
public boolean shutdown ( ) {
// todo: actually shutdown the resources
2009-04-24 04:34:52 +08:00
2009-03-27 23:40:45 +08:00
return true ;
}
2009-03-12 05:43:31 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// dealing with reference ordered data
//
// --------------------------------------------------------------------------------------------------------------
/ * *
* Builds a list of the reference ordered datum at loc from each of the iterators . This function
* assumes you are accessing the data in order . You can ' t use this function for random access . Each
* successive call moves you along the file , consuming all data before loc .
*
2009-03-24 04:27:21 +08:00
* @param loc The location to get the rods at
2009-03-12 05:43:31 +08:00
* @return A list of ReferenceOrderDatum at loc . ROD without a datum at loc will be null in the list
* /
2009-04-04 03:54:54 +08:00
protected RefMetaDataTracker getReferenceOrderedDataAtLocus ( final GenomeLoc loc ) {
RefMetaDataTracker tracks = new RefMetaDataTracker ( ) ;
2009-06-06 07:34:37 +08:00
for ( Pair < String , RODIterator < ? extends ReferenceOrderedDatum > > pair : rodIters ) {
2009-04-04 03:54:54 +08:00
String name = pair . getFirst ( ) ;
tracks . bind ( name , pair . getSecond ( ) . seekForward ( loc ) ) ;
2009-03-01 04:47:48 +08:00
}
2009-06-06 07:34:37 +08:00
2009-04-04 03:54:54 +08:00
return tracks ;
2009-03-01 04:47:48 +08:00
}
2009-03-27 23:40:45 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// processing
//
// --------------------------------------------------------------------------------------------------------------
2009-03-03 02:18:48 +08:00
2009-03-27 23:40:45 +08:00
public < M , T > T traverse ( Walker < M , T > walker ) {
2009-04-25 05:39:44 +08:00
T sum = null ;
2009-04-25 09:37:17 +08:00
if ( hasLocations ( ) & & walker . isReduceByInterval ( ) ) {
2009-04-25 05:39:44 +08:00
logger . info ( "Doing reduce by interval processing" ) ;
if ( ! hasLocations ( ) )
Utils . scareUser ( "ReduceByInterval requested by no interval was provided" ) ;
List < Pair < GenomeLoc , T > > map = new ArrayList < Pair < GenomeLoc , T > > ( locs . size ( ) ) ;
for ( GenomeLoc loc : locs ) {
ArrayList < GenomeLoc > l = new ArrayList < GenomeLoc > ( ) ;
l . add ( loc ) ;
T intervalSum = traverse ( walker , l ) ;
sum = intervalSum ;
map . add ( new Pair < GenomeLoc , T > ( loc , intervalSum ) ) ;
}
walker . onTraversalDone ( map ) ;
} else {
2009-05-12 09:04:18 +08:00
List < GenomeLoc > l = new ArrayList < GenomeLoc > ( ) ;
2009-04-25 05:39:44 +08:00
if ( hasLocations ( ) )
l = locs ;
sum = traverse ( walker , l ) ;
}
2009-03-27 23:40:45 +08:00
2009-04-25 05:39:44 +08:00
printOnTraversalDone ( "elements" , sum ) ;
return sum ;
2009-03-27 23:40:45 +08:00
}
2009-05-12 09:04:18 +08:00
public < M , T > T traverse ( Walker < M , T > walker , List < GenomeLoc > locations ) {
2009-03-27 23:40:45 +08:00
return null ;
}
2009-04-24 04:34:52 +08:00
2009-05-09 05:27:54 +08:00
public < M , T > T traverse ( Walker < M , T > walker ,
Shard shard ,
ShardDataProvider dataProvider ,
T sum ) {
throw new UnsupportedOperationException ( "This method is a required override for new traversal engines. Please port your traversal engine to the new style." ) ;
}
2009-03-17 07:22:04 +08:00
public void verifySortOrder ( final boolean requiresSortedOrder ) {
2009-03-24 04:27:21 +08:00
if ( beSafeP & & ! SORT_ON_FLY & & samReader . getFileHeader ( ) . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate ) {
2009-03-17 07:22:04 +08:00
final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing" ;
2009-03-24 04:27:21 +08:00
if ( requiresSortedOrder | | strictness = = SAMFileReader . ValidationStringency . STRICT )
2009-03-17 07:22:04 +08:00
throw new RuntimeIOException ( msg ) ;
2009-03-24 04:27:21 +08:00
else if ( strictness = = SAMFileReader . ValidationStringency . LENIENT )
logger . warn ( msg ) ;
2009-03-17 07:22:04 +08:00
}
}
2009-05-13 02:52:42 +08:00
@Deprecated
2009-04-03 05:38:00 +08:00
public SAMFileReader getSamReader ( ) { return this . samReader ; }
2009-03-03 05:51:25 +08:00
}