2009-03-27 23:40:45 +08:00
package org.broadinstitute.sting.gatk.traversals ;
2009-02-27 05:50:29 +08:00
2009-03-24 04:27:21 +08:00
import edu.mit.broad.picard.filter.SamRecordFilter ;
2009-03-13 22:50:45 +08:00
import edu.mit.broad.picard.reference.ReferenceSequence ;
2009-03-26 22:40:50 +08:00
import edu.mit.broad.picard.sam.SamFileHeaderMerger ;
2009-03-24 04:27:21 +08:00
import net.sf.samtools.SAMFileHeader ;
import net.sf.samtools.SAMFileReader ;
import net.sf.samtools.SAMFileReader.ValidationStringency ;
import net.sf.samtools.SAMRecord ;
import net.sf.samtools.util.RuntimeIOException ;
import org.apache.log4j.Logger ;
2009-03-16 06:21:48 +08:00
import org.broadinstitute.sting.gatk.iterators.* ;
2009-04-16 02:29:38 +08:00
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker ;
2009-03-16 06:21:48 +08:00
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum ;
2009-03-27 23:40:45 +08:00
import org.broadinstitute.sting.gatk.walkers.Walker ;
2009-03-25 23:17:38 +08:00
import org.broadinstitute.sting.utils.* ;
2009-04-15 02:13:23 +08:00
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2 ;
2009-02-27 05:50:29 +08:00
2009-04-16 02:29:38 +08:00
import java.io.File ;
import java.io.FileNotFoundException ;
2009-04-25 03:40:21 +08:00
import java.util.* ;
2009-03-03 02:18:48 +08:00
2009-03-27 23:40:45 +08:00
public abstract class TraversalEngine {
2009-03-16 06:21:48 +08:00
// list of reference ordered data objects
2009-04-03 04:53:01 +08:00
protected List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods = null ;
2009-03-16 06:21:48 +08:00
// Iterator over rods
2009-04-04 03:54:54 +08:00
List < Pair < String , ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator > > rodIters ;
2009-03-01 04:47:48 +08:00
2009-03-12 05:43:31 +08:00
// How strict should we be with SAM/BAM parsing?
2009-03-27 23:40:45 +08:00
protected ValidationStringency strictness = ValidationStringency . STRICT ;
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
// Time in milliseconds since we initialized this engine
2009-03-27 23:40:45 +08:00
protected long startTime = - 1 ;
protected long lastProgressPrintTime = - 1 ; // When was the last time we printed our progress?
2009-03-12 05:43:31 +08:00
// How long can we go without printing some progress info?
2009-03-27 23:40:45 +08:00
protected long MAX_PROGRESS_PRINT_TIME = 10 * 1000 ; // 10 seconds in millisecs
2009-03-12 05:43:31 +08:00
// Maximum number of reads to process before finishing
2009-03-27 23:40:45 +08:00
protected long maxReads = - 1 ;
2009-03-12 05:43:31 +08:00
// Name of the reads file, in BAM/SAM format
2009-04-25 03:40:21 +08:00
protected List < File > readsFiles = null ; // the name of the reads file
2009-03-27 23:40:45 +08:00
protected SAMFileReader samReader = null ;
2009-03-12 05:43:31 +08:00
// iterator over the sam records in the readsFile
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > samReadIter = null ;
2009-03-12 05:43:31 +08:00
2009-03-17 07:22:04 +08:00
// The verifying iterator, it does checking
2009-03-27 23:40:45 +08:00
protected VerifyingSamIterator verifyingSamReadIter = null ;
2009-03-17 07:22:04 +08:00
2009-03-12 05:43:31 +08:00
// The reference data -- filename, refSeqFile, and iterator
2009-03-27 23:40:45 +08:00
protected File refFileName = null ; // the name of the reference file
2009-03-23 03:56:54 +08:00
//private ReferenceSequenceFile refFile = null;
2009-03-27 23:40:45 +08:00
protected FastaSequenceFile2 refFile = null ; // todo: merge FastaSequenceFile2 into picard!
protected ReferenceIterator refIter = null ;
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
// Progress tracker for the sam file
2009-03-27 23:40:45 +08:00
protected FileProgressTracker samReadingTracker = null ;
protected boolean DEBUGGING = false ;
protected boolean beSafeP = true ;
protected boolean SORT_ON_FLY = false ;
2009-04-01 10:11:13 +08:00
protected boolean DOWNSAMPLE_BY_FRACTION = false ;
2009-04-02 04:27:06 +08:00
protected boolean DOWNSAMPLE_BY_COVERAGE = false ;
2009-03-27 23:40:45 +08:00
protected boolean FILTER_UNSORTED_READS = false ;
protected boolean walkOverAllSites = false ;
2009-04-01 10:11:13 +08:00
protected int maxOnFlySorts = 100000 ;
protected double downsamplingFraction = 1.0 ;
2009-04-02 04:27:06 +08:00
protected int downsamplingCoverage = 0 ;
2009-03-27 23:40:45 +08:00
protected long N_RECORDS_TO_PRINT = 100000 ;
protected boolean THREADED_IO = false ;
protected int THREADED_IO_BUFFER_SIZE = 10000 ;
2009-02-27 05:50:29 +08:00
2009-04-24 04:34:52 +08:00
// the stored header
protected SAMFileHeader myHeader = null ;
2009-03-24 04:27:21 +08:00
/ * *
* our log , which we want to capture anything from this class
* /
2009-03-27 23:40:45 +08:00
protected static Logger logger = Logger . getLogger ( TraversalEngine . class ) ;
2009-03-24 04:27:21 +08:00
2009-03-12 05:43:31 +08:00
// Locations we are going to process during the traversal
2009-03-29 04:37:27 +08:00
private ArrayList < GenomeLoc > locs = null ;
2009-03-03 02:18:48 +08:00
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Setting up the engine
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Creates a new , uninitialized TraversalEngine
*
* @param reads SAM / BAM file of reads
2009-03-24 04:27:21 +08:00
* @param ref Reference file in FASTA format , assumes a . dict file is also available
* @param rods Array of reference ordered data sets
2009-03-12 05:43:31 +08:00
* /
2009-04-25 03:40:21 +08:00
public TraversalEngine ( List < File > reads , File ref , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
readsFiles = reads ;
2009-02-27 05:50:29 +08:00
refFileName = ref ;
2009-03-24 11:58:03 +08:00
this . rods = rods ;
2009-02-27 05:50:29 +08:00
}
2009-03-03 05:49:08 +08:00
2009-03-12 05:43:31 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Manipulating the underlying engine parameters
//
// --------------------------------------------------------------------------------------------------------------
//public void setRegion(final String reg) { regionStr = regionStr; }
//public void setTraversalType(final String type) { traversalType = type; }
2009-03-24 04:27:21 +08:00
public void setStrictness ( final ValidationStringency s ) {
strictness = s ;
}
public void setMaxReads ( final int maxReads ) {
this . maxReads = maxReads ;
}
2009-03-25 06:32:45 +08:00
public void setThreadedIO ( final boolean threadedIO ) {
this . THREADED_IO = threadedIO ;
}
2009-03-27 08:12:35 +08:00
public void setWalkOverAllSites ( final boolean walkOverAllSites ) {
this . walkOverAllSites = walkOverAllSites ;
}
2009-03-24 04:27:21 +08:00
public void setDebugging ( final boolean d ) {
DEBUGGING = d ;
}
public void setSafetyChecking ( final boolean beSafeP ) {
if ( ! beSafeP )
2009-03-24 04:46:55 +08:00
logger . warn ( "*** Turning off safety checking, I hope you know what you are doing. Errors will result in debugging assert failures and other inscrutable messages..." ) ;
2009-03-17 07:22:04 +08:00
this . beSafeP = beSafeP ;
}
2009-03-24 04:27:21 +08:00
2009-03-25 04:55:34 +08:00
public void setFilterUnsortedReads ( final boolean filterUnsorted ) {
if ( ! filterUnsorted )
logger . warn ( "*** Turning on filtering of out of order reads, I *really* hope you know what you are doing, as you are removing data..." ) ;
this . FILTER_UNSORTED_READS = filterUnsorted ;
}
2009-04-01 10:11:13 +08:00
public void setSortOnFly ( final int maxReadsToSort ) {
logger . info ( "Sorting read file on the fly: max reads allowed is " + maxReadsToSort ) ;
SORT_ON_FLY = true ;
maxOnFlySorts = maxReadsToSort ;
}
public void setSortOnFly ( ) { setSortOnFly ( 100000 ) ; }
public void setDownsampleByFraction ( final double fraction ) {
logger . info ( "Downsampling to approximately " + ( fraction * 100.0 ) + "% of filtered reads" ) ;
DOWNSAMPLE_BY_FRACTION = true ;
downsamplingFraction = fraction ;
2009-03-18 04:29:09 +08:00
}
2009-02-27 05:50:29 +08:00
2009-04-02 04:27:06 +08:00
public void setDownsampleByCoverage ( final int coverage ) {
logger . info ( "Downsampling to coverage " + coverage ) ;
DOWNSAMPLE_BY_COVERAGE = true ;
downsamplingCoverage = coverage ;
}
2009-03-03 02:18:48 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// functions for dealing locations (areas of the genome we're traversing over)
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
2009-04-24 04:34:52 +08:00
/ * *
* get the associated SAM header for our run
* @return the header if it ' s stored , null if not
* /
public SAMFileHeader getSAMHeader ( ) {
return myHeader ;
}
/ * *
* set ' s the SAM header for this traversal , which should
* be the merged header in the multiple BAM file case .
*
* @param myHeader the passed in header
* /
public void setSAMHeader ( SAMFileHeader myHeader ) {
this . myHeader = myHeader ;
}
2009-03-12 05:43:31 +08:00
/ * *
* Parses the location string locStr and sets the traversal engine to only process
* regions specified by the location string . The string is of the form :
2009-03-24 04:27:21 +08:00
* Of the form : loc1 ; loc2 ; . . .
* Where each locN can be :
2009-04-04 02:24:08 +08:00
* ' chr2 ' , ' chr2 : 1000000 ' or ' chr2 : 1 , 000 , 000 - 2 , 000 , 000 '
2009-03-12 05:43:31 +08:00
*
* @param locStr
* /
2009-03-24 04:27:21 +08:00
public void setLocation ( final String locStr ) {
2009-03-29 04:37:27 +08:00
this . locs = GenomeLoc . parseGenomeLocs ( locStr ) ;
2009-03-03 02:18:48 +08:00
}
2009-03-22 00:07:32 +08:00
/ * *
* Read a file of genome locations to process .
* regions specified by the location string . The string is of the form :
2009-03-24 04:27:21 +08:00
* Of the form : loc1 ; loc2 ; . . .
* Where each locN can be :
2009-04-04 02:24:08 +08:00
* ' chr2 ' , ' chr2 : 1000000 ' or ' chr2 : 1 , 000 , 000 - 2 , 000 , 000 '
2009-03-22 00:07:32 +08:00
*
* @param file_name
* /
2009-03-24 04:27:21 +08:00
public void setLocationFromFile ( final String file_name ) {
2009-04-09 00:49:43 +08:00
2009-04-16 02:29:38 +08:00
this . locs = GenomeLoc . IntervalFileToList ( file_name ) ;
2009-03-22 00:07:32 +08:00
}
2009-03-03 02:18:48 +08:00
2009-03-26 00:01:58 +08:00
public boolean hasLocations ( ) {
return this . locs ! = null ;
}
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
2009-03-12 05:43:31 +08:00
// printing
2009-02-27 05:50:29 +08:00
//
// --------------------------------------------------------------------------------------------------------------
2009-03-24 04:27:21 +08:00
2009-03-10 22:59:42 +08:00
/ * *
* @param curTime ( current runtime , in millisecs )
* @return true if the maximum interval ( in millisecs ) has passed since the last printing
* /
2009-03-27 23:40:45 +08:00
protected boolean maxElapsedIntervalForPrinting ( final long curTime ) {
2009-03-10 22:59:42 +08:00
return ( curTime - this . lastProgressPrintTime ) > MAX_PROGRESS_PRINT_TIME ;
}
2009-03-12 05:43:31 +08:00
/ * *
* Forward request to printProgress
*
* @param type
* @param loc
* /
public void printProgress ( final String type , GenomeLoc loc ) {
2009-03-24 04:27:21 +08:00
printProgress ( false , type , loc ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Utility routine that prints out process information ( including timing ) every N records or
* every M seconds , for N and M set in global variables .
*
* @param mustPrint If true , will print out info , regardless of nRecords or time interval
2009-03-24 04:27:21 +08:00
* @param type String to print out describing our atomic traversal type ( "read" , "locus" , etc )
* @param loc Current location
2009-03-12 05:43:31 +08:00
* /
2009-03-24 04:27:21 +08:00
public void printProgress ( boolean mustPrint , final String type , GenomeLoc loc ) {
2009-04-10 04:28:17 +08:00
final long nRecords = TraversalStatistics . nRecords ;
2009-03-10 22:59:42 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2009-04-03 00:44:12 +08:00
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
2009-03-24 04:27:21 +08:00
if ( mustPrint | | nRecords % N_RECORDS_TO_PRINT = = 0 | | maxElapsedIntervalForPrinting ( curTime ) ) {
2009-03-10 22:59:42 +08:00
this . lastProgressPrintTime = curTime ;
2009-02-27 05:50:29 +08:00
final double secsPer1MReads = ( elapsed * 1000000.0 ) / nRecords ;
2009-03-24 04:27:21 +08:00
if ( loc ! = null )
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed to %s, processing %,d %s in %.2f secs (%.2f secs per 1M %s)" , loc , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-03-03 05:51:25 +08:00
else
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)" , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-03-10 22:59:42 +08:00
2009-03-12 05:43:31 +08:00
// Currently samReadingTracker will print misleading info if we're not processing the whole file
2009-03-20 07:02:49 +08:00
// If an index is enabled, file read progress is meaningless because a linear
// traversal is not being performed. For now, don't bother printing progress.
// TODO: Create a sam indexed read tracker that tracks based on percentage through the query.
2009-03-24 04:27:21 +08:00
if ( samReadingTracker ! = null & & this . locs = = null )
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] -> %s" , samReadingTracker . progressMeter ( ) ) ) ;
2009-02-27 05:50:29 +08:00
}
}
2009-03-12 05:43:31 +08:00
/ * *
* Called after a traversal to print out information about the traversal process
*
* @param type String describing this type of traversal ( "loci" , "read" )
2009-03-24 04:27:21 +08:00
* @param sum The reduce result of the traversal
* @param < T > ReduceType of the traversal
2009-03-12 05:43:31 +08:00
* /
2009-04-10 04:28:17 +08:00
public < T > void printOnTraversalDone ( final String type , T sum ) {
2009-03-24 04:27:21 +08:00
printProgress ( true , type , null ) ;
2009-03-27 21:27:04 +08:00
logger . info ( "Traversal reduce result is " + sum ) ;
2009-03-26 22:40:50 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
logger . info ( String . format ( "Total runtime %.2f secs, %.2f min, %.2f hours%n" , elapsed , elapsed / 60 , elapsed / 3600 ) ) ;
2009-04-10 04:28:17 +08:00
logger . info ( String . format ( "Traversal skipped %d reads out of %d total (%.2f%%)" ,
TraversalStatistics . nSkippedReads ,
TraversalStatistics . nReads ,
( TraversalStatistics . nSkippedReads * 100.0 ) / TraversalStatistics . nReads ) ) ;
logger . info ( String . format ( " -> %d unmapped reads" , TraversalStatistics . nUnmappedReads ) ) ;
2009-04-17 09:27:36 +08:00
logger . info ( String . format ( " -> %d duplicate reads" , TraversalStatistics . nDuplicates ) ) ;
2009-04-10 04:28:17 +08:00
logger . info ( String . format ( " -> %d non-primary reads" , TraversalStatistics . nNotPrimary ) ) ;
logger . info ( String . format ( " -> %d reads with bad alignments" , TraversalStatistics . nBadAlignments ) ) ;
logger . info ( String . format ( " -> %d reads with indels" , TraversalStatistics . nSkippedIndels ) ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
2009-03-12 05:43:31 +08:00
// Initialization
2009-02-27 05:50:29 +08:00
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Initialize the traversal engine . After this point traversals can be run over the data
*
* @return true on success
* /
2009-03-25 06:32:45 +08:00
public boolean initialize ( ) {
2009-03-12 05:43:31 +08:00
lastProgressPrintTime = startTime = System . currentTimeMillis ( ) ;
2009-03-16 06:21:48 +08:00
initializeReference ( ) ;
// Initial the reference ordered data iterators
initializeRODs ( ) ;
return true ;
}
2009-04-25 03:40:21 +08:00
/ * *
* Unpack the files to be processed , given a list of files . That list of files can
* itself contain lists of other files to be read .
* @param inputFiles
* @return
* /
public static List < File > unpackReads ( List < File > inputFiles ) throws FileNotFoundException {
List < File > unpackedReads = new ArrayList < File > ( ) ;
for ( File inputFile : inputFiles ) {
if ( inputFile . getName ( ) . endsWith ( ".list" ) ) {
for ( String fileName : new xReadLines ( inputFile ) )
unpackedReads . add ( new File ( fileName ) ) ;
}
else
unpackedReads . add ( inputFile ) ;
}
return unpackedReads ;
}
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > initializeReads ( ) {
2009-04-25 03:40:21 +08:00
List < File > allReadsFiles = null ;
try {
allReadsFiles = unpackReads ( readsFiles ) ;
}
catch ( FileNotFoundException ex ) {
throw new RuntimeException ( "Unable to unpack reads" , ex ) ;
}
if ( allReadsFiles . size ( ) = = 1 )
samReader = initializeSAMFile ( allReadsFiles . get ( 0 ) ) ;
else
samReader = null ;
2009-03-25 06:32:45 +08:00
return WrapReadsIterator ( getReadsIterator ( samReader ) , true ) ;
}
2009-03-18 00:39:03 +08:00
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > getReadsIterator ( final SAMFileReader samReader ) {
2009-03-25 06:32:45 +08:00
// If the file has an index, querying functions are available. Use them if possible...
2009-04-25 03:40:21 +08:00
if ( samReader = = null & & readsFiles . size ( ) > 0 ) {
2009-03-26 22:40:50 +08:00
SAMFileHeader . SortOrder SORT_ORDER = SAMFileHeader . SortOrder . coordinate ;
2009-04-25 03:40:21 +08:00
List < File > allReadsFiles = null ;
2009-03-26 22:40:50 +08:00
List < SAMFileReader > readers = new ArrayList < SAMFileReader > ( ) ;
try {
2009-04-25 03:40:21 +08:00
allReadsFiles = unpackReads ( readsFiles ) ;
2009-03-26 22:40:50 +08:00
}
2009-04-25 03:40:21 +08:00
catch ( FileNotFoundException ex ) {
throw new RuntimeException ( "Unable to unpack reads" , ex ) ;
2009-03-26 22:40:50 +08:00
}
2009-04-25 03:40:21 +08:00
for ( File readsFile : allReadsFiles ) {
SAMFileReader reader = initializeSAMFile ( readsFile ) ;
readers . add ( reader ) ;
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger ( readers , SORT_ORDER ) ;
return new MergingSamRecordIterator2 ( headerMerger ) ;
}
else {
return samReader . iterator ( ) ;
2009-03-26 22:40:50 +08:00
}
2009-03-25 06:32:45 +08:00
}
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > WrapReadsIterator ( final Iterator < SAMRecord > rawIterator , final boolean enableVerification ) {
2009-03-25 06:32:45 +08:00
Iterator < SAMRecord > wrappedIterator = rawIterator ;
2009-03-18 00:39:03 +08:00
2009-04-01 10:11:13 +08:00
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
if ( DOWNSAMPLE_BY_FRACTION )
wrappedIterator = new DownsampleIterator ( wrappedIterator , downsamplingFraction ) ;
2009-03-24 04:27:21 +08:00
if ( SORT_ON_FLY )
2009-04-01 10:11:13 +08:00
wrappedIterator = new SortSamIterator ( wrappedIterator , maxOnFlySorts ) ;
2009-03-25 06:32:45 +08:00
if ( beSafeP & & enableVerification )
wrappedIterator = new VerifyingSamIterator ( wrappedIterator ) ;
2009-03-24 04:27:21 +08:00
if ( THREADED_IO ) {
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "Enabling threaded I/O with buffer of %d reads" , THREADED_IO_BUFFER_SIZE ) ) ;
2009-03-25 06:32:45 +08:00
wrappedIterator = new ThreadedIterator < SAMRecord > ( wrappedIterator , THREADED_IO_BUFFER_SIZE ) ;
2009-03-18 00:39:03 +08:00
}
2009-03-25 06:32:45 +08:00
return wrappedIterator ;
}
2009-03-18 00:39:03 +08:00
2009-04-25 03:40:21 +08:00
protected SAMFileReader initializeSAMFile ( File samFile ) {
2009-03-26 22:40:50 +08:00
// todo: fixme, this is a hack to try out dynamic merging
if ( samFile . toString ( ) . endsWith ( ".list" ) ) {
return null ;
// todo: omg, this is just scary, just it's just for testing purposes. fix with the new DataSource system
} else {
SAMFileReader samReader = new SAMFileReader ( samFile , true ) ;
samReader . setValidationStringency ( strictness ) ;
2009-03-18 00:39:03 +08:00
2009-03-26 22:40:50 +08:00
final SAMFileHeader header = samReader . getFileHeader ( ) ;
2009-04-25 05:39:44 +08:00
logger . debug ( String . format ( "Sort order is: " + header . getSortOrder ( ) ) ) ;
2009-03-18 00:39:03 +08:00
2009-03-26 22:40:50 +08:00
return samReader ;
}
2009-03-12 05:43:31 +08:00
}
/ * *
* Prepare the reference for stream processing
* /
2009-03-16 06:21:48 +08:00
protected void initializeReference ( ) {
2009-03-24 04:27:21 +08:00
if ( refFileName ! = null ) {
2009-03-23 03:56:54 +08:00
//this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
this . refFile = new FastaSequenceFile2 ( refFileName ) ; // todo: replace when FastaSequenceFile2 is in picard
2009-02-27 05:50:29 +08:00
this . refIter = new ReferenceIterator ( this . refFile ) ;
2009-04-08 06:19:54 +08:00
if ( ! GenomeLoc . setupRefContigOrdering ( this . refFile ) ) {
// We couldn't process the reference contig ordering, fail since we need it
Utils . scareUser ( String . format ( "We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. Please use /seq/software/picard/current/bin/CreateSequenceDictionary.jar to create a sequence dictionary for your file" , refFileName ) ) ;
}
2009-03-24 04:27:21 +08:00
}
2009-02-27 05:50:29 +08:00
}
2009-03-12 05:43:31 +08:00
/ * *
* Prepare the list of reference ordered data iterators for each of the rods
2009-03-24 04:27:21 +08:00
*
2009-03-12 05:43:31 +08:00
* @return A list of ROD iterators for getting data from each ROD
* /
2009-04-04 03:54:54 +08:00
protected void initializeRODs ( ) {
2009-03-12 04:58:01 +08:00
// set up reference ordered data
2009-04-04 03:54:54 +08:00
rodIters = new ArrayList < Pair < String , ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator > > ( ) ;
2009-04-03 04:53:01 +08:00
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > data : rods ) {
2009-04-04 03:54:54 +08:00
rodIters . add ( new Pair < String , ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator > ( data . getName ( ) , data . iterator ( ) ) ) ;
2009-03-12 04:58:01 +08:00
}
}
2009-03-23 03:56:54 +08:00
/ * *
* An inappropriately placed testing of reading the reference
* /
2009-03-13 22:50:45 +08:00
protected void testReference ( ) {
while ( true ) {
ReferenceSequence ref = refFile . nextSequence ( ) ;
2009-03-24 04:46:55 +08:00
logger . debug ( String . format ( "%s %d %d" , ref . getName ( ) , ref . length ( ) , System . currentTimeMillis ( ) ) ) ;
2009-03-13 22:50:45 +08:00
printProgress ( true , "loci" , new GenomeLoc ( "foo" , 1 ) ) ;
}
}
2009-03-12 05:43:31 +08:00
2009-03-27 23:40:45 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// shutdown
//
// --------------------------------------------------------------------------------------------------------------
public boolean shutdown ( ) {
// todo: actually shutdown the resources
2009-04-24 04:34:52 +08:00
2009-03-27 23:40:45 +08:00
return true ;
}
2009-03-12 05:43:31 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// dealing with reference ordered data
//
// --------------------------------------------------------------------------------------------------------------
/ * *
* Builds a list of the reference ordered datum at loc from each of the iterators . This function
* assumes you are accessing the data in order . You can ' t use this function for random access . Each
* successive call moves you along the file , consuming all data before loc .
*
2009-03-24 04:27:21 +08:00
* @param loc The location to get the rods at
2009-03-12 05:43:31 +08:00
* @return A list of ReferenceOrderDatum at loc . ROD without a datum at loc will be null in the list
* /
2009-04-04 03:54:54 +08:00
protected RefMetaDataTracker getReferenceOrderedDataAtLocus ( final GenomeLoc loc ) {
RefMetaDataTracker tracks = new RefMetaDataTracker ( ) ;
for ( Pair < String , ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator > pair : rodIters ) {
String name = pair . getFirst ( ) ;
tracks . bind ( name , pair . getSecond ( ) . seekForward ( loc ) ) ;
2009-03-01 04:47:48 +08:00
}
2009-04-04 03:54:54 +08:00
return tracks ;
2009-03-01 04:47:48 +08:00
}
2009-03-27 23:40:45 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// processing
//
// --------------------------------------------------------------------------------------------------------------
2009-03-03 02:18:48 +08:00
2009-03-27 23:40:45 +08:00
public < M , T > T traverse ( Walker < M , T > walker ) {
2009-04-25 05:39:44 +08:00
T sum = null ;
2009-04-25 09:37:17 +08:00
if ( hasLocations ( ) & & walker . isReduceByInterval ( ) ) {
2009-04-25 05:39:44 +08:00
logger . info ( "Doing reduce by interval processing" ) ;
if ( ! hasLocations ( ) )
Utils . scareUser ( "ReduceByInterval requested by no interval was provided" ) ;
List < Pair < GenomeLoc , T > > map = new ArrayList < Pair < GenomeLoc , T > > ( locs . size ( ) ) ;
for ( GenomeLoc loc : locs ) {
ArrayList < GenomeLoc > l = new ArrayList < GenomeLoc > ( ) ;
l . add ( loc ) ;
T intervalSum = traverse ( walker , l ) ;
sum = intervalSum ;
map . add ( new Pair < GenomeLoc , T > ( loc , intervalSum ) ) ;
}
walker . onTraversalDone ( map ) ;
} else {
ArrayList < GenomeLoc > l = new ArrayList < GenomeLoc > ( ) ;
if ( hasLocations ( ) )
l = locs ;
sum = traverse ( walker , l ) ;
}
2009-03-27 23:40:45 +08:00
2009-04-25 05:39:44 +08:00
printOnTraversalDone ( "elements" , sum ) ;
return sum ;
2009-03-27 23:40:45 +08:00
}
2009-03-29 04:37:27 +08:00
public < M , T > T traverse ( Walker < M , T > walker , ArrayList < GenomeLoc > locations ) {
2009-03-27 23:40:45 +08:00
return null ;
}
2009-04-24 04:34:52 +08:00
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
2009-03-03 02:18:48 +08:00
// traversal by loci functions
2009-02-27 05:50:29 +08:00
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Class to filter out un - handle - able reads from the stream . We currently are skipping
2009-04-24 12:33:35 +08:00
* unmapped reads , non - primary reads , unaligned reads , and duplicate reads .
2009-03-12 05:43:31 +08:00
* /
2009-04-18 00:41:56 +08:00
public static class locusStreamFilterFunc implements SamRecordFilter {
2009-03-25 04:55:34 +08:00
SAMRecord lastRead = null ;
2009-02-27 05:50:29 +08:00
public boolean filterOut ( SAMRecord rec ) {
boolean result = false ;
String why = "" ;
2009-03-24 04:27:21 +08:00
if ( rec . getReadUnmappedFlag ( ) ) {
2009-04-10 04:28:17 +08:00
TraversalStatistics . nUnmappedReads + + ;
2009-02-27 05:50:29 +08:00
result = true ;
why = "Unmapped" ;
2009-03-24 04:27:21 +08:00
} else if ( rec . getNotPrimaryAlignmentFlag ( ) ) {
2009-04-10 04:28:17 +08:00
TraversalStatistics . nNotPrimary + + ;
2009-02-27 05:50:29 +08:00
result = true ;
why = "Not Primary" ;
2009-03-24 04:27:21 +08:00
} else if ( rec . getAlignmentStart ( ) = = SAMRecord . NO_ALIGNMENT_START ) {
2009-04-10 04:28:17 +08:00
TraversalStatistics . nBadAlignments + + ;
2009-02-27 05:50:29 +08:00
result = true ;
why = "No alignment start" ;
2009-04-17 09:27:36 +08:00
} else if ( rec . getDuplicateReadFlag ( ) ) {
TraversalStatistics . nDuplicates + + ;
result = true ;
why = "Duplicate reads" ;
2009-04-10 04:28:17 +08:00
}
2009-03-25 04:55:34 +08:00
else {
2009-02-27 05:50:29 +08:00
result = false ;
}
2009-03-24 04:27:21 +08:00
if ( result ) {
2009-04-10 04:28:17 +08:00
TraversalStatistics . nSkippedReads + + ;
2009-03-24 04:46:55 +08:00
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
2009-03-24 04:27:21 +08:00
} else {
2009-04-10 04:28:17 +08:00
TraversalStatistics . nReads + + ;
2009-02-27 05:50:29 +08:00
}
2009-03-24 04:27:21 +08:00
return result ;
2009-02-27 05:50:29 +08:00
}
}
2009-03-17 07:22:04 +08:00
public void verifySortOrder ( final boolean requiresSortedOrder ) {
2009-03-24 04:27:21 +08:00
if ( beSafeP & & ! SORT_ON_FLY & & samReader . getFileHeader ( ) . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate ) {
2009-03-17 07:22:04 +08:00
final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing" ;
2009-03-24 04:27:21 +08:00
if ( requiresSortedOrder | | strictness = = SAMFileReader . ValidationStringency . STRICT )
2009-03-17 07:22:04 +08:00
throw new RuntimeIOException ( msg ) ;
2009-03-24 04:27:21 +08:00
else if ( strictness = = SAMFileReader . ValidationStringency . LENIENT )
logger . warn ( msg ) ;
2009-03-17 07:22:04 +08:00
}
}
2009-04-03 05:38:00 +08:00
public SAMFileReader getSamReader ( ) { return this . samReader ; }
2009-03-03 05:51:25 +08:00
}