2009-03-27 23:40:45 +08:00
package org.broadinstitute.sting.gatk.traversals ;
2009-02-27 05:50:29 +08:00
import edu.mit.broad.picard.filter.FilteringIterator ;
2009-03-24 04:27:21 +08:00
import edu.mit.broad.picard.filter.SamRecordFilter ;
2009-03-13 22:50:45 +08:00
import edu.mit.broad.picard.reference.ReferenceSequence ;
2009-03-26 22:40:50 +08:00
import edu.mit.broad.picard.sam.SamFileHeaderMerger ;
2009-03-24 04:27:21 +08:00
import net.sf.functionalj.Function1 ;
import net.sf.functionalj.FunctionN ;
import net.sf.functionalj.Functions ;
import net.sf.functionalj.reflect.JdkStdReflect ;
import net.sf.functionalj.reflect.StdReflect ;
import net.sf.samtools.SAMFileHeader ;
import net.sf.samtools.SAMFileReader ;
import net.sf.samtools.SAMFileReader.ValidationStringency ;
import net.sf.samtools.SAMRecord ;
import net.sf.samtools.util.RuntimeIOException ;
2009-03-25 04:55:34 +08:00
import net.sf.samtools.util.CloseableIterator ;
2009-03-24 04:27:21 +08:00
import org.apache.log4j.Logger ;
2009-03-16 06:21:48 +08:00
import org.broadinstitute.sting.gatk.iterators.* ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum ;
2009-03-24 04:27:21 +08:00
import org.broadinstitute.sting.gatk.walkers.LocusWalker ;
import org.broadinstitute.sting.gatk.walkers.ReadWalker ;
2009-03-27 23:40:45 +08:00
import org.broadinstitute.sting.gatk.walkers.Walker ;
import org.broadinstitute.sting.gatk.LocusContext ;
2009-03-25 23:17:38 +08:00
import org.broadinstitute.sting.utils.* ;
2009-02-27 05:50:29 +08:00
import java.io.* ;
2009-03-03 02:18:48 +08:00
import java.util.* ;
2009-03-27 23:40:45 +08:00
public abstract class TraversalEngine {
2009-03-16 06:21:48 +08:00
// list of reference ordered data objects
2009-04-03 04:53:01 +08:00
protected List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods = null ;
2009-03-16 06:21:48 +08:00
// Iterator over rods
2009-04-03 04:53:01 +08:00
List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator > rodIters ;
2009-03-01 04:47:48 +08:00
2009-03-12 05:43:31 +08:00
// How strict should we be with SAM/BAM parsing?
2009-03-27 23:40:45 +08:00
protected ValidationStringency strictness = ValidationStringency . STRICT ;
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
// Time in milliseconds since we initialized this engine
2009-03-27 23:40:45 +08:00
protected long startTime = - 1 ;
protected long lastProgressPrintTime = - 1 ; // When was the last time we printed our progress?
2009-03-12 05:43:31 +08:00
// How long can we go without printing some progress info?
2009-03-27 23:40:45 +08:00
protected long MAX_PROGRESS_PRINT_TIME = 10 * 1000 ; // 10 seconds in millisecs
2009-03-12 05:43:31 +08:00
// Maximum number of reads to process before finishing
2009-03-27 23:40:45 +08:00
protected long maxReads = - 1 ;
2009-03-12 05:43:31 +08:00
// Name of the reads file, in BAM/SAM format
2009-03-27 23:40:45 +08:00
protected File readsFile = null ; // the name of the reads file
protected SAMFileReader samReader = null ;
2009-03-12 05:43:31 +08:00
// iterator over the sam records in the readsFile
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > samReadIter = null ;
2009-03-12 05:43:31 +08:00
2009-03-17 07:22:04 +08:00
// The verifying iterator, it does checking
2009-03-27 23:40:45 +08:00
protected VerifyingSamIterator verifyingSamReadIter = null ;
2009-03-17 07:22:04 +08:00
2009-03-12 05:43:31 +08:00
// The reference data -- filename, refSeqFile, and iterator
2009-03-27 23:40:45 +08:00
protected File refFileName = null ; // the name of the reference file
2009-03-23 03:56:54 +08:00
//private ReferenceSequenceFile refFile = null;
2009-03-27 23:40:45 +08:00
protected FastaSequenceFile2 refFile = null ; // todo: merge FastaSequenceFile2 into picard!
protected ReferenceIterator refIter = null ;
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
// Number of records (loci, reads) we've processed
2009-03-27 23:40:45 +08:00
protected long nRecords = 0 ;
2009-03-12 05:43:31 +08:00
// How many reads have we processed, along with those skipped for various reasons
2009-03-27 23:40:45 +08:00
protected int nReads = 0 ;
protected int nSkippedReads = 0 ;
protected int nUnmappedReads = 0 ;
protected int nNotPrimary = 0 ;
protected int nBadAlignments = 0 ;
protected int nSkippedIndels = 0 ;
2009-03-12 05:43:31 +08:00
// Progress tracker for the sam file
2009-03-27 23:40:45 +08:00
protected FileProgressTracker samReadingTracker = null ;
protected boolean DEBUGGING = false ;
protected boolean beSafeP = true ;
protected boolean SORT_ON_FLY = false ;
2009-04-01 10:11:13 +08:00
protected boolean DOWNSAMPLE_BY_FRACTION = false ;
2009-04-02 04:27:06 +08:00
protected boolean DOWNSAMPLE_BY_COVERAGE = false ;
2009-03-27 23:40:45 +08:00
protected boolean FILTER_UNSORTED_READS = false ;
protected boolean walkOverAllSites = false ;
2009-04-01 10:11:13 +08:00
protected int maxOnFlySorts = 100000 ;
protected double downsamplingFraction = 1.0 ;
2009-04-02 04:27:06 +08:00
protected int downsamplingCoverage = 0 ;
2009-03-27 23:40:45 +08:00
protected long N_RECORDS_TO_PRINT = 100000 ;
protected boolean THREADED_IO = false ;
protected int THREADED_IO_BUFFER_SIZE = 10000 ;
2009-02-27 05:50:29 +08:00
2009-03-24 04:27:21 +08:00
/ * *
* our log , which we want to capture anything from this class
* /
2009-03-27 23:40:45 +08:00
protected static Logger logger = Logger . getLogger ( TraversalEngine . class ) ;
2009-03-24 04:27:21 +08:00
2009-03-12 05:43:31 +08:00
// Locations we are going to process during the traversal
2009-03-29 04:37:27 +08:00
private ArrayList < GenomeLoc > locs = null ;
2009-03-03 02:18:48 +08:00
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Setting up the engine
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Creates a new , uninitialized TraversalEngine
*
* @param reads SAM / BAM file of reads
2009-03-24 04:27:21 +08:00
* @param ref Reference file in FASTA format , assumes a . dict file is also available
* @param rods Array of reference ordered data sets
2009-03-12 05:43:31 +08:00
* /
2009-04-03 04:53:01 +08:00
public TraversalEngine ( File reads , File ref , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
2009-02-27 05:50:29 +08:00
readsFile = reads ;
refFileName = ref ;
2009-03-24 11:58:03 +08:00
this . rods = rods ;
2009-02-27 05:50:29 +08:00
}
2009-03-03 05:49:08 +08:00
2009-03-12 05:43:31 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// Manipulating the underlying engine parameters
//
// --------------------------------------------------------------------------------------------------------------
//public void setRegion(final String reg) { regionStr = regionStr; }
//public void setTraversalType(final String type) { traversalType = type; }
2009-03-24 04:27:21 +08:00
public void setStrictness ( final ValidationStringency s ) {
strictness = s ;
}
public void setMaxReads ( final int maxReads ) {
this . maxReads = maxReads ;
}
2009-03-25 06:32:45 +08:00
public void setThreadedIO ( final boolean threadedIO ) {
this . THREADED_IO = threadedIO ;
}
2009-03-27 08:12:35 +08:00
public void setWalkOverAllSites ( final boolean walkOverAllSites ) {
this . walkOverAllSites = walkOverAllSites ;
}
2009-03-24 04:27:21 +08:00
public void setDebugging ( final boolean d ) {
DEBUGGING = d ;
}
public void setSafetyChecking ( final boolean beSafeP ) {
if ( ! beSafeP )
2009-03-24 04:46:55 +08:00
logger . warn ( "*** Turning off safety checking, I hope you know what you are doing. Errors will result in debugging assert failures and other inscrutable messages..." ) ;
2009-03-17 07:22:04 +08:00
this . beSafeP = beSafeP ;
}
2009-03-24 04:27:21 +08:00
2009-03-25 04:55:34 +08:00
public void setFilterUnsortedReads ( final boolean filterUnsorted ) {
if ( ! filterUnsorted )
logger . warn ( "*** Turning on filtering of out of order reads, I *really* hope you know what you are doing, as you are removing data..." ) ;
this . FILTER_UNSORTED_READS = filterUnsorted ;
}
2009-04-01 10:11:13 +08:00
public void setSortOnFly ( final int maxReadsToSort ) {
logger . info ( "Sorting read file on the fly: max reads allowed is " + maxReadsToSort ) ;
SORT_ON_FLY = true ;
maxOnFlySorts = maxReadsToSort ;
}
public void setSortOnFly ( ) { setSortOnFly ( 100000 ) ; }
public void setDownsampleByFraction ( final double fraction ) {
logger . info ( "Downsampling to approximately " + ( fraction * 100.0 ) + "% of filtered reads" ) ;
DOWNSAMPLE_BY_FRACTION = true ;
downsamplingFraction = fraction ;
2009-03-18 04:29:09 +08:00
}
2009-02-27 05:50:29 +08:00
2009-04-02 04:27:06 +08:00
public void setDownsampleByCoverage ( final int coverage ) {
logger . info ( "Downsampling to coverage " + coverage ) ;
DOWNSAMPLE_BY_COVERAGE = true ;
downsamplingCoverage = coverage ;
}
2009-03-03 02:18:48 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// functions for dealing locations (areas of the genome we're traversing over)
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Parses the location string locStr and sets the traversal engine to only process
* regions specified by the location string . The string is of the form :
2009-03-24 04:27:21 +08:00
* Of the form : loc1 ; loc2 ; . . .
* Where each locN can be :
* <EFBFBD> chr2 <EFBFBD> , <EFBFBD> chr2 : 1000000 <EFBFBD> or <EFBFBD> chr2 : 1 , 000 , 000 - 2 , 000 , 000 <EFBFBD>
2009-03-12 05:43:31 +08:00
*
* @param locStr
* /
2009-03-24 04:27:21 +08:00
public void setLocation ( final String locStr ) {
2009-03-29 04:37:27 +08:00
this . locs = GenomeLoc . parseGenomeLocs ( locStr ) ;
2009-03-03 02:18:48 +08:00
}
2009-03-22 00:07:32 +08:00
/ * *
* Read a file of genome locations to process .
* regions specified by the location string . The string is of the form :
2009-03-24 04:27:21 +08:00
* Of the form : loc1 ; loc2 ; . . .
* Where each locN can be :
* <EFBFBD> chr2 <EFBFBD> , <EFBFBD> chr2 : 1000000 <EFBFBD> or <EFBFBD> chr2 : 1 , 000 , 000 - 2 , 000 , 000 <EFBFBD>
2009-03-22 00:07:32 +08:00
*
* @param file_name
* /
2009-03-24 04:27:21 +08:00
public void setLocationFromFile ( final String file_name ) {
try {
2009-03-25 23:17:38 +08:00
xReadLines reader = new xReadLines ( new File ( file_name ) ) ;
List < String > lines = reader . readLines ( ) ;
reader . close ( ) ;
String locStr = Utils . join ( ";" , lines ) ;
logger . debug ( "locStr: " + locStr ) ;
2009-03-29 04:37:27 +08:00
setLocation ( locStr ) ;
2009-03-25 23:17:38 +08:00
} catch ( Exception e ) {
2009-03-22 00:07:32 +08:00
e . printStackTrace ( ) ;
2009-03-24 04:27:21 +08:00
System . exit ( - 1 ) ;
2009-03-22 00:07:32 +08:00
}
}
2009-03-03 02:18:48 +08:00
2009-03-26 00:01:58 +08:00
public boolean hasLocations ( ) {
return this . locs ! = null ;
}
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
2009-03-12 05:43:31 +08:00
// printing
2009-02-27 05:50:29 +08:00
//
// --------------------------------------------------------------------------------------------------------------
2009-03-24 04:27:21 +08:00
2009-03-10 22:59:42 +08:00
/ * *
* @param curTime ( current runtime , in millisecs )
* @return true if the maximum interval ( in millisecs ) has passed since the last printing
* /
2009-03-27 23:40:45 +08:00
protected boolean maxElapsedIntervalForPrinting ( final long curTime ) {
2009-03-10 22:59:42 +08:00
return ( curTime - this . lastProgressPrintTime ) > MAX_PROGRESS_PRINT_TIME ;
}
2009-03-12 05:43:31 +08:00
/ * *
* Forward request to printProgress
*
* @param type
* @param loc
* /
public void printProgress ( final String type , GenomeLoc loc ) {
2009-03-24 04:27:21 +08:00
printProgress ( false , type , loc ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
2009-03-12 05:43:31 +08:00
/ * *
* Utility routine that prints out process information ( including timing ) every N records or
* every M seconds , for N and M set in global variables .
*
* @param mustPrint If true , will print out info , regardless of nRecords or time interval
2009-03-24 04:27:21 +08:00
* @param type String to print out describing our atomic traversal type ( "read" , "locus" , etc )
* @param loc Current location
2009-03-12 05:43:31 +08:00
* /
2009-03-24 04:27:21 +08:00
public void printProgress ( boolean mustPrint , final String type , GenomeLoc loc ) {
2009-02-27 05:50:29 +08:00
final long nRecords = this . nRecords ;
2009-03-10 22:59:42 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
2009-04-03 00:44:12 +08:00
//System.out.printf("Cur = %d, last print = %d, elapsed=%.2f, nRecords=%d, met=%b%n", curTime, lastProgressPrintTime, elapsed, nRecords, maxElapsedIntervalForPrinting(curTime));
2009-03-24 04:27:21 +08:00
if ( mustPrint | | nRecords % N_RECORDS_TO_PRINT = = 0 | | maxElapsedIntervalForPrinting ( curTime ) ) {
2009-03-10 22:59:42 +08:00
this . lastProgressPrintTime = curTime ;
2009-02-27 05:50:29 +08:00
final double secsPer1MReads = ( elapsed * 1000000.0 ) / nRecords ;
2009-03-24 04:27:21 +08:00
if ( loc ! = null )
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed to %s, processing %,d %s in %.2f secs (%.2f secs per 1M %s)" , loc , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-03-03 05:51:25 +08:00
else
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] Traversed %,d %s in %.2f secs (%.2f secs per 1M %s)" , nRecords , type , elapsed , secsPer1MReads , type ) ) ;
2009-03-10 22:59:42 +08:00
2009-03-12 05:43:31 +08:00
// Currently samReadingTracker will print misleading info if we're not processing the whole file
2009-03-20 07:02:49 +08:00
// If an index is enabled, file read progress is meaningless because a linear
// traversal is not being performed. For now, don't bother printing progress.
// TODO: Create a sam indexed read tracker that tracks based on percentage through the query.
2009-03-24 04:27:21 +08:00
if ( samReadingTracker ! = null & & this . locs = = null )
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "[PROGRESS] -> %s" , samReadingTracker . progressMeter ( ) ) ) ;
2009-02-27 05:50:29 +08:00
}
}
2009-03-12 05:43:31 +08:00
/ * *
* Called after a traversal to print out information about the traversal process
*
* @param type String describing this type of traversal ( "loci" , "read" )
2009-03-24 04:27:21 +08:00
* @param sum The reduce result of the traversal
* @param < T > ReduceType of the traversal
2009-03-12 05:43:31 +08:00
* /
2009-03-24 04:27:21 +08:00
protected < T > void printOnTraversalDone ( final String type , T sum ) {
printProgress ( true , type , null ) ;
2009-03-27 21:27:04 +08:00
logger . info ( "Traversal reduce result is " + sum ) ;
2009-03-26 22:40:50 +08:00
final long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - startTime ) / 1000.0 ;
logger . info ( String . format ( "Total runtime %.2f secs, %.2f min, %.2f hours%n" , elapsed , elapsed / 60 , elapsed / 3600 ) ) ;
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "Traversal skipped %d reads out of %d total (%.2f%%)" , nSkippedReads , nReads , ( nSkippedReads * 100.0 ) / nReads ) ) ;
logger . info ( String . format ( " -> %d unmapped reads" , nUnmappedReads ) ) ;
logger . info ( String . format ( " -> %d non-primary reads" , nNotPrimary ) ) ;
logger . info ( String . format ( " -> %d reads with bad alignments" , nBadAlignments ) ) ;
logger . info ( String . format ( " -> %d reads with indels" , nSkippedIndels ) ) ;
2009-03-12 05:43:31 +08:00
}
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
2009-03-12 05:43:31 +08:00
// Initialization
2009-02-27 05:50:29 +08:00
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Initialize the traversal engine . After this point traversals can be run over the data
*
* @return true on success
* /
2009-03-25 06:32:45 +08:00
public boolean initialize ( ) {
2009-03-12 05:43:31 +08:00
lastProgressPrintTime = startTime = System . currentTimeMillis ( ) ;
2009-03-16 06:21:48 +08:00
initializeReference ( ) ;
// Initial the reference ordered data iterators
initializeRODs ( ) ;
return true ;
}
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > initializeReads ( ) {
2009-03-25 06:32:45 +08:00
samReader = initializeSAMFile ( readsFile ) ;
return WrapReadsIterator ( getReadsIterator ( samReader ) , true ) ;
}
2009-03-18 00:39:03 +08:00
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > getReadsIterator ( final SAMFileReader samReader ) {
2009-03-25 06:32:45 +08:00
// If the file has an index, querying functions are available. Use them if possible...
2009-03-26 22:40:50 +08:00
if ( samReader = = null & & readsFile . toString ( ) . endsWith ( ".list" ) ) {
SAMFileHeader . SortOrder SORT_ORDER = SAMFileHeader . SortOrder . coordinate ;
List < SAMFileReader > readers = new ArrayList < SAMFileReader > ( ) ;
try {
for ( String fileName : new xReadLines ( readsFile ) ) {
SAMFileReader reader = initializeSAMFile ( new File ( fileName ) ) ;
readers . add ( reader ) ;
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger ( readers , SORT_ORDER ) ;
return new MergingSamRecordIterator2 ( headerMerger ) ;
}
catch ( FileNotFoundException e ) {
logger . fatal ( "Couldn't open file in sam file list: " + readsFile ) ;
}
}
2009-04-03 02:04:28 +08:00
//if (samReader.hasIndex()) {
// return new SamQueryIterator(samReader, locs);
//} else {
return samReader . iterator ( ) ;
//}
2009-03-25 06:32:45 +08:00
}
2009-03-27 23:40:45 +08:00
protected Iterator < SAMRecord > WrapReadsIterator ( final Iterator < SAMRecord > rawIterator , final boolean enableVerification ) {
2009-03-25 06:32:45 +08:00
Iterator < SAMRecord > wrappedIterator = rawIterator ;
2009-03-18 00:39:03 +08:00
2009-04-01 10:11:13 +08:00
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
if ( DOWNSAMPLE_BY_FRACTION )
wrappedIterator = new DownsampleIterator ( wrappedIterator , downsamplingFraction ) ;
2009-03-24 04:27:21 +08:00
if ( SORT_ON_FLY )
2009-04-01 10:11:13 +08:00
wrappedIterator = new SortSamIterator ( wrappedIterator , maxOnFlySorts ) ;
2009-03-25 06:32:45 +08:00
if ( beSafeP & & enableVerification )
wrappedIterator = new VerifyingSamIterator ( wrappedIterator ) ;
2009-03-24 04:27:21 +08:00
if ( THREADED_IO ) {
2009-03-24 04:46:55 +08:00
logger . info ( String . format ( "Enabling threaded I/O with buffer of %d reads" , THREADED_IO_BUFFER_SIZE ) ) ;
2009-03-25 06:32:45 +08:00
wrappedIterator = new ThreadedIterator < SAMRecord > ( wrappedIterator , THREADED_IO_BUFFER_SIZE ) ;
2009-03-18 00:39:03 +08:00
}
2009-03-25 06:32:45 +08:00
return wrappedIterator ;
}
2009-03-18 00:39:03 +08:00
2009-03-27 23:40:45 +08:00
protected SAMFileReader initializeSAMFile ( final File samFile ) {
2009-03-26 22:40:50 +08:00
// todo: fixme, this is a hack to try out dynamic merging
if ( samFile . toString ( ) . endsWith ( ".list" ) ) {
return null ;
// todo: omg, this is just scary, just it's just for testing purposes. fix with the new DataSource system
} else {
SAMFileReader samReader = new SAMFileReader ( samFile , true ) ;
samReader . setValidationStringency ( strictness ) ;
2009-03-18 00:39:03 +08:00
2009-03-26 22:40:50 +08:00
final SAMFileHeader header = samReader . getFileHeader ( ) ;
logger . info ( String . format ( "Sort order is: " + header . getSortOrder ( ) ) ) ;
2009-03-18 00:39:03 +08:00
2009-03-26 22:40:50 +08:00
return samReader ;
}
2009-03-12 05:43:31 +08:00
}
/ * *
* Prepare the reference for stream processing
* /
2009-03-16 06:21:48 +08:00
protected void initializeReference ( ) {
2009-03-24 04:27:21 +08:00
if ( refFileName ! = null ) {
2009-03-23 03:56:54 +08:00
//this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
this . refFile = new FastaSequenceFile2 ( refFileName ) ; // todo: replace when FastaSequenceFile2 is in picard
2009-02-27 05:50:29 +08:00
this . refIter = new ReferenceIterator ( this . refFile ) ;
2009-03-29 04:37:27 +08:00
// if (!GenomeLoc.setupRefContigOrdering(this.refFile)) {
// // We couldn't process the reference contig ordering, fail since we need it
// logger.fatal(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.", refFileName));
// throw new RuntimeException("We couldn't load the contig dictionary associated with " + refFileName + ". At the current time we require this dictionary file to efficiently access the FASTA file. In the near future this program will automatically construct the dictionary for you and save it down.");
// }
2009-03-24 04:27:21 +08:00
}
2009-02-27 05:50:29 +08:00
}
2009-03-12 05:43:31 +08:00
/ * *
* Prepare the list of reference ordered data iterators for each of the rods
2009-03-24 04:27:21 +08:00
*
2009-03-12 05:43:31 +08:00
* @return A list of ROD iterators for getting data from each ROD
* /
2009-04-03 04:53:01 +08:00
protected List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator > initializeRODs ( ) {
2009-03-12 04:58:01 +08:00
// set up reference ordered data
2009-04-03 04:53:01 +08:00
rodIters = new ArrayList < ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator > ( ) ;
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > data : rods ) {
2009-03-12 04:58:01 +08:00
rodIters . add ( data . iterator ( ) ) ;
}
return rodIters ;
}
2009-03-23 03:56:54 +08:00
/ * *
* An inappropriately placed testing of reading the reference
* /
2009-03-13 22:50:45 +08:00
protected void testReference ( ) {
while ( true ) {
ReferenceSequence ref = refFile . nextSequence ( ) ;
2009-03-24 04:46:55 +08:00
logger . debug ( String . format ( "%s %d %d" , ref . getName ( ) , ref . length ( ) , System . currentTimeMillis ( ) ) ) ;
2009-03-13 22:50:45 +08:00
printProgress ( true , "loci" , new GenomeLoc ( "foo" , 1 ) ) ;
}
}
2009-03-12 05:43:31 +08:00
2009-03-27 23:40:45 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// shutdown
//
// --------------------------------------------------------------------------------------------------------------
public boolean shutdown ( ) {
// todo: actually shutdown the resources
return true ;
}
2009-03-12 05:43:31 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// dealing with reference ordered data
//
// --------------------------------------------------------------------------------------------------------------
/ * *
* Builds a list of the reference ordered datum at loc from each of the iterators . This function
* assumes you are accessing the data in order . You can ' t use this function for random access . Each
* successive call moves you along the file , consuming all data before loc .
*
* @param rodIters Iterators to access the RODs
2009-03-24 04:27:21 +08:00
* @param loc The location to get the rods at
2009-03-12 05:43:31 +08:00
* @return A list of ReferenceOrderDatum at loc . ROD without a datum at loc will be null in the list
* /
2009-04-03 04:53:01 +08:00
protected List < ReferenceOrderedDatum > getReferenceOrderedDataAtLocus ( List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator > rodIters ,
2009-03-24 04:27:21 +08:00
final GenomeLoc loc ) {
2009-03-01 04:47:48 +08:00
List < ReferenceOrderedDatum > data = new ArrayList < ReferenceOrderedDatum > ( ) ;
2009-04-03 04:53:01 +08:00
for ( ReferenceOrderedData < ? extends ReferenceOrderedDatum > . RODIterator iter : rodIters ) {
2009-03-03 02:18:48 +08:00
data . add ( iter . seekForward ( loc ) ) ;
2009-03-01 04:47:48 +08:00
}
return data ;
}
2009-03-27 23:40:45 +08:00
// --------------------------------------------------------------------------------------------------------------
//
// processing
//
// --------------------------------------------------------------------------------------------------------------
2009-03-03 02:18:48 +08:00
2009-03-27 23:40:45 +08:00
public < M , T > T traverse ( Walker < M , T > walker ) {
2009-03-29 04:37:27 +08:00
ArrayList < GenomeLoc > l = new ArrayList < GenomeLoc > ( ) ;
2009-03-27 23:40:45 +08:00
if ( hasLocations ( ) )
2009-03-29 04:37:27 +08:00
l = locs ;
2009-03-27 23:40:45 +08:00
return traverse ( walker , l ) ;
}
2009-03-29 04:37:27 +08:00
public < M , T > T traverse ( Walker < M , T > walker , ArrayList < GenomeLoc > locations ) {
2009-03-27 23:40:45 +08:00
return null ;
}
2009-02-27 05:50:29 +08:00
// --------------------------------------------------------------------------------------------------------------
//
2009-03-03 02:18:48 +08:00
// traversal by loci functions
2009-02-27 05:50:29 +08:00
//
// --------------------------------------------------------------------------------------------------------------
2009-03-12 05:43:31 +08:00
/ * *
* Class to filter out un - handle - able reads from the stream . We currently are skipping
* unmapped reads , non - primary reads , unaligned reads , and those with indels . We should
* really change this to handle indel containing reads .
* /
2009-02-27 05:50:29 +08:00
class locusStreamFilterFunc implements SamRecordFilter {
2009-03-25 04:55:34 +08:00
SAMRecord lastRead = null ;
2009-02-27 05:50:29 +08:00
public boolean filterOut ( SAMRecord rec ) {
boolean result = false ;
String why = "" ;
2009-03-24 04:27:21 +08:00
if ( rec . getReadUnmappedFlag ( ) ) {
2009-02-27 05:50:29 +08:00
nUnmappedReads + + ;
result = true ;
why = "Unmapped" ;
2009-03-24 04:27:21 +08:00
} else if ( rec . getNotPrimaryAlignmentFlag ( ) ) {
2009-02-27 05:50:29 +08:00
nNotPrimary + + ;
result = true ;
why = "Not Primary" ;
2009-03-24 04:27:21 +08:00
} else if ( rec . getAlignmentStart ( ) = = SAMRecord . NO_ALIGNMENT_START ) {
2009-02-27 05:50:29 +08:00
nBadAlignments + + ;
result = true ;
why = "No alignment start" ;
2009-03-25 04:55:34 +08:00
}
else {
2009-02-27 05:50:29 +08:00
result = false ;
}
2009-03-24 04:27:21 +08:00
if ( result ) {
2009-02-27 05:50:29 +08:00
nSkippedReads + + ;
2009-03-24 04:46:55 +08:00
//System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why);
2009-03-24 04:27:21 +08:00
} else {
2009-02-27 05:50:29 +08:00
nReads + + ;
}
2009-03-24 04:27:21 +08:00
return result ;
2009-02-27 05:50:29 +08:00
}
}
2009-03-17 07:22:04 +08:00
public void verifySortOrder ( final boolean requiresSortedOrder ) {
2009-03-24 04:27:21 +08:00
if ( beSafeP & & ! SORT_ON_FLY & & samReader . getFileHeader ( ) . getSortOrder ( ) ! = SAMFileHeader . SortOrder . coordinate ) {
2009-03-17 07:22:04 +08:00
final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing" ;
2009-03-24 04:27:21 +08:00
if ( requiresSortedOrder | | strictness = = SAMFileReader . ValidationStringency . STRICT )
2009-03-17 07:22:04 +08:00
throw new RuntimeIOException ( msg ) ;
2009-03-24 04:27:21 +08:00
else if ( strictness = = SAMFileReader . ValidationStringency . LENIENT )
logger . warn ( msg ) ;
2009-03-17 07:22:04 +08:00
}
}
2009-03-03 05:51:25 +08:00
}