Pooling resources to lower memory consumption.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@962 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-06-10 13:39:32 +00:00
parent 87ba8b3451
commit dc6a9ca196
11 changed files with 406 additions and 309 deletions

View File

@ -130,7 +130,7 @@ public class GATKArgumentCollection {
@Element(required=false)
@Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation (LENIENT|SILENT|STRICT)", required = false)
public String strictnessLevel = "strict";
public String strictnessLevel = "silent";
@Element(required=false)
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations, nothing will be checked at runtime.", required = false)

View File

@ -8,8 +8,6 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import org.broadinstitute.sting.gatk.refdata.IntervalRodIterator;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
@ -112,7 +110,7 @@ public class GenomeAnalysisEngine {
logger.info("Strictness is " + strictness);
// perform validation steps that are common to all the engines
genericEngineSetup(strictness);
genericEngineSetup();
// parse out any genomic location they've provided
//List<GenomeLoc> locationsList = setupIntervalRegion();
@ -154,7 +152,7 @@ public class GenomeAnalysisEngine {
ValidationStringency strictness = getValidationStringency();
logger.info("Strictness is " + strictness);
genericEngineSetup(strictness);
genericEngineSetup();
// store the results of the walker run
walkerReturn = engine.traverse(my_walker);
@ -177,13 +175,13 @@ public class GenomeAnalysisEngine {
// create the MicroScheduler
if( argCollection.walkAllLoci )
Utils.scareUser("Argument --all_loci is deprecated. Please annotate your walker with @By(DataSource.REFERENCE) to perform a by-reference traversal.");
microScheduler = MicroScheduler.create(my_walker, new Reads(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
engine = microScheduler.getTraversalEngine();
}
else if (my_walker instanceof ReadWalker) {
if (argCollection.referenceFile == null)
Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given"));
microScheduler = MicroScheduler.create(my_walker, new Reads(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads);
engine = microScheduler.getTraversalEngine();
}
@ -193,11 +191,11 @@ public class GenomeAnalysisEngine {
/**
* commands that get executed for each engine, regardless of the type
*
* @param strictness our current strictness level
*/
private void genericEngineSetup(ValidationStringency strictness) {
engine.setStrictness(strictness);
private void genericEngineSetup() {
Reads sourceInfo = extractSourceInfoFromArguments(argCollection);
engine.setStrictness(sourceInfo.getValidationStringency());
engine.setMaxReads(argCollection.maximumEngineIterations);
engine.setFilterZeroMappingQualityReads(argCollection.filterZeroMappingQualityReads);
@ -207,7 +205,7 @@ public class GenomeAnalysisEngine {
engine.setLocation(parseIntervalRegion(argCollection.intervals, false));
}
engine.setReadFilters(new Reads(argCollection));
engine.setReadFilters(sourceInfo);
engine.setThreadedIO(argCollection.enabledThreadedIO);
engine.setWalkOverAllSites(argCollection.walkAllLoci);
@ -233,6 +231,21 @@ public class GenomeAnalysisEngine {
return locs;
}
/**
* Bundles all the source information about the reads into a unified data structure.
* @param argCollection The collection of arguments passed to the engine.
* @return The reads object providing reads source info.
*/
private Reads extractSourceInfoFromArguments( GATKArgumentCollection argCollection ) {
return new Reads( argCollection.samFiles,
getValidationStringency(),
argCollection.downsampleFraction,
argCollection.downsampleCoverage,
argCollection.maximumReadSorts,
!argCollection.unsafe,
argCollection.filterZeroMappingQualityReads );
}
private void validateInputsAgainstWalker(Walker walker,
GATKArgumentCollection arguments,
List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
@ -275,12 +288,12 @@ public class GenomeAnalysisEngine {
* @return the validation stringency
*/
private ValidationStringency getValidationStringency() {
ValidationStringency strictness;
ValidationStringency strictness = ValidationStringency.SILENT;
try {
strictness = Enum.valueOf(ValidationStringency.class, argCollection.strictnessLevel);
strictness = Enum.valueOf(ValidationStringency.class, argCollection.strictnessLevel.toUpperCase().trim());
}
catch (IllegalArgumentException ex) {
strictness = ValidationStringency.STRICT;
logger.debug("Unable to parse strictness from command line. Assuming SILENT");
}
return strictness;
}

View File

@ -6,6 +6,8 @@ import org.broadinstitute.sting.utils.StingException;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.List;
import net.sf.samtools.SAMFileReader;
/**
* User: hanna
* Date: May 14, 2009
@ -25,7 +27,7 @@ import java.util.List;
*/
public class Reads {
private List<File> readsFiles = null;
private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT;
private Double downsamplingFraction = null;
private Integer downsampleToCoverage = null;
private Integer maxOnFlySorts = null;
@ -40,6 +42,14 @@ public class Reads {
return readsFiles;
}
/**
* How strict should validation be?
* @return Stringency of validation.
*/
public SAMFileReader.ValidationStringency getValidationStringency() {
return validationStringency;
}
/**
* Get the fraction of reads to downsample.
* @return Downsample fraction.
@ -88,14 +98,27 @@ public class Reads {
* Extract the command-line arguments having to do with reads input
* files and store them in an easy-to-work-with package. Constructor
* is package protected.
* @param arguments GATK parsed command-line arguments.
* @param samFiles list of reads files.
* @param strictness Stringency of reads file parsing.
* @param downsampleFraction fraction of reads to downsample.
* @param downsampleCoverage downsampling per-locus.
* @param maxOnFlySorts how many sorts to perform on-the-fly.
* @param beSafe Whether to enable safety checking.
* @param filterZeroMappingQualityReads whether to filter zero mapping quality reads.
*/
Reads( GATKArgumentCollection arguments ) {
this.readsFiles = arguments.samFiles;
if (arguments.downsampleFraction != null) downsamplingFraction = arguments.downsampleFraction;
if (arguments.downsampleCoverage != null) downsampleToCoverage = arguments.downsampleCoverage;
if (arguments.maximumReadSorts != null) maxOnFlySorts = arguments.maximumReadSorts;
if (arguments.filterZeroMappingQualityReads != null) filterZeroMappingQualityReads = arguments.filterZeroMappingQualityReads;
beSafe = !arguments.unsafe;
Reads( List<File> samFiles,
SAMFileReader.ValidationStringency strictness,
Double downsampleFraction,
Integer downsampleCoverage,
Integer maxOnFlySorts,
Boolean beSafe,
Boolean filterZeroMappingQualityReads ) {
this.readsFiles = samFiles;
this.validationStringency = strictness;
this.downsamplingFraction = downsampleFraction;
this.downsampleToCoverage = downsampleCoverage;
this.maxOnFlySorts = maxOnFlySorts;
this.beSafe = beSafe;
this.filterZeroMappingQualityReads = filterZeroMappingQualityReads;
}
}

View File

@ -1,107 +0,0 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.util.List;
import java.util.ArrayList;
/**
* User: hanna
* Date: May 21, 2009
* Time: 10:55:26 AM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* A pool of open iterators. Currently highly specialized to RODs, but could theoretically be
* generalized to a pool of arbitrary seekable, closeable iterators. Not thread-safe.
*/
class IteratorPool {
private final ReferenceOrderedData<? extends ReferenceOrderedDatum> rod;
/**
* All iterators of this reference-ordered data.
*/
private List<RODIterator> allIterators = new ArrayList<RODIterator>();
/**
* All iterators that are not currently in service.
*/
private List<RODIterator> availableIterators = new ArrayList<RODIterator>();
/**
* Create a new iterator pool given the current ROD.
* @param rod Reference-ordered data.
*/
public IteratorPool( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod ) {
this.rod = rod;
}
/**
* Get an iterator whose position is before the specified location. Create a new one if none exists.
* @param position Target position for the iterator.
* @return
*/
public RODIterator iterator( GenomeLoc position ) {
// Grab the first iterator in the list whose position is before the requested position.
RODIterator selectedIterator = null;
for( RODIterator iterator: availableIterators ) {
if( (iterator.position() == null && iterator.hasNext()) ||
(iterator.position() != null && iterator.position().isBefore(position)) ) {
selectedIterator = iterator;
break;
}
}
// No iterator found? Create another. It is expected that
// each iterator created will have its own file handle.
if( selectedIterator == null ) {
selectedIterator = rod.iterator();
allIterators.add(selectedIterator);
}
// Remove the iterator from the list of available iterators.
if( availableIterators.contains(selectedIterator) )
availableIterators.remove(selectedIterator);
return selectedIterator;
}
/**
* Close the given iterator, returning it to the pool.
* @param iterator Iterator to return to the pool.
*/
public void close( RODIterator iterator ) {
if( !allIterators.contains(iterator) )
throw new StingException("Iterator does not belong to the given pool.");
availableIterators.add(iterator);
}
/**
* Operating stats...get the number of total iterators. Package-protected
* for unit testing.
* @return An integer number of total iterators.
*/
int numIterators() {
return allIterators.size();
}
/**
* Operating stats...get the number of available iterators. Package-protected
* for unit testing.
* @return An integer number of available iterators.
*/
int numAvailableIterators() {
return availableIterators.size();
}
}

View File

@ -5,11 +5,9 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.RODIterator;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* User: hanna
* Date: May 21, 2009
@ -30,20 +28,20 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
/**
* The reference-ordered data itself.
*/
private final ReferenceOrderedData<? extends ReferenceOrderedDatum> rod;
private final ReferenceOrderedData rod;
/**
* A pool of iterators for navigating through the genome.
*/
private IteratorPool iteratorPool = null;
private ReferenceOrderedDataPool iteratorPool = null;
/**
* Create a new reference-ordered data source.
* @param rod
*/
public ReferenceOrderedDataSource( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod) {
public ReferenceOrderedDataSource( ReferenceOrderedData rod) {
this.rod = rod;
this.iteratorPool = new IteratorPool( rod );
this.iteratorPool = new ReferenceOrderedDataPool( rod );
}
/**
@ -69,7 +67,59 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
* @param iterator Iterator to close.
*/
public void close( RODIterator iterator ) {
this.iteratorPool.close(iterator);
this.iteratorPool.release(iterator);
}
}
/**
* A pool of reference-ordered data iterators.
*/
class ReferenceOrderedDataPool extends ResourcePool<RODIterator,RODIterator> {
private final ReferenceOrderedData<? extends ReferenceOrderedDatum> rod;
public ReferenceOrderedDataPool( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod ) {
this.rod = rod;
}
/**
* Create a new iterator from the existing reference-ordered data. This new iterator is expected
* to be completely independent of any other iterator.
* @param position @{inheritedDoc}
* @return The newly created resource.
*/
public RODIterator createNewResource( GenomeLoc position ) {
return rod.iterator();
}
/**
* Finds the best existing ROD iterator from the pool. In this case, the best existing ROD is defined as
* the first one encountered that is at or before the given position.
* @param position @{inheritedDoc}
* @param resources @{inheritedDoc}
* @return @{inheritedDoc}
*/
public RODIterator selectBestExistingResource( GenomeLoc position, List<RODIterator> resources ) {
for( RODIterator iterator: resources ) {
if( (iterator.position() == null && iterator.hasNext()) ||
(iterator.position() != null && iterator.position().isBefore(position)) )
return iterator;
}
return null;
}
/**
* In this case, the iterator is the resource. Pass it through.
*/
public RODIterator createIteratorFromResource( GenomeLoc position, RODIterator resource ) {
return resource;
}
/**
* Don't worry about closing the resource; let the file handles expire naturally for the moment.
*/
public void closeResource( RODIterator resource ) {
}
}

View File

@ -0,0 +1,152 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.apache.log4j.Logger;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.HashMap;
import java.util.Map;
/**
* User: hanna
* Date: May 21, 2009
* Time: 10:55:26 AM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* A pool of open resources, all of which can create a closeable iterator.
*/
abstract class ResourcePool <T,I extends Iterator> {
/**
* All iterators of this reference-ordered data.
*/
private List<T> allResources = new ArrayList<T>();
/**
* All iterators that are not currently in service.
*/
private List<T> availableResources = new ArrayList<T>();
/**
* Which iterators are assigned to which pools.
*/
private Map<I,T> resourceAssignments = new HashMap<I,T>();
/**
* Get an iterator whose position is before the specified location. Create a new one if none exists.
* @param position Target position for the iterator.
* @return An iterator that can traverse the selected region. Should be able to iterate concurrently with other
* iterators from tihs pool.
*/
public I iterator( GenomeLoc position ) {
// Grab the first iterator in the list whose position is before the requested position.
T selectedResource = null;
synchronized(this) {
selectedResource = selectBestExistingResource( position, availableResources );
// Remove the iterator from the list of available iterators.
if( selectedResource != null )
availableResources.remove(selectedResource);
}
// No iterator found? Create another. It is expected that
// each iterator created will have its own file handle.
if( selectedResource == null ) {
selectedResource = createNewResource(position);
addNewResource( selectedResource );
}
I iterator = createIteratorFromResource( position, selectedResource );
// Make a note of this assignment for proper releasing later.
resourceAssignments.put( iterator, selectedResource );
return iterator;
}
/**
* Release the lock on the given iterator, returning it to the pool.
* @param iterator Iterator to return to the pool.
*/
public void release( I iterator ) {
synchronized(this) {
// Find and remove the resource from the list of allocated resources.
T resource = resourceAssignments.get( iterator );
resourceAssignments.remove(resource);
// Return the resource to the pool.
if( !allResources.contains(resource) )
throw new StingException("Iterator does not belong to the given pool.");
availableResources.add(resource);
}
}
/**
* Add a resource to the list of available resources. Useful if derived classes
* want to seed the pool with a set of at a given time (like at initialization).
* @param resource The new resource to add.
*/
protected void addNewResource( T resource ) {
synchronized(this) {
allResources.add(resource);
}
}
/**
* If no appropriate resources are found in the pool, the system can create a new resource.
* Delegate the creation of the resource to the subclass.
* @param position Position for the new resource. This information may or may not inform the new resource.
* @return The new resource created.
*/
protected abstract T createNewResource( GenomeLoc position );
/**
* Find the most appropriate resource to acquire the specified data.
* @param position The data over which the resource is required.
* @param availableResources A list of candidate resources to evaluate.
* @return The best choice of the availableResources, or null if no resource meets the criteria.
*/
protected abstract T selectBestExistingResource( GenomeLoc position, List<T> availableResources );
/**
* Create an iterator over the specified resource.
* @param position The bounds of iteration. The first element of the iterator through the last element should all
* be in the range described by position.
* @return A new iterator over the given data.
*/
protected abstract I createIteratorFromResource( GenomeLoc position, T resource );
/**
* Retire this resource from service.
* @param resource The resource to retire.
*/
protected abstract void closeResource(T resource);
/**
* Operating stats...get the number of total iterators. Package-protected
* for unit testing.
* @return An integer number of total iterators.
*/
int numIterators() {
return allResources.size();
}
/**
* Operating stats...get the number of available iterators. Package-protected
* for unit testing.
* @return An integer number of available iterators.
*/
int numAvailableIterators() {
return availableResources.size();
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
@ -18,6 +19,7 @@ import org.broadinstitute.sting.utils.StingException;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Iterator;
/**
* User: aaron
@ -54,22 +56,8 @@ public class SAMDataSource implements SimpleDataSource {
private boolean intoUnmappedReads = false;
private int readsSeenAtLastPos = 0;
/**
* package protected getter and setter for the iterator generator
*
* @return
*/
IteratorGenerator getIterGen() {
return iterGen;
}
void setIterGen( IteratorGenerator iterGen ) {
this.iterGen = iterGen;
}
// where we get out iterators from
private IteratorGenerator iterGen;
// A pool of SAM iterators.
private SAMIteratorPool iteratorPool = null;
/**
* constructor, given sam files
@ -91,8 +79,15 @@ public class SAMDataSource implements SimpleDataSource {
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
}
}
iterGen = new MSR2IteratorGenerator(reads, byReads);
iteratorPool = new SAMIteratorPool(reads,byReads);
}
/**
* For unit testing, add a custom iterator pool.
* @param iteratorPool Custom mock iterator pool.
*/
void setResourcePool( SAMIteratorPool iteratorPool ) {
this.iteratorPool = iteratorPool;
}
@ -105,11 +100,8 @@ public class SAMDataSource implements SimpleDataSource {
*
* @return an iterator for that region
*/
public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
// make a merging iterator for this record
CloseableIterator<SAMRecord> iter = iterGen.seek(location);
// return the iterator
return StingSAMIteratorAdapter.adapt(reads, iter);
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
return iteratorPool.iterator(location);
}
/**
@ -153,7 +145,7 @@ public class SAMDataSource implements SimpleDataSource {
* @return SAM file header.
*/
public SAMFileHeader getHeader() {
return this.iterGen.getHeader();
return iteratorPool.getHeader();
}
@ -169,16 +161,16 @@ public class SAMDataSource implements SimpleDataSource {
private BoundedReadIterator seekRead( ReadShard shard ) throws SimpleDataSourceLoadException {
BoundedReadIterator bound = null;
CloseableIterator<SAMRecord> iter = null;
StingSAMIterator iter = null;
if (!intoUnmappedReads) {
if (lastReadPos == null) {
lastReadPos = new GenomeLoc(iterGen.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE);
iter = iterGen.seek(lastReadPos);
lastReadPos = new GenomeLoc(getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE);
iter = iteratorPool.iterator(lastReadPos);
return InitialReadIterator(shard.getSize(), iter);
} else {
lastReadPos.setStop(-1);
iter = iterGen.seek(lastReadPos);
iter = iteratorPool.iterator(lastReadPos);
bound = fastMappedReadSeek(shard.getSize(), StingSAMIteratorAdapter.adapt(reads, iter));
}
}
@ -187,8 +179,8 @@ public class SAMDataSource implements SimpleDataSource {
if (iter != null) {
iter.close();
}
iter = iterGen.seek(null);
bound = toUnmappedReads(shard.getSize(), (PeekingStingIterator) iter);
iter = iteratorPool.iterator(null);
bound = toUnmappedReads(shard.getSize(), iter);
}
if (bound == null) {
shard.signalDone();
@ -219,39 +211,41 @@ public class SAMDataSource implements SimpleDataSource {
* @return the bounded iterator that you can use to get the intervaled reads from
* @throws SimpleDataSourceLoadException
*/
<U extends PeekingStingIterator> BoundedReadIterator toUnmappedReads( long readCount, U iter ) throws SimpleDataSourceLoadException {
BoundedReadIterator toUnmappedReads( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
PeekableIterator<SAMRecord> peekable = new PeekableIterator<SAMRecord>(iter);
int count = 0;
int cnt = 0;
SAMRecord d = null;
while (iter.hasNext()) {
d = iter.peek();
while (peekable.hasNext()) {
d = peekable.peek();
int x = d.getReferenceIndex();
if (x < 0)
// we have the magic read that starts the unmapped read segment!
break;
cnt++;
iter.next();
peekable.next();
}
// check to see what happened, did we run out of reads?
if (!iter.hasNext()) {
if (!peekable.hasNext()) {
return null;
}
// now walk until we've taken the unmapped read count
while (iter.hasNext() && count < this.readsTaken) {
iter.next();
while (peekable.hasNext() && count < this.readsTaken) {
peekable.next();
count++;
}
// check to see what happened, did we run out of reads?
if (!iter.hasNext()) {
if (!peekable.hasNext()) {
return null;
}
// we're not out of unmapped reads, so increment our read cout
this.readsTaken += readCount;
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, peekable), readCount);
}
@ -293,7 +287,7 @@ public class SAMDataSource implements SimpleDataSource {
readsTaken = readCount;
readsSeenAtLastPos = 0;
lastReadPos.setStop(-1);
CloseableIterator<SAMRecord> ret = iterGen.seek(lastReadPos);
CloseableIterator<SAMRecord> ret = iteratorPool.iterator(lastReadPos);
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount);
}
}
@ -360,42 +354,81 @@ public class SAMDataSource implements SimpleDataSource {
return bound;
}
}
class SAMIteratorPool extends ResourcePool<SamFileHeaderMerger,StingSAMIterator> {
/**
* Source information about the reads.
*/
protected Reads reads;
/**
* Is this a by-reads traversal or a by-locus?
*/
protected boolean byReads;
/**
* File header for the combined file.
*/
protected SAMFileHeader header;
/**
* iterator generator
* <p/>
* This class generates iterators for the SAMDataSource. This class was introduced for testing purposes,
* since it became increasingly hard to test the SAM data source code. The class defines two abstraact
* methods:
* <p/>
* -seek( GenomeLoc ) which returns an iterator seeked to the genome loc, and if null is passed to the default
* location (which is implementation specific).
* <p/>
* -getHeader(), which returns a SAMFileHeader for the specified IteratorGenerator. I hope we can phase this
* method out, since it doesn't seem necessary, and it would be much cleaner with out it.
*/
abstract class IteratorGenerator {
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
protected static Logger logger = Logger.getLogger(SAMIteratorPool.class);
public SAMIteratorPool( Reads reads, boolean byReads ) {
this.reads = reads;
this.byReads = byReads;
SamFileHeaderMerger merger = createNewResource( null );
this.header = merger.getMergedHeader();
// Add this resource to the pool.
this.addNewResource( merger );
}
/**
* seek to a location
*
* @param seekTo the genome loc to seek to
*
* @return StingSAMIterator
* Get the combined header for all files in the iterator pool.
*/
public abstract CloseableIterator<SAMRecord> seek( GenomeLoc seekTo );
public SAMFileHeader getHeader() {
return header;
}
/**
* get the merged header
*
* @return the merged header
*/
public abstract SAMFileHeader getHeader();
protected SamFileHeaderMerger selectBestExistingResource( GenomeLoc position, List<SamFileHeaderMerger> mergers) {
if( mergers.size() == 0 )
return null;
return mergers.get(0);
}
protected SamFileHeaderMerger createNewResource( GenomeLoc position ) {
return createHeaderMerger( reads, SAMFileHeader.SortOrder.coordinate );
}
protected StingSAMIterator createIteratorFromResource( GenomeLoc loc, SamFileHeaderMerger headerMerger ) {
final MergingSamRecordIterator2 iterator = new MergingSamRecordIterator2(headerMerger, reads);
if( loc != null ) {
if (byReads)
iterator.queryContained(loc.getContig(), (int) loc.getStart(), (int) loc.getStop());
else
iterator.queryOverlapping(loc.getContig(), (int) loc.getStart(), (int) loc.getStop());
}
return new StingSAMIterator() {
public Reads getSourceInfo() { return reads; }
public void close() {
iterator.close();
release(this);
}
public Iterator<SAMRecord> iterator() { return this; }
public boolean hasNext() { return iterator.hasNext(); }
public SAMRecord next() { return iterator.next(); }
public void remove() { throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); }
};
}
protected void closeResource( SamFileHeaderMerger resource ) {
for( SAMFileReader reader: resource.getReaders() )
reader.close();
}
/**
* Load a SAM/BAM, given an input file.
@ -404,7 +437,7 @@ abstract class IteratorGenerator {
*
* @return a SAMFileReader for the file, null if we're attempting to read a list
*/
protected static SAMFileReader initializeSAMFile( final File samFile, SAMFileReader.ValidationStringency strictness ) {
protected SAMFileReader initializeSAMFile( final File samFile, SAMFileReader.ValidationStringency strictness ) {
if (samFile.toString().endsWith(".list")) {
return null;
} else {
@ -425,11 +458,11 @@ abstract class IteratorGenerator {
* @return a list of SAMFileReaders that represent the stored file names
* @throws SimpleDataSourceLoadException if there's a problem loading the files
*/
protected static List<SAMFileReader> GetReaderList( Reads reads, SAMFileReader.ValidationStringency strictness ) throws SimpleDataSourceLoadException {
protected List<SAMFileReader> GetReaderList( Reads reads ) throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
for (File f : reads.getReadsFiles()) {
SAMFileReader reader = initializeSAMFile(f, strictness);
SAMFileReader reader = initializeSAMFile(f, reads.getValidationStringency());
if (reader.getFileHeader().getReadGroups().size() < 1) {
//logger.warn("Setting header in reader " + f.getName());
@ -453,60 +486,8 @@ abstract class IteratorGenerator {
*
* @return a SamFileHeaderMerger that includes the set of SAM files we were created with
*/
protected static SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileReader.ValidationStringency strictness, SAMFileHeader.SortOrder SORT_ORDER ) {
List<SAMFileReader> lst = GetReaderList(reads, strictness);
protected SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileHeader.SortOrder SORT_ORDER ) {
List<SAMFileReader> lst = GetReaderList(reads);
return new SamFileHeaderMerger(lst, SORT_ORDER, true);
}
}
/**
* MSR2IteratorGenerator
* <p/>
* generates a MerginsSAMIterator2, given a genomic location. The constructor takes the reads structure,
* and a flag indicating if we're dealing with reads or loci (to determine the correct query function).
*/
class MSR2IteratorGenerator extends IteratorGenerator {
/** our read pile */
private Reads reads;
private SamFileHeaderMerger header;
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
// are we by reads or by loci
protected boolean byReads = true;
/** our SAM data files */
private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
public MSR2IteratorGenerator( Reads reads, boolean byReads ) {
this.reads = reads;
this.header = IteratorGenerator.createHeaderMerger(reads, strictness, sortOrder);
this.byReads = byReads;
}
public CloseableIterator<SAMRecord> seek( GenomeLoc seekTo ) {
SamFileHeaderMerger mg = createHeaderMerger(reads, strictness, sortOrder);
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(mg, reads);
if (seekTo != null) {
if (byReads)
iter.queryContained(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop());
else
iter.queryOverlapping(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop());
}
return iter;
}
/**
* get the merged header
*
* @return the merged header
*/
public SAMFileHeader getHeader() {
return header.getMergedHeader();
}
}
}

View File

@ -55,9 +55,6 @@ public class BoundedReadIterator implements StingSAMIterator {
// our unmapped read flag
private boolean doNotUseThatUnmappedReadPile = false;
// are we open
private boolean isOpen = false;
// the next read we've buffered
private SAMRecord record = null;
@ -67,10 +64,6 @@ public class BoundedReadIterator implements StingSAMIterator {
* @param readCount
*/
public BoundedReadIterator(StingSAMIterator iter, long readCount) {
if (iter != null) {
isOpen = true;
}
this.iterator = iter;
this.readCount = readCount;
}
@ -103,7 +96,7 @@ public class BoundedReadIterator implements StingSAMIterator {
* @return
*/
public boolean hasNext() {
if (isOpen && iterator.hasNext() && currentCount < readCount) {
if (iterator.hasNext() && currentCount < readCount) {
record = iterator.next();
++currentCount;
if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) {
@ -111,9 +104,6 @@ public class BoundedReadIterator implements StingSAMIterator {
}
return true;
} else {
if (isOpen) {
close();
}
return false;
}
}
@ -137,7 +127,6 @@ public class BoundedReadIterator implements StingSAMIterator {
* close the iterator
*/
public void close() {
isOpen = false;
iterator.close();
}

View File

@ -93,7 +93,6 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
}
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
//reader.close();
Iterator<SAMRecord> recordIter = reader.query(contig, start, stop, contained);
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
addIfNotEmpty(iterator);
@ -257,13 +256,12 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
/**
* closes all the file handles for the readers....DO THIS or you will run out of handles
* closes all open iterators....DO THIS or you will run out of handles
* with sharding.
*/
public void close() {
for (SAMFileReader reader : samHeaderMerger.getReaders()) {
reader.close();
}
for( ComparableSamRecordIterator iterator: pq )
iterator.close();
}

View File

@ -32,7 +32,7 @@ import java.io.FileNotFoundException;
* Test the contents and number of iterators in the pool.
*/
public class IteratorPoolTest extends BaseTest {
public class ReferenceOrderedDataPoolTest extends BaseTest {
private static File sequenceFile = new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta");
@ -56,7 +56,7 @@ public class IteratorPoolTest extends BaseTest {
@Test
public void testCreateSingleIterator() {
IteratorPool iteratorPool = new IteratorPool(rod);
ResourcePool iteratorPool = new ReferenceOrderedDataPool(rod);
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 );
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
@ -69,7 +69,7 @@ public class IteratorPoolTest extends BaseTest {
assertTrue(datum.get("COL2").equals("B"));
assertTrue(datum.get("COL3").equals("C"));
iteratorPool.close(iterator);
iteratorPool.release(iterator);
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators());
@ -77,7 +77,7 @@ public class IteratorPoolTest extends BaseTest {
@Test
public void testCreateMultipleIterators() {
IteratorPool iteratorPool = new IteratorPool(rod);
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod);
RODIterator iterator1 = (RODIterator)iteratorPool.iterator( testSite1 );
// Create a new iterator at position 2.
@ -114,12 +114,12 @@ public class IteratorPoolTest extends BaseTest {
assertTrue(datum.get("COL3").equals("E"));
// Cleanup, and make sure the number of iterators dies appropriately.
iteratorPool.close(iterator1);
iteratorPool.release(iterator1);
Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators());
iteratorPool.close(iterator2);
iteratorPool.release(iterator2);
Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 2, iteratorPool.numAvailableIterators());
@ -127,7 +127,7 @@ public class IteratorPoolTest extends BaseTest {
@Test
public void testIteratorConservation() {
IteratorPool iteratorPool = new IteratorPool(rod);
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod);
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 );
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
@ -139,7 +139,7 @@ public class IteratorPoolTest extends BaseTest {
assertTrue(datum.get("COL2").equals("B"));
assertTrue(datum.get("COL3").equals("C"));
iteratorPool.close(iterator);
iteratorPool.release(iterator);
// Create another iterator after the current iterator.
iterator = iteratorPool.iterator(testSite3);
@ -154,7 +154,7 @@ public class IteratorPoolTest extends BaseTest {
assertTrue(datum.get("COL2").equals("G"));
assertTrue(datum.get("COL3").equals("H"));
iteratorPool.close(iterator);
iteratorPool.release(iterator);
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators());
@ -162,7 +162,7 @@ public class IteratorPoolTest extends BaseTest {
@Test
public void testIteratorCreation() {
IteratorPool iteratorPool = new IteratorPool(rod);
ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod);
RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite3 );
Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators());
@ -174,7 +174,7 @@ public class IteratorPoolTest extends BaseTest {
assertTrue(datum.get("COL2").equals("G"));
assertTrue(datum.get("COL3").equals("H"));
iteratorPool.close(iterator);
iteratorPool.release(iterator);
// Create another iterator after the current iterator.
iterator = iteratorPool.iterator(testSite1);
@ -189,7 +189,7 @@ public class IteratorPoolTest extends BaseTest {
assertTrue(datum.get("COL2").equals("B"));
assertTrue(datum.get("COL3").equals("C"));
iteratorPool.close(iterator);
iteratorPool.release(iterator);
Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators());
Assert.assertEquals("Number of available iterators in the pool is incorrect", 2, iteratorPool.numAvailableIterators());

View File

@ -22,6 +22,7 @@ import org.junit.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Collections;
/**
*
@ -76,7 +77,7 @@ public class SAMByReadsTest extends BaseTest {
/** Test out that we can shard the file and iterate over every read */
@Test
public void testToUnmappedReads() {
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
ArtificialResourcePool gen = new ArtificialResourcePool(1,10,100,1000);
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
try {
int unmappedReadsSeen = 0;
@ -109,7 +110,7 @@ public class SAMByReadsTest extends BaseTest {
/** Test out that we can shard the file and iterate over every read */
@Test
public void testShardingOfReadsSize14() {
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
ArtificialResourcePool gen = new ArtificialResourcePool(1,10,100,1000);
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
targetReadCount = 14;
try {
@ -118,7 +119,7 @@ public class SAMByReadsTest extends BaseTest {
SAMDataSource data = new SAMDataSource(reads,true);
data.setIterGen(gen);
data.setResourcePool(gen);
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
while (shardStrategy.hasNext()) {
@ -159,7 +160,7 @@ public class SAMByReadsTest extends BaseTest {
/** Test out that we can shard the file and iterate over every read */
@Test
public void testShardingOfReadsSize25() {
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
ArtificialResourcePool gen = new ArtificialResourcePool(1,10,100,1000);
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
targetReadCount = 25;
try {
@ -168,7 +169,7 @@ public class SAMByReadsTest extends BaseTest {
SAMDataSource data = new SAMDataSource(reads,true);
data.setIterGen(gen);
data.setResourcePool(gen);
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
while (shardStrategy.hasNext()) {
@ -212,28 +213,25 @@ public class SAMByReadsTest extends BaseTest {
/**
* use this to inject into SAMDataSource for testing
*/
class ArtificialIteratorGenerator extends IteratorGenerator {
class ArtificialResourcePool extends SAMIteratorPool {
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
// the header
private SAMFileHeader header;
private int endingChr;
private int startingChr;
private int readCount;
private int readSize;
/** our SAM data files */
private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
public ArtificialIteratorGenerator( int startingChr, int endingChr, int readCount, int readSize) {
public ArtificialResourcePool( int startingChr, int endingChr, int readCount, int readSize) {
super( new Reads(Collections.<File>emptyList()),true );
header = ArtificialSamUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + readSize);
}
public CloseableIterator<SAMRecord> seek( GenomeLoc seekTo ) {
@Override
public StingSAMIterator iterator( GenomeLoc loc ) {
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 10, 100, 1000);
if (seekTo != null) {
iter.queryContained(seekTo.getContig(), (int)seekTo.getStart(), (int)seekTo.getStop());
if (loc != null) {
iter.queryContained(loc.getContig(), (int)loc.getStart(), (int)loc.getStop());
}
return iter;
}
@ -244,6 +242,6 @@ class ArtificialIteratorGenerator extends IteratorGenerator {
* @return the merged header
*/
public SAMFileHeader getHeader() {
return this.header;
return this.header;
}
}