From dc6a9ca196f7aa082b9cd551a92fedc43bab5423 Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 10 Jun 2009 13:39:32 +0000 Subject: [PATCH] Pooling resources to lower memory consumption. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@962 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GATKArgumentCollection.java | 2 +- .../sting/gatk/GenomeAnalysisEngine.java | 41 ++-- .../org/broadinstitute/sting/gatk/Reads.java | 41 +++- .../simpleDataSources/IteratorPool.java | 107 -------- .../ReferenceOrderedDataSource.java | 66 ++++- .../simpleDataSources/ResourcePool.java | 152 ++++++++++++ .../simpleDataSources/SAMDataSource.java | 231 ++++++++---------- .../gatk/iterators/BoundedReadIterator.java | 13 +- .../iterators/MergingSamRecordIterator2.java | 8 +- ...java => ReferenceOrderedDataPoolTest.java} | 24 +- .../simpleDataSources/SAMByReadsTest.java | 30 ++- 11 files changed, 406 insertions(+), 309 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPool.java create mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ResourcePool.java rename java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/{IteratorPoolTest.java => ReferenceOrderedDataPoolTest.java} (92%) diff --git a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java index 90091e829..b2dbb7edd 100755 --- a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java @@ -130,7 +130,7 @@ public class GATKArgumentCollection { @Element(required=false) @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation (LENIENT|SILENT|STRICT)", required = false) - public String strictnessLevel = "strict"; + public String strictnessLevel = "silent"; @Element(required=false) @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations, nothing will be checked at runtime.", required = false) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 089f963d4..ccc680347 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -8,8 +8,6 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.RODIterator; -import org.broadinstitute.sting.gatk.refdata.IntervalRodIterator; import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; @@ -112,7 +110,7 @@ public class GenomeAnalysisEngine { logger.info("Strictness is " + strictness); // perform validation steps that are common to all the engines - genericEngineSetup(strictness); + genericEngineSetup(); // parse out any genomic location they've provided //List locationsList = setupIntervalRegion(); @@ -154,7 +152,7 @@ public class GenomeAnalysisEngine { ValidationStringency strictness = getValidationStringency(); logger.info("Strictness is " + strictness); - genericEngineSetup(strictness); + genericEngineSetup(); // store the results of the walker run walkerReturn = engine.traverse(my_walker); @@ -177,13 +175,13 @@ public class GenomeAnalysisEngine { // create the MicroScheduler if( argCollection.walkAllLoci ) Utils.scareUser("Argument --all_loci is deprecated. Please annotate your walker with @By(DataSource.REFERENCE) to perform a by-reference traversal."); - microScheduler = MicroScheduler.create(my_walker, new Reads(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads); + microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads); engine = microScheduler.getTraversalEngine(); } else if (my_walker instanceof ReadWalker) { if (argCollection.referenceFile == null) Utils.scareUser(String.format("Locus-based traversals require a reference file but none was given")); - microScheduler = MicroScheduler.create(my_walker, new Reads(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads); + microScheduler = MicroScheduler.create(my_walker, extractSourceInfoFromArguments(argCollection), argCollection.referenceFile, rods, argCollection.numberOfThreads); engine = microScheduler.getTraversalEngine(); } @@ -193,11 +191,11 @@ public class GenomeAnalysisEngine { /** * commands that get executed for each engine, regardless of the type - * - * @param strictness our current strictness level */ - private void genericEngineSetup(ValidationStringency strictness) { - engine.setStrictness(strictness); + private void genericEngineSetup() { + Reads sourceInfo = extractSourceInfoFromArguments(argCollection); + + engine.setStrictness(sourceInfo.getValidationStringency()); engine.setMaxReads(argCollection.maximumEngineIterations); engine.setFilterZeroMappingQualityReads(argCollection.filterZeroMappingQualityReads); @@ -207,7 +205,7 @@ public class GenomeAnalysisEngine { engine.setLocation(parseIntervalRegion(argCollection.intervals, false)); } - engine.setReadFilters(new Reads(argCollection)); + engine.setReadFilters(sourceInfo); engine.setThreadedIO(argCollection.enabledThreadedIO); engine.setWalkOverAllSites(argCollection.walkAllLoci); @@ -233,6 +231,21 @@ public class GenomeAnalysisEngine { return locs; } + /** + * Bundles all the source information about the reads into a unified data structure. + * @param argCollection The collection of arguments passed to the engine. + * @return The reads object providing reads source info. + */ + private Reads extractSourceInfoFromArguments( GATKArgumentCollection argCollection ) { + return new Reads( argCollection.samFiles, + getValidationStringency(), + argCollection.downsampleFraction, + argCollection.downsampleCoverage, + argCollection.maximumReadSorts, + !argCollection.unsafe, + argCollection.filterZeroMappingQualityReads ); + } + private void validateInputsAgainstWalker(Walker walker, GATKArgumentCollection arguments, List> rods) { @@ -275,12 +288,12 @@ public class GenomeAnalysisEngine { * @return the validation stringency */ private ValidationStringency getValidationStringency() { - ValidationStringency strictness; + ValidationStringency strictness = ValidationStringency.SILENT; try { - strictness = Enum.valueOf(ValidationStringency.class, argCollection.strictnessLevel); + strictness = Enum.valueOf(ValidationStringency.class, argCollection.strictnessLevel.toUpperCase().trim()); } catch (IllegalArgumentException ex) { - strictness = ValidationStringency.STRICT; + logger.debug("Unable to parse strictness from command line. Assuming SILENT"); } return strictness; } diff --git a/java/src/org/broadinstitute/sting/gatk/Reads.java b/java/src/org/broadinstitute/sting/gatk/Reads.java index 83d836653..a996aedb8 100755 --- a/java/src/org/broadinstitute/sting/gatk/Reads.java +++ b/java/src/org/broadinstitute/sting/gatk/Reads.java @@ -6,6 +6,8 @@ import org.broadinstitute.sting.utils.StingException; import java.io.File; import java.io.FileNotFoundException; import java.util.List; + +import net.sf.samtools.SAMFileReader; /** * User: hanna * Date: May 14, 2009 @@ -25,7 +27,7 @@ import java.util.List; */ public class Reads { private List readsFiles = null; - + private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT; private Double downsamplingFraction = null; private Integer downsampleToCoverage = null; private Integer maxOnFlySorts = null; @@ -40,6 +42,14 @@ public class Reads { return readsFiles; } + /** + * How strict should validation be? + * @return Stringency of validation. + */ + public SAMFileReader.ValidationStringency getValidationStringency() { + return validationStringency; + } + /** * Get the fraction of reads to downsample. * @return Downsample fraction. @@ -88,14 +98,27 @@ public class Reads { * Extract the command-line arguments having to do with reads input * files and store them in an easy-to-work-with package. Constructor * is package protected. - * @param arguments GATK parsed command-line arguments. + * @param samFiles list of reads files. + * @param strictness Stringency of reads file parsing. + * @param downsampleFraction fraction of reads to downsample. + * @param downsampleCoverage downsampling per-locus. + * @param maxOnFlySorts how many sorts to perform on-the-fly. + * @param beSafe Whether to enable safety checking. + * @param filterZeroMappingQualityReads whether to filter zero mapping quality reads. */ - Reads( GATKArgumentCollection arguments ) { - this.readsFiles = arguments.samFiles; - if (arguments.downsampleFraction != null) downsamplingFraction = arguments.downsampleFraction; - if (arguments.downsampleCoverage != null) downsampleToCoverage = arguments.downsampleCoverage; - if (arguments.maximumReadSorts != null) maxOnFlySorts = arguments.maximumReadSorts; - if (arguments.filterZeroMappingQualityReads != null) filterZeroMappingQualityReads = arguments.filterZeroMappingQualityReads; - beSafe = !arguments.unsafe; + Reads( List samFiles, + SAMFileReader.ValidationStringency strictness, + Double downsampleFraction, + Integer downsampleCoverage, + Integer maxOnFlySorts, + Boolean beSafe, + Boolean filterZeroMappingQualityReads ) { + this.readsFiles = samFiles; + this.validationStringency = strictness; + this.downsamplingFraction = downsampleFraction; + this.downsampleToCoverage = downsampleCoverage; + this.maxOnFlySorts = maxOnFlySorts; + this.beSafe = beSafe; + this.filterZeroMappingQualityReads = filterZeroMappingQualityReads; } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPool.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPool.java deleted file mode 100755 index dcfeb1382..000000000 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPool.java +++ /dev/null @@ -1,107 +0,0 @@ -package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; - -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.RODIterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; - -import java.util.List; -import java.util.ArrayList; -/** - * User: hanna - * Date: May 21, 2009 - * Time: 10:55:26 AM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A pool of open iterators. Currently highly specialized to RODs, but could theoretically be - * generalized to a pool of arbitrary seekable, closeable iterators. Not thread-safe. - */ -class IteratorPool { - private final ReferenceOrderedData rod; - - /** - * All iterators of this reference-ordered data. - */ - private List allIterators = new ArrayList(); - - /** - * All iterators that are not currently in service. - */ - private List availableIterators = new ArrayList(); - - /** - * Create a new iterator pool given the current ROD. - * @param rod Reference-ordered data. - */ - public IteratorPool( ReferenceOrderedData rod ) { - this.rod = rod; - } - - /** - * Get an iterator whose position is before the specified location. Create a new one if none exists. - * @param position Target position for the iterator. - * @return - */ - public RODIterator iterator( GenomeLoc position ) { - // Grab the first iterator in the list whose position is before the requested position. - RODIterator selectedIterator = null; - for( RODIterator iterator: availableIterators ) { - if( (iterator.position() == null && iterator.hasNext()) || - (iterator.position() != null && iterator.position().isBefore(position)) ) { - selectedIterator = iterator; - break; - } - } - - // No iterator found? Create another. It is expected that - // each iterator created will have its own file handle. - if( selectedIterator == null ) { - selectedIterator = rod.iterator(); - allIterators.add(selectedIterator); - } - - // Remove the iterator from the list of available iterators. - if( availableIterators.contains(selectedIterator) ) - availableIterators.remove(selectedIterator); - - return selectedIterator; - } - - /** - * Close the given iterator, returning it to the pool. - * @param iterator Iterator to return to the pool. - */ - public void close( RODIterator iterator ) { - if( !allIterators.contains(iterator) ) - throw new StingException("Iterator does not belong to the given pool."); - availableIterators.add(iterator); - } - - /** - * Operating stats...get the number of total iterators. Package-protected - * for unit testing. - * @return An integer number of total iterators. - */ - int numIterators() { - return allIterators.size(); - } - - /** - * Operating stats...get the number of available iterators. Package-protected - * for unit testing. - * @return An integer number of available iterators. - */ - int numAvailableIterators() { - return availableIterators.size(); - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataSource.java index 8090ef4d5..c76d6c537 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataSource.java @@ -5,11 +5,9 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.RODIterator; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import java.util.List; -import java.util.ArrayList; import java.util.Iterator; +import java.util.List; /** * User: hanna * Date: May 21, 2009 @@ -30,20 +28,20 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { /** * The reference-ordered data itself. */ - private final ReferenceOrderedData rod; + private final ReferenceOrderedData rod; /** * A pool of iterators for navigating through the genome. */ - private IteratorPool iteratorPool = null; + private ReferenceOrderedDataPool iteratorPool = null; /** * Create a new reference-ordered data source. * @param rod */ - public ReferenceOrderedDataSource( ReferenceOrderedData rod) { + public ReferenceOrderedDataSource( ReferenceOrderedData rod) { this.rod = rod; - this.iteratorPool = new IteratorPool( rod ); + this.iteratorPool = new ReferenceOrderedDataPool( rod ); } /** @@ -69,7 +67,59 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { * @param iterator Iterator to close. */ public void close( RODIterator iterator ) { - this.iteratorPool.close(iterator); + this.iteratorPool.release(iterator); + } + +} + +/** + * A pool of reference-ordered data iterators. + */ +class ReferenceOrderedDataPool extends ResourcePool { + private final ReferenceOrderedData rod; + + public ReferenceOrderedDataPool( ReferenceOrderedData rod ) { + this.rod = rod; + } + + /** + * Create a new iterator from the existing reference-ordered data. This new iterator is expected + * to be completely independent of any other iterator. + * @param position @{inheritedDoc} + * @return The newly created resource. + */ + public RODIterator createNewResource( GenomeLoc position ) { + return rod.iterator(); + } + + /** + * Finds the best existing ROD iterator from the pool. In this case, the best existing ROD is defined as + * the first one encountered that is at or before the given position. + * @param position @{inheritedDoc} + * @param resources @{inheritedDoc} + * @return @{inheritedDoc} + */ + public RODIterator selectBestExistingResource( GenomeLoc position, List resources ) { + for( RODIterator iterator: resources ) { + if( (iterator.position() == null && iterator.hasNext()) || + (iterator.position() != null && iterator.position().isBefore(position)) ) + return iterator; + } + return null; + } + + /** + * In this case, the iterator is the resource. Pass it through. + */ + public RODIterator createIteratorFromResource( GenomeLoc position, RODIterator resource ) { + return resource; + } + + /** + * Don't worry about closing the resource; let the file handles expire naturally for the moment. + */ + public void closeResource( RODIterator resource ) { + } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ResourcePool.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ResourcePool.java new file mode 100755 index 000000000..12d93d24b --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ResourcePool.java @@ -0,0 +1,152 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.apache.log4j.Logger; + +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.HashMap; +import java.util.Map; +/** + * User: hanna + * Date: May 21, 2009 + * Time: 10:55:26 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A pool of open resources, all of which can create a closeable iterator. + */ +abstract class ResourcePool { + /** + * All iterators of this reference-ordered data. + */ + private List allResources = new ArrayList(); + + /** + * All iterators that are not currently in service. + */ + private List availableResources = new ArrayList(); + + /** + * Which iterators are assigned to which pools. + */ + private Map resourceAssignments = new HashMap(); + + /** + * Get an iterator whose position is before the specified location. Create a new one if none exists. + * @param position Target position for the iterator. + * @return An iterator that can traverse the selected region. Should be able to iterate concurrently with other + * iterators from tihs pool. + */ + public I iterator( GenomeLoc position ) { + // Grab the first iterator in the list whose position is before the requested position. + T selectedResource = null; + synchronized(this) { + selectedResource = selectBestExistingResource( position, availableResources ); + + // Remove the iterator from the list of available iterators. + if( selectedResource != null ) + availableResources.remove(selectedResource); + } + + // No iterator found? Create another. It is expected that + // each iterator created will have its own file handle. + if( selectedResource == null ) { + selectedResource = createNewResource(position); + addNewResource( selectedResource ); + } + + I iterator = createIteratorFromResource( position, selectedResource ); + + // Make a note of this assignment for proper releasing later. + resourceAssignments.put( iterator, selectedResource ); + + return iterator; + } + + /** + * Release the lock on the given iterator, returning it to the pool. + * @param iterator Iterator to return to the pool. + */ + public void release( I iterator ) { + synchronized(this) { + // Find and remove the resource from the list of allocated resources. + T resource = resourceAssignments.get( iterator ); + resourceAssignments.remove(resource); + + // Return the resource to the pool. + if( !allResources.contains(resource) ) + throw new StingException("Iterator does not belong to the given pool."); + availableResources.add(resource); + } + } + + /** + * Add a resource to the list of available resources. Useful if derived classes + * want to seed the pool with a set of at a given time (like at initialization). + * @param resource The new resource to add. + */ + protected void addNewResource( T resource ) { + synchronized(this) { + allResources.add(resource); + } + } + + /** + * If no appropriate resources are found in the pool, the system can create a new resource. + * Delegate the creation of the resource to the subclass. + * @param position Position for the new resource. This information may or may not inform the new resource. + * @return The new resource created. + */ + protected abstract T createNewResource( GenomeLoc position ); + + /** + * Find the most appropriate resource to acquire the specified data. + * @param position The data over which the resource is required. + * @param availableResources A list of candidate resources to evaluate. + * @return The best choice of the availableResources, or null if no resource meets the criteria. + */ + protected abstract T selectBestExistingResource( GenomeLoc position, List availableResources ); + + /** + * Create an iterator over the specified resource. + * @param position The bounds of iteration. The first element of the iterator through the last element should all + * be in the range described by position. + * @return A new iterator over the given data. + */ + protected abstract I createIteratorFromResource( GenomeLoc position, T resource ); + + /** + * Retire this resource from service. + * @param resource The resource to retire. + */ + protected abstract void closeResource(T resource); + + /** + * Operating stats...get the number of total iterators. Package-protected + * for unit testing. + * @return An integer number of total iterators. + */ + int numIterators() { + return allResources.size(); + } + + /** + * Operating stats...get the number of available iterators. Package-protected + * for unit testing. + * @return An integer number of available iterators. + */ + int numAvailableIterators() { + return availableResources.size(); + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java index 34316167d..c0ff97eba 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; import net.sf.picard.sam.SamFileHeaderMerger; +import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMReadGroupRecord; @@ -18,6 +19,7 @@ import org.broadinstitute.sting.utils.StingException; import java.io.File; import java.util.ArrayList; import java.util.List; +import java.util.Iterator; /** * User: aaron @@ -54,22 +56,8 @@ public class SAMDataSource implements SimpleDataSource { private boolean intoUnmappedReads = false; private int readsSeenAtLastPos = 0; - - /** - * package protected getter and setter for the iterator generator - * - * @return - */ - IteratorGenerator getIterGen() { - return iterGen; - } - - void setIterGen( IteratorGenerator iterGen ) { - this.iterGen = iterGen; - } - - // where we get out iterators from - private IteratorGenerator iterGen; + // A pool of SAM iterators. + private SAMIteratorPool iteratorPool = null; /** * constructor, given sam files @@ -91,8 +79,15 @@ public class SAMDataSource implements SimpleDataSource { throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName()); } } - iterGen = new MSR2IteratorGenerator(reads, byReads); + iteratorPool = new SAMIteratorPool(reads,byReads); + } + /** + * For unit testing, add a custom iterator pool. + * @param iteratorPool Custom mock iterator pool. + */ + void setResourcePool( SAMIteratorPool iteratorPool ) { + this.iteratorPool = iteratorPool; } @@ -105,11 +100,8 @@ public class SAMDataSource implements SimpleDataSource { * * @return an iterator for that region */ - public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException { - // make a merging iterator for this record - CloseableIterator iter = iterGen.seek(location); - // return the iterator - return StingSAMIteratorAdapter.adapt(reads, iter); + public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException { + return iteratorPool.iterator(location); } /** @@ -153,7 +145,7 @@ public class SAMDataSource implements SimpleDataSource { * @return SAM file header. */ public SAMFileHeader getHeader() { - return this.iterGen.getHeader(); + return iteratorPool.getHeader(); } @@ -169,16 +161,16 @@ public class SAMDataSource implements SimpleDataSource { private BoundedReadIterator seekRead( ReadShard shard ) throws SimpleDataSourceLoadException { BoundedReadIterator bound = null; - CloseableIterator iter = null; + StingSAMIterator iter = null; if (!intoUnmappedReads) { if (lastReadPos == null) { - lastReadPos = new GenomeLoc(iterGen.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE); - iter = iterGen.seek(lastReadPos); + lastReadPos = new GenomeLoc(getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE); + iter = iteratorPool.iterator(lastReadPos); return InitialReadIterator(shard.getSize(), iter); } else { lastReadPos.setStop(-1); - iter = iterGen.seek(lastReadPos); + iter = iteratorPool.iterator(lastReadPos); bound = fastMappedReadSeek(shard.getSize(), StingSAMIteratorAdapter.adapt(reads, iter)); } } @@ -187,8 +179,8 @@ public class SAMDataSource implements SimpleDataSource { if (iter != null) { iter.close(); } - iter = iterGen.seek(null); - bound = toUnmappedReads(shard.getSize(), (PeekingStingIterator) iter); + iter = iteratorPool.iterator(null); + bound = toUnmappedReads(shard.getSize(), iter); } if (bound == null) { shard.signalDone(); @@ -219,39 +211,41 @@ public class SAMDataSource implements SimpleDataSource { * @return the bounded iterator that you can use to get the intervaled reads from * @throws SimpleDataSourceLoadException */ - BoundedReadIterator toUnmappedReads( long readCount, U iter ) throws SimpleDataSourceLoadException { + BoundedReadIterator toUnmappedReads( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException { + PeekableIterator peekable = new PeekableIterator(iter); + int count = 0; int cnt = 0; SAMRecord d = null; - while (iter.hasNext()) { - d = iter.peek(); + while (peekable.hasNext()) { + d = peekable.peek(); int x = d.getReferenceIndex(); if (x < 0) // we have the magic read that starts the unmapped read segment! break; cnt++; - iter.next(); + peekable.next(); } // check to see what happened, did we run out of reads? - if (!iter.hasNext()) { + if (!peekable.hasNext()) { return null; } // now walk until we've taken the unmapped read count - while (iter.hasNext() && count < this.readsTaken) { - iter.next(); + while (peekable.hasNext() && count < this.readsTaken) { + peekable.next(); count++; } // check to see what happened, did we run out of reads? - if (!iter.hasNext()) { + if (!peekable.hasNext()) { return null; } // we're not out of unmapped reads, so increment our read cout this.readsTaken += readCount; - return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); + return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, peekable), readCount); } @@ -293,7 +287,7 @@ public class SAMDataSource implements SimpleDataSource { readsTaken = readCount; readsSeenAtLastPos = 0; lastReadPos.setStop(-1); - CloseableIterator ret = iterGen.seek(lastReadPos); + CloseableIterator ret = iteratorPool.iterator(lastReadPos); return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount); } } @@ -360,42 +354,81 @@ public class SAMDataSource implements SimpleDataSource { return bound; } - } +class SAMIteratorPool extends ResourcePool { + /** + * Source information about the reads. + */ + protected Reads reads; + + /** + * Is this a by-reads traversal or a by-locus? + */ + protected boolean byReads; + + /** + * File header for the combined file. + */ + protected SAMFileHeader header; -/** - * iterator generator - *

- * This class generates iterators for the SAMDataSource. This class was introduced for testing purposes, - * since it became increasingly hard to test the SAM data source code. The class defines two abstraact - * methods: - *

- * -seek( GenomeLoc ) which returns an iterator seeked to the genome loc, and if null is passed to the default - * location (which is implementation specific). - *

- * -getHeader(), which returns a SAMFileHeader for the specified IteratorGenerator. I hope we can phase this - * method out, since it doesn't seem necessary, and it would be much cleaner with out it. - */ -abstract class IteratorGenerator { /** our log, which we want to capture anything from this class */ - protected static Logger logger = Logger.getLogger(SAMDataSource.class); + protected static Logger logger = Logger.getLogger(SAMIteratorPool.class); + + public SAMIteratorPool( Reads reads, boolean byReads ) { + this.reads = reads; + this.byReads = byReads; + + SamFileHeaderMerger merger = createNewResource( null ); + this.header = merger.getMergedHeader(); + // Add this resource to the pool. + this.addNewResource( merger ); + } /** - * seek to a location - * - * @param seekTo the genome loc to seek to - * - * @return StingSAMIterator + * Get the combined header for all files in the iterator pool. */ - public abstract CloseableIterator seek( GenomeLoc seekTo ); + public SAMFileHeader getHeader() { + return header; + } - /** - * get the merged header - * - * @return the merged header - */ - public abstract SAMFileHeader getHeader(); + protected SamFileHeaderMerger selectBestExistingResource( GenomeLoc position, List mergers) { + if( mergers.size() == 0 ) + return null; + return mergers.get(0); + } + + protected SamFileHeaderMerger createNewResource( GenomeLoc position ) { + return createHeaderMerger( reads, SAMFileHeader.SortOrder.coordinate ); + } + + protected StingSAMIterator createIteratorFromResource( GenomeLoc loc, SamFileHeaderMerger headerMerger ) { + final MergingSamRecordIterator2 iterator = new MergingSamRecordIterator2(headerMerger, reads); + + if( loc != null ) { + if (byReads) + iterator.queryContained(loc.getContig(), (int) loc.getStart(), (int) loc.getStop()); + else + iterator.queryOverlapping(loc.getContig(), (int) loc.getStart(), (int) loc.getStop()); + } + + return new StingSAMIterator() { + public Reads getSourceInfo() { return reads; } + public void close() { + iterator.close(); + release(this); + } + public Iterator iterator() { return this; } + public boolean hasNext() { return iterator.hasNext(); } + public SAMRecord next() { return iterator.next(); } + public void remove() { throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); } + }; + } + + protected void closeResource( SamFileHeaderMerger resource ) { + for( SAMFileReader reader: resource.getReaders() ) + reader.close(); + } /** * Load a SAM/BAM, given an input file. @@ -404,7 +437,7 @@ abstract class IteratorGenerator { * * @return a SAMFileReader for the file, null if we're attempting to read a list */ - protected static SAMFileReader initializeSAMFile( final File samFile, SAMFileReader.ValidationStringency strictness ) { + protected SAMFileReader initializeSAMFile( final File samFile, SAMFileReader.ValidationStringency strictness ) { if (samFile.toString().endsWith(".list")) { return null; } else { @@ -425,11 +458,11 @@ abstract class IteratorGenerator { * @return a list of SAMFileReaders that represent the stored file names * @throws SimpleDataSourceLoadException if there's a problem loading the files */ - protected static List GetReaderList( Reads reads, SAMFileReader.ValidationStringency strictness ) throws SimpleDataSourceLoadException { + protected List GetReaderList( Reads reads ) throws SimpleDataSourceLoadException { // right now this is pretty damn heavy, it copies the file list into a reader list every time List lst = new ArrayList(); for (File f : reads.getReadsFiles()) { - SAMFileReader reader = initializeSAMFile(f, strictness); + SAMFileReader reader = initializeSAMFile(f, reads.getValidationStringency()); if (reader.getFileHeader().getReadGroups().size() < 1) { //logger.warn("Setting header in reader " + f.getName()); @@ -453,60 +486,8 @@ abstract class IteratorGenerator { * * @return a SamFileHeaderMerger that includes the set of SAM files we were created with */ - protected static SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileReader.ValidationStringency strictness, SAMFileHeader.SortOrder SORT_ORDER ) { - List lst = GetReaderList(reads, strictness); + protected SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileHeader.SortOrder SORT_ORDER ) { + List lst = GetReaderList(reads); return new SamFileHeaderMerger(lst, SORT_ORDER, true); } -} - - -/** - * MSR2IteratorGenerator - *

- * generates a MerginsSAMIterator2, given a genomic location. The constructor takes the reads structure, - * and a flag indicating if we're dealing with reads or loci (to determine the correct query function). - */ -class MSR2IteratorGenerator extends IteratorGenerator { - /** our read pile */ - private Reads reads; - - private SamFileHeaderMerger header; - - // How strict should we be with SAM/BAM parsing? - protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT; - - // are we by reads or by loci - protected boolean byReads = true; - - /** our SAM data files */ - private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; - - public MSR2IteratorGenerator( Reads reads, boolean byReads ) { - this.reads = reads; - this.header = IteratorGenerator.createHeaderMerger(reads, strictness, sortOrder); - this.byReads = byReads; - } - - public CloseableIterator seek( GenomeLoc seekTo ) { - SamFileHeaderMerger mg = createHeaderMerger(reads, strictness, sortOrder); - MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(mg, reads); - if (seekTo != null) { - if (byReads) - iter.queryContained(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop()); - else - iter.queryOverlapping(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop()); - } - return iter; - } - - /** - * get the merged header - * - * @return the merged header - */ - public SAMFileHeader getHeader() { - return header.getMergedHeader(); - } -} - - +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java index eb5fb0a48..da7ea497c 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java @@ -55,9 +55,6 @@ public class BoundedReadIterator implements StingSAMIterator { // our unmapped read flag private boolean doNotUseThatUnmappedReadPile = false; - // are we open - private boolean isOpen = false; - // the next read we've buffered private SAMRecord record = null; @@ -67,10 +64,6 @@ public class BoundedReadIterator implements StingSAMIterator { * @param readCount */ public BoundedReadIterator(StingSAMIterator iter, long readCount) { - if (iter != null) { - isOpen = true; - - } this.iterator = iter; this.readCount = readCount; } @@ -103,7 +96,7 @@ public class BoundedReadIterator implements StingSAMIterator { * @return */ public boolean hasNext() { - if (isOpen && iterator.hasNext() && currentCount < readCount) { + if (iterator.hasNext() && currentCount < readCount) { record = iterator.next(); ++currentCount; if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) { @@ -111,9 +104,6 @@ public class BoundedReadIterator implements StingSAMIterator { } return true; } else { - if (isOpen) { - close(); - } return false; } } @@ -137,7 +127,6 @@ public class BoundedReadIterator implements StingSAMIterator { * close the iterator */ public void close() { - isOpen = false; iterator.close(); } diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java b/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java index 12b022d44..ca865eef1 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java @@ -93,7 +93,6 @@ public class MergingSamRecordIterator2 implements CloseableIterator, } final SAMRecordComparator comparator = getComparator(); for (final SAMFileReader reader : samHeaderMerger.getReaders()) { - //reader.close(); Iterator recordIter = reader.query(contig, start, stop, contained); final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator); addIfNotEmpty(iterator); @@ -257,13 +256,12 @@ public class MergingSamRecordIterator2 implements CloseableIterator, /** - * closes all the file handles for the readers....DO THIS or you will run out of handles + * closes all open iterators....DO THIS or you will run out of handles * with sharding. */ public void close() { - for (SAMFileReader reader : samHeaderMerger.getReaders()) { - reader.close(); - } + for( ComparableSamRecordIterator iterator: pq ) + iterator.close(); } diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPoolTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataPoolTest.java similarity index 92% rename from java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPoolTest.java rename to java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataPoolTest.java index 8ff0cd345..5f174ab7b 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/IteratorPoolTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceOrderedDataPoolTest.java @@ -32,7 +32,7 @@ import java.io.FileNotFoundException; * Test the contents and number of iterators in the pool. */ -public class IteratorPoolTest extends BaseTest { +public class ReferenceOrderedDataPoolTest extends BaseTest { private static File sequenceFile = new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"); @@ -56,7 +56,7 @@ public class IteratorPoolTest extends BaseTest { @Test public void testCreateSingleIterator() { - IteratorPool iteratorPool = new IteratorPool(rod); + ResourcePool iteratorPool = new ReferenceOrderedDataPool(rod); RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); @@ -69,7 +69,7 @@ public class IteratorPoolTest extends BaseTest { assertTrue(datum.get("COL2").equals("B")); assertTrue(datum.get("COL3").equals("C")); - iteratorPool.close(iterator); + iteratorPool.release(iterator); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators()); @@ -77,7 +77,7 @@ public class IteratorPoolTest extends BaseTest { @Test public void testCreateMultipleIterators() { - IteratorPool iteratorPool = new IteratorPool(rod); + ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod); RODIterator iterator1 = (RODIterator)iteratorPool.iterator( testSite1 ); // Create a new iterator at position 2. @@ -114,12 +114,12 @@ public class IteratorPoolTest extends BaseTest { assertTrue(datum.get("COL3").equals("E")); // Cleanup, and make sure the number of iterators dies appropriately. - iteratorPool.close(iterator1); + iteratorPool.release(iterator1); Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators()); - iteratorPool.close(iterator2); + iteratorPool.release(iterator2); Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 2, iteratorPool.numAvailableIterators()); @@ -127,7 +127,7 @@ public class IteratorPoolTest extends BaseTest { @Test public void testIteratorConservation() { - IteratorPool iteratorPool = new IteratorPool(rod); + ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod); RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite1 ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); @@ -139,7 +139,7 @@ public class IteratorPoolTest extends BaseTest { assertTrue(datum.get("COL2").equals("B")); assertTrue(datum.get("COL3").equals("C")); - iteratorPool.close(iterator); + iteratorPool.release(iterator); // Create another iterator after the current iterator. iterator = iteratorPool.iterator(testSite3); @@ -154,7 +154,7 @@ public class IteratorPoolTest extends BaseTest { assertTrue(datum.get("COL2").equals("G")); assertTrue(datum.get("COL3").equals("H")); - iteratorPool.close(iterator); + iteratorPool.release(iterator); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators()); @@ -162,7 +162,7 @@ public class IteratorPoolTest extends BaseTest { @Test public void testIteratorCreation() { - IteratorPool iteratorPool = new IteratorPool(rod); + ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod); RODIterator iterator = (RODIterator)iteratorPool.iterator( testSite3 ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); @@ -174,7 +174,7 @@ public class IteratorPoolTest extends BaseTest { assertTrue(datum.get("COL2").equals("G")); assertTrue(datum.get("COL3").equals("H")); - iteratorPool.close(iterator); + iteratorPool.release(iterator); // Create another iterator after the current iterator. iterator = iteratorPool.iterator(testSite1); @@ -189,7 +189,7 @@ public class IteratorPoolTest extends BaseTest { assertTrue(datum.get("COL2").equals("B")); assertTrue(datum.get("COL3").equals("C")); - iteratorPool.close(iterator); + iteratorPool.release(iterator); Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 2, iteratorPool.numAvailableIterators()); diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java index 2c6e456e3..053d22cb3 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java @@ -22,6 +22,7 @@ import org.junit.Test; import java.io.File; import java.util.ArrayList; import java.util.List; +import java.util.Collections; /** * @@ -76,7 +77,7 @@ public class SAMByReadsTest extends BaseTest { /** Test out that we can shard the file and iterate over every read */ @Test public void testToUnmappedReads() { - ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000); + ArtificialResourcePool gen = new ArtificialResourcePool(1,10,100,1000); GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary()); try { int unmappedReadsSeen = 0; @@ -109,7 +110,7 @@ public class SAMByReadsTest extends BaseTest { /** Test out that we can shard the file and iterate over every read */ @Test public void testShardingOfReadsSize14() { - ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000); + ArtificialResourcePool gen = new ArtificialResourcePool(1,10,100,1000); GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary()); targetReadCount = 14; try { @@ -118,7 +119,7 @@ public class SAMByReadsTest extends BaseTest { SAMDataSource data = new SAMDataSource(reads,true); - data.setIterGen(gen); + data.setResourcePool(gen); shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount); while (shardStrategy.hasNext()) { @@ -159,7 +160,7 @@ public class SAMByReadsTest extends BaseTest { /** Test out that we can shard the file and iterate over every read */ @Test public void testShardingOfReadsSize25() { - ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000); + ArtificialResourcePool gen = new ArtificialResourcePool(1,10,100,1000); GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary()); targetReadCount = 25; try { @@ -168,7 +169,7 @@ public class SAMByReadsTest extends BaseTest { SAMDataSource data = new SAMDataSource(reads,true); - data.setIterGen(gen); + data.setResourcePool(gen); shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount); while (shardStrategy.hasNext()) { @@ -212,28 +213,25 @@ public class SAMByReadsTest extends BaseTest { /** * use this to inject into SAMDataSource for testing */ -class ArtificialIteratorGenerator extends IteratorGenerator { +class ArtificialResourcePool extends SAMIteratorPool { // How strict should we be with SAM/BAM parsing? protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT; // the header private SAMFileHeader header; - private int endingChr; - private int startingChr; - private int readCount; - private int readSize; - /** our SAM data files */ private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; - public ArtificialIteratorGenerator( int startingChr, int endingChr, int readCount, int readSize) { + public ArtificialResourcePool( int startingChr, int endingChr, int readCount, int readSize) { + super( new Reads(Collections.emptyList()),true ); header = ArtificialSamUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + readSize); } - public CloseableIterator seek( GenomeLoc seekTo ) { + @Override + public StingSAMIterator iterator( GenomeLoc loc ) { ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 10, 100, 1000); - if (seekTo != null) { - iter.queryContained(seekTo.getContig(), (int)seekTo.getStart(), (int)seekTo.getStop()); + if (loc != null) { + iter.queryContained(loc.getContig(), (int)loc.getStart(), (int)loc.getStop()); } return iter; } @@ -244,6 +242,6 @@ class ArtificialIteratorGenerator extends IteratorGenerator { * @return the merged header */ public SAMFileHeader getHeader() { - return this.header; + return this.header; } } \ No newline at end of file