ReservoirDownsampler optimizations

-- Add an option to not allocate always ArrayLists of targetSampleSize, but rather the previous size + MARGIN.  This helps for LIBS as most of the time we don't need nearly so much space as we allow
-- consumeFinalizedItems returns an empty list if the reservior is empty, which it often true for our BAM files with low coverage
-- Allow empty sample lists for SamplePartitioner as these are used by the RefTraversals and other non-read based traversals

Make the reservoir downsampler use a linked list, rather than a fixed sized array list, in the expectFewOverflows case
This commit is contained in:
Mark DePristo 2013-01-13 20:43:10 -05:00
parent c7f0ca8ac5
commit 7eea6b8f92
2 changed files with 68 additions and 17 deletions

View File

@ -29,9 +29,7 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList; import java.util.*;
import java.util.Collection;
import java.util.List;
/** /**
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
@ -42,10 +40,25 @@ import java.util.List;
* @author David Roazen * @author David Roazen
*/ */
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> { public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private final int targetSampleSize;
private ArrayList<T> reservoir; /**
* if true, this downsampler will be optimized for the case
* where most of the time we won't fill up anything like the
* targetSampleSize elements. If this is false, we will allocate
* internal buffers to targetSampleSize initially, which minimizes
* the cost of allocation if we often use targetSampleSize or more
* elements.
*/
private final boolean expectFewOverflows;
private int targetSampleSize; /**
* At times this can be a linked list or an array list, depending on how we're accessing the
* data and whether or not we're expecting few overflows
*/
private List<T> reservoir;
private boolean isLinkedList;
private int totalReadsSeen; private int totalReadsSeen;
@ -56,17 +69,35 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
* *
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
* after downsampling will be min(totalReads, targetSampleSize) * after downsampling will be min(totalReads, targetSampleSize)
* @param expectFewOverflows if true, this downsampler will be optimized for the case
* where most of the time we won't fill up anything like the
* targetSampleSize elements. If this is false, we will allocate
* internal buffers to targetSampleSize initially, which minimizes
* the cost of allocation if we often use targetSampleSize or more
* elements.
*/ */
public ReservoirDownsampler ( int targetSampleSize ) { public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows) {
if ( targetSampleSize <= 0 ) { if ( targetSampleSize <= 0 ) {
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
} }
this.targetSampleSize = targetSampleSize; this.targetSampleSize = targetSampleSize;
this.expectFewOverflows = expectFewOverflows;
clear(); clear();
reset(); reset();
} }
/**
* Construct a ReservoirDownsampler
*
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
* after downsampling will be min(totalReads, targetSampleSize)
*/
public ReservoirDownsampler ( int targetSampleSize ) {
this(targetSampleSize, false);
}
public void submit ( T newRead ) { public void submit ( T newRead ) {
totalReadsSeen++; totalReadsSeen++;
@ -74,7 +105,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
reservoir.add(newRead); reservoir.add(newRead);
} }
else { else {
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen); if ( isLinkedList ) {
reservoir = new ArrayList<T>(reservoir);
isLinkedList = false;
}
final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
if ( randomSlot < targetSampleSize ) { if ( randomSlot < targetSampleSize ) {
reservoir.set(randomSlot, newRead); reservoir.set(randomSlot, newRead);
} }
@ -93,10 +129,15 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
} }
public List<T> consumeFinalizedItems() { public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed if ( reservoir.isEmpty() ) {
List<T> downsampledItems = reservoir; // if there's nothing here, don't both allocating a new list completely
clear(); return Collections.emptyList();
return downsampledItems; } else {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
}
} }
public boolean hasPendingItems() { public boolean hasPendingItems() {
@ -119,9 +160,18 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
// NO-OP // NO-OP
} }
/**
* Clear the data structures used to hold information
*/
public void clear() { public void clear() {
reservoir = new ArrayList<T>(targetSampleSize); // if we aren't expecting many overflows, allocate a linked list not an arraylist
totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below reservoir = expectFewOverflows ? new LinkedList<T>() : new ArrayList<T>(targetSampleSize);
// it's a linked list if we allocate one
isLinkedList = expectFewOverflows;
// an internal stat used by the downsampling process, so not cleared by reset() below
totalReadsSeen = 0;
} }
public void reset() { public void reset() {

View File

@ -62,16 +62,17 @@ class SamplePartitioner<T extends SAMRecord> {
* will throw an exception. Duplicates in the list of samples will be ignored * will throw an exception. Duplicates in the list of samples will be ignored
* *
* @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage? * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage?
* @param samples the complete list of samples we're going to partition reads into * @param samples the complete list of samples we're going to partition reads into. Can be
* empty, but in that case this code cannot function properly if you
* attempt to add data to it.
*/ */
@Ensures({ @Ensures({
"readsBySample != null", "readsBySample != null",
"! readsBySample.isEmpty()",
"readsBySample.size() == new HashSet(samples).size()" "readsBySample.size() == new HashSet(samples).size()"
}) })
public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) { public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) {
if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null"); if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null");
if ( samples == null || samples.isEmpty() ) throw new IllegalArgumentException("samples must be a non-null, non-empty list but got " + samples); if ( samples == null ) throw new IllegalArgumentException("samples must be a non-null list");
readsBySample = new LinkedHashMap<String, Downsampler<T>>(samples.size()); readsBySample = new LinkedHashMap<String, Downsampler<T>>(samples.size());
for ( final String sample : samples ) { for ( final String sample : samples ) {
@ -89,7 +90,7 @@ class SamplePartitioner<T extends SAMRecord> {
@Ensures("result != null") @Ensures("result != null")
private Downsampler<T> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { private Downsampler<T> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
return LIBSDownsamplingInfo.isPerformDownsampling() return LIBSDownsamplingInfo.isPerformDownsampling()
? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage()) ? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage(), true)
: new PassThroughDownsampler<T>(); : new PassThroughDownsampler<T>();
} }