ReservoirDownsampler optimizations

-- Add an option to not allocate always ArrayLists of targetSampleSize, but rather the previous size + MARGIN.  This helps for LIBS as most of the time we don't need nearly so much space as we allow
-- consumeFinalizedItems returns an empty list if the reservior is empty, which it often true for our BAM files with low coverage
-- Allow empty sample lists for SamplePartitioner as these are used by the RefTraversals and other non-read based traversals

Make the reservoir downsampler use a linked list, rather than a fixed sized array list, in the expectFewOverflows case
This commit is contained in:
Mark DePristo 2013-01-13 20:43:10 -05:00
parent c7f0ca8ac5
commit 7eea6b8f92
2 changed files with 68 additions and 17 deletions

View File

@ -29,9 +29,7 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.*;
/**
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
@ -42,10 +40,25 @@ import java.util.List;
* @author David Roazen
*/
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private final int targetSampleSize;
private ArrayList<T> reservoir;
/**
* if true, this downsampler will be optimized for the case
* where most of the time we won't fill up anything like the
* targetSampleSize elements. If this is false, we will allocate
* internal buffers to targetSampleSize initially, which minimizes
* the cost of allocation if we often use targetSampleSize or more
* elements.
*/
private final boolean expectFewOverflows;
private int targetSampleSize;
/**
* At times this can be a linked list or an array list, depending on how we're accessing the
* data and whether or not we're expecting few overflows
*/
private List<T> reservoir;
private boolean isLinkedList;
private int totalReadsSeen;
@ -56,17 +69,35 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
*
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
* after downsampling will be min(totalReads, targetSampleSize)
* @param expectFewOverflows if true, this downsampler will be optimized for the case
* where most of the time we won't fill up anything like the
* targetSampleSize elements. If this is false, we will allocate
* internal buffers to targetSampleSize initially, which minimizes
* the cost of allocation if we often use targetSampleSize or more
* elements.
*/
public ReservoirDownsampler ( int targetSampleSize ) {
public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows) {
if ( targetSampleSize <= 0 ) {
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
}
this.targetSampleSize = targetSampleSize;
this.expectFewOverflows = expectFewOverflows;
clear();
reset();
}
/**
* Construct a ReservoirDownsampler
*
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
* after downsampling will be min(totalReads, targetSampleSize)
*/
public ReservoirDownsampler ( int targetSampleSize ) {
this(targetSampleSize, false);
}
public void submit ( T newRead ) {
totalReadsSeen++;
@ -74,7 +105,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
reservoir.add(newRead);
}
else {
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
if ( isLinkedList ) {
reservoir = new ArrayList<T>(reservoir);
isLinkedList = false;
}
final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
if ( randomSlot < targetSampleSize ) {
reservoir.set(randomSlot, newRead);
}
@ -93,10 +129,15 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
}
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
if ( reservoir.isEmpty() ) {
// if there's nothing here, don't both allocating a new list completely
return Collections.emptyList();
} else {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
}
}
public boolean hasPendingItems() {
@ -119,9 +160,18 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
// NO-OP
}
/**
* Clear the data structures used to hold information
*/
public void clear() {
reservoir = new ArrayList<T>(targetSampleSize);
totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below
// if we aren't expecting many overflows, allocate a linked list not an arraylist
reservoir = expectFewOverflows ? new LinkedList<T>() : new ArrayList<T>(targetSampleSize);
// it's a linked list if we allocate one
isLinkedList = expectFewOverflows;
// an internal stat used by the downsampling process, so not cleared by reset() below
totalReadsSeen = 0;
}
public void reset() {

View File

@ -62,16 +62,17 @@ class SamplePartitioner<T extends SAMRecord> {
* will throw an exception. Duplicates in the list of samples will be ignored
*
* @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage?
* @param samples the complete list of samples we're going to partition reads into
* @param samples the complete list of samples we're going to partition reads into. Can be
* empty, but in that case this code cannot function properly if you
* attempt to add data to it.
*/
@Ensures({
"readsBySample != null",
"! readsBySample.isEmpty()",
"readsBySample.size() == new HashSet(samples).size()"
})
public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) {
if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null");
if ( samples == null || samples.isEmpty() ) throw new IllegalArgumentException("samples must be a non-null, non-empty list but got " + samples);
if ( samples == null ) throw new IllegalArgumentException("samples must be a non-null list");
readsBySample = new LinkedHashMap<String, Downsampler<T>>(samples.size());
for ( final String sample : samples ) {
@ -89,7 +90,7 @@ class SamplePartitioner<T extends SAMRecord> {
@Ensures("result != null")
private Downsampler<T> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
return LIBSDownsamplingInfo.isPerformDownsampling()
? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage())
? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage(), true)
: new PassThroughDownsampler<T>();
}