ReservoirDownsampler optimizations
-- Add an option to not allocate always ArrayLists of targetSampleSize, but rather the previous size + MARGIN. This helps for LIBS as most of the time we don't need nearly so much space as we allow -- consumeFinalizedItems returns an empty list if the reservior is empty, which it often true for our BAM files with low coverage -- Allow empty sample lists for SamplePartitioner as these are used by the RefTraversals and other non-read based traversals Make the reservoir downsampler use a linked list, rather than a fixed sized array list, in the expectFewOverflows case
This commit is contained in:
parent
c7f0ca8ac5
commit
7eea6b8f92
|
|
@ -29,9 +29,7 @@ import net.sf.samtools.SAMRecord;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
|
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
|
||||||
|
|
@ -42,10 +40,25 @@ import java.util.List;
|
||||||
* @author David Roazen
|
* @author David Roazen
|
||||||
*/
|
*/
|
||||||
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||||
|
private final int targetSampleSize;
|
||||||
|
|
||||||
private ArrayList<T> reservoir;
|
/**
|
||||||
|
* if true, this downsampler will be optimized for the case
|
||||||
|
* where most of the time we won't fill up anything like the
|
||||||
|
* targetSampleSize elements. If this is false, we will allocate
|
||||||
|
* internal buffers to targetSampleSize initially, which minimizes
|
||||||
|
* the cost of allocation if we often use targetSampleSize or more
|
||||||
|
* elements.
|
||||||
|
*/
|
||||||
|
private final boolean expectFewOverflows;
|
||||||
|
|
||||||
private int targetSampleSize;
|
/**
|
||||||
|
* At times this can be a linked list or an array list, depending on how we're accessing the
|
||||||
|
* data and whether or not we're expecting few overflows
|
||||||
|
*/
|
||||||
|
private List<T> reservoir;
|
||||||
|
|
||||||
|
private boolean isLinkedList;
|
||||||
|
|
||||||
private int totalReadsSeen;
|
private int totalReadsSeen;
|
||||||
|
|
||||||
|
|
@ -56,17 +69,35 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
||||||
*
|
*
|
||||||
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
|
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
|
||||||
* after downsampling will be min(totalReads, targetSampleSize)
|
* after downsampling will be min(totalReads, targetSampleSize)
|
||||||
|
* @param expectFewOverflows if true, this downsampler will be optimized for the case
|
||||||
|
* where most of the time we won't fill up anything like the
|
||||||
|
* targetSampleSize elements. If this is false, we will allocate
|
||||||
|
* internal buffers to targetSampleSize initially, which minimizes
|
||||||
|
* the cost of allocation if we often use targetSampleSize or more
|
||||||
|
* elements.
|
||||||
*/
|
*/
|
||||||
public ReservoirDownsampler ( int targetSampleSize ) {
|
public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows) {
|
||||||
if ( targetSampleSize <= 0 ) {
|
if ( targetSampleSize <= 0 ) {
|
||||||
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
|
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
|
||||||
}
|
}
|
||||||
|
|
||||||
this.targetSampleSize = targetSampleSize;
|
this.targetSampleSize = targetSampleSize;
|
||||||
|
this.expectFewOverflows = expectFewOverflows;
|
||||||
clear();
|
clear();
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct a ReservoirDownsampler
|
||||||
|
*
|
||||||
|
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
|
||||||
|
* after downsampling will be min(totalReads, targetSampleSize)
|
||||||
|
*/
|
||||||
|
public ReservoirDownsampler ( int targetSampleSize ) {
|
||||||
|
this(targetSampleSize, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void submit ( T newRead ) {
|
public void submit ( T newRead ) {
|
||||||
totalReadsSeen++;
|
totalReadsSeen++;
|
||||||
|
|
||||||
|
|
@ -74,7 +105,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
||||||
reservoir.add(newRead);
|
reservoir.add(newRead);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
|
if ( isLinkedList ) {
|
||||||
|
reservoir = new ArrayList<T>(reservoir);
|
||||||
|
isLinkedList = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
|
||||||
if ( randomSlot < targetSampleSize ) {
|
if ( randomSlot < targetSampleSize ) {
|
||||||
reservoir.set(randomSlot, newRead);
|
reservoir.set(randomSlot, newRead);
|
||||||
}
|
}
|
||||||
|
|
@ -93,10 +129,15 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<T> consumeFinalizedItems() {
|
public List<T> consumeFinalizedItems() {
|
||||||
// pass by reference rather than make a copy, for speed
|
if ( reservoir.isEmpty() ) {
|
||||||
List<T> downsampledItems = reservoir;
|
// if there's nothing here, don't both allocating a new list completely
|
||||||
clear();
|
return Collections.emptyList();
|
||||||
return downsampledItems;
|
} else {
|
||||||
|
// pass by reference rather than make a copy, for speed
|
||||||
|
List<T> downsampledItems = reservoir;
|
||||||
|
clear();
|
||||||
|
return downsampledItems;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasPendingItems() {
|
public boolean hasPendingItems() {
|
||||||
|
|
@ -119,9 +160,18 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
||||||
// NO-OP
|
// NO-OP
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear the data structures used to hold information
|
||||||
|
*/
|
||||||
public void clear() {
|
public void clear() {
|
||||||
reservoir = new ArrayList<T>(targetSampleSize);
|
// if we aren't expecting many overflows, allocate a linked list not an arraylist
|
||||||
totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below
|
reservoir = expectFewOverflows ? new LinkedList<T>() : new ArrayList<T>(targetSampleSize);
|
||||||
|
|
||||||
|
// it's a linked list if we allocate one
|
||||||
|
isLinkedList = expectFewOverflows;
|
||||||
|
|
||||||
|
// an internal stat used by the downsampling process, so not cleared by reset() below
|
||||||
|
totalReadsSeen = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reset() {
|
public void reset() {
|
||||||
|
|
|
||||||
|
|
@ -62,16 +62,17 @@ class SamplePartitioner<T extends SAMRecord> {
|
||||||
* will throw an exception. Duplicates in the list of samples will be ignored
|
* will throw an exception. Duplicates in the list of samples will be ignored
|
||||||
*
|
*
|
||||||
* @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage?
|
* @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage?
|
||||||
* @param samples the complete list of samples we're going to partition reads into
|
* @param samples the complete list of samples we're going to partition reads into. Can be
|
||||||
|
* empty, but in that case this code cannot function properly if you
|
||||||
|
* attempt to add data to it.
|
||||||
*/
|
*/
|
||||||
@Ensures({
|
@Ensures({
|
||||||
"readsBySample != null",
|
"readsBySample != null",
|
||||||
"! readsBySample.isEmpty()",
|
|
||||||
"readsBySample.size() == new HashSet(samples).size()"
|
"readsBySample.size() == new HashSet(samples).size()"
|
||||||
})
|
})
|
||||||
public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) {
|
public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) {
|
||||||
if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null");
|
if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null");
|
||||||
if ( samples == null || samples.isEmpty() ) throw new IllegalArgumentException("samples must be a non-null, non-empty list but got " + samples);
|
if ( samples == null ) throw new IllegalArgumentException("samples must be a non-null list");
|
||||||
|
|
||||||
readsBySample = new LinkedHashMap<String, Downsampler<T>>(samples.size());
|
readsBySample = new LinkedHashMap<String, Downsampler<T>>(samples.size());
|
||||||
for ( final String sample : samples ) {
|
for ( final String sample : samples ) {
|
||||||
|
|
@ -89,7 +90,7 @@ class SamplePartitioner<T extends SAMRecord> {
|
||||||
@Ensures("result != null")
|
@Ensures("result != null")
|
||||||
private Downsampler<T> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
|
private Downsampler<T> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
|
||||||
return LIBSDownsamplingInfo.isPerformDownsampling()
|
return LIBSDownsamplingInfo.isPerformDownsampling()
|
||||||
? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage())
|
? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage(), true)
|
||||||
: new PassThroughDownsampler<T>();
|
: new PassThroughDownsampler<T>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue