2010-04-20 04:48:14 +08:00
|
|
|
package org.broadinstitute.sting.utils;
|
|
|
|
|
|
|
|
|
|
import net.sf.picard.util.PeekableIterator;
|
|
|
|
|
|
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Randomly downsample from a stream of elements. This algorithm is a direct,
|
|
|
|
|
* naive implementation of reservoir downsampling as described in "Random Downsampling
|
|
|
|
|
* with a Reservoir" (Vitter 1985). At time of writing, this paper is located here:
|
|
|
|
|
* http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.138.784&rep=rep1&type=pdf
|
2010-05-18 05:00:44 +08:00
|
|
|
|
2010-04-20 04:48:14 +08:00
|
|
|
* @author mhanna
|
|
|
|
|
* @version 0.1
|
|
|
|
|
*/
|
2010-05-18 05:00:44 +08:00
|
|
|
public class ReservoirDownsampler<T> implements Collection<T> {
|
2010-04-20 04:48:14 +08:00
|
|
|
/**
|
|
|
|
|
* Create a random number generator with a random, but reproducible, seed.
|
|
|
|
|
*/
|
|
|
|
|
private final Random random = new Random(47382911L);
|
|
|
|
|
|
|
|
|
|
/**
|
2010-05-18 05:00:44 +08:00
|
|
|
* The reservoir of elements tracked by this downsampler.
|
2010-04-20 04:48:14 +08:00
|
|
|
*/
|
2010-05-18 05:00:44 +08:00
|
|
|
private final ArrayList<T> reservoir;
|
2010-04-22 03:50:26 +08:00
|
|
|
|
2010-04-20 04:48:14 +08:00
|
|
|
/**
|
|
|
|
|
* What is the maximum number of reads that can be returned in a single batch.
|
|
|
|
|
*/
|
|
|
|
|
private final int maxElements;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Create a new downsampler with the given source iterator and given comparator.
|
2010-04-22 03:50:26 +08:00
|
|
|
* @param maxElements What is the maximum number of reads that can be returned in any call of this
|
|
|
|
|
*/
|
2010-05-18 05:00:44 +08:00
|
|
|
public ReservoirDownsampler(final int maxElements) {
|
2010-04-20 04:48:14 +08:00
|
|
|
if(maxElements < 0)
|
|
|
|
|
throw new StingException("Unable to work with an negative size collection of elements");
|
2010-05-18 05:00:44 +08:00
|
|
|
this.reservoir = new ArrayList<T>(maxElements);
|
2010-04-20 04:48:14 +08:00
|
|
|
this.maxElements = maxElements;
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public boolean add(T element) {
|
|
|
|
|
if(maxElements <= 0)
|
|
|
|
|
return false;
|
|
|
|
|
else if(reservoir.size() < maxElements) {
|
|
|
|
|
reservoir.add(element);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// Get a uniformly distributed int. If the chosen slot lives within the partition, replace the entry in that slot with the newest entry.
|
|
|
|
|
int slot = random.nextInt(maxElements);
|
|
|
|
|
if(slot >= 0 && slot < maxElements) {
|
|
|
|
|
reservoir.set(slot,element);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public boolean addAll(Collection<? extends T> elements) {
|
|
|
|
|
boolean added = false;
|
|
|
|
|
for(T element: elements)
|
|
|
|
|
added |= add(element);
|
|
|
|
|
return added;
|
2010-04-20 04:48:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2010-05-18 05:00:44 +08:00
|
|
|
* Returns the contents of this reservoir, downsampled to the given value. Note that the return value
|
|
|
|
|
* @return The downsampled contents of this reservoir.
|
2010-04-20 04:48:14 +08:00
|
|
|
*/
|
2010-05-18 05:00:44 +08:00
|
|
|
public Collection<T> getDownsampledContents() {
|
2010-05-19 13:40:05 +08:00
|
|
|
return (Collection<T>)reservoir.clone();
|
2010-05-18 05:00:44 +08:00
|
|
|
}
|
2010-04-20 04:48:14 +08:00
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public void clear() {
|
|
|
|
|
reservoir.clear();
|
|
|
|
|
}
|
2010-04-20 04:48:14 +08:00
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public boolean isEmpty() {
|
|
|
|
|
return reservoir.isEmpty();
|
|
|
|
|
}
|
2010-04-22 03:50:26 +08:00
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public int size() {
|
|
|
|
|
return reservoir.size();
|
|
|
|
|
}
|
2010-04-20 04:48:14 +08:00
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public Iterator<T> iterator() {
|
|
|
|
|
return reservoir.iterator();
|
|
|
|
|
}
|
2010-04-22 03:50:26 +08:00
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public boolean contains(Object o) {
|
|
|
|
|
return reservoir.contains(o);
|
|
|
|
|
}
|
2010-04-20 04:48:14 +08:00
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public boolean containsAll(Collection<?> elements) {
|
|
|
|
|
return reservoir.containsAll(elements);
|
2010-04-20 04:48:14 +08:00
|
|
|
}
|
|
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public boolean retainAll(Collection<?> elements) {
|
|
|
|
|
return reservoir.retainAll(elements);
|
2010-04-22 03:50:26 +08:00
|
|
|
}
|
|
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public boolean remove(Object o) {
|
|
|
|
|
return reservoir.remove(o);
|
2010-04-20 04:48:14 +08:00
|
|
|
}
|
2010-04-22 03:50:26 +08:00
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public boolean removeAll(Collection<?> elements) {
|
|
|
|
|
return reservoir.removeAll(elements);
|
2010-04-22 03:50:26 +08:00
|
|
|
}
|
|
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public Object[] toArray() {
|
|
|
|
|
Object[] contents = new Object[reservoir.size()];
|
|
|
|
|
reservoir.toArray(contents);
|
|
|
|
|
return contents;
|
|
|
|
|
}
|
2010-04-22 03:50:26 +08:00
|
|
|
|
2010-05-18 05:00:44 +08:00
|
|
|
@Override
|
|
|
|
|
public <T> T[] toArray(T[] array) {
|
|
|
|
|
return reservoir.toArray(array);
|
2010-04-22 03:50:26 +08:00
|
|
|
}
|
2010-04-20 04:48:14 +08:00
|
|
|
}
|