From c08936d6f47894f4073c9e4d2678a05fa6798c55 Mon Sep 17 00:00:00 2001 From: hanna Date: Mon, 19 Apr 2010 20:48:14 +0000 Subject: [PATCH] Added a reservoir downsampler which can sample elements in an iterator uniformly from a stream (see Vitter 1985). Thanks to Eric and Andrey for the pointer. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3197 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/ReservoirDownsampler.java | 103 ++++++++++ .../utils/ReservoirDownsamplerUnitTest.java | 190 ++++++++++++++++++ .../utils/sam/AlignmentStartComparator.java | 22 ++ 3 files changed, 315 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java create mode 100644 java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java create mode 100644 java/test/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java diff --git a/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java b/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java new file mode 100644 index 000000000..a4a8e7f03 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java @@ -0,0 +1,103 @@ +package org.broadinstitute.sting.utils; + +import net.sf.picard.util.PeekableIterator; + +import java.util.*; + +/** + * Randomly downsample from a stream of elements. This algorithm is a direct, + * naive implementation of reservoir downsampling as described in "Random Downsampling + * with a Reservoir" (Vitter 1985). At time of writing, this paper is located here: + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.138.784&rep=rep1&type=pdf + * + * Note that using the ReservoirDownsampler will leave the given iterator in an undefined + * state. Do not attempt to use the iterator (other than closing it) after the Downsampler + * completes. + * + * @author mhanna + * @version 0.1 + */ +public class ReservoirDownsampler implements Iterator> { + /** + * Create a random number generator with a random, but reproducible, seed. + */ + private final Random random = new Random(47382911L); + + /** + * The data source, wrapped in a peekable input stream. + */ + private final PeekableIterator iterator; + + /** + * Used to identify whether two elements are 'equal' in the eyes of the downsampler. + */ + private final Comparator comparator; + + /** + * What is the maximum number of reads that can be returned in a single batch. + */ + private final int maxElements; + + /** + * Create a new downsampler with the given source iterator and given comparator. + * @param iterator Source of the data stream. + * @param comparator Used to compare two records to see whether they're 'equal' at this position. + * @param maxElements What is the maximum number of reads that can be returned in any call of this + */ + public ReservoirDownsampler(final Iterator iterator, final Comparator comparator, final int maxElements) { + this.iterator = new PeekableIterator(iterator); + this.comparator = comparator; + if(maxElements < 0) + throw new StingException("Unable to work with an negative size collection of elements"); + this.maxElements = maxElements; + } + + public boolean hasNext() { + return iterator.hasNext(); + } + + /** + * Gets a collection of 'equal' elements, as judged by the comparator. If the number of equal elements + * is greater than the maximum, then the elements in the collection should be a truly random sampling. + * @return Collection of equal elements. + */ + public Collection next() { + if(!hasNext()) + throw new NoSuchElementException("No next element is present."); + + List batch = new ArrayList(maxElements); + int currentElement = 0; + + // Determine our basis of equality. + T first = iterator.next(); + if(maxElements > 0) + batch.add(first); + currentElement++; + + // Fill the reservoir + while(iterator.hasNext() && + currentElement < maxElements && + comparator.compare(first,iterator.peek()) == 0) { + batch.add(iterator.next()); + currentElement++; + } + + // Trim off remaining elements, randomly selecting them using the process as described by Vitter. + while(iterator.hasNext() && comparator.compare(first,iterator.peek()) == 0) { + T candidate = iterator.next(); + final int slot = random.nextInt(currentElement); + if(slot >= 0 && slot < maxElements) + batch.set(slot,candidate); + currentElement++; + } + + return batch; + } + + /** + * Unsupported; throws exception to that effect. + */ + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a ReservoirDownsampler."); + } +} diff --git a/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java b/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..c166904e6 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,190 @@ +package org.broadinstitute.sting.utils; + +import org.junit.Test; +import org.broadinstitute.sting.utils.sam.AlignmentStartComparator; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileHeader; + +import java.util.*; + +import junit.framework.Assert; + +/** + * Basic tests to prove the integrity of the reservoir downsampler. + * At the moment, always run tests on SAM records as that's the task + * for which the downsampler was conceived. + * + * @author mhanna + * @version 0.1 + */ +public class ReservoirDownsamplerUnitTest { + private static final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,200); + + + @Test + public void testEmptyIterator() { + ReservoirDownsampler downsampler = new ReservoirDownsampler(Collections.emptyList().iterator(), + new AlignmentStartComparator(),1); + Assert.assertFalse("Downsampler is not empty but should be.",downsampler.hasNext()); + } + + @Test + public void testOneElementWithPoolSizeOne() { + List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(reads.iterator(), + new AlignmentStartComparator(),1); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + Collection batchedReads = downsampler.next(); + Assert.assertEquals("Downsampler is returning the wrong number of reads",1,batchedReads.size()); + Assert.assertSame("Downsampler is returning an incorrect read",reads.get(0),batchedReads.iterator().next()); + } + + @Test + public void testOneElementWithPoolSizeGreaterThanOne() { + List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(reads.iterator(), + new AlignmentStartComparator(),5); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + Collection batchedReads = downsampler.next(); + Assert.assertEquals("Downsampler is returning the wrong number of reads",1,batchedReads.size()); + Assert.assertSame("Downsampler is returning an incorrect read",reads.get(0),batchedReads.iterator().next()); + + } + + @Test + public void testPoolFilledPartially() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(reads.iterator(), + new AlignmentStartComparator(),5); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + List batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",3,batchedReads.size()); + + Assert.assertSame("Downsampler read 1 is incorrect",reads.get(0),batchedReads.get(0)); + Assert.assertSame("Downsampler read 2 is incorrect",reads.get(1),batchedReads.get(1)); + Assert.assertSame("Downsampler read 3 is incorrect",reads.get(2),batchedReads.get(2)); + } + + @Test + public void testPoolFilledExactly() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(reads.iterator(), + new AlignmentStartComparator(),5); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + List batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",5,batchedReads.size()); + Assert.assertSame("Downsampler is returning an incorrect read",reads.get(0),batchedReads.iterator().next()); + + Assert.assertSame("Downsampler read 1 is incorrect",reads.get(0),batchedReads.get(0)); + Assert.assertSame("Downsampler read 2 is incorrect",reads.get(1),batchedReads.get(1)); + Assert.assertSame("Downsampler read 3 is incorrect",reads.get(2),batchedReads.get(2)); + Assert.assertSame("Downsampler read 4 is incorrect",reads.get(3),batchedReads.get(3)); + Assert.assertSame("Downsampler read 5 is incorrect",reads.get(4),batchedReads.get(4)); + } + + @Test + public void testLargerPileWithZeroElementPool() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(reads.iterator(), + new AlignmentStartComparator(),0); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + List batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",0,batchedReads.size()); + Assert.assertFalse("Downsampler is not empty but should be",downsampler.hasNext()); + } + + @Test + public void testLargerPileWithSingleElementPool() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(reads.iterator(), + new AlignmentStartComparator(),1); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + List batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",1,batchedReads.size()); + Assert.assertTrue("Downsampler is returning a bad read.",reads.contains(batchedReads.get(0))) ; + Assert.assertFalse("Downsampler is not empty but should be",downsampler.hasNext()); + } + + @Test + public void testFillingAcrossLoci() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,2,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,2,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,3,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,3,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(reads.iterator(), + new AlignmentStartComparator(),5); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + List batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",1,batchedReads.size()); + Assert.assertEquals("Downsampler is returning an incorrect read.",reads.get(0),batchedReads.get(0)); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",2,batchedReads.size()); + Assert.assertEquals("Downsampler is returning an incorrect read.",reads.get(1),batchedReads.get(0)); + Assert.assertEquals("Downsampler is returning an incorrect read.",reads.get(2),batchedReads.get(1)); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",2,batchedReads.size()); + Assert.assertEquals("Downsampler is returning an incorrect read.",reads.get(3),batchedReads.get(0)); + Assert.assertEquals("Downsampler is returning an incorrect read.",reads.get(4),batchedReads.get(1)); + + Assert.assertFalse("Downsampler is not empty but should be",downsampler.hasNext()); + } + + @Test + public void testDownsamplingAcrossLoci() { + List reads = new ArrayList(); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,2,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,2,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,3,76)); + reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,3,76)); + ReservoirDownsampler downsampler = new ReservoirDownsampler(reads.iterator(), + new AlignmentStartComparator(),1); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + List batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",1,batchedReads.size()); + Assert.assertEquals("Downsampler is returning an incorrect read.",reads.get(0),batchedReads.get(0)); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",1,batchedReads.size()); + Assert.assertTrue("Downsampler is returning an incorrect read.",batchedReads.get(0).equals(reads.get(1)) || batchedReads.get(0).equals(reads.get(2))); + + Assert.assertTrue("Downsampler is empty but shouldn't be",downsampler.hasNext()); + batchedReads = new ArrayList(downsampler.next()); + Assert.assertEquals("Downsampler is returning the wrong number of reads",1,batchedReads.size()); + Assert.assertTrue("Downsampler is returning an incorrect read.",batchedReads.get(0).equals(reads.get(3)) || batchedReads.get(0).equals(reads.get(4))); + + Assert.assertFalse("Downsampler is not empty but should be",downsampler.hasNext()); + } +} diff --git a/java/test/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java b/java/test/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java new file mode 100644 index 000000000..6b0f6d036 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java @@ -0,0 +1,22 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMRecord; + +import java.util.Comparator; + +/** + * Compares two SAMRecords only the basis on alignment start. Note that + * comparisons are performed ONLY on the basis of alignment start; any + * two SAM records with the same alignment start will be considered equal. + * + * Unmapped alignments will all be considered equal. + * + * @author mhanna + * @version 0.1 + */ +public class AlignmentStartComparator implements Comparator { + public int compare(SAMRecord lhs, SAMRecord rhs) { + // Note: no integer overflow here because alignment starts are >= 0. + return lhs.getAlignmentStart() - rhs.getAlignmentStart(); + } +}