From 0550b2779929b94ea792517bd2116e0af7a4bb80 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 13 Jun 2012 14:42:10 -0400 Subject: [PATCH 2/2] Make downsampler classes themselves generic (instead of just the Downsampler interface) This is in response to a request from Mauricio to make it easier to use the downsamplers with GATKSAMRecords (as opposed to SAMRecords) without having to do any cumbersome typecasting. Sadly, Java language limitations make this sort of solution the best choice. Thanks to Khalid for his feedback on this issue. Also: -added a unit test to verify GATKSAMRecord support with no typecasting required -added some unit tests for the FractionalDownsampler that Mauricio will/might be using -moved classes from private to public to better sync up with my local development branch for engine integration --- .../sting/gatk/downsampling/Downsampler.java | 76 ++++ .../DownsamplingReadsIterator.java | 98 +++++ .../downsampling/FractionalDownsampler.java | 94 +++++ .../downsampling/PositionalDownsampler.java | 259 +++++++++++++ .../gatk/downsampling/ReadsDownsampler.java | 40 ++ .../downsampling/ReservoirDownsampler.java | 106 ++++++ .../DownsamplingReadsIteratorUnitTest.java | 73 ++++ .../FractionalDownsamplerUnitTest.java | 65 ++++ .../PositionalDownsamplerUnitTest.java | 357 ++++++++++++++++++ 9 files changed, 1168 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java new file mode 100644 index 000000000..5fb99b2bc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import java.util.Collection; +import java.util.List; + +/** + * The basic downsampler API, with no reads-specific operations + * + * @author David Roazen + */ +public interface Downsampler { + + /* + * Submit one item to the downsampler for consideration . Some downsamplers will be able to determine + * immediately whether the item survives the downsampling process, while others will need to see + * more items before making that determination. + */ + public void submit( T item ); + + /* + * Submit a collection of items to the downsampler for consideration. + */ + public void submit( Collection items ); + + /* + * Are there items that have survived the downsampling process waiting to be retrieved? + */ + public boolean hasDownsampledItems(); + + /* + * Return (and remove) all items that have survived downsampling and are waiting to be retrieved. + */ + public List consumeDownsampledItems(); + + /* + * Are there items stored in this downsampler that it doesn't yet know whether they will + * ultimately survive the downsampling process? + */ + public boolean hasPendingItems(); + + /* + * Used to tell the downsampler that no more items will be submitted to it, and that it should + * finalize any pending items. + */ + public void signalEndOfInput(); + + /* + * Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state + * information. + */ + public void clear(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java new file mode 100644 index 000000000..bccc2e946 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; + +import java.util.Collection; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +/** + * StingSAMIterator wrapper around our generic reads downsampler interface + * + * @author David Roazen + */ +public class DownsamplingReadsIterator implements StingSAMIterator { + + private StingSAMIterator nestedSAMIterator; + private ReadsDownsampler downsampler; + private Collection downsampledReadsCache; + private Iterator downsampledReadsCacheIterator; + + public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler downsampler ) { + nestedSAMIterator = iter; + this.downsampler = downsampler; + fillDownsampledReadsCache(); + } + + public boolean hasNext() { + if ( downsampledReadsCacheIterator.hasNext() ) { + return true; + } + else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) { + return false; + } + + return true; + } + + public SAMRecord next() { + if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) { + throw new NoSuchElementException("next() called when there are no more items"); + } + + return downsampledReadsCacheIterator.next(); + } + + private boolean fillDownsampledReadsCache() { + while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) { + downsampler.submit(nestedSAMIterator.next()); + } + + if ( ! nestedSAMIterator.hasNext() ) { + downsampler.signalEndOfInput(); + } + + downsampledReadsCache = downsampler.consumeDownsampledItems(); + downsampledReadsCacheIterator = downsampledReadsCache.iterator(); + + return downsampledReadsCacheIterator.hasNext(); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + nestedSAMIterator.close(); + } + + public Iterator iterator() { + return this; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java new file mode 100644 index 000000000..d5d529c9f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Fractional Downsampler: selects a specified fraction of the reads for inclusion + * + * @author David Roazen + */ +public class FractionalDownsampler implements ReadsDownsampler { + + private ArrayList selectedReads; + + private int cutoffForInclusion; + + private static final int RANDOM_POOL_SIZE = 10000; + + public FractionalDownsampler( double fraction ) { + if ( fraction < 0.0 || fraction > 1.0 ) { + throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); + } + + cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); + clear(); + } + + public void submit( T newRead ) { + if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) { + selectedReads.add(newRead); + } + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasDownsampledItems() { + return selectedReads.size() > 0; + } + + public List consumeDownsampledItems() { + List downsampledItems = selectedReads; + clear(); + return downsampledItems; + } + + public boolean hasPendingItems() { + return false; + } + + public void signalEndOfInput() { + // NO-OP + } + + public void clear() { + selectedReads = new ArrayList(); + } + + public boolean requiresCoordinateSortOrder() { + return false; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java new file mode 100644 index 000000000..f29c7728c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions + * + * @author David Roazen + */ +public class PositionalDownsampler implements ReadsDownsampler { + + private int targetCoverage; + + private ReservoirDownsampler reservoir; + + private int currentContigIndex; + + private int currentAlignmentStart; + + private LinkedList pendingReads; + + private ArrayList finalizedReads; + + public PositionalDownsampler ( int targetCoverage ) { + this.targetCoverage = targetCoverage; + clear(); + } + + public void submit ( T newRead ) { + if ( readIsPastCurrentPosition(newRead) ) { + updateAndDownsamplePendingReads(); + } + + reservoir.submit(newRead); + updateCurrentPosition(newRead); + } + + public void submit ( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasDownsampledItems() { + return finalizedReads.size() > 0; + } + + public List consumeDownsampledItems() { + List toReturn = finalizedReads; + finalizedReads = new ArrayList(); + return toReturn; + } + + public boolean hasPendingItems() { + return pendingReads.size() > 0; + } + + public void signalEndOfInput() { + updateAndDownsamplePendingReads(); + + for ( PositionalReadGrouping group : pendingReads ) { + group.finalizeAllActiveReads(); + finalizedReads.addAll(group.getFinalizedReads()); + } + + pendingReads.clear(); + } + + public void clear() { + reservoir = new ReservoirDownsampler(targetCoverage); + pendingReads = new LinkedList(); + finalizedReads = new ArrayList(); + } + + public boolean requiresCoordinateSortOrder() { + return true; + } + + private void updateCurrentPosition ( T read ) { + currentContigIndex = read.getReferenceIndex(); + currentAlignmentStart = read.getAlignmentStart(); + } + + private boolean readIsPastCurrentPosition ( T read ) { + return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart; + } + + private void updateAndDownsamplePendingReads() { + finalizeOutOfScopeReads(); + + List oldLocusReads = reservoir.consumeDownsampledItems(); + pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart)); + + downsampleOverlappingGroups(); + } + + private void finalizeOutOfScopeReads() { + Iterator iter = pendingReads.iterator(); + boolean noPrecedingUnfinalizedGroups = true; + + while ( iter.hasNext() ) { + PositionalReadGrouping currentGroup = iter.next(); + currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart); + + if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) { + iter.remove(); + finalizedReads.addAll(currentGroup.getFinalizedReads()); + } + else { + noPrecedingUnfinalizedGroups = false; + } + } + } + + private void downsampleOverlappingGroups() { + int[] groupReadCounts = new int[pendingReads.size()]; + int totalCoverage = 0; + int numActiveGroups = 0; + int currentGroup = 0; + + for ( PositionalReadGrouping group : pendingReads ) { + groupReadCounts[currentGroup] = group.numActiveReads(); + totalCoverage += groupReadCounts[currentGroup]; + + if ( groupReadCounts[currentGroup] > 0 ) { + numActiveGroups++; + } + + currentGroup++; + } + + if ( totalCoverage <= targetCoverage ) { + return; + } + + int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups); + currentGroup = 0; + + while ( numReadsToRemove > 0 ) { + if ( groupReadCounts[currentGroup] > 1 ) { + groupReadCounts[currentGroup]--; + numReadsToRemove--; + } + + currentGroup = (currentGroup + 1) % groupReadCounts.length; + } + + currentGroup = 0; + for ( PositionalReadGrouping group : pendingReads ) { + if ( ! group.isFinalized() ) { + group.downsampleActiveReads(groupReadCounts[currentGroup]); + } + currentGroup++; + } + } + + private class PositionalReadGrouping { + private List activeReads; + private List finalizedReads; + + private int contig; + private int alignmentStart; + + public PositionalReadGrouping( Collection reads, int contig, int alignmentStart ) { + activeReads = new LinkedList(reads); + finalizedReads = new ArrayList(); + this.contig = contig; + this.alignmentStart = alignmentStart; + } + + public int numActiveReads() { + return activeReads.size(); + } + + public boolean isFinalized() { + return activeReads.size() == 0; + } + + public List getFinalizedReads() { + return finalizedReads; + } + + public void finalizeActiveReadsBeforePosition( int contig, int position ) { + if ( this.contig != contig ) { + finalizeAllActiveReads(); + return; + } + + Iterator iter = activeReads.iterator(); + + while ( iter.hasNext() ) { + T read = iter.next(); + if ( read.getAlignmentEnd() < position ) { + iter.remove(); + finalizedReads.add(read); + } + } + } + + public void finalizeAllActiveReads() { + finalizedReads.addAll(activeReads); + activeReads.clear(); + } + + public void downsampleActiveReads( int numReadsToKeep ) { + if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) { + throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads", + numReadsToKeep, activeReads.size())); + } + + BitSet itemsToKeep = new BitSet(activeReads.size()); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) { + itemsToKeep.set(selectedIndex); + } + + int currentIndex = 0; + Iterator iter = activeReads.iterator(); + + while ( iter.hasNext() ) { + T read = iter.next(); + + if ( ! itemsToKeep.get(currentIndex) ) { + iter.remove(); + } + + currentIndex++; + } + } + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java new file mode 100644 index 000000000..f78aaf4bf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * An extension of the basic downsampler API with reads-specific operations + * + * @author David Roazen + */ +public interface ReadsDownsampler extends Downsampler { + + /* + * Does this downsampler require that reads be fed to it in coordinate order? + */ + public boolean requiresCoordinateSortOrder(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java new file mode 100644 index 000000000..cb40c7042 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with + * every read in the stream having an equal chance of being selected for inclusion. + * + * An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985) + * + * @author David Roazen + */ +public class ReservoirDownsampler implements ReadsDownsampler { + + private ArrayList reservoir; + + private int targetSampleSize; + + private int totalReadsSeen; + + public ReservoirDownsampler ( int targetSampleSize ) { + if ( targetSampleSize <= 0 ) { + throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); + } + + this.targetSampleSize = targetSampleSize; + clear(); + } + + public void submit ( T newRead ) { + totalReadsSeen++; + + if ( totalReadsSeen <= targetSampleSize ) { + reservoir.add(newRead); + } + else { + int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen); + if ( randomSlot < targetSampleSize ) { + reservoir.set(randomSlot, newRead); + } + } + } + + public void submit ( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasDownsampledItems() { + return reservoir.size() > 0; + } + + public List consumeDownsampledItems() { + List downsampledItems = reservoir; + clear(); + return downsampledItems; + } + + public boolean hasPendingItems() { + return false; + } + + public void signalEndOfInput() { + // NO-OP + } + + public void clear() { + reservoir = new ArrayList(targetSampleSize); + totalReadsSeen = 0; + } + + public boolean requiresCoordinateSortOrder() { + return false; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..b0de78b97 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,73 @@ +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Collection; + +public class DownsamplingReadsIteratorUnitTest { + + @Test + public void testDownsamplingIteratorWithPositionalDownsampling() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection reads = new ArrayList(); + + reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100)); + reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100)); + + StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); + + Assert.assertTrue(iter.hasNext()); + SAMRecord previous = iter.next(); + int count = 1; + + while ( iter.hasNext() ) { + SAMRecord current = iter.next(); + Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); + count++; + previous = current; + } + + Assert.assertEquals(count, 1000); + } + + @Test + public void testDownsamplingIteratorNoEffectiveDownsampling() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection reads = new ArrayList(); + + reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); + reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); + + StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); + + Assert.assertTrue(iter.hasNext()); + SAMRecord previous = iter.next(); + int count = 1; + + while ( iter.hasNext() ) { + SAMRecord current = iter.next(); + Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); + count++; + previous = current; + } + + Assert.assertEquals(count, 600); + } + + private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + ArrayList stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java new file mode 100644 index 000000000..0f4bae555 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.List; + +public class FractionalDownsamplerUnitTest { + + @Test + public void test100PercentInclusion() { + FractionalDownsampler downsampler = new FractionalDownsampler(1.0); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeDownsampledItems(); + + Assert.assertTrue(downsampledReads.size() == 1000); + } + + @Test + public void test0PercentInclusion() { + FractionalDownsampler downsampler = new FractionalDownsampler(0.0); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeDownsampledItems(); + + Assert.assertTrue(downsampledReads.isEmpty()); + } + + @Test + public void test50PercentInclusion() { + FractionalDownsampler downsampler = new FractionalDownsampler(0.5); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500)); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeDownsampledItems(); + + Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000); + } + + private List createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) { + List reads = new ArrayList(numReads); + + for ( int i = 1; i <= numReads; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex, + GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1, + GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1)); + } + + return reads; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..b1d8e45c9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java @@ -0,0 +1,357 @@ +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested +public class PositionalDownsamplerUnitTest extends BaseTest { + + /** + * ------- + * ------- + * ------- + * ------- + * ------- + * ------- + */ + @Test + public void testThreeOverlappingIdenticalStacks() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); + } + + /** + * ------- + * ------- + * ------- + * ------- + * ------- + * ------- + */ + @Test + public void testThreeNonOverlappingIdenticalStacks() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100)); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) == 1000); + Assert.assertTrue(downsampledStackSizes.get(1) == 1000); + Assert.assertTrue(downsampledStackSizes.get(2) == 1000); + } + + /** + * --- + * --- + * ------- + * ------- + * ------- + * ------- + */ + @Test + public void testThreeStacksWithShortStackAtBeginning() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); + } + + /** + * ------- + * ------- + * --- + * --- + * ------- + * ------- + */ + @Test + public void testThreeStacksWithShortStackInMiddle() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000); + } + + /** + * ------ + * ------ + * ------- + * ------- + * --- + * --- + */ + @Test + public void testThreeStacksWithShortStackAtEnd() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); + } + + /** + * ------- + * ---- + * ------- + * ---- + * ------- + * ------- + */ + @Test + public void testThreePartiallyOverlappingStacks() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + + // TODO: need to examine per-base coverage here + } + + @Test + public void testNoDownsamplingRequired() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) == 300); + Assert.assertTrue(downsampledStackSizes.get(1) == 300); + Assert.assertTrue(downsampledStackSizes.get(2) == 300); + } + + @Test + public void testGATKSAMRecordSupport() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeDownsampledItems(); + + Assert.assertTrue(downsampledReads.size() == 10); + } + + private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + ArrayList stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } + + private ArrayList createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) { + ArrayList stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength); + stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength)); + return stack; + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } +} +