diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java new file mode 100644 index 000000000..5fb99b2bc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import java.util.Collection; +import java.util.List; + +/** + * The basic downsampler API, with no reads-specific operations + * + * @author David Roazen + */ +public interface Downsampler { + + /* + * Submit one item to the downsampler for consideration . Some downsamplers will be able to determine + * immediately whether the item survives the downsampling process, while others will need to see + * more items before making that determination. + */ + public void submit( T item ); + + /* + * Submit a collection of items to the downsampler for consideration. + */ + public void submit( Collection items ); + + /* + * Are there items that have survived the downsampling process waiting to be retrieved? + */ + public boolean hasDownsampledItems(); + + /* + * Return (and remove) all items that have survived downsampling and are waiting to be retrieved. + */ + public List consumeDownsampledItems(); + + /* + * Are there items stored in this downsampler that it doesn't yet know whether they will + * ultimately survive the downsampling process? + */ + public boolean hasPendingItems(); + + /* + * Used to tell the downsampler that no more items will be submitted to it, and that it should + * finalize any pending items. + */ + public void signalEndOfInput(); + + /* + * Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state + * information. + */ + public void clear(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java new file mode 100644 index 000000000..bccc2e946 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; + +import java.util.Collection; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +/** + * StingSAMIterator wrapper around our generic reads downsampler interface + * + * @author David Roazen + */ +public class DownsamplingReadsIterator implements StingSAMIterator { + + private StingSAMIterator nestedSAMIterator; + private ReadsDownsampler downsampler; + private Collection downsampledReadsCache; + private Iterator downsampledReadsCacheIterator; + + public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler downsampler ) { + nestedSAMIterator = iter; + this.downsampler = downsampler; + fillDownsampledReadsCache(); + } + + public boolean hasNext() { + if ( downsampledReadsCacheIterator.hasNext() ) { + return true; + } + else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) { + return false; + } + + return true; + } + + public SAMRecord next() { + if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) { + throw new NoSuchElementException("next() called when there are no more items"); + } + + return downsampledReadsCacheIterator.next(); + } + + private boolean fillDownsampledReadsCache() { + while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) { + downsampler.submit(nestedSAMIterator.next()); + } + + if ( ! nestedSAMIterator.hasNext() ) { + downsampler.signalEndOfInput(); + } + + downsampledReadsCache = downsampler.consumeDownsampledItems(); + downsampledReadsCacheIterator = downsampledReadsCache.iterator(); + + return downsampledReadsCacheIterator.hasNext(); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + nestedSAMIterator.close(); + } + + public Iterator iterator() { + return this; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java new file mode 100644 index 000000000..d5d529c9f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Fractional Downsampler: selects a specified fraction of the reads for inclusion + * + * @author David Roazen + */ +public class FractionalDownsampler implements ReadsDownsampler { + + private ArrayList selectedReads; + + private int cutoffForInclusion; + + private static final int RANDOM_POOL_SIZE = 10000; + + public FractionalDownsampler( double fraction ) { + if ( fraction < 0.0 || fraction > 1.0 ) { + throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); + } + + cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); + clear(); + } + + public void submit( T newRead ) { + if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) { + selectedReads.add(newRead); + } + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasDownsampledItems() { + return selectedReads.size() > 0; + } + + public List consumeDownsampledItems() { + List downsampledItems = selectedReads; + clear(); + return downsampledItems; + } + + public boolean hasPendingItems() { + return false; + } + + public void signalEndOfInput() { + // NO-OP + } + + public void clear() { + selectedReads = new ArrayList(); + } + + public boolean requiresCoordinateSortOrder() { + return false; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java new file mode 100644 index 000000000..f29c7728c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions + * + * @author David Roazen + */ +public class PositionalDownsampler implements ReadsDownsampler { + + private int targetCoverage; + + private ReservoirDownsampler reservoir; + + private int currentContigIndex; + + private int currentAlignmentStart; + + private LinkedList pendingReads; + + private ArrayList finalizedReads; + + public PositionalDownsampler ( int targetCoverage ) { + this.targetCoverage = targetCoverage; + clear(); + } + + public void submit ( T newRead ) { + if ( readIsPastCurrentPosition(newRead) ) { + updateAndDownsamplePendingReads(); + } + + reservoir.submit(newRead); + updateCurrentPosition(newRead); + } + + public void submit ( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasDownsampledItems() { + return finalizedReads.size() > 0; + } + + public List consumeDownsampledItems() { + List toReturn = finalizedReads; + finalizedReads = new ArrayList(); + return toReturn; + } + + public boolean hasPendingItems() { + return pendingReads.size() > 0; + } + + public void signalEndOfInput() { + updateAndDownsamplePendingReads(); + + for ( PositionalReadGrouping group : pendingReads ) { + group.finalizeAllActiveReads(); + finalizedReads.addAll(group.getFinalizedReads()); + } + + pendingReads.clear(); + } + + public void clear() { + reservoir = new ReservoirDownsampler(targetCoverage); + pendingReads = new LinkedList(); + finalizedReads = new ArrayList(); + } + + public boolean requiresCoordinateSortOrder() { + return true; + } + + private void updateCurrentPosition ( T read ) { + currentContigIndex = read.getReferenceIndex(); + currentAlignmentStart = read.getAlignmentStart(); + } + + private boolean readIsPastCurrentPosition ( T read ) { + return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart; + } + + private void updateAndDownsamplePendingReads() { + finalizeOutOfScopeReads(); + + List oldLocusReads = reservoir.consumeDownsampledItems(); + pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart)); + + downsampleOverlappingGroups(); + } + + private void finalizeOutOfScopeReads() { + Iterator iter = pendingReads.iterator(); + boolean noPrecedingUnfinalizedGroups = true; + + while ( iter.hasNext() ) { + PositionalReadGrouping currentGroup = iter.next(); + currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart); + + if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) { + iter.remove(); + finalizedReads.addAll(currentGroup.getFinalizedReads()); + } + else { + noPrecedingUnfinalizedGroups = false; + } + } + } + + private void downsampleOverlappingGroups() { + int[] groupReadCounts = new int[pendingReads.size()]; + int totalCoverage = 0; + int numActiveGroups = 0; + int currentGroup = 0; + + for ( PositionalReadGrouping group : pendingReads ) { + groupReadCounts[currentGroup] = group.numActiveReads(); + totalCoverage += groupReadCounts[currentGroup]; + + if ( groupReadCounts[currentGroup] > 0 ) { + numActiveGroups++; + } + + currentGroup++; + } + + if ( totalCoverage <= targetCoverage ) { + return; + } + + int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups); + currentGroup = 0; + + while ( numReadsToRemove > 0 ) { + if ( groupReadCounts[currentGroup] > 1 ) { + groupReadCounts[currentGroup]--; + numReadsToRemove--; + } + + currentGroup = (currentGroup + 1) % groupReadCounts.length; + } + + currentGroup = 0; + for ( PositionalReadGrouping group : pendingReads ) { + if ( ! group.isFinalized() ) { + group.downsampleActiveReads(groupReadCounts[currentGroup]); + } + currentGroup++; + } + } + + private class PositionalReadGrouping { + private List activeReads; + private List finalizedReads; + + private int contig; + private int alignmentStart; + + public PositionalReadGrouping( Collection reads, int contig, int alignmentStart ) { + activeReads = new LinkedList(reads); + finalizedReads = new ArrayList(); + this.contig = contig; + this.alignmentStart = alignmentStart; + } + + public int numActiveReads() { + return activeReads.size(); + } + + public boolean isFinalized() { + return activeReads.size() == 0; + } + + public List getFinalizedReads() { + return finalizedReads; + } + + public void finalizeActiveReadsBeforePosition( int contig, int position ) { + if ( this.contig != contig ) { + finalizeAllActiveReads(); + return; + } + + Iterator iter = activeReads.iterator(); + + while ( iter.hasNext() ) { + T read = iter.next(); + if ( read.getAlignmentEnd() < position ) { + iter.remove(); + finalizedReads.add(read); + } + } + } + + public void finalizeAllActiveReads() { + finalizedReads.addAll(activeReads); + activeReads.clear(); + } + + public void downsampleActiveReads( int numReadsToKeep ) { + if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) { + throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads", + numReadsToKeep, activeReads.size())); + } + + BitSet itemsToKeep = new BitSet(activeReads.size()); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) { + itemsToKeep.set(selectedIndex); + } + + int currentIndex = 0; + Iterator iter = activeReads.iterator(); + + while ( iter.hasNext() ) { + T read = iter.next(); + + if ( ! itemsToKeep.get(currentIndex) ) { + iter.remove(); + } + + currentIndex++; + } + } + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java new file mode 100644 index 000000000..f78aaf4bf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * An extension of the basic downsampler API with reads-specific operations + * + * @author David Roazen + */ +public interface ReadsDownsampler extends Downsampler { + + /* + * Does this downsampler require that reads be fed to it in coordinate order? + */ + public boolean requiresCoordinateSortOrder(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java new file mode 100644 index 000000000..cb40c7042 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with + * every read in the stream having an equal chance of being selected for inclusion. + * + * An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985) + * + * @author David Roazen + */ +public class ReservoirDownsampler implements ReadsDownsampler { + + private ArrayList reservoir; + + private int targetSampleSize; + + private int totalReadsSeen; + + public ReservoirDownsampler ( int targetSampleSize ) { + if ( targetSampleSize <= 0 ) { + throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); + } + + this.targetSampleSize = targetSampleSize; + clear(); + } + + public void submit ( T newRead ) { + totalReadsSeen++; + + if ( totalReadsSeen <= targetSampleSize ) { + reservoir.add(newRead); + } + else { + int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen); + if ( randomSlot < targetSampleSize ) { + reservoir.set(randomSlot, newRead); + } + } + } + + public void submit ( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasDownsampledItems() { + return reservoir.size() > 0; + } + + public List consumeDownsampledItems() { + List downsampledItems = reservoir; + clear(); + return downsampledItems; + } + + public boolean hasPendingItems() { + return false; + } + + public void signalEndOfInput() { + // NO-OP + } + + public void clear() { + reservoir = new ArrayList(targetSampleSize); + totalReadsSeen = 0; + } + + public boolean requiresCoordinateSortOrder() { + return false; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..b0de78b97 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,73 @@ +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Collection; + +public class DownsamplingReadsIteratorUnitTest { + + @Test + public void testDownsamplingIteratorWithPositionalDownsampling() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection reads = new ArrayList(); + + reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100)); + reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100)); + + StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); + + Assert.assertTrue(iter.hasNext()); + SAMRecord previous = iter.next(); + int count = 1; + + while ( iter.hasNext() ) { + SAMRecord current = iter.next(); + Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); + count++; + previous = current; + } + + Assert.assertEquals(count, 1000); + } + + @Test + public void testDownsamplingIteratorNoEffectiveDownsampling() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection reads = new ArrayList(); + + reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); + reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); + + StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); + + Assert.assertTrue(iter.hasNext()); + SAMRecord previous = iter.next(); + int count = 1; + + while ( iter.hasNext() ) { + SAMRecord current = iter.next(); + Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); + count++; + previous = current; + } + + Assert.assertEquals(count, 600); + } + + private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + ArrayList stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java new file mode 100644 index 000000000..0f4bae555 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.List; + +public class FractionalDownsamplerUnitTest { + + @Test + public void test100PercentInclusion() { + FractionalDownsampler downsampler = new FractionalDownsampler(1.0); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeDownsampledItems(); + + Assert.assertTrue(downsampledReads.size() == 1000); + } + + @Test + public void test0PercentInclusion() { + FractionalDownsampler downsampler = new FractionalDownsampler(0.0); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeDownsampledItems(); + + Assert.assertTrue(downsampledReads.isEmpty()); + } + + @Test + public void test50PercentInclusion() { + FractionalDownsampler downsampler = new FractionalDownsampler(0.5); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500)); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeDownsampledItems(); + + Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000); + } + + private List createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) { + List reads = new ArrayList(numReads); + + for ( int i = 1; i <= numReads; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex, + GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1, + GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1)); + } + + return reads; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..b1d8e45c9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java @@ -0,0 +1,357 @@ +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested +public class PositionalDownsamplerUnitTest extends BaseTest { + + /** + * ------- + * ------- + * ------- + * ------- + * ------- + * ------- + */ + @Test + public void testThreeOverlappingIdenticalStacks() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); + } + + /** + * ------- + * ------- + * ------- + * ------- + * ------- + * ------- + */ + @Test + public void testThreeNonOverlappingIdenticalStacks() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100)); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) == 1000); + Assert.assertTrue(downsampledStackSizes.get(1) == 1000); + Assert.assertTrue(downsampledStackSizes.get(2) == 1000); + } + + /** + * --- + * --- + * ------- + * ------- + * ------- + * ------- + */ + @Test + public void testThreeStacksWithShortStackAtBeginning() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); + } + + /** + * ------- + * ------- + * --- + * --- + * ------- + * ------- + */ + @Test + public void testThreeStacksWithShortStackInMiddle() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000); + } + + /** + * ------ + * ------ + * ------- + * ------- + * --- + * --- + */ + @Test + public void testThreeStacksWithShortStackAtEnd() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); + } + + /** + * ------- + * ---- + * ------- + * ---- + * ------- + * ------- + */ + @Test + public void testThreePartiallyOverlappingStacks() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); + Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); + + // TODO: need to examine per-base coverage here + } + + @Test + public void testNoDownsamplingRequired() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); + Assert.assertFalse(downsampler.hasDownsampledItems()); + Assert.assertTrue(downsampler.hasPendingItems()); + + downsampler.signalEndOfInput(); + Assert.assertTrue(downsampler.hasDownsampledItems()); + Assert.assertFalse(downsampler.hasPendingItems()); + + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); + + System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes); + + Assert.assertEquals(downsampledStackSizes.size(), 3); + Assert.assertTrue(downsampledStackSizes.get(0) == 300); + Assert.assertTrue(downsampledStackSizes.get(1) == 300); + Assert.assertTrue(downsampledStackSizes.get(2) == 300); + } + + @Test + public void testGATKSAMRecordSupport() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + PositionalDownsampler downsampler = new PositionalDownsampler(1000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeDownsampledItems(); + + Assert.assertTrue(downsampledReads.size() == 10); + } + + private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + ArrayList stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } + + private ArrayList createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) { + ArrayList stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength); + stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength)); + return stack; + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } +} +