diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 8d1fa4638..dc3d67283 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -125,7 +125,14 @@ public class GATKArgumentCollection { @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.", required = false) + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", + doc = "Coverage [integer] to downsample to. For locus-based traversals (eg., LocusWalkers and ActiveRegionWalkers)," + + "this controls the maximum depth of coverage at each locus. For non-locus-based traversals (eg., ReadWalkers), " + + "this controls the maximum number of reads sharing the same alignment start position. Note that the " + + "coverage target is an approximate goal that is not guaranteed to be met exactly: the GATK's approach " + + "to downsampling is based on even representation of reads from all alignment start positions, and the " + + "downsampling algorithm will under some circumstances retain slightly more coverage than requested.", + required = false) public Integer downsampleCoverage = null; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java index 23b16cff2..466ade1ed 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -25,19 +25,27 @@ package org.broadinstitute.sting.gatk.downsampling; +import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + import java.util.Collection; import java.util.List; /** * The basic downsampler API, with no reads-specific operations. * - * Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle + * Downsamplers that extend this class rather than the ReadsDownsampler class can handle * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a * PerSampleDownsamplingReadsIterator. * * @author David Roazen */ -public interface Downsampler { +public abstract class Downsampler { + + /** + * Number of items discarded by this downsampler since the last call to resetStats() + */ + protected int numDiscardedItems = 0; /** * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine @@ -46,7 +54,7 @@ public interface Downsampler { * * @param item the individual item to submit to the downsampler for consideration */ - public void submit( T item ); + public abstract void submit( final T item ); /** * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling @@ -54,21 +62,29 @@ public interface Downsampler { * * @param items the collection of items to submit to the downsampler for consideration */ - public void submit( Collection items ); + public void submit( final Collection items ) { + if ( items == null ) { + throw new IllegalArgumentException("submitted items must not be null"); + } + + for ( final T item : items ) { + submit(item); + } + } /** * Are there items that have survived the downsampling process waiting to be retrieved? * * @return true if this downsampler has > 0 finalized items, otherwise false */ - public boolean hasFinalizedItems(); + public abstract boolean hasFinalizedItems(); /** * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. * * @return a list of all finalized items this downsampler contains, or an empty list if there are none */ - public List consumeFinalizedItems(); + public abstract List consumeFinalizedItems(); /** * Are there items stored in this downsampler that it doesn't yet know whether they will @@ -76,7 +92,7 @@ public interface Downsampler { * * @return true if this downsampler has > 0 pending items, otherwise false */ - public boolean hasPendingItems(); + public abstract boolean hasPendingItems(); /** * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) @@ -84,7 +100,7 @@ public interface Downsampler { * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), * or null if there are none */ - public T peekFinalized(); + public abstract T peekFinalized(); /** * Peek at the first pending item stored in this downsampler (or null if there are no pending items) @@ -92,7 +108,7 @@ public interface Downsampler { * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), * or null if there are none */ - public T peekPending(); + public abstract T peekPending(); /** * Get the current number of items in this downsampler @@ -103,7 +119,7 @@ public interface Downsampler { * * @return a positive integer */ - public int size(); + public abstract int size(); /** * Returns the number of items discarded (so far) during the downsampling process @@ -111,21 +127,46 @@ public interface Downsampler { * @return the number of items that have been submitted to this downsampler and discarded in the process of * downsampling */ - public int getNumberOfDiscardedItems(); + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } /** * Used to tell the downsampler that no more items will be submitted to it, and that it should * finalize any pending items. */ - public void signalEndOfInput(); + public abstract void signalEndOfInput(); /** * Empty the downsampler of all finalized/pending items */ - public void clear(); + public abstract void clearItems(); /** * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items */ - public void reset(); + public void resetStats() { + numDiscardedItems = 0; + } + + /** + * Indicates whether an item should be excluded from elimination during downsampling. By default, + * all items representing reduced reads are excluded from downsampling, but individual downsamplers + * may override if they are able to handle reduced reads correctly. Downsamplers should check + * the return value of this method before discarding an item. + * + * @param item The item to test + * @return true if the item should not be subject to elimination during downsampling, otherwise false + */ + protected boolean doNotDiscardItem( final Object item ) { + // Use getClass() rather than instanceof for performance reasons. Ugly but fast. + if ( item.getClass() == GATKSAMRecord.class ) { + return ((GATKSAMRecord)item).isReducedRead(); + } + else if ( item.getClass() == AlignmentStateMachine.class ) { + return ((AlignmentStateMachine)item).isReducedRead(); + } + + return false; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java index 1cede9c33..c40f8019e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; -import java.util.Collection; import java.util.List; /** @@ -41,13 +40,11 @@ import java.util.List; * * @author David Roazen */ -public class FractionalDownsampler implements ReadsDownsampler { +public class FractionalDownsampler extends ReadsDownsampler { private ArrayList selectedReads; - private int cutoffForInclusion; - - private int numDiscardedItems; + private final int cutoffForInclusion; private static final int RANDOM_POOL_SIZE = 10000; @@ -57,18 +54,19 @@ public class FractionalDownsampler implements ReadsDownsamp * @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive). * Actual number of reads preserved may differ randomly. */ - public FractionalDownsampler( double fraction ) { + public FractionalDownsampler( final double fraction ) { if ( fraction < 0.0 || fraction > 1.0 ) { throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); } cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); - clear(); - reset(); + clearItems(); + resetStats(); } - public void submit( T newRead ) { - if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) { + @Override + public void submit( final T newRead ) { + if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion || doNotDiscardItem(newRead) ) { selectedReads.add(newRead); } else { @@ -76,61 +74,56 @@ public class FractionalDownsampler implements ReadsDownsamp } } - public void submit( Collection newReads ) { - for ( T read : newReads ) { - submit(read); - } - } - + @Override public boolean hasFinalizedItems() { return selectedReads.size() > 0; } + @Override public List consumeFinalizedItems() { // pass by reference rather than make a copy, for speed List downsampledItems = selectedReads; - clear(); + clearItems(); return downsampledItems; } + @Override public boolean hasPendingItems() { return false; } + @Override public T peekFinalized() { return selectedReads.isEmpty() ? null : selectedReads.get(0); } + @Override public T peekPending() { return null; } - public int getNumberOfDiscardedItems() { - return numDiscardedItems; - } - @Override public int size() { return selectedReads.size(); } + @Override public void signalEndOfInput() { // NO-OP } - public void clear() { + @Override + public void clearItems() { selectedReads = new ArrayList(); } - public void reset() { - numDiscardedItems = 0; - } - + @Override public boolean requiresCoordinateSortOrder() { return false; } - public void signalNoMoreReadsBefore( T read ) { + @Override + public void signalNoMoreReadsBefore( final T read ) { // NO-OP } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java index 4ff729537..3ce4d09d6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -46,16 +46,15 @@ import java.util.*; * * @author David Roazen */ -public class LevelingDownsampler, E> implements Downsampler { +public class LevelingDownsampler, E> extends Downsampler { private final int minElementsPerStack; + private final int targetSize; private List groups; private boolean groupsAreFinalized; - private int numDiscardedItems; - /** * Construct a LevelingDownsampler * @@ -65,7 +64,7 @@ public class LevelingDownsampler, E> implements Downsampler * this value -- if it does, items are removed from Lists evenly until the total size * is <= this value */ - public LevelingDownsampler( int targetSize ) { + public LevelingDownsampler( final int targetSize ) { this(targetSize, 1); } @@ -79,55 +78,58 @@ public class LevelingDownsampler, E> implements Downsampler * if a stack has only 3 elements and minElementsPerStack is 3, no matter what * we'll not reduce this stack below 3. */ - public LevelingDownsampler(final int targetSize, final int minElementsPerStack) { + public LevelingDownsampler( final int targetSize, final int minElementsPerStack ) { if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize); if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack); this.targetSize = targetSize; this.minElementsPerStack = minElementsPerStack; - clear(); - reset(); + clearItems(); + resetStats(); } - public void submit( T item ) { + @Override + public void submit( final T item ) { groups.add(item); } - public void submit( Collection items ){ + @Override + public void submit( final Collection items ){ groups.addAll(items); } + @Override public boolean hasFinalizedItems() { return groupsAreFinalized && groups.size() > 0; } + @Override public List consumeFinalizedItems() { if ( ! hasFinalizedItems() ) { return new ArrayList(); } // pass by reference rather than make a copy, for speed - List toReturn = groups; - clear(); + final List toReturn = groups; + clearItems(); return toReturn; } + @Override public boolean hasPendingItems() { return ! groupsAreFinalized && groups.size() > 0; } + @Override public T peekFinalized() { return hasFinalizedItems() ? groups.get(0) : null; } + @Override public T peekPending() { return hasPendingItems() ? groups.get(0) : null; } - public int getNumberOfDiscardedItems() { - return numDiscardedItems; - } - @Override public int size() { int s = 0; @@ -137,26 +139,24 @@ public class LevelingDownsampler, E> implements Downsampler return s; } + @Override public void signalEndOfInput() { levelGroups(); groupsAreFinalized = true; } - public void clear() { + @Override + public void clearItems() { groups = new ArrayList(); groupsAreFinalized = false; } - public void reset() { - numDiscardedItems = 0; - } - private void levelGroups() { + final int[] groupSizes = new int[groups.size()]; int totalSize = 0; - int[] groupSizes = new int[groups.size()]; int currentGroupIndex = 0; - for ( T group : groups ) { + for ( final T group : groups ) { groupSizes[currentGroupIndex] = group.size(); totalSize += groupSizes[currentGroupIndex]; currentGroupIndex++; @@ -191,20 +191,18 @@ public class LevelingDownsampler, E> implements Downsampler // Now we actually go through and reduce each group to its new count as specified in groupSizes currentGroupIndex = 0; - for ( T group : groups ) { + for ( final T group : groups ) { downsampleOneGroup(group, groupSizes[currentGroupIndex]); currentGroupIndex++; } } - private void downsampleOneGroup( T group, int numItemsToKeep ) { + private void downsampleOneGroup( final T group, final int numItemsToKeep ) { if ( numItemsToKeep >= group.size() ) { return; } - numDiscardedItems += group.size() - numItemsToKeep; - - BitSet itemsToKeep = new BitSet(group.size()); + final BitSet itemsToKeep = new BitSet(group.size()); for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { itemsToKeep.set(selectedIndex); } @@ -213,12 +211,13 @@ public class LevelingDownsampler, E> implements Downsampler // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator if ( group instanceof LinkedList ) { - Iterator iter = group.iterator(); + final Iterator iter = group.iterator(); while ( iter.hasNext() ) { - iter.next(); + final E item = iter.next(); - if ( ! itemsToKeep.get(currentIndex) ) { + if ( ! itemsToKeep.get(currentIndex) && ! doNotDiscardItem(item) ) { iter.remove(); + numDiscardedItems++; } currentIndex++; @@ -227,14 +226,15 @@ public class LevelingDownsampler, E> implements Downsampler // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather // than suffer O(n^2) of item shifting else { - List keptItems = new ArrayList(numItemsToKeep); + final List keptItems = new ArrayList(group.size()); - for ( E item : group ) { - if ( itemsToKeep.get(currentIndex) ) { + for ( final E item : group ) { + if ( itemsToKeep.get(currentIndex) || doNotDiscardItem(item) ) { keptItems.add(item); } currentIndex++; } + numDiscardedItems += group.size() - keptItems.size(); group.clear(); group.addAll(keptItems); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java index 3aaed6c73..1eabf5038 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMRecord; -import java.util.Collection; import java.util.LinkedList; import java.util.List; @@ -39,25 +38,21 @@ import java.util.List; * * @author David Roazen */ -public class PassThroughDownsampler implements ReadsDownsampler { +public class PassThroughDownsampler extends ReadsDownsampler { private LinkedList selectedReads; public PassThroughDownsampler() { - clear(); + clearItems(); } + @Override public void submit( T newRead ) { // All reads pass-through, no reads get downsampled selectedReads.add(newRead); } - public void submit( Collection newReads ) { - for ( T read : newReads ) { - submit(read); - } - } - + @Override public boolean hasFinalizedItems() { return ! selectedReads.isEmpty(); } @@ -66,50 +61,50 @@ public class PassThroughDownsampler implements ReadsDownsam * Note that this list is a linked list and so doesn't support fast random access * @return */ + @Override public List consumeFinalizedItems() { // pass by reference rather than make a copy, for speed - List downsampledItems = selectedReads; - clear(); + final List downsampledItems = selectedReads; + clearItems(); return downsampledItems; } + @Override public boolean hasPendingItems() { return false; } + @Override public T peekFinalized() { return selectedReads.isEmpty() ? null : selectedReads.getFirst(); } + @Override public T peekPending() { return null; } - public int getNumberOfDiscardedItems() { - return 0; - } - @Override public int size() { return selectedReads.size(); } + @Override public void signalEndOfInput() { // NO-OP } - public void clear() { + @Override + public void clearItems() { selectedReads = new LinkedList(); } - public void reset() { - // NO-OP - } - + @Override public boolean requiresCoordinateSortOrder() { return false; } + @Override public void signalNoMoreReadsBefore( T read ) { // NO-OP } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java index a878d7553..a8df014e5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java @@ -32,14 +32,14 @@ import net.sf.samtools.SAMRecord; * * @author David Roazen */ -public interface ReadsDownsampler extends Downsampler { +public abstract class ReadsDownsampler extends Downsampler { /** * Does this downsampler require that reads be fed to it in coordinate order? * * @return true if reads must be submitted to this downsampler in coordinate order, otherwise false */ - public boolean requiresCoordinateSortOrder(); + public abstract boolean requiresCoordinateSortOrder(); /** * Tell this downsampler that no more reads located before the provided read (according to @@ -52,5 +52,5 @@ public interface ReadsDownsampler extends Downsampler { * @param read the downsampler will assume that no reads located before this read will ever * be submitted to it in the future */ - public void signalNoMoreReadsBefore( T read ); + public abstract void signalNoMoreReadsBefore( final T read ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java index 0e6bbfcb6..ff085d17b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -39,7 +39,12 @@ import java.util.*; * * @author David Roazen */ -public class ReservoirDownsampler implements ReadsDownsampler { +public class ReservoirDownsampler extends ReadsDownsampler { + + /** + * size of our reservoir -- ie., the maximum number of reads from the stream that will be retained + * (not including any undiscardable items) + */ private final int targetSampleSize; /** @@ -58,17 +63,33 @@ public class ReservoirDownsampler implements ReadsDownsampl */ private List reservoir; + /** + * Certain items (eg., reduced reads) cannot be discarded at all during downsampling. We store + * these items separately so as not to impact the fair selection of items for inclusion in the + * reservoir. These items are returned (and cleared) along with any items in the reservoir in + * calls to consumeFinalizedItems(). + */ + private List undiscardableItems; + + /** + * Are we currently using a linked list for the reservoir? + */ private boolean isLinkedList; - private int totalReadsSeen; + /** + * Count of the number of reads seen that were actually eligible for discarding. Used by the reservoir downsampling + * algorithm to ensure that all discardable reads have an equal chance of making it into the reservoir. + */ + private int totalDiscardableReadsSeen; - private int numDiscardedItems; /** * Construct a ReservoirDownsampler * * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained - * after downsampling will be min(totalReads, targetSampleSize) + * after downsampling will be min(totalDiscardableReads, targetSampleSize) + any + * undiscardable reads (eg., reduced reads). + * * @param expectFewOverflows if true, this downsampler will be optimized for the case * where most of the time we won't fill up anything like the * targetSampleSize elements. If this is false, we will allocate @@ -76,15 +97,15 @@ public class ReservoirDownsampler implements ReadsDownsampl * the cost of allocation if we often use targetSampleSize or more * elements. */ - public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows) { + public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows ) { if ( targetSampleSize <= 0 ) { throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); } this.targetSampleSize = targetSampleSize; this.expectFewOverflows = expectFewOverflows; - clear(); - reset(); + clearItems(); + resetStats(); } /** @@ -93,15 +114,21 @@ public class ReservoirDownsampler implements ReadsDownsampl * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained * after downsampling will be min(totalReads, targetSampleSize) */ - public ReservoirDownsampler ( int targetSampleSize ) { + public ReservoirDownsampler ( final int targetSampleSize ) { this(targetSampleSize, false); } + @Override + public void submit ( final T newRead ) { + if ( doNotDiscardItem(newRead) ) { + undiscardableItems.add(newRead); + return; + } - public void submit ( T newRead ) { - totalReadsSeen++; + // Only count reads that are actually eligible for discarding for the purposes of the reservoir downsampling algorithm + totalDiscardableReadsSeen++; - if ( totalReadsSeen <= targetSampleSize ) { + if ( totalDiscardableReadsSeen <= targetSampleSize ) { reservoir.add(newRead); } else { @@ -110,7 +137,7 @@ public class ReservoirDownsampler implements ReadsDownsampl isLinkedList = false; } - final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen); + final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalDiscardableReadsSeen); if ( randomSlot < targetSampleSize ) { reservoir.set(randomSlot, newRead); } @@ -118,49 +145,46 @@ public class ReservoirDownsampler implements ReadsDownsampl } } - public void submit ( Collection newReads ) { - for ( T read : newReads ) { - submit(read); - } - } - + @Override public boolean hasFinalizedItems() { - return reservoir.size() > 0; + return ! reservoir.isEmpty() || ! undiscardableItems.isEmpty(); } + @Override public List consumeFinalizedItems() { - if ( reservoir.isEmpty() ) { - // if there's nothing here, don't both allocating a new list completely + if ( ! hasFinalizedItems() ) { + // if there's nothing here, don't bother allocating a new list return Collections.emptyList(); } else { - // pass by reference rather than make a copy, for speed - List downsampledItems = reservoir; - clear(); + // pass reservoir by reference rather than make a copy, for speed + final List downsampledItems = reservoir; + downsampledItems.addAll(undiscardableItems); + clearItems(); return downsampledItems; } } + @Override public boolean hasPendingItems() { return false; } + @Override public T peekFinalized() { - return reservoir.isEmpty() ? null : reservoir.get(0); + return ! reservoir.isEmpty() ? reservoir.get(0) : (! undiscardableItems.isEmpty() ? undiscardableItems.get(0) : null); } + @Override public T peekPending() { return null; } - public int getNumberOfDiscardedItems() { - return numDiscardedItems; + @Override + public int size() { + return reservoir.size() + undiscardableItems.size(); } @Override - public int size() { - return reservoir.size(); - } - public void signalEndOfInput() { // NO-OP } @@ -168,25 +192,27 @@ public class ReservoirDownsampler implements ReadsDownsampl /** * Clear the data structures used to hold information */ - public void clear() { + @Override + public void clearItems() { // if we aren't expecting many overflows, allocate a linked list not an arraylist reservoir = expectFewOverflows ? new LinkedList() : new ArrayList(targetSampleSize); + // there's no possibility of overflow with the undiscardable items, so we always use a linked list for them + undiscardableItems = new LinkedList<>(); + // it's a linked list if we allocate one isLinkedList = expectFewOverflows; - // an internal stat used by the downsampling process, so not cleared by reset() below - totalReadsSeen = 0; - } - - public void reset() { - numDiscardedItems = 0; + // an internal stat used by the downsampling process, so not cleared by resetStats() below + totalDiscardableReadsSeen = 0; } + @Override public boolean requiresCoordinateSortOrder() { return false; } + @Override public void signalNoMoreReadsBefore( T read ) { // NO-OP } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java index 7c6c043c2..897e2c05e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java @@ -35,11 +35,11 @@ import java.util.*; * * @author David Roazen */ -public class SimplePositionalDownsampler implements ReadsDownsampler { +public class SimplePositionalDownsampler extends ReadsDownsampler { - private int targetCoverage; + private final int targetCoverage; - private ReservoirDownsampler reservoir; + private final ReservoirDownsampler reservoir; private int currentContigIndex; @@ -51,97 +51,93 @@ public class SimplePositionalDownsampler implements ReadsDo private ArrayList finalizedReads; - private int numDiscardedItems; /** * Construct a SimplePositionalDownsampler * * @param targetCoverage Maximum number of reads that may share any given alignment start position */ - public SimplePositionalDownsampler( int targetCoverage ) { + public SimplePositionalDownsampler( final int targetCoverage ) { this.targetCoverage = targetCoverage; reservoir = new ReservoirDownsampler(targetCoverage); finalizedReads = new ArrayList(); - clear(); - reset(); + clearItems(); + resetStats(); } - public void submit( T newRead ) { + @Override + public void submit( final T newRead ) { updatePositionalState(newRead); if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream finalizedReads.add(newRead); } else { - int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); + final int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); + // our reservoir downsampler will call doNotDiscardItem() for us to exclude items from elimination as appropriate reservoir.submit(newRead); numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; } } - public void submit( Collection newReads ) { - for ( T read : newReads ) { - submit(read); - } - } - + @Override public boolean hasFinalizedItems() { return finalizedReads.size() > 0; } + @Override public List consumeFinalizedItems() { // pass by reference rather than make a copy, for speed - List toReturn = finalizedReads; + final List toReturn = finalizedReads; finalizedReads = new ArrayList(); return toReturn; } + @Override public boolean hasPendingItems() { return reservoir.hasFinalizedItems(); } + @Override public T peekFinalized() { return finalizedReads.isEmpty() ? null : finalizedReads.get(0); } + @Override public T peekPending() { return reservoir.peekFinalized(); } - public int getNumberOfDiscardedItems() { - return numDiscardedItems; - } - @Override public int size() { return finalizedReads.size() + reservoir.size(); } + @Override public void signalEndOfInput() { finalizeReservoir(); } - public void clear() { - reservoir.clear(); - reservoir.reset(); + @Override + public void clearItems() { + reservoir.clearItems(); + reservoir.resetStats(); finalizedReads.clear(); positionEstablished = false; unmappedReadsReached = false; } - public void reset() { - numDiscardedItems = 0; - } - + @Override public boolean requiresCoordinateSortOrder() { return true; } - public void signalNoMoreReadsBefore( T read ) { + @Override + public void signalNoMoreReadsBefore( final T read ) { updatePositionalState(read); } - private void updatePositionalState( T newRead ) { + private void updatePositionalState( final T newRead ) { if ( readIsPastCurrentPosition(newRead) ) { if ( reservoir.hasFinalizedItems() ) { finalizeReservoir(); @@ -155,13 +151,13 @@ public class SimplePositionalDownsampler implements ReadsDo } } - private void setCurrentPosition( T read ) { + private void setCurrentPosition( final T read ) { currentContigIndex = read.getReferenceIndex(); currentAlignmentStart = read.getAlignmentStart(); positionEstablished = true; } - private boolean readIsPastCurrentPosition( T read ) { + private boolean readIsPastCurrentPosition( final T read ) { return ! positionEstablished || read.getReferenceIndex() > currentContigIndex || read.getAlignmentStart() > currentAlignmentStart || @@ -170,6 +166,6 @@ public class SimplePositionalDownsampler implements ReadsDo private void finalizeReservoir() { finalizedReads.addAll(reservoir.consumeFinalizedItems()); - reservoir.reset(); + reservoir.resetStats(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java index 80da8f8eb..424bd489e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java @@ -43,17 +43,42 @@ import java.util.List; * Time: 11:23 AM */ public class TAROrderedReadCache { - final int maxCapacity; - final Downsampler downsampler; + private final int maxCapacity; + private ArrayList undownsampledCache; + private Downsampler downsampler; + + private static final int UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE = 10000; /** * Create a new empty ReadCache * @param maxCapacity the max capacity of the read cache. */ - public TAROrderedReadCache(int maxCapacity) { + public TAROrderedReadCache( final int maxCapacity ) { if ( maxCapacity < 0 ) throw new IllegalArgumentException("maxCapacity must be >= 0 but got " + maxCapacity); this.maxCapacity = maxCapacity; - this.downsampler = new ReservoirDownsampler(maxCapacity); + + // The one we're not currently using will always be null: + initializeUndownsampledCache(); + this.downsampler = null; + } + + /** + * Moves all reads over to the downsampler, causing it to be used from this point on. Should be called + * when the undownsampledCache fills up and we need to start discarding reads. Since the + * ReservoirDownsampler doesn't preserve relative ordering, pop operations become expensive + * after this point, as they require a O(n log n) sort. + */ + private void activateDownsampler() { + downsampler = new ReservoirDownsampler<>(maxCapacity, false); + downsampler.submit(undownsampledCache); + undownsampledCache = null; // preferable to the O(n) clear() method + } + + /** + * Allocate the undownsampled cache used when we have fewer than maxCapacity items + */ + private void initializeUndownsampledCache() { + undownsampledCache = new ArrayList<>(Math.min(maxCapacity + 1, UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE)); } /** @@ -68,18 +93,31 @@ public class TAROrderedReadCache { * Add a single read to this cache. Assumed to be in sorted order w.r.t. the previously added reads * @param read a read to add */ - public void add(final GATKSAMRecord read) { + public void add( final GATKSAMRecord read ) { if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); - downsampler.submit(read); + + if ( downsampler != null ) { + downsampler.submit(read); + } + else { + undownsampledCache.add(read); + + // No more room in the undownsampledCache? Time to start downsampling + if ( undownsampledCache.size() > maxCapacity ) { + activateDownsampler(); + } + } } /** * Add a collection of reads to this cache. Assumed to be in sorted order w.r.t. the previously added reads and each other * @param reads a collection of reads to add */ - public void addAll(final List reads) { + public void addAll( final List reads ) { if ( reads == null ) throw new IllegalArgumentException("Reads cannot be null"); - downsampler.submit(reads); + for ( final GATKSAMRecord read : reads ) { + add(read); + } } /** @@ -87,40 +125,44 @@ public class TAROrderedReadCache { * @return a positive integer */ public int size() { - return downsampler.size(); + return downsampler != null ? downsampler.size() : undownsampledCache.size(); } /** * How many reads were discarded since the last call to popCurrentReads - * @return + * + * @return number of items discarded during downsampling since last pop operation */ public int getNumDiscarded() { - return downsampler.getNumberOfDiscardedItems(); + return downsampler != null ? downsampler.getNumberOfDiscardedItems() : 0; } /** * Removes all reads currently in the cache, and returns them in sorted order (w.r.t. alignmentStart) * - * Flushes this cache, so after this call the cache will contain no reads and all downsampling stats will - * be reset. + * Flushes this cache, so after this call the cache will contain no reads, and we'll be in the same + * initial state as the constructor would put us in, with a non-null undownsampledCache and a null + * downsampler. * * @return a list of GATKSAMRecords in this cache */ public List popCurrentReads() { - final List maybeUnordered = downsampler.consumeFinalizedItems(); + final List poppedReads; - final List ordered; - if ( downsampler.getNumberOfDiscardedItems() == 0 ) { - // haven't discarded anything, so the reads are ordered properly - ordered = maybeUnordered; - } else { - // we need to sort these damn things: O(n log n) - ordered = new ArrayList(maybeUnordered); - Collections.sort(ordered, new AlignmentStartComparator()); + if ( downsampler == null ) { + poppedReads = undownsampledCache; // avoid making a copy here, since we're going to allocate a new cache + } + else { + // If we triggered the downsampler, we need to sort the reads before returning them, + // since the ReservoirDownsampler is not guaranteed to preserve relative ordering of items. + // After consuming the downsampled items in this call to popCurrentReads(), we switch back + // to using the undownsampledCache until we fill up again. + poppedReads = downsampler.consumeFinalizedItems(); // avoid making a copy here + Collections.sort(poppedReads, new AlignmentStartComparator()); + downsampler = null; } - // reset the downsampler stats so getNumberOfDiscardedItems is 0 - downsampler.reset(); - return ordered; + initializeUndownsampledCache(); + return poppedReads; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java index c4b566582..86f3500be 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java @@ -123,6 +123,15 @@ public class AlignmentStateMachine { return getRead().getReferenceIndex(); } + /** + * Is our read a reduced read? + * + * @return true if the read we encapsulate is a reduced read, otherwise false + */ + public boolean isReducedRead() { + return read.isReducedRead(); + } + /** * Is this the left edge state? I.e., one that is before or after the current read? * @return true if this state is an edge state, false otherwise diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java index 2caaf9d27..669e76adc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java @@ -167,7 +167,7 @@ final class PerSampleReadStateManager implements Iterable // use returned List directly rather than make a copy, for efficiency's sake readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems()); - levelingDownsampler.reset(); + levelingDownsampler.resetStats(); } return nStatesAdded; diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java index 49a8d10aa..9122beebb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java @@ -164,8 +164,8 @@ class SamplePartitioner { @Ensures("doneSubmittingReads == false") public void reset() { for ( final Downsampler downsampler : readsBySample.values() ) { - downsampler.clear(); - downsampler.reset(); + downsampler.clearItems(); + downsampler.resetStats(); } doneSubmittingReads = false; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index b8367a7df..055f8630b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -324,6 +324,31 @@ public class ArtificialSAMUtils { return Arrays.asList(left, right); } + /** + * Create an artificial reduced read based on the parameters. The cigar string will be *M, where * is the + * length of the read. The base counts specified in the baseCounts array will be stored fully encoded in + * the RR attribute. + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param length the length of the read + * @param baseCounts reduced base counts to encode in the RR attribute; length must match the read length + * @return the artificial reduced read + */ + public static GATKSAMRecord createArtificialReducedRead( final SAMFileHeader header, + final String name, + final int refIndex, + final int alignmentStart, + final int length, + final int[] baseCounts ) { + final GATKSAMRecord read = createArtificialRead(header, name, refIndex, alignmentStart, length); + read.setReducedReadCounts(baseCounts); + read.setReducedReadCountsTag(); + return read; + } + /** * Create a collection of identical artificial reads based on the parameters. The cigar string for each * read will be *M, where * is the length of the read. diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java index 6f18d794f..8f0eee069 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -30,6 +30,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; @@ -152,7 +153,39 @@ public class FractionalDownsamplerUnitTest extends BaseTest { Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); - downsampler.reset(); + downsampler.resetStats(); Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } + + @Test + public void testDoNotDiscardReducedReads() { + GenomeAnalysisEngine.resetRandomGenerator(); + final ReadsDownsampler downsampler = new FractionalDownsampler(0.0); + + final Collection reads = new ArrayList(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + final int[] baseCounts = { 10, 10, 10, 10, 10 }; + + for ( int i = 1; i <= 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts)); + } + for ( int i = 1; i <= 5; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 5, "wrong number of items discarded by the downsampler"); + Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); + Assert.assertEquals(downsampler.size(), 10, "downsampler size() reports wrong number of items"); + + final Collection readsReturned = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(readsReturned.size(), 10, "wrong number of items returned by the downsampler"); + + for ( GATKSAMRecord readReturned : readsReturned ) { + Assert.assertTrue(readReturned.isReducedRead(), "non-reduced read survived the downsampling process, but shouldn't have"); + } + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java index 972e51dcd..8cf0fd2a1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -25,16 +25,17 @@ package org.broadinstitute.sting.gatk.downsampling; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.Test; import org.testng.annotations.DataProvider; import org.testng.Assert; -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedList; -import java.util.List; +import java.util.*; public class LevelingDownsamplerUnitTest extends BaseTest { @@ -158,9 +159,46 @@ public class LevelingDownsamplerUnitTest extends BaseTest { Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); - downsampler.reset(); + downsampler.resetStats(); Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); } + + @Test + public void testDoNotDiscardReducedReads() { + GenomeAnalysisEngine.resetRandomGenerator(); + final Downsampler> downsampler = new LevelingDownsampler, AlignmentStateMachine>(1); + + final Collection> groups = new LinkedList>(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + final int[] baseCounts = { 10, 10, 10, 10, 10 }; + + for ( int alignmentStart : Arrays.asList(1, 2, 3) ) { + final LinkedList group = new LinkedList(); + for ( int i = 1; i <= 10; i++ ) { + group.add(new AlignmentStateMachine(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts))); + } + groups.add(group); + } + + downsampler.submit(groups); + downsampler.signalEndOfInput(); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0, "wrong number of items discarded by the downsampler"); + Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); + Assert.assertEquals(downsampler.size(), 30, "downsampler size() reports wrong number of items"); + + final Collection> groupsReturned = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(groupsReturned.size(), 3, "wrong number of groups returned by the downsampler"); + + for ( LinkedList group : groupsReturned ) { + Assert.assertEquals(group.size(), 10, "group has wrong size after downsampling"); + + for ( AlignmentStateMachine state : group ) { + Assert.assertTrue(state.isReducedRead()); + } + } + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java index 022eb02d2..a50201efd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -30,6 +30,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; @@ -125,7 +126,49 @@ public class ReservoirDownsamplerUnitTest extends BaseTest { Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); - downsampler.reset(); + downsampler.resetStats(); Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } + + @Test + public void testDoNotDiscardReducedReads() { + GenomeAnalysisEngine.resetRandomGenerator(); + final ReadsDownsampler downsampler = new ReservoirDownsampler(1); + + final Collection reads = new ArrayList(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + final int[] baseCounts = { 10, 10, 10, 10, 10 }; + + for ( int i = 1; i <= 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts)); + } + for ( int i = 1; i <= 5; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 4, "wrong number of items discarded by the downsampler"); + Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); + Assert.assertEquals(downsampler.size(), 11, "downsampler size() reports wrong number of items"); + + final Collection readsReturned = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(readsReturned.size(), 11, "wrong number of items returned by the downsampler"); + + int numReducedReadsReturned = 0; + int numNormalReadsReturned = 0; + for ( GATKSAMRecord readReturned : readsReturned ) { + if ( readReturned.isReducedRead() ) { + numReducedReadsReturned++; + } + else { + numNormalReadsReturned++; + } + } + + Assert.assertEquals(numReducedReadsReturned, 10, "wrong number of reduced reads returned by the downsampler"); + Assert.assertEquals(numNormalReadsReturned, 1, "wrong number of non-reduced reads returned by the downsampler"); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java index c6b0dea29..bec0030d0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -177,7 +177,7 @@ public class SimplePositionalDownsamplerUnitTest extends BaseTest { Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); } - downsampler.reset(); + downsampler.resetStats(); Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } @@ -328,4 +328,48 @@ public class SimplePositionalDownsamplerUnitTest extends BaseTest { Assert.assertEquals(downsampledReads.size(), 10); } + + @Test + public void testDoNotDiscardReducedReads() { + GenomeAnalysisEngine.resetRandomGenerator(); + final ReadsDownsampler downsampler = new SimplePositionalDownsampler(1); + + final Collection reads = new ArrayList(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + final int[] baseCounts = { 10, 10, 10, 10, 10 }; + + for ( int alignmentStart : Arrays.asList(1, 2, 3) ) { + for ( int i = 1; i <= 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts)); + } + for ( int i = 1; i <= 5; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, alignmentStart, 5)); + } + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 12, "wrong number of items discarded by the downsampler"); + Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); + Assert.assertEquals(downsampler.size(), 33, "downsampler size() reports wrong number of items"); + + final Collection readsReturned = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(readsReturned.size(), 33, "wrong number of items returned by the downsampler"); + + int numReducedReadsReturned = 0; + int numNormalReadsReturned = 0; + for ( GATKSAMRecord readReturned : readsReturned ) { + if ( readReturned.isReducedRead() ) { + numReducedReadsReturned++; + } + else { + numNormalReadsReturned++; + } + } + + Assert.assertEquals(numReducedReadsReturned, 30, "wrong number of reduced reads returned by the downsampler"); + Assert.assertEquals(numNormalReadsReturned, 3, "wrong number of non-reduced reads returned by the downsampler"); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java index f3e1ce44b..4d85997b3 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java @@ -26,9 +26,11 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; @@ -39,6 +41,7 @@ import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.List; public class TAROrderedReadCacheUnitTest extends BaseTest { @@ -98,8 +101,53 @@ public class TAROrderedReadCacheUnitTest extends BaseTest { Assert.assertEquals(cache.getNumDiscarded(), 0, "should have reset stats"); Assert.assertEquals(cacheReads.size(), nExpectedToKeep, "should have 1 read for every read we expected to keep"); + verifySortednessOfReads(cacheReads); + } + + @Test + public void testReadCacheWithReducedReads() { + final List reads = new ArrayList(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + final int[] baseCounts = { 10, 10, 10, 10, 10 }; + + for ( int i = 1; i <= 100; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, i, 5, baseCounts)); + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, i, 5)); + } + + final TAROrderedReadCache cache = new TAROrderedReadCache(50); + + cache.addAll(reads); + + // Our cache should have kept all of the reduced reads (which are retained unconditionally and do not count + // towards the capacity limit), and discarded half of the 100 non-reduced reads due to the cache capacity + // limit of 50. + Assert.assertEquals(cache.size(), 150, "wrong number of reads in the cache at the end"); + Assert.assertEquals(cache.getNumDiscarded(), 50, "wrong number of reads discarded from the cache"); + + final List cacheReads = cache.popCurrentReads(); + + int numReducedReadsRetained = 0; + int numNormalReadsRetained = 0; + + for ( GATKSAMRecord read : cacheReads ) { + if ( read.isReducedRead() ) { + numReducedReadsRetained++; + } + else { + numNormalReadsRetained++; + } + } + + Assert.assertEquals(numReducedReadsRetained, 100, "wrong number of reduced reads retained in the cache"); + Assert.assertEquals(numNormalReadsRetained, 50, "wrong number of non-reduced reads retained in the cache"); + + verifySortednessOfReads(cacheReads); + } + + private void verifySortednessOfReads( final List reads) { int lastStart = -1; - for ( final GATKSAMRecord read : cacheReads ) { + for ( GATKSAMRecord read : reads ) { Assert.assertTrue(lastStart <= read.getAlignmentStart(), "Reads should be sorted but weren't. Found read with start " + read.getAlignmentStart() + " while last was " + lastStart); lastStart = read.getAlignmentStart(); }