Exclude reduced reads from elimination during downsampling
Problem: -Downsamplers were treating reduced reads the same as normal reads, with occasionally catastrophic results on variant calling when an entire reduced read happened to get eliminated. Solution: -Since reduced reads lack the information we need to do position-based downsampling on them, best available option for now is to simply exempt all reduced reads from elimination during downsampling. Details: -Add generic capability of exempting items from elimination to the Downsampler interface via new doNotDiscardItem() method. Default inherited version of this method exempts all reduced reads (or objects encapsulating reduced reads) from elimination. -Switch from interfaces to abstract classes to facilitate this change, and do some minor refactoring of the Downsampler interface (push implementation of some methods into the abstract classes, improve names of the confusing clear() and reset() methods). -Rewrite TAROrderedReadCache. This class was incorrectly relying on the ReservoirDownsampler to preserve the relative ordering of items in some circumstances, which was behavior not guaranteed by the API and only happened to work due to implementation details which no longer apply. Restructured this class around the assumption that the ReservoirDownsampler will not preserve relative ordering at all. -Add disclaimer to description of -dcov argument explaining that coverage targets are approximate goals that will not always be precisely met. -Unit tests for all individual downsamplers to verify that reduced reads are exempted from elimination
This commit is contained in:
parent
b63cbd8cc9
commit
95b5f99feb
|
|
@ -125,7 +125,14 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
|
||||
public Double downsampleFraction = null;
|
||||
|
||||
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.", required = false)
|
||||
@Argument(fullName = "downsample_to_coverage", shortName = "dcov",
|
||||
doc = "Coverage [integer] to downsample to. For locus-based traversals (eg., LocusWalkers and ActiveRegionWalkers)," +
|
||||
"this controls the maximum depth of coverage at each locus. For non-locus-based traversals (eg., ReadWalkers), " +
|
||||
"this controls the maximum number of reads sharing the same alignment start position. Note that the " +
|
||||
"coverage target is an approximate goal that is not guaranteed to be met exactly: the GATK's approach " +
|
||||
"to downsampling is based on even representation of reads from all alignment start positions, and the " +
|
||||
"downsampling algorithm will under some circumstances retain slightly more coverage than requested.",
|
||||
required = false)
|
||||
public Integer downsampleCoverage = null;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -25,19 +25,27 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The basic downsampler API, with no reads-specific operations.
|
||||
*
|
||||
* Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle
|
||||
* Downsamplers that extend this class rather than the ReadsDownsampler class can handle
|
||||
* any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a
|
||||
* PerSampleDownsamplingReadsIterator.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface Downsampler<T> {
|
||||
public abstract class Downsampler<T> {
|
||||
|
||||
/**
|
||||
* Number of items discarded by this downsampler since the last call to resetStats()
|
||||
*/
|
||||
protected int numDiscardedItems = 0;
|
||||
|
||||
/**
|
||||
* Submit one item to the downsampler for consideration. Some downsamplers will be able to determine
|
||||
|
|
@ -46,7 +54,7 @@ public interface Downsampler<T> {
|
|||
*
|
||||
* @param item the individual item to submit to the downsampler for consideration
|
||||
*/
|
||||
public void submit( T item );
|
||||
public abstract void submit( final T item );
|
||||
|
||||
/**
|
||||
* Submit a collection of items to the downsampler for consideration. Should be equivalent to calling
|
||||
|
|
@ -54,21 +62,29 @@ public interface Downsampler<T> {
|
|||
*
|
||||
* @param items the collection of items to submit to the downsampler for consideration
|
||||
*/
|
||||
public void submit( Collection<T> items );
|
||||
public void submit( final Collection<T> items ) {
|
||||
if ( items == null ) {
|
||||
throw new IllegalArgumentException("submitted items must not be null");
|
||||
}
|
||||
|
||||
for ( final T item : items ) {
|
||||
submit(item);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Are there items that have survived the downsampling process waiting to be retrieved?
|
||||
*
|
||||
* @return true if this downsampler has > 0 finalized items, otherwise false
|
||||
*/
|
||||
public boolean hasFinalizedItems();
|
||||
public abstract boolean hasFinalizedItems();
|
||||
|
||||
/**
|
||||
* Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved.
|
||||
*
|
||||
* @return a list of all finalized items this downsampler contains, or an empty list if there are none
|
||||
*/
|
||||
public List<T> consumeFinalizedItems();
|
||||
public abstract List<T> consumeFinalizedItems();
|
||||
|
||||
/**
|
||||
* Are there items stored in this downsampler that it doesn't yet know whether they will
|
||||
|
|
@ -76,7 +92,7 @@ public interface Downsampler<T> {
|
|||
*
|
||||
* @return true if this downsampler has > 0 pending items, otherwise false
|
||||
*/
|
||||
public boolean hasPendingItems();
|
||||
public abstract boolean hasPendingItems();
|
||||
|
||||
/**
|
||||
* Peek at the first finalized item stored in this downsampler (or null if there are no finalized items)
|
||||
|
|
@ -84,7 +100,7 @@ public interface Downsampler<T> {
|
|||
* @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call),
|
||||
* or null if there are none
|
||||
*/
|
||||
public T peekFinalized();
|
||||
public abstract T peekFinalized();
|
||||
|
||||
/**
|
||||
* Peek at the first pending item stored in this downsampler (or null if there are no pending items)
|
||||
|
|
@ -92,7 +108,7 @@ public interface Downsampler<T> {
|
|||
* @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call),
|
||||
* or null if there are none
|
||||
*/
|
||||
public T peekPending();
|
||||
public abstract T peekPending();
|
||||
|
||||
/**
|
||||
* Get the current number of items in this downsampler
|
||||
|
|
@ -103,7 +119,7 @@ public interface Downsampler<T> {
|
|||
*
|
||||
* @return a positive integer
|
||||
*/
|
||||
public int size();
|
||||
public abstract int size();
|
||||
|
||||
/**
|
||||
* Returns the number of items discarded (so far) during the downsampling process
|
||||
|
|
@ -111,21 +127,46 @@ public interface Downsampler<T> {
|
|||
* @return the number of items that have been submitted to this downsampler and discarded in the process of
|
||||
* downsampling
|
||||
*/
|
||||
public int getNumberOfDiscardedItems();
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to tell the downsampler that no more items will be submitted to it, and that it should
|
||||
* finalize any pending items.
|
||||
*/
|
||||
public void signalEndOfInput();
|
||||
public abstract void signalEndOfInput();
|
||||
|
||||
/**
|
||||
* Empty the downsampler of all finalized/pending items
|
||||
*/
|
||||
public void clear();
|
||||
public abstract void clearItems();
|
||||
|
||||
/**
|
||||
* Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items
|
||||
*/
|
||||
public void reset();
|
||||
public void resetStats() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether an item should be excluded from elimination during downsampling. By default,
|
||||
* all items representing reduced reads are excluded from downsampling, but individual downsamplers
|
||||
* may override if they are able to handle reduced reads correctly. Downsamplers should check
|
||||
* the return value of this method before discarding an item.
|
||||
*
|
||||
* @param item The item to test
|
||||
* @return true if the item should not be subject to elimination during downsampling, otherwise false
|
||||
*/
|
||||
protected boolean doNotDiscardItem( final Object item ) {
|
||||
// Use getClass() rather than instanceof for performance reasons. Ugly but fast.
|
||||
if ( item.getClass() == GATKSAMRecord.class ) {
|
||||
return ((GATKSAMRecord)item).isReducedRead();
|
||||
}
|
||||
else if ( item.getClass() == AlignmentStateMachine.class ) {
|
||||
return ((AlignmentStateMachine)item).isReducedRead();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
|
@ -41,13 +40,11 @@ import java.util.List;
|
|||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
public class FractionalDownsampler<T extends SAMRecord> extends ReadsDownsampler<T> {
|
||||
|
||||
private ArrayList<T> selectedReads;
|
||||
|
||||
private int cutoffForInclusion;
|
||||
|
||||
private int numDiscardedItems;
|
||||
private final int cutoffForInclusion;
|
||||
|
||||
private static final int RANDOM_POOL_SIZE = 10000;
|
||||
|
||||
|
|
@ -57,18 +54,19 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
* @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive).
|
||||
* Actual number of reads preserved may differ randomly.
|
||||
*/
|
||||
public FractionalDownsampler( double fraction ) {
|
||||
public FractionalDownsampler( final double fraction ) {
|
||||
if ( fraction < 0.0 || fraction > 1.0 ) {
|
||||
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
|
||||
}
|
||||
|
||||
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
|
||||
clear();
|
||||
reset();
|
||||
clearItems();
|
||||
resetStats();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
|
||||
@Override
|
||||
public void submit( final T newRead ) {
|
||||
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion || doNotDiscardItem(newRead) ) {
|
||||
selectedReads.add(newRead);
|
||||
}
|
||||
else {
|
||||
|
|
@ -76,61 +74,56 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
}
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasFinalizedItems() {
|
||||
return selectedReads.size() > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = selectedReads;
|
||||
clear();
|
||||
clearItems();
|
||||
return downsampledItems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekFinalized() {
|
||||
return selectedReads.isEmpty() ? null : selectedReads.get(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return selectedReads.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
@Override
|
||||
public void clearItems() {
|
||||
selectedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
@Override
|
||||
public void signalNoMoreReadsBefore( final T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,16 +46,15 @@ import java.util.*;
|
|||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T> {
|
||||
public class LevelingDownsampler<T extends List<E>, E> extends Downsampler<T> {
|
||||
private final int minElementsPerStack;
|
||||
|
||||
private final int targetSize;
|
||||
|
||||
private List<T> groups;
|
||||
|
||||
private boolean groupsAreFinalized;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a LevelingDownsampler
|
||||
*
|
||||
|
|
@ -65,7 +64,7 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
|
|||
* this value -- if it does, items are removed from Lists evenly until the total size
|
||||
* is <= this value
|
||||
*/
|
||||
public LevelingDownsampler( int targetSize ) {
|
||||
public LevelingDownsampler( final int targetSize ) {
|
||||
this(targetSize, 1);
|
||||
}
|
||||
|
||||
|
|
@ -79,55 +78,58 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
|
|||
* if a stack has only 3 elements and minElementsPerStack is 3, no matter what
|
||||
* we'll not reduce this stack below 3.
|
||||
*/
|
||||
public LevelingDownsampler(final int targetSize, final int minElementsPerStack) {
|
||||
public LevelingDownsampler( final int targetSize, final int minElementsPerStack ) {
|
||||
if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize);
|
||||
if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack);
|
||||
|
||||
this.targetSize = targetSize;
|
||||
this.minElementsPerStack = minElementsPerStack;
|
||||
clear();
|
||||
reset();
|
||||
clearItems();
|
||||
resetStats();
|
||||
}
|
||||
|
||||
public void submit( T item ) {
|
||||
@Override
|
||||
public void submit( final T item ) {
|
||||
groups.add(item);
|
||||
}
|
||||
|
||||
public void submit( Collection<T> items ){
|
||||
@Override
|
||||
public void submit( final Collection<T> items ){
|
||||
groups.addAll(items);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasFinalizedItems() {
|
||||
return groupsAreFinalized && groups.size() > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<T> consumeFinalizedItems() {
|
||||
if ( ! hasFinalizedItems() ) {
|
||||
return new ArrayList<T>();
|
||||
}
|
||||
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> toReturn = groups;
|
||||
clear();
|
||||
final List<T> toReturn = groups;
|
||||
clearItems();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPendingItems() {
|
||||
return ! groupsAreFinalized && groups.size() > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekFinalized() {
|
||||
return hasFinalizedItems() ? groups.get(0) : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekPending() {
|
||||
return hasPendingItems() ? groups.get(0) : null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
int s = 0;
|
||||
|
|
@ -137,26 +139,24 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
|
|||
return s;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void signalEndOfInput() {
|
||||
levelGroups();
|
||||
groupsAreFinalized = true;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
@Override
|
||||
public void clearItems() {
|
||||
groups = new ArrayList<T>();
|
||||
groupsAreFinalized = false;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
private void levelGroups() {
|
||||
final int[] groupSizes = new int[groups.size()];
|
||||
int totalSize = 0;
|
||||
int[] groupSizes = new int[groups.size()];
|
||||
int currentGroupIndex = 0;
|
||||
|
||||
for ( T group : groups ) {
|
||||
for ( final T group : groups ) {
|
||||
groupSizes[currentGroupIndex] = group.size();
|
||||
totalSize += groupSizes[currentGroupIndex];
|
||||
currentGroupIndex++;
|
||||
|
|
@ -191,20 +191,18 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
|
|||
|
||||
// Now we actually go through and reduce each group to its new count as specified in groupSizes
|
||||
currentGroupIndex = 0;
|
||||
for ( T group : groups ) {
|
||||
for ( final T group : groups ) {
|
||||
downsampleOneGroup(group, groupSizes[currentGroupIndex]);
|
||||
currentGroupIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
private void downsampleOneGroup( T group, int numItemsToKeep ) {
|
||||
private void downsampleOneGroup( final T group, final int numItemsToKeep ) {
|
||||
if ( numItemsToKeep >= group.size() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
numDiscardedItems += group.size() - numItemsToKeep;
|
||||
|
||||
BitSet itemsToKeep = new BitSet(group.size());
|
||||
final BitSet itemsToKeep = new BitSet(group.size());
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) {
|
||||
itemsToKeep.set(selectedIndex);
|
||||
}
|
||||
|
|
@ -213,12 +211,13 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
|
|||
|
||||
// If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator
|
||||
if ( group instanceof LinkedList ) {
|
||||
Iterator iter = group.iterator();
|
||||
final Iterator<E> iter = group.iterator();
|
||||
while ( iter.hasNext() ) {
|
||||
iter.next();
|
||||
final E item = iter.next();
|
||||
|
||||
if ( ! itemsToKeep.get(currentIndex) ) {
|
||||
if ( ! itemsToKeep.get(currentIndex) && ! doNotDiscardItem(item) ) {
|
||||
iter.remove();
|
||||
numDiscardedItems++;
|
||||
}
|
||||
|
||||
currentIndex++;
|
||||
|
|
@ -227,14 +226,15 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
|
|||
// If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather
|
||||
// than suffer O(n^2) of item shifting
|
||||
else {
|
||||
List<E> keptItems = new ArrayList<E>(numItemsToKeep);
|
||||
final List<E> keptItems = new ArrayList<E>(group.size());
|
||||
|
||||
for ( E item : group ) {
|
||||
if ( itemsToKeep.get(currentIndex) ) {
|
||||
for ( final E item : group ) {
|
||||
if ( itemsToKeep.get(currentIndex) || doNotDiscardItem(item) ) {
|
||||
keptItems.add(item);
|
||||
}
|
||||
currentIndex++;
|
||||
}
|
||||
numDiscardedItems += group.size() - keptItems.size();
|
||||
group.clear();
|
||||
group.addAll(keptItems);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.downsampling;
|
|||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -39,25 +38,21 @@ import java.util.List;
|
|||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PassThroughDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
public class PassThroughDownsampler<T extends SAMRecord> extends ReadsDownsampler<T> {
|
||||
|
||||
private LinkedList<T> selectedReads;
|
||||
|
||||
public PassThroughDownsampler() {
|
||||
clear();
|
||||
clearItems();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void submit( T newRead ) {
|
||||
// All reads pass-through, no reads get downsampled
|
||||
selectedReads.add(newRead);
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasFinalizedItems() {
|
||||
return ! selectedReads.isEmpty();
|
||||
}
|
||||
|
|
@ -66,50 +61,50 @@ public class PassThroughDownsampler<T extends SAMRecord> implements ReadsDownsam
|
|||
* Note that this list is a linked list and so doesn't support fast random access
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = selectedReads;
|
||||
clear();
|
||||
final List<T> downsampledItems = selectedReads;
|
||||
clearItems();
|
||||
return downsampledItems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekFinalized() {
|
||||
return selectedReads.isEmpty() ? null : selectedReads.getFirst();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return selectedReads.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
@Override
|
||||
public void clearItems() {
|
||||
selectedReads = new LinkedList<T>();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,14 +32,14 @@ import net.sf.samtools.SAMRecord;
|
|||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
|
||||
public abstract class ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
|
||||
|
||||
/**
|
||||
* Does this downsampler require that reads be fed to it in coordinate order?
|
||||
*
|
||||
* @return true if reads must be submitted to this downsampler in coordinate order, otherwise false
|
||||
*/
|
||||
public boolean requiresCoordinateSortOrder();
|
||||
public abstract boolean requiresCoordinateSortOrder();
|
||||
|
||||
/**
|
||||
* Tell this downsampler that no more reads located before the provided read (according to
|
||||
|
|
@ -52,5 +52,5 @@ public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
|
|||
* @param read the downsampler will assume that no reads located before this read will ever
|
||||
* be submitted to it in the future
|
||||
*/
|
||||
public void signalNoMoreReadsBefore( T read );
|
||||
public abstract void signalNoMoreReadsBefore( final T read );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,12 @@ import java.util.*;
|
|||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
public class ReservoirDownsampler<T extends SAMRecord> extends ReadsDownsampler<T> {
|
||||
|
||||
/**
|
||||
* size of our reservoir -- ie., the maximum number of reads from the stream that will be retained
|
||||
* (not including any undiscardable items)
|
||||
*/
|
||||
private final int targetSampleSize;
|
||||
|
||||
/**
|
||||
|
|
@ -58,17 +63,33 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
*/
|
||||
private List<T> reservoir;
|
||||
|
||||
/**
|
||||
* Certain items (eg., reduced reads) cannot be discarded at all during downsampling. We store
|
||||
* these items separately so as not to impact the fair selection of items for inclusion in the
|
||||
* reservoir. These items are returned (and cleared) along with any items in the reservoir in
|
||||
* calls to consumeFinalizedItems().
|
||||
*/
|
||||
private List<T> undiscardableItems;
|
||||
|
||||
/**
|
||||
* Are we currently using a linked list for the reservoir?
|
||||
*/
|
||||
private boolean isLinkedList;
|
||||
|
||||
private int totalReadsSeen;
|
||||
/**
|
||||
* Count of the number of reads seen that were actually eligible for discarding. Used by the reservoir downsampling
|
||||
* algorithm to ensure that all discardable reads have an equal chance of making it into the reservoir.
|
||||
*/
|
||||
private int totalDiscardableReadsSeen;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a ReservoirDownsampler
|
||||
*
|
||||
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
|
||||
* after downsampling will be min(totalReads, targetSampleSize)
|
||||
* after downsampling will be min(totalDiscardableReads, targetSampleSize) + any
|
||||
* undiscardable reads (eg., reduced reads).
|
||||
*
|
||||
* @param expectFewOverflows if true, this downsampler will be optimized for the case
|
||||
* where most of the time we won't fill up anything like the
|
||||
* targetSampleSize elements. If this is false, we will allocate
|
||||
|
|
@ -76,15 +97,15 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
* the cost of allocation if we often use targetSampleSize or more
|
||||
* elements.
|
||||
*/
|
||||
public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows) {
|
||||
public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows ) {
|
||||
if ( targetSampleSize <= 0 ) {
|
||||
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
|
||||
}
|
||||
|
||||
this.targetSampleSize = targetSampleSize;
|
||||
this.expectFewOverflows = expectFewOverflows;
|
||||
clear();
|
||||
reset();
|
||||
clearItems();
|
||||
resetStats();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -93,15 +114,21 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
|
||||
* after downsampling will be min(totalReads, targetSampleSize)
|
||||
*/
|
||||
public ReservoirDownsampler ( int targetSampleSize ) {
|
||||
public ReservoirDownsampler ( final int targetSampleSize ) {
|
||||
this(targetSampleSize, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void submit ( final T newRead ) {
|
||||
if ( doNotDiscardItem(newRead) ) {
|
||||
undiscardableItems.add(newRead);
|
||||
return;
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
totalReadsSeen++;
|
||||
// Only count reads that are actually eligible for discarding for the purposes of the reservoir downsampling algorithm
|
||||
totalDiscardableReadsSeen++;
|
||||
|
||||
if ( totalReadsSeen <= targetSampleSize ) {
|
||||
if ( totalDiscardableReadsSeen <= targetSampleSize ) {
|
||||
reservoir.add(newRead);
|
||||
}
|
||||
else {
|
||||
|
|
@ -110,7 +137,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
isLinkedList = false;
|
||||
}
|
||||
|
||||
final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
|
||||
final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalDiscardableReadsSeen);
|
||||
if ( randomSlot < targetSampleSize ) {
|
||||
reservoir.set(randomSlot, newRead);
|
||||
}
|
||||
|
|
@ -118,49 +145,46 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
}
|
||||
}
|
||||
|
||||
public void submit ( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasFinalizedItems() {
|
||||
return reservoir.size() > 0;
|
||||
return ! reservoir.isEmpty() || ! undiscardableItems.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<T> consumeFinalizedItems() {
|
||||
if ( reservoir.isEmpty() ) {
|
||||
// if there's nothing here, don't both allocating a new list completely
|
||||
if ( ! hasFinalizedItems() ) {
|
||||
// if there's nothing here, don't bother allocating a new list
|
||||
return Collections.emptyList();
|
||||
} else {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = reservoir;
|
||||
clear();
|
||||
// pass reservoir by reference rather than make a copy, for speed
|
||||
final List<T> downsampledItems = reservoir;
|
||||
downsampledItems.addAll(undiscardableItems);
|
||||
clearItems();
|
||||
return downsampledItems;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekFinalized() {
|
||||
return reservoir.isEmpty() ? null : reservoir.get(0);
|
||||
return ! reservoir.isEmpty() ? reservoir.get(0) : (! undiscardableItems.isEmpty() ? undiscardableItems.get(0) : null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
@Override
|
||||
public int size() {
|
||||
return reservoir.size() + undiscardableItems.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return reservoir.size();
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
|
@ -168,25 +192,27 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
/**
|
||||
* Clear the data structures used to hold information
|
||||
*/
|
||||
public void clear() {
|
||||
@Override
|
||||
public void clearItems() {
|
||||
// if we aren't expecting many overflows, allocate a linked list not an arraylist
|
||||
reservoir = expectFewOverflows ? new LinkedList<T>() : new ArrayList<T>(targetSampleSize);
|
||||
|
||||
// there's no possibility of overflow with the undiscardable items, so we always use a linked list for them
|
||||
undiscardableItems = new LinkedList<>();
|
||||
|
||||
// it's a linked list if we allocate one
|
||||
isLinkedList = expectFewOverflows;
|
||||
|
||||
// an internal stat used by the downsampling process, so not cleared by reset() below
|
||||
totalReadsSeen = 0;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
// an internal stat used by the downsampling process, so not cleared by resetStats() below
|
||||
totalDiscardableReadsSeen = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,11 +35,11 @@ import java.util.*;
|
|||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
public class SimplePositionalDownsampler<T extends SAMRecord> extends ReadsDownsampler<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
private final int targetCoverage;
|
||||
|
||||
private ReservoirDownsampler<T> reservoir;
|
||||
private final ReservoirDownsampler<T> reservoir;
|
||||
|
||||
private int currentContigIndex;
|
||||
|
||||
|
|
@ -51,97 +51,93 @@ public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDo
|
|||
|
||||
private ArrayList<T> finalizedReads;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a SimplePositionalDownsampler
|
||||
*
|
||||
* @param targetCoverage Maximum number of reads that may share any given alignment start position
|
||||
*/
|
||||
public SimplePositionalDownsampler( int targetCoverage ) {
|
||||
public SimplePositionalDownsampler( final int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
reservoir = new ReservoirDownsampler<T>(targetCoverage);
|
||||
finalizedReads = new ArrayList<T>();
|
||||
clear();
|
||||
reset();
|
||||
clearItems();
|
||||
resetStats();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
@Override
|
||||
public void submit( final T newRead ) {
|
||||
updatePositionalState(newRead);
|
||||
|
||||
if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream
|
||||
finalizedReads.add(newRead);
|
||||
}
|
||||
else {
|
||||
int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems();
|
||||
final int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems();
|
||||
// our reservoir downsampler will call doNotDiscardItem() for us to exclude items from elimination as appropriate
|
||||
reservoir.submit(newRead);
|
||||
numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems;
|
||||
}
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasFinalizedItems() {
|
||||
return finalizedReads.size() > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> toReturn = finalizedReads;
|
||||
final List<T> toReturn = finalizedReads;
|
||||
finalizedReads = new ArrayList<T>();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPendingItems() {
|
||||
return reservoir.hasFinalizedItems();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekFinalized() {
|
||||
return finalizedReads.isEmpty() ? null : finalizedReads.get(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public T peekPending() {
|
||||
return reservoir.peekFinalized();
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return finalizedReads.size() + reservoir.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void signalEndOfInput() {
|
||||
finalizeReservoir();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir.clear();
|
||||
reservoir.reset();
|
||||
@Override
|
||||
public void clearItems() {
|
||||
reservoir.clearItems();
|
||||
reservoir.resetStats();
|
||||
finalizedReads.clear();
|
||||
positionEstablished = false;
|
||||
unmappedReadsReached = false;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
@Override
|
||||
public void signalNoMoreReadsBefore( final T read ) {
|
||||
updatePositionalState(read);
|
||||
}
|
||||
|
||||
private void updatePositionalState( T newRead ) {
|
||||
private void updatePositionalState( final T newRead ) {
|
||||
if ( readIsPastCurrentPosition(newRead) ) {
|
||||
if ( reservoir.hasFinalizedItems() ) {
|
||||
finalizeReservoir();
|
||||
|
|
@ -155,13 +151,13 @@ public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDo
|
|||
}
|
||||
}
|
||||
|
||||
private void setCurrentPosition( T read ) {
|
||||
private void setCurrentPosition( final T read ) {
|
||||
currentContigIndex = read.getReferenceIndex();
|
||||
currentAlignmentStart = read.getAlignmentStart();
|
||||
positionEstablished = true;
|
||||
}
|
||||
|
||||
private boolean readIsPastCurrentPosition( T read ) {
|
||||
private boolean readIsPastCurrentPosition( final T read ) {
|
||||
return ! positionEstablished ||
|
||||
read.getReferenceIndex() > currentContigIndex ||
|
||||
read.getAlignmentStart() > currentAlignmentStart ||
|
||||
|
|
@ -170,6 +166,6 @@ public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDo
|
|||
|
||||
private void finalizeReservoir() {
|
||||
finalizedReads.addAll(reservoir.consumeFinalizedItems());
|
||||
reservoir.reset();
|
||||
reservoir.resetStats();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,17 +43,42 @@ import java.util.List;
|
|||
* Time: 11:23 AM
|
||||
*/
|
||||
public class TAROrderedReadCache {
|
||||
final int maxCapacity;
|
||||
final Downsampler<GATKSAMRecord> downsampler;
|
||||
private final int maxCapacity;
|
||||
private ArrayList<GATKSAMRecord> undownsampledCache;
|
||||
private Downsampler<GATKSAMRecord> downsampler;
|
||||
|
||||
private static final int UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE = 10000;
|
||||
|
||||
/**
|
||||
* Create a new empty ReadCache
|
||||
* @param maxCapacity the max capacity of the read cache.
|
||||
*/
|
||||
public TAROrderedReadCache(int maxCapacity) {
|
||||
public TAROrderedReadCache( final int maxCapacity ) {
|
||||
if ( maxCapacity < 0 ) throw new IllegalArgumentException("maxCapacity must be >= 0 but got " + maxCapacity);
|
||||
this.maxCapacity = maxCapacity;
|
||||
this.downsampler = new ReservoirDownsampler<GATKSAMRecord>(maxCapacity);
|
||||
|
||||
// The one we're not currently using will always be null:
|
||||
initializeUndownsampledCache();
|
||||
this.downsampler = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves all reads over to the downsampler, causing it to be used from this point on. Should be called
|
||||
* when the undownsampledCache fills up and we need to start discarding reads. Since the
|
||||
* ReservoirDownsampler doesn't preserve relative ordering, pop operations become expensive
|
||||
* after this point, as they require a O(n log n) sort.
|
||||
*/
|
||||
private void activateDownsampler() {
|
||||
downsampler = new ReservoirDownsampler<>(maxCapacity, false);
|
||||
downsampler.submit(undownsampledCache);
|
||||
undownsampledCache = null; // preferable to the O(n) clear() method
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate the undownsampled cache used when we have fewer than maxCapacity items
|
||||
*/
|
||||
private void initializeUndownsampledCache() {
|
||||
undownsampledCache = new ArrayList<>(Math.min(maxCapacity + 1, UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -68,18 +93,31 @@ public class TAROrderedReadCache {
|
|||
* Add a single read to this cache. Assumed to be in sorted order w.r.t. the previously added reads
|
||||
* @param read a read to add
|
||||
*/
|
||||
public void add(final GATKSAMRecord read) {
|
||||
public void add( final GATKSAMRecord read ) {
|
||||
if ( read == null ) throw new IllegalArgumentException("Read cannot be null");
|
||||
downsampler.submit(read);
|
||||
|
||||
if ( downsampler != null ) {
|
||||
downsampler.submit(read);
|
||||
}
|
||||
else {
|
||||
undownsampledCache.add(read);
|
||||
|
||||
// No more room in the undownsampledCache? Time to start downsampling
|
||||
if ( undownsampledCache.size() > maxCapacity ) {
|
||||
activateDownsampler();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a collection of reads to this cache. Assumed to be in sorted order w.r.t. the previously added reads and each other
|
||||
* @param reads a collection of reads to add
|
||||
*/
|
||||
public void addAll(final List<GATKSAMRecord> reads) {
|
||||
public void addAll( final List<GATKSAMRecord> reads ) {
|
||||
if ( reads == null ) throw new IllegalArgumentException("Reads cannot be null");
|
||||
downsampler.submit(reads);
|
||||
for ( final GATKSAMRecord read : reads ) {
|
||||
add(read);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -87,40 +125,44 @@ public class TAROrderedReadCache {
|
|||
* @return a positive integer
|
||||
*/
|
||||
public int size() {
|
||||
return downsampler.size();
|
||||
return downsampler != null ? downsampler.size() : undownsampledCache.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* How many reads were discarded since the last call to popCurrentReads
|
||||
* @return
|
||||
*
|
||||
* @return number of items discarded during downsampling since last pop operation
|
||||
*/
|
||||
public int getNumDiscarded() {
|
||||
return downsampler.getNumberOfDiscardedItems();
|
||||
return downsampler != null ? downsampler.getNumberOfDiscardedItems() : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all reads currently in the cache, and returns them in sorted order (w.r.t. alignmentStart)
|
||||
*
|
||||
* Flushes this cache, so after this call the cache will contain no reads and all downsampling stats will
|
||||
* be reset.
|
||||
* Flushes this cache, so after this call the cache will contain no reads, and we'll be in the same
|
||||
* initial state as the constructor would put us in, with a non-null undownsampledCache and a null
|
||||
* downsampler.
|
||||
*
|
||||
* @return a list of GATKSAMRecords in this cache
|
||||
*/
|
||||
public List<GATKSAMRecord> popCurrentReads() {
|
||||
final List<GATKSAMRecord> maybeUnordered = downsampler.consumeFinalizedItems();
|
||||
final List<GATKSAMRecord> poppedReads;
|
||||
|
||||
final List<GATKSAMRecord> ordered;
|
||||
if ( downsampler.getNumberOfDiscardedItems() == 0 ) {
|
||||
// haven't discarded anything, so the reads are ordered properly
|
||||
ordered = maybeUnordered;
|
||||
} else {
|
||||
// we need to sort these damn things: O(n log n)
|
||||
ordered = new ArrayList<GATKSAMRecord>(maybeUnordered);
|
||||
Collections.sort(ordered, new AlignmentStartComparator());
|
||||
if ( downsampler == null ) {
|
||||
poppedReads = undownsampledCache; // avoid making a copy here, since we're going to allocate a new cache
|
||||
}
|
||||
else {
|
||||
// If we triggered the downsampler, we need to sort the reads before returning them,
|
||||
// since the ReservoirDownsampler is not guaranteed to preserve relative ordering of items.
|
||||
// After consuming the downsampled items in this call to popCurrentReads(), we switch back
|
||||
// to using the undownsampledCache until we fill up again.
|
||||
poppedReads = downsampler.consumeFinalizedItems(); // avoid making a copy here
|
||||
Collections.sort(poppedReads, new AlignmentStartComparator());
|
||||
downsampler = null;
|
||||
}
|
||||
|
||||
// reset the downsampler stats so getNumberOfDiscardedItems is 0
|
||||
downsampler.reset();
|
||||
return ordered;
|
||||
initializeUndownsampledCache();
|
||||
return poppedReads;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -123,6 +123,15 @@ public class AlignmentStateMachine {
|
|||
return getRead().getReferenceIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Is our read a reduced read?
|
||||
*
|
||||
* @return true if the read we encapsulate is a reduced read, otherwise false
|
||||
*/
|
||||
public boolean isReducedRead() {
|
||||
return read.isReducedRead();
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this the left edge state? I.e., one that is before or after the current read?
|
||||
* @return true if this state is an edge state, false otherwise
|
||||
|
|
|
|||
|
|
@ -167,7 +167,7 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
|
|||
|
||||
// use returned List directly rather than make a copy, for efficiency's sake
|
||||
readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems());
|
||||
levelingDownsampler.reset();
|
||||
levelingDownsampler.resetStats();
|
||||
}
|
||||
|
||||
return nStatesAdded;
|
||||
|
|
|
|||
|
|
@ -164,8 +164,8 @@ class SamplePartitioner<T extends SAMRecord> {
|
|||
@Ensures("doneSubmittingReads == false")
|
||||
public void reset() {
|
||||
for ( final Downsampler<T> downsampler : readsBySample.values() ) {
|
||||
downsampler.clear();
|
||||
downsampler.reset();
|
||||
downsampler.clearItems();
|
||||
downsampler.resetStats();
|
||||
}
|
||||
doneSubmittingReads = false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -324,6 +324,31 @@ public class ArtificialSAMUtils {
|
|||
return Arrays.asList(left, right);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an artificial reduced read based on the parameters. The cigar string will be *M, where * is the
|
||||
* length of the read. The base counts specified in the baseCounts array will be stored fully encoded in
|
||||
* the RR attribute.
|
||||
*
|
||||
* @param header the SAM header to associate the read with
|
||||
* @param name the name of the read
|
||||
* @param refIndex the reference index, i.e. what chromosome to associate it with
|
||||
* @param alignmentStart where to start the alignment
|
||||
* @param length the length of the read
|
||||
* @param baseCounts reduced base counts to encode in the RR attribute; length must match the read length
|
||||
* @return the artificial reduced read
|
||||
*/
|
||||
public static GATKSAMRecord createArtificialReducedRead( final SAMFileHeader header,
|
||||
final String name,
|
||||
final int refIndex,
|
||||
final int alignmentStart,
|
||||
final int length,
|
||||
final int[] baseCounts ) {
|
||||
final GATKSAMRecord read = createArtificialRead(header, name, refIndex, alignmentStart, length);
|
||||
read.setReducedReadCounts(baseCounts);
|
||||
read.setReducedReadCountsTag();
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a collection of identical artificial reads based on the parameters. The cigar string for each
|
||||
* read will be *M, where * is the length of the read.
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -152,7 +153,39 @@ public class FractionalDownsamplerUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size());
|
||||
|
||||
downsampler.reset();
|
||||
downsampler.resetStats();
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotDiscardReducedReads() {
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final ReadsDownsampler<GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(0.0);
|
||||
|
||||
final Collection<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int i = 1; i <= 10; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts));
|
||||
}
|
||||
for ( int i = 1; i <= 5; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5));
|
||||
}
|
||||
|
||||
downsampler.submit(reads);
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 5, "wrong number of items discarded by the downsampler");
|
||||
Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't");
|
||||
Assert.assertEquals(downsampler.size(), 10, "downsampler size() reports wrong number of items");
|
||||
|
||||
final Collection<GATKSAMRecord> readsReturned = downsampler.consumeFinalizedItems();
|
||||
|
||||
Assert.assertEquals(readsReturned.size(), 10, "wrong number of items returned by the downsampler");
|
||||
|
||||
for ( GATKSAMRecord readReturned : readsReturned ) {
|
||||
Assert.assertTrue(readReturned.isReducedRead(), "non-reduced read survived the downsampling process, but shouldn't have");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,16 +25,17 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.Assert;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class LevelingDownsamplerUnitTest extends BaseTest {
|
||||
|
||||
|
|
@ -158,9 +159,46 @@ public class LevelingDownsamplerUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded);
|
||||
|
||||
downsampler.reset();
|
||||
downsampler.resetStats();
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
|
||||
|
||||
Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotDiscardReducedReads() {
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final Downsampler<LinkedList<AlignmentStateMachine>> downsampler = new LevelingDownsampler<LinkedList<AlignmentStateMachine>, AlignmentStateMachine>(1);
|
||||
|
||||
final Collection<LinkedList<AlignmentStateMachine>> groups = new LinkedList<LinkedList<AlignmentStateMachine>>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int alignmentStart : Arrays.asList(1, 2, 3) ) {
|
||||
final LinkedList<AlignmentStateMachine> group = new LinkedList<AlignmentStateMachine>();
|
||||
for ( int i = 1; i <= 10; i++ ) {
|
||||
group.add(new AlignmentStateMachine(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts)));
|
||||
}
|
||||
groups.add(group);
|
||||
}
|
||||
|
||||
downsampler.submit(groups);
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0, "wrong number of items discarded by the downsampler");
|
||||
Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't");
|
||||
Assert.assertEquals(downsampler.size(), 30, "downsampler size() reports wrong number of items");
|
||||
|
||||
final Collection<LinkedList<AlignmentStateMachine>> groupsReturned = downsampler.consumeFinalizedItems();
|
||||
|
||||
Assert.assertEquals(groupsReturned.size(), 3, "wrong number of groups returned by the downsampler");
|
||||
|
||||
for ( LinkedList<AlignmentStateMachine> group : groupsReturned ) {
|
||||
Assert.assertEquals(group.size(), 10, "group has wrong size after downsampling");
|
||||
|
||||
for ( AlignmentStateMachine state : group ) {
|
||||
Assert.assertTrue(state.isReducedRead());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -125,7 +126,49 @@ public class ReservoirDownsamplerUnitTest extends BaseTest {
|
|||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems);
|
||||
Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems);
|
||||
|
||||
downsampler.reset();
|
||||
downsampler.resetStats();
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotDiscardReducedReads() {
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final ReadsDownsampler<GATKSAMRecord> downsampler = new ReservoirDownsampler<GATKSAMRecord>(1);
|
||||
|
||||
final Collection<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int i = 1; i <= 10; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts));
|
||||
}
|
||||
for ( int i = 1; i <= 5; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5));
|
||||
}
|
||||
|
||||
downsampler.submit(reads);
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 4, "wrong number of items discarded by the downsampler");
|
||||
Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't");
|
||||
Assert.assertEquals(downsampler.size(), 11, "downsampler size() reports wrong number of items");
|
||||
|
||||
final Collection<GATKSAMRecord> readsReturned = downsampler.consumeFinalizedItems();
|
||||
|
||||
Assert.assertEquals(readsReturned.size(), 11, "wrong number of items returned by the downsampler");
|
||||
|
||||
int numReducedReadsReturned = 0;
|
||||
int numNormalReadsReturned = 0;
|
||||
for ( GATKSAMRecord readReturned : readsReturned ) {
|
||||
if ( readReturned.isReducedRead() ) {
|
||||
numReducedReadsReturned++;
|
||||
}
|
||||
else {
|
||||
numNormalReadsReturned++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(numReducedReadsReturned, 10, "wrong number of reduced reads returned by the downsampler");
|
||||
Assert.assertEquals(numNormalReadsReturned, 1, "wrong number of non-reduced reads returned by the downsampler");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ public class SimplePositionalDownsamplerUnitTest extends BaseTest {
|
|||
Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated);
|
||||
}
|
||||
|
||||
downsampler.reset();
|
||||
downsampler.resetStats();
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
|
||||
}
|
||||
|
||||
|
|
@ -328,4 +328,48 @@ public class SimplePositionalDownsamplerUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertEquals(downsampledReads.size(), 10);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotDiscardReducedReads() {
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final ReadsDownsampler<GATKSAMRecord> downsampler = new SimplePositionalDownsampler<GATKSAMRecord>(1);
|
||||
|
||||
final Collection<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int alignmentStart : Arrays.asList(1, 2, 3) ) {
|
||||
for ( int i = 1; i <= 10; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts));
|
||||
}
|
||||
for ( int i = 1; i <= 5; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, alignmentStart, 5));
|
||||
}
|
||||
}
|
||||
|
||||
downsampler.submit(reads);
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 12, "wrong number of items discarded by the downsampler");
|
||||
Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't");
|
||||
Assert.assertEquals(downsampler.size(), 33, "downsampler size() reports wrong number of items");
|
||||
|
||||
final Collection<GATKSAMRecord> readsReturned = downsampler.consumeFinalizedItems();
|
||||
|
||||
Assert.assertEquals(readsReturned.size(), 33, "wrong number of items returned by the downsampler");
|
||||
|
||||
int numReducedReadsReturned = 0;
|
||||
int numNormalReadsReturned = 0;
|
||||
for ( GATKSAMRecord readReturned : readsReturned ) {
|
||||
if ( readReturned.isReducedRead() ) {
|
||||
numReducedReadsReturned++;
|
||||
}
|
||||
else {
|
||||
numNormalReadsReturned++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(numReducedReadsReturned, 30, "wrong number of reduced reads returned by the downsampler");
|
||||
Assert.assertEquals(numNormalReadsReturned, 3, "wrong number of non-reduced reads returned by the downsampler");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,9 +26,11 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
|
|
@ -39,6 +41,7 @@ import java.io.File;
|
|||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
public class TAROrderedReadCacheUnitTest extends BaseTest {
|
||||
|
|
@ -98,8 +101,53 @@ public class TAROrderedReadCacheUnitTest extends BaseTest {
|
|||
Assert.assertEquals(cache.getNumDiscarded(), 0, "should have reset stats");
|
||||
Assert.assertEquals(cacheReads.size(), nExpectedToKeep, "should have 1 read for every read we expected to keep");
|
||||
|
||||
verifySortednessOfReads(cacheReads);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadCacheWithReducedReads() {
|
||||
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int i = 1; i <= 100; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, i, 5, baseCounts));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, i, 5));
|
||||
}
|
||||
|
||||
final TAROrderedReadCache cache = new TAROrderedReadCache(50);
|
||||
|
||||
cache.addAll(reads);
|
||||
|
||||
// Our cache should have kept all of the reduced reads (which are retained unconditionally and do not count
|
||||
// towards the capacity limit), and discarded half of the 100 non-reduced reads due to the cache capacity
|
||||
// limit of 50.
|
||||
Assert.assertEquals(cache.size(), 150, "wrong number of reads in the cache at the end");
|
||||
Assert.assertEquals(cache.getNumDiscarded(), 50, "wrong number of reads discarded from the cache");
|
||||
|
||||
final List<GATKSAMRecord> cacheReads = cache.popCurrentReads();
|
||||
|
||||
int numReducedReadsRetained = 0;
|
||||
int numNormalReadsRetained = 0;
|
||||
|
||||
for ( GATKSAMRecord read : cacheReads ) {
|
||||
if ( read.isReducedRead() ) {
|
||||
numReducedReadsRetained++;
|
||||
}
|
||||
else {
|
||||
numNormalReadsRetained++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(numReducedReadsRetained, 100, "wrong number of reduced reads retained in the cache");
|
||||
Assert.assertEquals(numNormalReadsRetained, 50, "wrong number of non-reduced reads retained in the cache");
|
||||
|
||||
verifySortednessOfReads(cacheReads);
|
||||
}
|
||||
|
||||
private void verifySortednessOfReads( final List<GATKSAMRecord> reads) {
|
||||
int lastStart = -1;
|
||||
for ( final GATKSAMRecord read : cacheReads ) {
|
||||
for ( GATKSAMRecord read : reads ) {
|
||||
Assert.assertTrue(lastStart <= read.getAlignmentStart(), "Reads should be sorted but weren't. Found read with start " + read.getAlignmentStart() + " while last was " + lastStart);
|
||||
lastStart = read.getAlignmentStart();
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue