Merge branch 'master' of ssh://gsa4.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
cd2074b1dc
|
|
@ -0,0 +1,76 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The basic downsampler API, with no reads-specific operations
|
||||||
|
*
|
||||||
|
* @author David Roazen
|
||||||
|
*/
|
||||||
|
public interface Downsampler<T> {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
|
||||||
|
* immediately whether the item survives the downsampling process, while others will need to see
|
||||||
|
* more items before making that determination.
|
||||||
|
*/
|
||||||
|
public void submit( T item );
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Submit a collection of items to the downsampler for consideration.
|
||||||
|
*/
|
||||||
|
public void submit( Collection<T> items );
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Are there items that have survived the downsampling process waiting to be retrieved?
|
||||||
|
*/
|
||||||
|
public boolean hasDownsampledItems();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
|
||||||
|
*/
|
||||||
|
public List<T> consumeDownsampledItems();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Are there items stored in this downsampler that it doesn't yet know whether they will
|
||||||
|
* ultimately survive the downsampling process?
|
||||||
|
*/
|
||||||
|
public boolean hasPendingItems();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Used to tell the downsampler that no more items will be submitted to it, and that it should
|
||||||
|
* finalize any pending items.
|
||||||
|
*/
|
||||||
|
public void signalEndOfInput();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
|
||||||
|
* information.
|
||||||
|
*/
|
||||||
|
public void clear();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,98 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* StingSAMIterator wrapper around our generic reads downsampler interface
|
||||||
|
*
|
||||||
|
* @author David Roazen
|
||||||
|
*/
|
||||||
|
public class DownsamplingReadsIterator implements StingSAMIterator {
|
||||||
|
|
||||||
|
private StingSAMIterator nestedSAMIterator;
|
||||||
|
private ReadsDownsampler<SAMRecord> downsampler;
|
||||||
|
private Collection<SAMRecord> downsampledReadsCache;
|
||||||
|
private Iterator<SAMRecord> downsampledReadsCacheIterator;
|
||||||
|
|
||||||
|
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
|
||||||
|
nestedSAMIterator = iter;
|
||||||
|
this.downsampler = downsampler;
|
||||||
|
fillDownsampledReadsCache();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
if ( downsampledReadsCacheIterator.hasNext() ) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord next() {
|
||||||
|
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
|
||||||
|
throw new NoSuchElementException("next() called when there are no more items");
|
||||||
|
}
|
||||||
|
|
||||||
|
return downsampledReadsCacheIterator.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean fillDownsampledReadsCache() {
|
||||||
|
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
|
||||||
|
downsampler.submit(nestedSAMIterator.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! nestedSAMIterator.hasNext() ) {
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
}
|
||||||
|
|
||||||
|
downsampledReadsCache = downsampler.consumeDownsampledItems();
|
||||||
|
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
|
||||||
|
|
||||||
|
return downsampledReadsCacheIterator.hasNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
nestedSAMIterator.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Iterator<SAMRecord> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,94 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
|
||||||
|
*
|
||||||
|
* @author David Roazen
|
||||||
|
*/
|
||||||
|
public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||||
|
|
||||||
|
private ArrayList<T> selectedReads;
|
||||||
|
|
||||||
|
private int cutoffForInclusion;
|
||||||
|
|
||||||
|
private static final int RANDOM_POOL_SIZE = 10000;
|
||||||
|
|
||||||
|
public FractionalDownsampler( double fraction ) {
|
||||||
|
if ( fraction < 0.0 || fraction > 1.0 ) {
|
||||||
|
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
|
||||||
|
}
|
||||||
|
|
||||||
|
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void submit( T newRead ) {
|
||||||
|
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
|
||||||
|
selectedReads.add(newRead);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void submit( Collection<T> newReads ) {
|
||||||
|
for ( T read : newReads ) {
|
||||||
|
submit(read);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasDownsampledItems() {
|
||||||
|
return selectedReads.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<T> consumeDownsampledItems() {
|
||||||
|
List<T> downsampledItems = selectedReads;
|
||||||
|
clear();
|
||||||
|
return downsampledItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasPendingItems() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void signalEndOfInput() {
|
||||||
|
// NO-OP
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
selectedReads = new ArrayList<T>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean requiresCoordinateSortOrder() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,259 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
|
||||||
|
*
|
||||||
|
* @author David Roazen
|
||||||
|
*/
|
||||||
|
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||||
|
|
||||||
|
private int targetCoverage;
|
||||||
|
|
||||||
|
private ReservoirDownsampler<T> reservoir;
|
||||||
|
|
||||||
|
private int currentContigIndex;
|
||||||
|
|
||||||
|
private int currentAlignmentStart;
|
||||||
|
|
||||||
|
private LinkedList<PositionalReadGrouping> pendingReads;
|
||||||
|
|
||||||
|
private ArrayList<T> finalizedReads;
|
||||||
|
|
||||||
|
public PositionalDownsampler ( int targetCoverage ) {
|
||||||
|
this.targetCoverage = targetCoverage;
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void submit ( T newRead ) {
|
||||||
|
if ( readIsPastCurrentPosition(newRead) ) {
|
||||||
|
updateAndDownsamplePendingReads();
|
||||||
|
}
|
||||||
|
|
||||||
|
reservoir.submit(newRead);
|
||||||
|
updateCurrentPosition(newRead);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void submit ( Collection<T> newReads ) {
|
||||||
|
for ( T read : newReads ) {
|
||||||
|
submit(read);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasDownsampledItems() {
|
||||||
|
return finalizedReads.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<T> consumeDownsampledItems() {
|
||||||
|
List<T> toReturn = finalizedReads;
|
||||||
|
finalizedReads = new ArrayList<T>();
|
||||||
|
return toReturn;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasPendingItems() {
|
||||||
|
return pendingReads.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void signalEndOfInput() {
|
||||||
|
updateAndDownsamplePendingReads();
|
||||||
|
|
||||||
|
for ( PositionalReadGrouping group : pendingReads ) {
|
||||||
|
group.finalizeAllActiveReads();
|
||||||
|
finalizedReads.addAll(group.getFinalizedReads());
|
||||||
|
}
|
||||||
|
|
||||||
|
pendingReads.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
reservoir = new ReservoirDownsampler<T>(targetCoverage);
|
||||||
|
pendingReads = new LinkedList<PositionalReadGrouping>();
|
||||||
|
finalizedReads = new ArrayList<T>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean requiresCoordinateSortOrder() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateCurrentPosition ( T read ) {
|
||||||
|
currentContigIndex = read.getReferenceIndex();
|
||||||
|
currentAlignmentStart = read.getAlignmentStart();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean readIsPastCurrentPosition ( T read ) {
|
||||||
|
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateAndDownsamplePendingReads() {
|
||||||
|
finalizeOutOfScopeReads();
|
||||||
|
|
||||||
|
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
|
||||||
|
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
|
||||||
|
|
||||||
|
downsampleOverlappingGroups();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void finalizeOutOfScopeReads() {
|
||||||
|
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
|
||||||
|
boolean noPrecedingUnfinalizedGroups = true;
|
||||||
|
|
||||||
|
while ( iter.hasNext() ) {
|
||||||
|
PositionalReadGrouping currentGroup = iter.next();
|
||||||
|
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
|
||||||
|
|
||||||
|
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
|
||||||
|
iter.remove();
|
||||||
|
finalizedReads.addAll(currentGroup.getFinalizedReads());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
noPrecedingUnfinalizedGroups = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void downsampleOverlappingGroups() {
|
||||||
|
int[] groupReadCounts = new int[pendingReads.size()];
|
||||||
|
int totalCoverage = 0;
|
||||||
|
int numActiveGroups = 0;
|
||||||
|
int currentGroup = 0;
|
||||||
|
|
||||||
|
for ( PositionalReadGrouping group : pendingReads ) {
|
||||||
|
groupReadCounts[currentGroup] = group.numActiveReads();
|
||||||
|
totalCoverage += groupReadCounts[currentGroup];
|
||||||
|
|
||||||
|
if ( groupReadCounts[currentGroup] > 0 ) {
|
||||||
|
numActiveGroups++;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentGroup++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( totalCoverage <= targetCoverage ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
|
||||||
|
currentGroup = 0;
|
||||||
|
|
||||||
|
while ( numReadsToRemove > 0 ) {
|
||||||
|
if ( groupReadCounts[currentGroup] > 1 ) {
|
||||||
|
groupReadCounts[currentGroup]--;
|
||||||
|
numReadsToRemove--;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentGroup = (currentGroup + 1) % groupReadCounts.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentGroup = 0;
|
||||||
|
for ( PositionalReadGrouping group : pendingReads ) {
|
||||||
|
if ( ! group.isFinalized() ) {
|
||||||
|
group.downsampleActiveReads(groupReadCounts[currentGroup]);
|
||||||
|
}
|
||||||
|
currentGroup++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class PositionalReadGrouping {
|
||||||
|
private List<T> activeReads;
|
||||||
|
private List<T> finalizedReads;
|
||||||
|
|
||||||
|
private int contig;
|
||||||
|
private int alignmentStart;
|
||||||
|
|
||||||
|
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
|
||||||
|
activeReads = new LinkedList<T>(reads);
|
||||||
|
finalizedReads = new ArrayList<T>();
|
||||||
|
this.contig = contig;
|
||||||
|
this.alignmentStart = alignmentStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int numActiveReads() {
|
||||||
|
return activeReads.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isFinalized() {
|
||||||
|
return activeReads.size() == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<T> getFinalizedReads() {
|
||||||
|
return finalizedReads;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
|
||||||
|
if ( this.contig != contig ) {
|
||||||
|
finalizeAllActiveReads();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Iterator<T> iter = activeReads.iterator();
|
||||||
|
|
||||||
|
while ( iter.hasNext() ) {
|
||||||
|
T read = iter.next();
|
||||||
|
if ( read.getAlignmentEnd() < position ) {
|
||||||
|
iter.remove();
|
||||||
|
finalizedReads.add(read);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void finalizeAllActiveReads() {
|
||||||
|
finalizedReads.addAll(activeReads);
|
||||||
|
activeReads.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void downsampleActiveReads( int numReadsToKeep ) {
|
||||||
|
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
|
||||||
|
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
|
||||||
|
numReadsToKeep, activeReads.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
BitSet itemsToKeep = new BitSet(activeReads.size());
|
||||||
|
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
|
||||||
|
itemsToKeep.set(selectedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
int currentIndex = 0;
|
||||||
|
Iterator<T> iter = activeReads.iterator();
|
||||||
|
|
||||||
|
while ( iter.hasNext() ) {
|
||||||
|
T read = iter.next();
|
||||||
|
|
||||||
|
if ( ! itemsToKeep.get(currentIndex) ) {
|
||||||
|
iter.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
currentIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An extension of the basic downsampler API with reads-specific operations
|
||||||
|
*
|
||||||
|
* @author David Roazen
|
||||||
|
*/
|
||||||
|
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Does this downsampler require that reads be fed to it in coordinate order?
|
||||||
|
*/
|
||||||
|
public boolean requiresCoordinateSortOrder();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,106 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
|
||||||
|
* every read in the stream having an equal chance of being selected for inclusion.
|
||||||
|
*
|
||||||
|
* An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985)
|
||||||
|
*
|
||||||
|
* @author David Roazen
|
||||||
|
*/
|
||||||
|
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||||
|
|
||||||
|
private ArrayList<T> reservoir;
|
||||||
|
|
||||||
|
private int targetSampleSize;
|
||||||
|
|
||||||
|
private int totalReadsSeen;
|
||||||
|
|
||||||
|
public ReservoirDownsampler ( int targetSampleSize ) {
|
||||||
|
if ( targetSampleSize <= 0 ) {
|
||||||
|
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
this.targetSampleSize = targetSampleSize;
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void submit ( T newRead ) {
|
||||||
|
totalReadsSeen++;
|
||||||
|
|
||||||
|
if ( totalReadsSeen <= targetSampleSize ) {
|
||||||
|
reservoir.add(newRead);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
|
||||||
|
if ( randomSlot < targetSampleSize ) {
|
||||||
|
reservoir.set(randomSlot, newRead);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void submit ( Collection<T> newReads ) {
|
||||||
|
for ( T read : newReads ) {
|
||||||
|
submit(read);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasDownsampledItems() {
|
||||||
|
return reservoir.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<T> consumeDownsampledItems() {
|
||||||
|
List<T> downsampledItems = reservoir;
|
||||||
|
clear();
|
||||||
|
return downsampledItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasPendingItems() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void signalEndOfInput() {
|
||||||
|
// NO-OP
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
reservoir = new ArrayList<T>(targetSampleSize);
|
||||||
|
totalReadsSeen = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean requiresCoordinateSortOrder() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,73 @@
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
public class DownsamplingReadsIteratorUnitTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDownsamplingIteratorWithPositionalDownsampling() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||||
|
|
||||||
|
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100));
|
||||||
|
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100));
|
||||||
|
|
||||||
|
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
|
||||||
|
|
||||||
|
Assert.assertTrue(iter.hasNext());
|
||||||
|
SAMRecord previous = iter.next();
|
||||||
|
int count = 1;
|
||||||
|
|
||||||
|
while ( iter.hasNext() ) {
|
||||||
|
SAMRecord current = iter.next();
|
||||||
|
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
|
||||||
|
count++;
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertEquals(count, 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDownsamplingIteratorNoEffectiveDownsampling() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||||
|
|
||||||
|
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
|
||||||
|
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
|
||||||
|
|
||||||
|
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
|
||||||
|
|
||||||
|
Assert.assertTrue(iter.hasNext());
|
||||||
|
SAMRecord previous = iter.next();
|
||||||
|
int count = 1;
|
||||||
|
|
||||||
|
while ( iter.hasNext() ) {
|
||||||
|
SAMRecord current = iter.next();
|
||||||
|
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
|
||||||
|
count++;
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertEquals(count, 600);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||||
|
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
|
||||||
|
for ( int i = 1; i <= stackSize; i++ ) {
|
||||||
|
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
|
||||||
|
}
|
||||||
|
return stack;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,65 @@
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
import org.testng.Assert;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class FractionalDownsamplerUnitTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test100PercentInclusion() {
|
||||||
|
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(1.0);
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
|
||||||
|
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
|
||||||
|
|
||||||
|
Assert.assertTrue(downsampledReads.size() == 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test0PercentInclusion() {
|
||||||
|
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.0);
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
|
||||||
|
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
|
||||||
|
|
||||||
|
Assert.assertTrue(downsampledReads.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test50PercentInclusion() {
|
||||||
|
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.5);
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500));
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
|
||||||
|
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
|
||||||
|
|
||||||
|
Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<SAMRecord> createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) {
|
||||||
|
List<SAMRecord> reads = new ArrayList<SAMRecord>(numReads);
|
||||||
|
|
||||||
|
for ( int i = 1; i <= numReads; i++ ) {
|
||||||
|
reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex,
|
||||||
|
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1,
|
||||||
|
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
return reads;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,357 @@
|
||||||
|
package org.broadinstitute.sting.gatk.downsampling;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
import org.testng.Assert;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested
|
||||||
|
public class PositionalDownsamplerUnitTest extends BaseTest {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testThreeOverlappingIdenticalStacks() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertFalse(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||||
|
|
||||||
|
System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||||
|
|
||||||
|
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testThreeNonOverlappingIdenticalStacks() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100));
|
||||||
|
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertFalse(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||||
|
|
||||||
|
System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||||
|
|
||||||
|
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) == 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) == 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(2) == 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ---
|
||||||
|
* ---
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testThreeStacksWithShortStackAtBeginning() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertFalse(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||||
|
|
||||||
|
System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||||
|
|
||||||
|
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* ---
|
||||||
|
* ---
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testThreeStacksWithShortStackInMiddle() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertFalse(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||||
|
|
||||||
|
System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||||
|
|
||||||
|
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ------
|
||||||
|
* ------
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
* ---
|
||||||
|
* ---
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testThreeStacksWithShortStackAtEnd() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertFalse(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||||
|
|
||||||
|
System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||||
|
|
||||||
|
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* -------
|
||||||
|
* ----
|
||||||
|
* -------
|
||||||
|
* ----
|
||||||
|
* -------
|
||||||
|
* -------
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testThreePartiallyOverlappingStacks() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertFalse(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||||
|
|
||||||
|
System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||||
|
|
||||||
|
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||||
|
|
||||||
|
// TODO: need to examine per-base coverage here
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNoDownsamplingRequired() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
|
||||||
|
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
|
||||||
|
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertTrue(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||||
|
Assert.assertFalse(downsampler.hasPendingItems());
|
||||||
|
|
||||||
|
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||||
|
|
||||||
|
System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||||
|
|
||||||
|
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(0) == 300);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(1) == 300);
|
||||||
|
Assert.assertTrue(downsampledStackSizes.get(2) == 300);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGATKSAMRecordSupport() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
PositionalDownsampler<GATKSAMRecord> downsampler = new PositionalDownsampler<GATKSAMRecord>(1000);
|
||||||
|
|
||||||
|
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||||
|
for ( int i = 0; i < 10; i++ ) {
|
||||||
|
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10));
|
||||||
|
}
|
||||||
|
|
||||||
|
downsampler.submit(reads);
|
||||||
|
downsampler.signalEndOfInput();
|
||||||
|
List<GATKSAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
|
||||||
|
|
||||||
|
Assert.assertTrue(downsampledReads.size() == 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||||
|
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
|
||||||
|
for ( int i = 1; i <= stackSize; i++ ) {
|
||||||
|
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
|
||||||
|
}
|
||||||
|
return stack;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ArrayList<SAMRecord> createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) {
|
||||||
|
ArrayList<SAMRecord> stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength);
|
||||||
|
stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength));
|
||||||
|
return stack;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Integer> getDownsampledStackSizesAndVerifySortedness( List<SAMRecord> downsampledReads ) {
|
||||||
|
List<Integer> stackSizes = new ArrayList<Integer>();
|
||||||
|
Iterator<SAMRecord> iter = downsampledReads.iterator();
|
||||||
|
Assert.assertTrue(iter.hasNext());
|
||||||
|
|
||||||
|
SAMRecord previousRead = iter.next();
|
||||||
|
int currentStackSize = 1;
|
||||||
|
|
||||||
|
while ( iter.hasNext() ) {
|
||||||
|
SAMRecord currentRead = iter.next();
|
||||||
|
|
||||||
|
if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) {
|
||||||
|
stackSizes.add(currentStackSize);
|
||||||
|
currentStackSize = 1;
|
||||||
|
}
|
||||||
|
else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) {
|
||||||
|
Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
currentStackSize++;
|
||||||
|
}
|
||||||
|
|
||||||
|
previousRead = currentRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
stackSizes.add(currentStackSize);
|
||||||
|
return stackSizes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Loading…
Reference in New Issue