Merge branch 'master' of ssh://gsa4.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
cd2074b1dc
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The basic downsampler API, with no reads-specific operations
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface Downsampler<T> {
|
||||
|
||||
/*
|
||||
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
|
||||
* immediately whether the item survives the downsampling process, while others will need to see
|
||||
* more items before making that determination.
|
||||
*/
|
||||
public void submit( T item );
|
||||
|
||||
/*
|
||||
* Submit a collection of items to the downsampler for consideration.
|
||||
*/
|
||||
public void submit( Collection<T> items );
|
||||
|
||||
/*
|
||||
* Are there items that have survived the downsampling process waiting to be retrieved?
|
||||
*/
|
||||
public boolean hasDownsampledItems();
|
||||
|
||||
/*
|
||||
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
|
||||
*/
|
||||
public List<T> consumeDownsampledItems();
|
||||
|
||||
/*
|
||||
* Are there items stored in this downsampler that it doesn't yet know whether they will
|
||||
* ultimately survive the downsampling process?
|
||||
*/
|
||||
public boolean hasPendingItems();
|
||||
|
||||
/*
|
||||
* Used to tell the downsampler that no more items will be submitted to it, and that it should
|
||||
* finalize any pending items.
|
||||
*/
|
||||
public void signalEndOfInput();
|
||||
|
||||
/*
|
||||
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
|
||||
* information.
|
||||
*/
|
||||
public void clear();
|
||||
}
|
||||
|
|
@ -0,0 +1,98 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
|
||||
/**
|
||||
* StingSAMIterator wrapper around our generic reads downsampler interface
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class DownsamplingReadsIterator implements StingSAMIterator {
|
||||
|
||||
private StingSAMIterator nestedSAMIterator;
|
||||
private ReadsDownsampler<SAMRecord> downsampler;
|
||||
private Collection<SAMRecord> downsampledReadsCache;
|
||||
private Iterator<SAMRecord> downsampledReadsCacheIterator;
|
||||
|
||||
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
|
||||
nestedSAMIterator = iter;
|
||||
this.downsampler = downsampler;
|
||||
fillDownsampledReadsCache();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if ( downsampledReadsCacheIterator.hasNext() ) {
|
||||
return true;
|
||||
}
|
||||
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
|
||||
throw new NoSuchElementException("next() called when there are no more items");
|
||||
}
|
||||
|
||||
return downsampledReadsCacheIterator.next();
|
||||
}
|
||||
|
||||
private boolean fillDownsampledReadsCache() {
|
||||
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
|
||||
downsampler.submit(nestedSAMIterator.next());
|
||||
}
|
||||
|
||||
if ( ! nestedSAMIterator.hasNext() ) {
|
||||
downsampler.signalEndOfInput();
|
||||
}
|
||||
|
||||
downsampledReadsCache = downsampler.consumeDownsampledItems();
|
||||
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
|
||||
|
||||
return downsampledReadsCacheIterator.hasNext();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
nestedSAMIterator.close();
|
||||
}
|
||||
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private ArrayList<T> selectedReads;
|
||||
|
||||
private int cutoffForInclusion;
|
||||
|
||||
private static final int RANDOM_POOL_SIZE = 10000;
|
||||
|
||||
public FractionalDownsampler( double fraction ) {
|
||||
if ( fraction < 0.0 || fraction > 1.0 ) {
|
||||
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
|
||||
}
|
||||
|
||||
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
|
||||
selectedReads.add(newRead);
|
||||
}
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
return selectedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
List<T> downsampledItems = selectedReads;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
selectedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,259 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
|
||||
private ReservoirDownsampler<T> reservoir;
|
||||
|
||||
private int currentContigIndex;
|
||||
|
||||
private int currentAlignmentStart;
|
||||
|
||||
private LinkedList<PositionalReadGrouping> pendingReads;
|
||||
|
||||
private ArrayList<T> finalizedReads;
|
||||
|
||||
public PositionalDownsampler ( int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
if ( readIsPastCurrentPosition(newRead) ) {
|
||||
updateAndDownsamplePendingReads();
|
||||
}
|
||||
|
||||
reservoir.submit(newRead);
|
||||
updateCurrentPosition(newRead);
|
||||
}
|
||||
|
||||
public void submit ( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
return finalizedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
List<T> toReturn = finalizedReads;
|
||||
finalizedReads = new ArrayList<T>();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return pendingReads.size() > 0;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
updateAndDownsamplePendingReads();
|
||||
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
group.finalizeAllActiveReads();
|
||||
finalizedReads.addAll(group.getFinalizedReads());
|
||||
}
|
||||
|
||||
pendingReads.clear();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir = new ReservoirDownsampler<T>(targetCoverage);
|
||||
pendingReads = new LinkedList<PositionalReadGrouping>();
|
||||
finalizedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private void updateCurrentPosition ( T read ) {
|
||||
currentContigIndex = read.getReferenceIndex();
|
||||
currentAlignmentStart = read.getAlignmentStart();
|
||||
}
|
||||
|
||||
private boolean readIsPastCurrentPosition ( T read ) {
|
||||
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
|
||||
}
|
||||
|
||||
private void updateAndDownsamplePendingReads() {
|
||||
finalizeOutOfScopeReads();
|
||||
|
||||
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
|
||||
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
|
||||
|
||||
downsampleOverlappingGroups();
|
||||
}
|
||||
|
||||
private void finalizeOutOfScopeReads() {
|
||||
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
|
||||
boolean noPrecedingUnfinalizedGroups = true;
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
PositionalReadGrouping currentGroup = iter.next();
|
||||
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
|
||||
|
||||
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
|
||||
iter.remove();
|
||||
finalizedReads.addAll(currentGroup.getFinalizedReads());
|
||||
}
|
||||
else {
|
||||
noPrecedingUnfinalizedGroups = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void downsampleOverlappingGroups() {
|
||||
int[] groupReadCounts = new int[pendingReads.size()];
|
||||
int totalCoverage = 0;
|
||||
int numActiveGroups = 0;
|
||||
int currentGroup = 0;
|
||||
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
groupReadCounts[currentGroup] = group.numActiveReads();
|
||||
totalCoverage += groupReadCounts[currentGroup];
|
||||
|
||||
if ( groupReadCounts[currentGroup] > 0 ) {
|
||||
numActiveGroups++;
|
||||
}
|
||||
|
||||
currentGroup++;
|
||||
}
|
||||
|
||||
if ( totalCoverage <= targetCoverage ) {
|
||||
return;
|
||||
}
|
||||
|
||||
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
|
||||
currentGroup = 0;
|
||||
|
||||
while ( numReadsToRemove > 0 ) {
|
||||
if ( groupReadCounts[currentGroup] > 1 ) {
|
||||
groupReadCounts[currentGroup]--;
|
||||
numReadsToRemove--;
|
||||
}
|
||||
|
||||
currentGroup = (currentGroup + 1) % groupReadCounts.length;
|
||||
}
|
||||
|
||||
currentGroup = 0;
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
if ( ! group.isFinalized() ) {
|
||||
group.downsampleActiveReads(groupReadCounts[currentGroup]);
|
||||
}
|
||||
currentGroup++;
|
||||
}
|
||||
}
|
||||
|
||||
private class PositionalReadGrouping {
|
||||
private List<T> activeReads;
|
||||
private List<T> finalizedReads;
|
||||
|
||||
private int contig;
|
||||
private int alignmentStart;
|
||||
|
||||
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
|
||||
activeReads = new LinkedList<T>(reads);
|
||||
finalizedReads = new ArrayList<T>();
|
||||
this.contig = contig;
|
||||
this.alignmentStart = alignmentStart;
|
||||
}
|
||||
|
||||
public int numActiveReads() {
|
||||
return activeReads.size();
|
||||
}
|
||||
|
||||
public boolean isFinalized() {
|
||||
return activeReads.size() == 0;
|
||||
}
|
||||
|
||||
public List<T> getFinalizedReads() {
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
|
||||
if ( this.contig != contig ) {
|
||||
finalizeAllActiveReads();
|
||||
return;
|
||||
}
|
||||
|
||||
Iterator<T> iter = activeReads.iterator();
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
T read = iter.next();
|
||||
if ( read.getAlignmentEnd() < position ) {
|
||||
iter.remove();
|
||||
finalizedReads.add(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void finalizeAllActiveReads() {
|
||||
finalizedReads.addAll(activeReads);
|
||||
activeReads.clear();
|
||||
}
|
||||
|
||||
public void downsampleActiveReads( int numReadsToKeep ) {
|
||||
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
|
||||
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
|
||||
numReadsToKeep, activeReads.size()));
|
||||
}
|
||||
|
||||
BitSet itemsToKeep = new BitSet(activeReads.size());
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
|
||||
itemsToKeep.set(selectedIndex);
|
||||
}
|
||||
|
||||
int currentIndex = 0;
|
||||
Iterator<T> iter = activeReads.iterator();
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
T read = iter.next();
|
||||
|
||||
if ( ! itemsToKeep.get(currentIndex) ) {
|
||||
iter.remove();
|
||||
}
|
||||
|
||||
currentIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* An extension of the basic downsampler API with reads-specific operations
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
|
||||
|
||||
/*
|
||||
* Does this downsampler require that reads be fed to it in coordinate order?
|
||||
*/
|
||||
public boolean requiresCoordinateSortOrder();
|
||||
}
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
|
||||
* every read in the stream having an equal chance of being selected for inclusion.
|
||||
*
|
||||
* An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985)
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private ArrayList<T> reservoir;
|
||||
|
||||
private int targetSampleSize;
|
||||
|
||||
private int totalReadsSeen;
|
||||
|
||||
public ReservoirDownsampler ( int targetSampleSize ) {
|
||||
if ( targetSampleSize <= 0 ) {
|
||||
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
|
||||
}
|
||||
|
||||
this.targetSampleSize = targetSampleSize;
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
totalReadsSeen++;
|
||||
|
||||
if ( totalReadsSeen <= targetSampleSize ) {
|
||||
reservoir.add(newRead);
|
||||
}
|
||||
else {
|
||||
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
|
||||
if ( randomSlot < targetSampleSize ) {
|
||||
reservoir.set(randomSlot, newRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void submit ( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
return reservoir.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
List<T> downsampledItems = reservoir;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir = new ArrayList<T>(targetSampleSize);
|
||||
totalReadsSeen = 0;
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
|
||||
public class DownsamplingReadsIteratorUnitTest {
|
||||
|
||||
@Test
|
||||
public void testDownsamplingIteratorWithPositionalDownsampling() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
|
||||
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100));
|
||||
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100));
|
||||
|
||||
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
|
||||
|
||||
Assert.assertTrue(iter.hasNext());
|
||||
SAMRecord previous = iter.next();
|
||||
int count = 1;
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
SAMRecord current = iter.next();
|
||||
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
|
||||
count++;
|
||||
previous = current;
|
||||
}
|
||||
|
||||
Assert.assertEquals(count, 1000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDownsamplingIteratorNoEffectiveDownsampling() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
|
||||
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
|
||||
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
|
||||
|
||||
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
|
||||
|
||||
Assert.assertTrue(iter.hasNext());
|
||||
SAMRecord previous = iter.next();
|
||||
int count = 1;
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
SAMRecord current = iter.next();
|
||||
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
|
||||
count++;
|
||||
previous = current;
|
||||
}
|
||||
|
||||
Assert.assertEquals(count, 600);
|
||||
}
|
||||
|
||||
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
|
||||
for ( int i = 1; i <= stackSize; i++ ) {
|
||||
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
|
||||
}
|
||||
return stack;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class FractionalDownsamplerUnitTest {
|
||||
|
||||
@Test
|
||||
public void test100PercentInclusion() {
|
||||
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(1.0);
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
|
||||
|
||||
Assert.assertTrue(downsampledReads.size() == 1000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test0PercentInclusion() {
|
||||
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.0);
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
|
||||
|
||||
Assert.assertTrue(downsampledReads.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test50PercentInclusion() {
|
||||
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.5);
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500));
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
|
||||
|
||||
Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000);
|
||||
}
|
||||
|
||||
private List<SAMRecord> createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) {
|
||||
List<SAMRecord> reads = new ArrayList<SAMRecord>(numReads);
|
||||
|
||||
for ( int i = 1; i <= numReads; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex,
|
||||
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1,
|
||||
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1));
|
||||
}
|
||||
|
||||
return reads;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,357 @@
|
|||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested
|
||||
public class PositionalDownsamplerUnitTest extends BaseTest {
|
||||
|
||||
/**
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
*/
|
||||
@Test
|
||||
public void testThreeOverlappingIdenticalStacks() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.signalEndOfInput();
|
||||
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||
Assert.assertFalse(downsampler.hasPendingItems());
|
||||
|
||||
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||
|
||||
System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||
|
||||
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
*/
|
||||
@Test
|
||||
public void testThreeNonOverlappingIdenticalStacks() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100));
|
||||
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.signalEndOfInput();
|
||||
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||
Assert.assertFalse(downsampler.hasPendingItems());
|
||||
|
||||
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||
|
||||
System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||
|
||||
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) == 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) == 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(2) == 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* ---
|
||||
* ---
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
* -------
|
||||
*/
|
||||
@Test
|
||||
public void testThreeStacksWithShortStackAtBeginning() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.signalEndOfInput();
|
||||
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||
Assert.assertFalse(downsampler.hasPendingItems());
|
||||
|
||||
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||
|
||||
System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||
|
||||
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* -------
|
||||
* -------
|
||||
* ---
|
||||
* ---
|
||||
* -------
|
||||
* -------
|
||||
*/
|
||||
@Test
|
||||
public void testThreeStacksWithShortStackInMiddle() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.signalEndOfInput();
|
||||
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||
Assert.assertFalse(downsampler.hasPendingItems());
|
||||
|
||||
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||
|
||||
System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||
|
||||
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* ------
|
||||
* ------
|
||||
* -------
|
||||
* -------
|
||||
* ---
|
||||
* ---
|
||||
*/
|
||||
@Test
|
||||
public void testThreeStacksWithShortStackAtEnd() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.signalEndOfInput();
|
||||
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||
Assert.assertFalse(downsampler.hasPendingItems());
|
||||
|
||||
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||
|
||||
System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||
|
||||
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* -------
|
||||
* ----
|
||||
* -------
|
||||
* ----
|
||||
* -------
|
||||
* -------
|
||||
*/
|
||||
@Test
|
||||
public void testThreePartiallyOverlappingStacks() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||
|
||||
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.signalEndOfInput();
|
||||
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||
Assert.assertFalse(downsampler.hasPendingItems());
|
||||
|
||||
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||
|
||||
System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||
|
||||
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
|
||||
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
|
||||
|
||||
// TODO: need to examine per-base coverage here
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoDownsamplingRequired() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
|
||||
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
|
||||
Assert.assertFalse(downsampler.hasDownsampledItems());
|
||||
Assert.assertTrue(downsampler.hasPendingItems());
|
||||
|
||||
downsampler.signalEndOfInput();
|
||||
Assert.assertTrue(downsampler.hasDownsampledItems());
|
||||
Assert.assertFalse(downsampler.hasPendingItems());
|
||||
|
||||
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
|
||||
|
||||
System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes);
|
||||
|
||||
Assert.assertEquals(downsampledStackSizes.size(), 3);
|
||||
Assert.assertTrue(downsampledStackSizes.get(0) == 300);
|
||||
Assert.assertTrue(downsampledStackSizes.get(1) == 300);
|
||||
Assert.assertTrue(downsampledStackSizes.get(2) == 300);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGATKSAMRecordSupport() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
PositionalDownsampler<GATKSAMRecord> downsampler = new PositionalDownsampler<GATKSAMRecord>(1000);
|
||||
|
||||
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
for ( int i = 0; i < 10; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10));
|
||||
}
|
||||
|
||||
downsampler.submit(reads);
|
||||
downsampler.signalEndOfInput();
|
||||
List<GATKSAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
|
||||
|
||||
Assert.assertTrue(downsampledReads.size() == 10);
|
||||
}
|
||||
|
||||
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
|
||||
for ( int i = 1; i <= stackSize; i++ ) {
|
||||
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
|
||||
}
|
||||
return stack;
|
||||
}
|
||||
|
||||
private ArrayList<SAMRecord> createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) {
|
||||
ArrayList<SAMRecord> stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength);
|
||||
stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength));
|
||||
return stack;
|
||||
}
|
||||
|
||||
private List<Integer> getDownsampledStackSizesAndVerifySortedness( List<SAMRecord> downsampledReads ) {
|
||||
List<Integer> stackSizes = new ArrayList<Integer>();
|
||||
Iterator<SAMRecord> iter = downsampledReads.iterator();
|
||||
Assert.assertTrue(iter.hasNext());
|
||||
|
||||
SAMRecord previousRead = iter.next();
|
||||
int currentStackSize = 1;
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
SAMRecord currentRead = iter.next();
|
||||
|
||||
if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) {
|
||||
stackSizes.add(currentStackSize);
|
||||
currentStackSize = 1;
|
||||
}
|
||||
else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) {
|
||||
Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead));
|
||||
}
|
||||
else {
|
||||
currentStackSize++;
|
||||
}
|
||||
|
||||
previousRead = currentRead;
|
||||
}
|
||||
|
||||
stackSizes.add(currentStackSize);
|
||||
return stackSizes;
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue