Downsampling: experimental engine integration

-Off by default; engine fork isolates new code paths from old code paths,
so no integration tests change yet

-Experimental implementation is currently BROKEN due to a serious issue
involving file spans. No one can/should use the experimental features
until I've patched this issue.

-There are temporarily two independent versions of LocusIteratorByState.
Anyone changing one version should port the change to the other (if possible),
and anyone adding unit tests for one version should add the same unit tests
for the other (again, if possible). This situation will hopefully be extremely
temporary, and last only until the experimental implementation is proven.
This commit is contained in:
David Roazen 2012-05-24 09:17:11 -04:00
parent 6df6c1abd5
commit cb84a6473f
52 changed files with 4701 additions and 879 deletions

View File

@ -546,7 +546,7 @@ public class SlidingWindow {
FractionalDownsampler <GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(fraction);
downsampler.submit(allReads);
return downsampler.consumeDownsampledItems();
return downsampler.consumeFinalizedItems();
}

View File

@ -1,52 +0,0 @@
package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Describes the method for downsampling reads at a given locus.
*
* @author hanna
* @version 0.1
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
/**
* Expresses no downsampling applied at all.
*/
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null);
public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) {
// Do some basic sanity checks on the downsampling parameters passed in.
// Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator.
if(type != DownsampleType.NONE && toFraction == null && toCoverage == null)
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if(toFraction != null && toCoverage != null)
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
// Experimental by sample downsampling does not work with a fraction of reads.
if(type == DownsampleType.BY_SAMPLE && toFraction != null)
throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method");
this.type = type;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
}
}

View File

@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
@ -441,14 +442,18 @@ public class GenomeAnalysisEngine {
protected DownsamplingMethod getDownsamplingMethod() {
GATKArgumentCollection argCollection = this.getArguments();
DownsamplingMethod method;
if(argCollection.getDownsamplingMethod() != null)
method = argCollection.getDownsamplingMethod();
else if(WalkerManager.getDownsamplingMethod(walker) != null)
method = WalkerManager.getDownsamplingMethod(walker);
else
method = GATKArgumentCollection.getDefaultDownsamplingMethod();
return method;
boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
// until the file pointer bug with the experimental downsamplers is fixed, disallow running with experimental downsampling
if ( useExperimentalDownsampling ) {
throw new UserException("The experimental downsampling implementation is currently crippled by a file-pointer-related bug. Until this bug is fixed, it's not safe (or possible) for anyone to use the experimental implementation!");
}
DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
}
protected void setDownsamplingMethod(DownsamplingMethod method) {
@ -821,11 +826,13 @@ public class GenomeAnalysisEngine {
* @return A data source for the given set of reads.
*/
private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
DownsamplingMethod method = getDownsamplingMethod();
DownsamplingMethod downsamplingMethod = getDownsamplingMethod();
// Synchronize the method back into the collection so that it shows up when
// interrogating for the downsample method during command line recreation.
setDownsamplingMethod(method);
setDownsamplingMethod(downsamplingMethod);
logger.info(downsamplingMethod);
if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
@ -843,7 +850,7 @@ public class GenomeAnalysisEngine {
argCollection.useOriginalBaseQualities,
argCollection.strictnessLevel,
argCollection.readBufferSize,
method,
downsamplingMethod,
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
filters,
readTransformers,

View File

@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@ -304,9 +306,10 @@ public class WalkerManager extends PluginManager<Walker> {
* downsampling method is specified on the command-line, the command-line version will
* be used instead.
* @param walkerClass The class of the walker to interrogate.
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
* @return The downsampling method, as specified by the walker. Null if none exists.
*/
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass) {
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useExperimentalDownsampling) {
DownsamplingMethod downsamplingMethod = null;
if( walkerClass.isAnnotationPresent(Downsample.class) ) {
@ -314,7 +317,7 @@ public class WalkerManager extends PluginManager<Walker> {
DownsampleType type = downsampleParameters.by();
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction);
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
}
return downsamplingMethod;
@ -333,10 +336,11 @@ public class WalkerManager extends PluginManager<Walker> {
* downsampling method is specified on the command-line, the command-line version will
* be used instead.
* @param walker The walker to interrogate.
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
* @return The downsampling method, as specified by the walker. Null if none exists.
*/
public static DownsamplingMethod getDownsamplingMethod(Walker walker) {
return getDownsamplingMethod(walker.getClass());
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
}
/**

View File

@ -31,8 +31,8 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.QualityUtils;
@ -140,15 +140,11 @@ public class GATKArgumentCollection {
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
/**
* The override mechanism in the GATK, by default, populates the command-line arguments, then
* the defaults from the walker annotations. Unfortunately, walker annotations should be trumped
* by a user explicitly specifying command-line arguments.
* TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments.
*/
private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
// --------------------------------------------------------------------------------------------------------------
//
// Downsampling Arguments
//
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
public DownsampleType downsamplingType = null;
@ -158,17 +154,20 @@ public class GATKArgumentCollection {
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
public Integer downsampleCoverage = null;
@Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
@Hidden
public boolean enableExperimentalDownsampling = false;
/**
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
* a default downsampling mechanism, return the default.
* @return The explicitly specified downsampling mechanism, or the default if none exists.
*/
public DownsamplingMethod getDownsamplingMethod() {
if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null)
if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
return null;
if(downsamplingType == null && downsampleCoverage != null)
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null);
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
}
/**
@ -178,9 +177,11 @@ public class GATKArgumentCollection {
public void setDownsamplingMethod(DownsamplingMethod method) {
if (method == null)
throw new IllegalArgumentException("method is null");
downsamplingType = method.type;
downsampleCoverage = method.toCoverage;
downsampleFraction = method.toFraction;
enableExperimentalDownsampling = method.useExperimentalDownsampling;
}
// --------------------------------------------------------------------------------------------------------------
@ -208,15 +209,6 @@ public class GATKArgumentCollection {
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
public File performanceLog = null;
/**
* Gets the default downsampling method, returned if the user didn't specify any downsampling
* method.
* @return The default downsampling mechanism, or null if none exists.
*/
public static DownsamplingMethod getDefaultDownsamplingMethod() {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
}
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
public Boolean useOriginalBaseQualities = false;

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View {
// Cache the current and apply filtering.
AlignmentContext current = nextLocus;
if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
// The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
}
// Indicate that the next operation will need to advance.
nextLocus = null;

View File

@ -30,7 +30,9 @@ import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.*;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -152,6 +154,8 @@ public class SAMDataSource {
*/
private final ThreadAllocation threadAllocation;
private final boolean expandShardsForDownsampling;
/**
* Create a new SAM data source given the supplied read metadata.
* @param samFiles list of reads files.
@ -302,6 +306,11 @@ public class SAMDataSource {
includeReadsWithDeletionAtLoci,
defaultBaseQualities);
expandShardsForDownsampling = readProperties.getDownsamplingMethod() != null &&
readProperties.getDownsamplingMethod().useExperimentalDownsampling &&
readProperties.getDownsamplingMethod().type != DownsampleType.NONE &&
readProperties.getDownsamplingMethod().toCoverage != null;
// cache the read group id (original) -> read group id (merged)
// and read group id (merged) -> read group id (original) mappings.
for(SAMReaderID id: readerIDs) {
@ -457,6 +466,16 @@ public class SAMDataSource {
}
}
/**
* Are we expanding shards as necessary to prevent shard boundaries from occurring at improper places?
*
* @return true if we are using expanded shards, otherwise false
*/
public boolean usingExpandedShards() {
return expandShardsForDownsampling;
}
/**
* Fill the given buffering shard with reads.
* @param shard Shard to fill.
@ -484,6 +503,31 @@ public class SAMDataSource {
}
}
// If the reads are sorted in coordinate order, ensure that all reads
// having the same alignment start become part of the same shard, to allow
// downsampling to work better across shard boundaries. Note that because our
// read stream has already been fed through the positional downsampler, which
// ensures that at each alignment start position there are no more than dcov
// reads, we're in no danger of accidentally creating a disproportionately huge
// shard
if ( expandShardsForDownsampling && sortOrder == SAMFileHeader.SortOrder.coordinate ) {
while ( iterator.hasNext() ) {
SAMRecord additionalRead = iterator.next();
// Stop filling the shard as soon as we encounter a read having a different
// alignment start or contig from the last read added in the earlier loop
// above, or an unmapped read
if ( read == null ||
additionalRead.getReadUnmappedFlag() ||
! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) ||
additionalRead.getAlignmentStart() != read.getAlignmentStart() ) {
break;
}
shard.addRead(additionalRead);
noteFilePositionUpdate(positionUpdates, additionalRead);
}
}
// If the reads are sorted in queryname order, ensure that all reads
// having the same queryname become part of the same shard.
if(sortOrder == SAMFileHeader.SortOrder.queryname) {
@ -578,6 +622,7 @@ public class SAMDataSource {
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
iteratorMap.put(readers.getReader(id), iterator);
}
@ -660,20 +705,25 @@ public class SAMDataSource {
List<ReadTransformer> readTransformers,
byte defaultBaseQualities) {
// *********************************************************************************** //
// * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * //
// *********************************************************************************** //
// ************************************************************************************************ //
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * //
// ************************************************************************************************ //
if (downsamplingFraction != null)
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
}
// Use the old fractional downsampler only if we're not using experimental downsampling:
if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
// verify the read ordering by applying a sort order iterator
if (!noValidationOfReadOrder && enableVerification)
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
// only wrap if we are replacing the original qualities or using a default base quality
@ -688,6 +738,26 @@ public class SAMDataSource {
return wrappedIterator;
}
protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
new FractionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
}
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
ReadsDownsampler<SAMRecord> downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
return new DownsamplingReadsIterator(wrappedIterator, downsampler);
}
return wrappedIterator;
}
private class SAMResourcePool {
/**
* How many entries can be cached in this resource pool?

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.gatk;
package org.broadinstitute.sting.gatk.downsampling;
/**
* Type of downsampling method to invoke.

View File

@ -28,49 +28,92 @@ import java.util.Collection;
import java.util.List;
/**
* The basic downsampler API, with no reads-specific operations
* The basic downsampler API, with no reads-specific operations.
*
* Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle
* any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a
* PerSampleDownsamplingReadsIterator.
*
* @author David Roazen
*/
public interface Downsampler<T> {
/*
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
/**
* Submit one item to the downsampler for consideration. Some downsamplers will be able to determine
* immediately whether the item survives the downsampling process, while others will need to see
* more items before making that determination.
*
* @param item the individual item to submit to the downsampler for consideration
*/
public void submit( T item );
/*
* Submit a collection of items to the downsampler for consideration.
/**
* Submit a collection of items to the downsampler for consideration. Should be equivalent to calling
* submit() on each individual item in the collection.
*
* @param items the collection of items to submit to the downsampler for consideration
*/
public void submit( Collection<T> items );
/*
/**
* Are there items that have survived the downsampling process waiting to be retrieved?
*
* @return true if this downsampler has > 0 finalized items, otherwise false
*/
public boolean hasDownsampledItems();
public boolean hasFinalizedItems();
/*
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
/**
* Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved.
*
* @return a list of all finalized items this downsampler contains, or an empty list if there are none
*/
public List<T> consumeDownsampledItems();
public List<T> consumeFinalizedItems();
/*
/**
* Are there items stored in this downsampler that it doesn't yet know whether they will
* ultimately survive the downsampling process?
*
* @return true if this downsampler has > 0 pending items, otherwise false
*/
public boolean hasPendingItems();
/*
/**
* Peek at the first finalized item stored in this downsampler (or null if there are no finalized items)
*
* @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call),
* or null if there are none
*/
public T peekFinalized();
/**
* Peek at the first pending item stored in this downsampler (or null if there are no pending items)
*
* @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call),
* or null if there are none
*/
public T peekPending();
/**
* Returns the number of items discarded (so far) during the downsampling process
*
* @return the number of items that have been submitted to this downsampler and discarded in the process of
* downsampling
*/
public int getNumberOfDiscardedItems();
/**
* Used to tell the downsampler that no more items will be submitted to it, and that it should
* finalize any pending items.
*/
public void signalEndOfInput();
/*
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
* information.
/**
* Empty the downsampler of all finalized/pending items
*/
public void clear();
/**
* Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items
*/
public void reset();
}

View File

@ -0,0 +1,153 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Describes the method for downsampling reads at a given locus.
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
/**
* Use the new experimental downsampling?
*/
public final boolean useExperimentalDownsampling;
/**
* Expresses no downsampling applied at all.
*/
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false);
/**
* Default type to use if no type is specified
*/
public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
/**
* Default target coverage for locus-based traversals
*/
public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
this.useExperimentalDownsampling = useExperimentalDownsampling;
if ( type == DownsampleType.NONE ) {
toCoverage = null;
toFraction = null;
}
validate();
}
private void validate() {
// Can't leave toFraction and toCoverage null unless type is NONE
if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null )
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if ( toFraction != null && toCoverage != null )
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
// toCoverage must be > 0 when specified
if ( toCoverage != null && toCoverage <= 0 ) {
throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage");
}
// toFraction must be >= 0.0 and <= 1.0 when specified
if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
}
// Some restrictions only exist for the old downsampling implementation:
if ( ! useExperimentalDownsampling ) {
// By sample downsampling does not work with a fraction of reads in the old downsampling implementation
if( type == DownsampleType.BY_SAMPLE && toFraction != null )
throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
}
// Some restrictions only exist for the new downsampling implementation:
if ( useExperimentalDownsampling ) {
if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
}
}
}
public String toString() {
StringBuilder builder = new StringBuilder("Downsampling Settings: ");
if ( type == DownsampleType.NONE ) {
builder.append("No downsampling");
}
else {
builder.append(String.format("Method: %s ", type));
if ( toCoverage != null ) {
builder.append(String.format("Target Coverage: %d ", toCoverage));
}
else {
builder.append(String.format("Target Fraction: %.2f ", toFraction));
}
if ( useExperimentalDownsampling ) {
builder.append("Using Experimental Downsampling");
}
}
return builder.toString();
}
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
null, useExperimentalDownsampling);
}
else {
return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
}
}
}

View File

@ -33,7 +33,8 @@ import java.util.NoSuchElementException;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
* StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style
* downsampler interface to a pull model.
*
* @author David Roazen
*/
@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsampler<SAMRecord> downsampler;
private Collection<SAMRecord> downsampledReadsCache;
private Iterator<SAMRecord> downsampledReadsCacheIterator;
private SAMRecord nextRead = null;
private Iterator<SAMRecord> downsampledReadsCacheIterator = null;
/**
* @param iter wrapped iterator from which this iterator will pull reads
* @param downsampler downsampler through which the reads will be fed
*/
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
nestedSAMIterator = iter;
this.downsampler = downsampler;
fillDownsampledReadsCache();
advanceToNextRead();
}
public boolean hasNext() {
if ( downsampledReadsCacheIterator.hasNext() ) {
return true;
}
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
return false;
}
return true;
return nextRead != null;
}
public SAMRecord next() {
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
if ( nextRead == null ) {
throw new NoSuchElementException("next() called when there are no more items");
}
return downsampledReadsCacheIterator.next();
SAMRecord toReturn = nextRead;
advanceToNextRead();
return toReturn;
}
private void advanceToNextRead() {
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
nextRead = null;
}
else {
nextRead = downsampledReadsCacheIterator.next();
}
}
private boolean readyToReleaseReads() {
return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext();
}
private boolean fillDownsampledReadsCache() {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) {
downsampler.submit(nestedSAMIterator.next());
}
@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
downsampler.signalEndOfInput();
}
downsampledReadsCache = downsampler.consumeDownsampledItems();
// use returned collection directly rather than make a copy, for speed
downsampledReadsCache = downsampler.consumeFinalizedItems();
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
return downsampledReadsCacheIterator.hasNext();

View File

@ -33,7 +33,10 @@ import java.util.Collection;
import java.util.List;
/**
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
* Fractional Downsampler: selects a specified fraction of the reads for inclusion.
*
* Since the selection is done randomly, the actual fraction of reads retained may be slightly
* more or less than the requested fraction, depending on the total number of reads submitted.
*
* @author David Roazen
*/
@ -43,8 +46,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
private int cutoffForInclusion;
private int numDiscardedItems;
private static final int RANDOM_POOL_SIZE = 10000;
/**
* Construct a FractionalDownsampler
*
* @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive).
* Actual number of reads preserved may differ randomly.
*/
public FractionalDownsampler( double fraction ) {
if ( fraction < 0.0 || fraction > 1.0 ) {
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
@ -52,12 +63,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
clear();
reset();
}
public void submit( T newRead ) {
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
selectedReads.add(newRead);
}
else {
numDiscardedItems++;
}
}
public void submit( Collection<T> newReads ) {
@ -66,11 +81,12 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
}
}
public boolean hasDownsampledItems() {
public boolean hasFinalizedItems() {
return selectedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = selectedReads;
clear();
return downsampledItems;
@ -80,6 +96,18 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
return false;
}
public T peekFinalized() {
return selectedReads.isEmpty() ? null : selectedReads.get(0);
}
public T peekPending() {
return null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
// NO-OP
}
@ -88,7 +116,15 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
selectedReads = new ArrayList<T>();
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
public void signalNoMoreReadsBefore( T read ) {
// NO-OP
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating FractionalDownsamplers on demand
*
* @author David Roazen
*/
public class FractionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private double fraction;
public FractionalDownsamplerFactory( double fraction ) {
this.fraction = fraction;
}
public ReadsDownsampler<T> newInstance() {
return new FractionalDownsampler<T>(fraction);
}
}

View File

@ -0,0 +1,212 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.*;
/**
* Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from
* the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling
* does not occur until all Lists have been submitted and signalEndOfInput() is called.
*
* The Lists should be LinkedLists for maximum efficiency during item removal, however other
* kinds of Lists are also accepted (albeit at a slight performance penalty).
*
* Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface,
* the Lists need not contain reads. However this downsampler may not be wrapped within one of the
* DownsamplingReadsIterators
*
* @param <T> the List type representing the stacks to be leveled
* @param <E> the type of the elements of each List
*
* @author David Roazen
*/
public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T> {
private int targetSize;
private List<T> groups;
private boolean groupsAreFinalized;
private int numDiscardedItems;
/**
* Construct a LevelingDownsampler
*
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
* this value -- if it does, items are removed from Lists evenly until the total size
* is <= this value
*/
public LevelingDownsampler( int targetSize ) {
this.targetSize = targetSize;
clear();
reset();
}
public void submit( T item ) {
groups.add(item);
}
public void submit( Collection<T> items ){
groups.addAll(items);
}
public boolean hasFinalizedItems() {
return groupsAreFinalized && groups.size() > 0;
}
public List<T> consumeFinalizedItems() {
if ( ! hasFinalizedItems() ) {
return new ArrayList<T>();
}
// pass by reference rather than make a copy, for speed
List<T> toReturn = groups;
clear();
return toReturn;
}
public boolean hasPendingItems() {
return ! groupsAreFinalized && groups.size() > 0;
}
public T peekFinalized() {
return hasFinalizedItems() ? groups.get(0) : null;
}
public T peekPending() {
return hasPendingItems() ? groups.get(0) : null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
levelGroups();
groupsAreFinalized = true;
}
public void clear() {
groups = new ArrayList<T>();
groupsAreFinalized = false;
}
public void reset() {
numDiscardedItems = 0;
}
private void levelGroups() {
int totalSize = 0;
int[] groupSizes = new int[groups.size()];
int currentGroupIndex = 0;
for ( T group : groups ) {
groupSizes[currentGroupIndex] = group.size();
totalSize += groupSizes[currentGroupIndex];
currentGroupIndex++;
}
if ( totalSize <= targetSize ) {
return; // no need to eliminate any items
}
// We will try to remove exactly this many items, however we will refuse to allow any
// one group to fall below size 1, and so might end up removing fewer items than this
int numItemsToRemove = totalSize - targetSize;
currentGroupIndex = 0;
int numConsecutiveUmodifiableGroups = 0;
// Continue until we've either removed all the items we wanted to, or we can't
// remove any more items without violating the constraint that all groups must
// be left with at least one item
while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) {
if ( groupSizes[currentGroupIndex] > 1 ) {
groupSizes[currentGroupIndex]--;
numItemsToRemove--;
numConsecutiveUmodifiableGroups = 0;
}
else {
numConsecutiveUmodifiableGroups++;
}
currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length;
}
// Now we actually go through and reduce each group to its new count as specified in groupSizes
currentGroupIndex = 0;
for ( T group : groups ) {
downsampleOneGroup(group, groupSizes[currentGroupIndex]);
currentGroupIndex++;
}
}
private void downsampleOneGroup( T group, int numItemsToKeep ) {
if ( numItemsToKeep >= group.size() ) {
return;
}
numDiscardedItems += group.size() - numItemsToKeep;
BitSet itemsToKeep = new BitSet(group.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
// If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator
if ( group instanceof LinkedList ) {
Iterator iter = group.iterator();
while ( iter.hasNext() ) {
iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
// If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather
// than suffer O(n^2) of item shifting
else {
List<E> keptItems = new ArrayList<E>(numItemsToKeep);
for ( E item : group ) {
if ( itemsToKeep.get(currentIndex) ) {
keptItems.add(item);
}
currentIndex++;
}
group.clear();
group.addAll(keptItems);
}
}
}

View File

@ -0,0 +1,202 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordComparator;
import net.sf.samtools.SAMRecordCoordinateComparator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import java.util.*;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
* that downsamples reads for each sample independently, and then re-assembles
* the reads back into a single merged stream.
*
* @author David Roazen
*/
public class PerSampleDownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsamplerFactory<SAMRecord> downsamplerFactory;
private Map<String, ReadsDownsampler<SAMRecord>> perSampleDownsamplers;
private PriorityQueue<SAMRecord> orderedDownsampledReadsCache;
private SAMRecord nextRead = null;
private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator();
private SAMRecord earliestPendingRead = null;
private ReadsDownsampler<SAMRecord> earliestPendingDownsampler = null;
// Initial size of our cache of finalized reads
private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096;
// The number of positional changes that can occur in the read stream before all downsamplers
// should be informed of the current position (guards against samples with relatively sparse reads
// getting stuck in a pending state):
private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value
/**
* @param iter wrapped iterator from which this iterator will pull reads
* @param downsamplerFactory factory used to create new downsamplers as needed
*/
public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory<SAMRecord> downsamplerFactory ) {
nestedSAMIterator = iter;
this.downsamplerFactory = downsamplerFactory;
perSampleDownsamplers = new HashMap<String, ReadsDownsampler<SAMRecord>>();
orderedDownsampledReadsCache = new PriorityQueue<SAMRecord>(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator);
advanceToNextRead();
}
public boolean hasNext() {
return nextRead != null;
}
public SAMRecord next() {
if ( nextRead == null ) {
throw new NoSuchElementException("next() called when there are no more items");
}
SAMRecord toReturn = nextRead;
advanceToNextRead();
return toReturn;
}
private void advanceToNextRead() {
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
nextRead = null;
}
else {
nextRead = orderedDownsampledReadsCache.poll();
}
}
private boolean readyToReleaseReads() {
if ( orderedDownsampledReadsCache.isEmpty() ) {
return false;
}
return earliestPendingRead == null ||
readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0;
}
private void updateEarliestPendingRead( ReadsDownsampler<SAMRecord> currentDownsampler ) {
// If there is no recorded earliest pending read and this downsampler has pending items,
// then this downsampler's first pending item becomes the new earliest pending read:
if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) {
earliestPendingRead = currentDownsampler.peekPending();
earliestPendingDownsampler = currentDownsampler;
}
// In all other cases, we only need to update the earliest pending read when the downsampler
// associated with it experiences a change in its pending reads, since by assuming a sorted
// read stream we're assured that each downsampler's earliest pending read will only increase
// in genomic position over time.
//
// TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers
// TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))),
// TODO: but need to verify this empirically.
else if ( currentDownsampler == earliestPendingDownsampler &&
(! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) {
earliestPendingRead = null;
earliestPendingDownsampler = null;
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
if ( perSampleDownsampler.hasPendingItems() &&
(earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) {
earliestPendingRead = perSampleDownsampler.peekPending();
earliestPendingDownsampler = perSampleDownsampler;
}
}
}
}
private boolean fillDownsampledReadsCache() {
SAMRecord prevRead = null;
int numPositionalChanges = 0;
// Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue
// can be released without violating global sort order
while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) {
SAMRecord read = nestedSAMIterator.next();
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
ReadsDownsampler<SAMRecord> thisSampleDownsampler = perSampleDownsamplers.get(sampleName);
if ( thisSampleDownsampler == null ) {
thisSampleDownsampler = downsamplerFactory.newInstance();
perSampleDownsamplers.put(sampleName, thisSampleDownsampler);
}
thisSampleDownsampler.submit(read);
updateEarliestPendingRead(thisSampleDownsampler);
if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) {
numPositionalChanges++;
}
// If the number of times we've changed position exceeds a certain threshold, inform all
// downsamplers of the current position in the read stream. This is to prevent downsamplers
// for samples with sparser reads than others from getting stuck too long in a pending state.
if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) {
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
perSampleDownsampler.signalNoMoreReadsBefore(read);
updateEarliestPendingRead(perSampleDownsampler);
}
}
prevRead = read;
}
if ( ! nestedSAMIterator.hasNext() ) {
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
perSampleDownsampler.signalEndOfInput();
}
earliestPendingRead = null;
earliestPendingDownsampler = null;
}
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
if ( perSampleDownsampler.hasFinalizedItems() ) {
orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems());
}
}
return readyToReleaseReads();
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
public void close() {
nestedSAMIterator.close();
}
public Iterator<SAMRecord> iterator() {
return this;
}
}

View File

@ -1,259 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
*
* @author David Roazen
*/
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private LinkedList<PositionalReadGrouping> pendingReads;
private ArrayList<T> finalizedReads;
public PositionalDownsampler ( int targetCoverage ) {
this.targetCoverage = targetCoverage;
clear();
}
public void submit ( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
updateAndDownsamplePendingReads();
}
reservoir.submit(newRead);
updateCurrentPosition(newRead);
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return pendingReads.size() > 0;
}
public void signalEndOfInput() {
updateAndDownsamplePendingReads();
for ( PositionalReadGrouping group : pendingReads ) {
group.finalizeAllActiveReads();
finalizedReads.addAll(group.getFinalizedReads());
}
pendingReads.clear();
}
public void clear() {
reservoir = new ReservoirDownsampler<T>(targetCoverage);
pendingReads = new LinkedList<PositionalReadGrouping>();
finalizedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return true;
}
private void updateCurrentPosition ( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
}
private boolean readIsPastCurrentPosition ( T read ) {
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
}
private void updateAndDownsamplePendingReads() {
finalizeOutOfScopeReads();
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
downsampleOverlappingGroups();
}
private void finalizeOutOfScopeReads() {
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
boolean noPrecedingUnfinalizedGroups = true;
while ( iter.hasNext() ) {
PositionalReadGrouping currentGroup = iter.next();
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
iter.remove();
finalizedReads.addAll(currentGroup.getFinalizedReads());
}
else {
noPrecedingUnfinalizedGroups = false;
}
}
}
private void downsampleOverlappingGroups() {
int[] groupReadCounts = new int[pendingReads.size()];
int totalCoverage = 0;
int numActiveGroups = 0;
int currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
groupReadCounts[currentGroup] = group.numActiveReads();
totalCoverage += groupReadCounts[currentGroup];
if ( groupReadCounts[currentGroup] > 0 ) {
numActiveGroups++;
}
currentGroup++;
}
if ( totalCoverage <= targetCoverage ) {
return;
}
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
currentGroup = 0;
while ( numReadsToRemove > 0 ) {
if ( groupReadCounts[currentGroup] > 1 ) {
groupReadCounts[currentGroup]--;
numReadsToRemove--;
}
currentGroup = (currentGroup + 1) % groupReadCounts.length;
}
currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
if ( ! group.isFinalized() ) {
group.downsampleActiveReads(groupReadCounts[currentGroup]);
}
currentGroup++;
}
}
private class PositionalReadGrouping {
private List<T> activeReads;
private List<T> finalizedReads;
private int contig;
private int alignmentStart;
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
activeReads = new LinkedList<T>(reads);
finalizedReads = new ArrayList<T>();
this.contig = contig;
this.alignmentStart = alignmentStart;
}
public int numActiveReads() {
return activeReads.size();
}
public boolean isFinalized() {
return activeReads.size() == 0;
}
public List<T> getFinalizedReads() {
return finalizedReads;
}
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
if ( this.contig != contig ) {
finalizeAllActiveReads();
return;
}
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( read.getAlignmentEnd() < position ) {
iter.remove();
finalizedReads.add(read);
}
}
}
public void finalizeAllActiveReads() {
finalizedReads.addAll(activeReads);
activeReads.clear();
}
public void downsampleActiveReads( int numReadsToKeep ) {
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
numReadsToKeep, activeReads.size()));
}
BitSet itemsToKeep = new BitSet(activeReads.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
}
}

View File

@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord;
*/
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
/*
/**
* Does this downsampler require that reads be fed to it in coordinate order?
*
* @return true if reads must be submitted to this downsampler in coordinate order, otherwise false
*/
public boolean requiresCoordinateSortOrder();
/**
* Tell this downsampler that no more reads located before the provided read (according to
* the sort order of the read stream) will be fed to it.
*
* Allows position-aware downsamplers to finalize pending reads earlier than they would
* otherwise be able to, particularly when doing per-sample downsampling and reads for
* certain samples are sparser than average.
*
* @param read the downsampler will assume that no reads located before this read will ever
* be submitted to it in the future
*/
public void signalNoMoreReadsBefore( T read );
}

View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular
* downsampler, all sharing the same construction parameters.
*
* @author David Roazen
*/
public interface ReadsDownsamplerFactory<T extends SAMRecord> {
public ReadsDownsampler<T> newInstance();
}

View File

@ -48,6 +48,14 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
private int totalReadsSeen;
private int numDiscardedItems;
/**
* Construct a ReservoirDownsampler
*
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
* after downsampling will be min(totalReads, targetSampleSize)
*/
public ReservoirDownsampler ( int targetSampleSize ) {
if ( targetSampleSize <= 0 ) {
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
@ -55,6 +63,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
this.targetSampleSize = targetSampleSize;
clear();
reset();
}
public void submit ( T newRead ) {
@ -68,6 +77,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
if ( randomSlot < targetSampleSize ) {
reservoir.set(randomSlot, newRead);
}
numDiscardedItems++;
}
}
@ -77,11 +87,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
}
}
public boolean hasDownsampledItems() {
public boolean hasFinalizedItems() {
return reservoir.size() > 0;
}
public List<T> consumeDownsampledItems() {
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
@ -91,16 +102,36 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
return false;
}
public T peekFinalized() {
return reservoir.isEmpty() ? null : reservoir.get(0);
}
public T peekPending() {
return null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
reservoir = new ArrayList<T>(targetSampleSize);
totalReadsSeen = 0;
totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
public void signalNoMoreReadsBefore( T read ) {
// NO-OP
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating ReservoirDownsamplers on demand
*
* @author David Roazen
*/
public class ReservoirDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private int targetSampleSize;
public ReservoirDownsamplerFactory( int targetSampleSize ) {
this.targetSampleSize = targetSampleSize;
}
public ReadsDownsampler<T> newInstance() {
return new ReservoirDownsampler<T>(targetSampleSize);
}
}

View File

@ -0,0 +1,169 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import java.util.*;
/**
* Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage
* using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time.
*
* @author David Roazen
*/
public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private boolean positionEstablished;
private boolean unmappedReadsReached;
private ArrayList<T> finalizedReads;
private int numDiscardedItems;
/**
* Construct a SimplePositionalDownsampler
*
* @param targetCoverage Maximum number of reads that may share any given alignment start position
*/
public SimplePositionalDownsampler( int targetCoverage ) {
this.targetCoverage = targetCoverage;
reservoir = new ReservoirDownsampler<T>(targetCoverage);
finalizedReads = new ArrayList<T>();
clear();
reset();
}
public void submit( T newRead ) {
updatePositionalState(newRead);
if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream
finalizedReads.add(newRead);
}
else {
int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems();
reservoir.submit(newRead);
numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems;
}
}
public void submit( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasFinalizedItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return reservoir.hasFinalizedItems();
}
public T peekFinalized() {
return finalizedReads.isEmpty() ? null : finalizedReads.get(0);
}
public T peekPending() {
return reservoir.peekFinalized();
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
finalizeReservoir();
}
public void clear() {
reservoir.clear();
reservoir.reset();
finalizedReads.clear();
positionEstablished = false;
unmappedReadsReached = false;
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return true;
}
public void signalNoMoreReadsBefore( T read ) {
updatePositionalState(read);
}
private void updatePositionalState( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
if ( reservoir.hasFinalizedItems() ) {
finalizeReservoir();
}
setCurrentPosition(newRead);
if ( newRead.getReadUnmappedFlag() ) {
unmappedReadsReached = true;
}
}
}
private void setCurrentPosition( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
positionEstablished = true;
}
private boolean readIsPastCurrentPosition( T read ) {
return ! positionEstablished ||
read.getReferenceIndex() > currentContigIndex ||
read.getAlignmentStart() > currentAlignmentStart ||
(read.getReadUnmappedFlag() && ! unmappedReadsReached);
}
private void finalizeReservoir() {
finalizedReads.addAll(reservoir.consumeFinalizedItems());
reservoir.reset();
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating SimplePositionalDownsamplers on demand
*
* @author David Roazen
*/
public class SimplePositionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private int targetCoverage;
public SimplePositionalDownsamplerFactory( int targetCoverage ) {
this.targetCoverage = targetCoverage;
}
public ReadsDownsampler<T> newInstance() {
return new SimplePositionalDownsampler<T>(targetCoverage);
}
}

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -81,7 +82,13 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals, Collection<String> sampleNames) {
this.sourceInfo = shard.getReadProperties();
this.readIterator = iterator;
this.sourceIterator = new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
// Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested:
this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ?
new PeekableIterator<AlignmentContext>(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames))
:
new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
this.intervalIterator = intervals.size()>0 ? new PeekableIterator<GenomeLoc>(intervals.iterator()) : null;
}

View File

@ -6,13 +6,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import java.util.Iterator;
public class DownsampleIterator implements StingSAMIterator {
public class LegacyDownsampleIterator implements StingSAMIterator {
StingSAMIterator it;
int cutoff;
SAMRecord next;
public DownsampleIterator(StingSAMIterator it, double fraction) {
public LegacyDownsampleIterator(StingSAMIterator it, double fraction) {
this.it = it;
cutoff = (int)(fraction * 10000);
next = getNextRecord();

View File

@ -31,8 +31,8 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc;

View File

@ -0,0 +1,649 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.iterators;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
*/
public class LocusIteratorByStateExperimental extends LocusIterator {
/**
* our log, which we want to capture anything from this class
*/
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
// -----------------------------------------------------------------------------------------------------------------
//
// member fields
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Used to create new GenomeLocs.
*/
private final GenomeLocParser genomeLocParser;
private final ArrayList<String> samples;
private final ReadStateManager readStates;
protected static class SAMRecordState {
SAMRecord read;
int readOffset = -1; // how far are we offset from the start of the read bases?
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
Cigar cigar = null;
int cigarOffset = -1;
CigarElement curElement = null;
int nCigarElements = 0;
int cigarElementCounter = -1; // how far are we into a single cigarElement
// The logical model for generating extended events is as follows: the "record state" implements the traversal
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
// can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
// deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
// if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base).
public SAMRecordState(SAMRecord read) {
this.read = read;
cigar = read.getCigar();
nCigarElements = cigar.numCigarElements();
//System.out.printf("Creating a SAMRecordState: %s%n", this);
}
public SAMRecord getRead() {
return read;
}
/**
* What is our current offset in the read's bases that aligns us with the reference genome?
*
* @return
*/
public int getReadOffset() {
return readOffset;
}
/**
* What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
*
* @return
*/
public int getGenomeOffset() {
return genomeOffset;
}
public int getGenomePosition() {
return read.getAlignmentStart() + getGenomeOffset();
}
public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
}
public CigarOperator getCurrentCigarOperator() {
return curElement.getOperator();
}
public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
public CigarElement peekForwardOnGenome() {
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
}
public CigarElement peekBackwardOnGenome() {
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
}
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
cigarOffset++;
if (cigarOffset < nCigarElements) {
curElement = cigar.getCigarElement(cigarOffset);
cigarElementCounter = 0;
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
// we reenter in order to re-check cigarElementCounter against curElement's length
return stepForwardOnGenome();
} else {
if (curElement != null && curElement.getOperator() == CigarOperator.D)
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
// current offset of this read is the next ref base after the end of the indel. This position will
// model a point on the reference somewhere after the end of the read.
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
return null;
}
}
boolean done = false;
switch (curElement.getOperator()) {
case H: // ignore hard clips
case P: // ignore pads
cigarElementCounter = curElement.getLength();
break;
case I: // insertion w.r.t. the reference
case S: // soft clip
cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength();
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// should be the same as N case
genomeOffset++;
done = true;
break;
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
genomeOffset++;
done = true;
break;
case M:
case EQ:
case X:
readOffset++;
genomeOffset++;
done = true;
break;
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
}
return done ? curElement.getOperator() : stepForwardOnGenome();
}
}
//final boolean DEBUG = false;
//final boolean DEBUG2 = false && DEBUG;
private ReadProperties readInfo;
private AlignmentContext nextAlignmentContext;
private boolean performLevelingDownsampling;
// -----------------------------------------------------------------------------------------------------------------
//
// constructors and other basic operations
//
// -----------------------------------------------------------------------------------------------------------------
public LocusIteratorByStateExperimental(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
this.readInfo = readInformation;
this.genomeLocParser = genomeLocParser;
this.samples = new ArrayList<String>(samples);
this.readStates = new ReadStateManager(samIterator);
this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null &&
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
readInfo.getDownsamplingMethod().toCoverage != null;
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
if (this.samples.isEmpty() && samIterator.hasNext()) {
throw new IllegalArgumentException("samples list must not be empty");
}
}
/**
* For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
* for the system.
*/
public final static Collection<String> sampleListForSAMWithoutReadGroups() {
List<String> samples = new ArrayList<String>();
samples.add(null);
return samples;
}
public Iterator<AlignmentContext> iterator() {
return this;
}
public void close() {
//this.it.close();
}
public boolean hasNext() {
lazyLoadNextAlignmentContext();
return (nextAlignmentContext != null);
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
}
private GenomeLoc getLocation() {
return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
}
// -----------------------------------------------------------------------------------------------------------------
//
// next() routine and associated collection operations
//
// -----------------------------------------------------------------------------------------------------------------
public AlignmentContext next() {
lazyLoadNextAlignmentContext();
if (!hasNext())
throw new NoSuchElementException("LocusIteratorByState: out of elements.");
AlignmentContext currentAlignmentContext = nextAlignmentContext;
nextAlignmentContext = null;
return currentAlignmentContext;
}
/**
* Creates the next alignment context from the given state. Note that this is implemented as a lazy load method.
* nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
*/
private void lazyLoadNextAlignmentContext() {
while (nextAlignmentContext == null && readStates.hasNext()) {
readStates.collectPendingReads();
final GenomeLoc location = getLocation();
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
// TODO: How can you determine here whether the current pileup has been downsampled?
boolean hasBeenSampled = false;
for (final String sample : samples) {
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
int size = 0; // number of elements in this sample's pileup
int nDeletions = 0; // number of deletions in this sample's pileup
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
final boolean isSingleElementCigar = nextElement == lastElement;
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
int readOffset = state.getReadOffset(); // the base offset on this read
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar;
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
int nextElementLength = nextElement.getLength();
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (op == CigarOperator.D) {
// TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;
nDeletions++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
else {
if (!filterBaseInRead(read, location.getStart())) {
String insertedBaseString = null;
if (nextOp == CigarOperator.I) {
final int insertionOffset = isSingleElementCigar ? 0 : 1;
// TODO -- someone please implement a better fix for the single element insertion CIGAR!
if (isSingleElementCigar)
readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases!
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength()));
}
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
size++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
}
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
}
updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
}
}
// fast testing of position
private boolean readIsPastCurrentPosition(SAMRecord read) {
if (readStates.isEmpty())
return false;
else {
SAMRecordState state = readStates.getFirst();
SAMRecord ourRead = state.getRead();
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
}
}
/**
* Generic place to put per-base filters appropriate to LocusIteratorByState
*
* @param rec
* @param pos
* @return
*/
private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
return ReadUtils.isBaseInsideAdaptor(rec, pos);
}
private void updateReadStates() {
for (final String sample : samples) {
Iterator<SAMRecordState> it = readStates.iterator(sample);
while (it.hasNext()) {
SAMRecordState state = it.next();
CigarOperator op = state.stepForwardOnGenome();
if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
it.remove(); // we've stepped off the end of the object
}
}
}
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
protected class ReadStateManager {
private final PeekableIterator<SAMRecord> iterator;
private final SamplePartitioner samplePartitioner;
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
private int totalReadStates = 0;
public ReadStateManager(Iterator<SAMRecord> source) {
this.iterator = new PeekableIterator<SAMRecord>(source);
for (final String sample : samples) {
readStatesBySample.put(sample, new PerSampleReadStateManager());
}
samplePartitioner = new SamplePartitioner();
}
/**
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
* for this iterator; if present, total read states will be decremented.
*
* @param sample The sample.
* @return Iterator over the reads associated with that sample.
*/
public Iterator<SAMRecordState> iterator(final String sample) {
return new Iterator<SAMRecordState>() {
private Iterator<SAMRecordState> wrappedIterator = readStatesBySample.get(sample).iterator();
public boolean hasNext() {
return wrappedIterator.hasNext();
}
public SAMRecordState next() {
return wrappedIterator.next();
}
public void remove() {
wrappedIterator.remove();
}
};
}
public boolean isEmpty() {
return totalReadStates == 0;
}
/**
* Retrieves the total number of reads in the manager across all samples.
*
* @return Total number of reads over all samples.
*/
public int size() {
return totalReadStates;
}
/**
* Retrieves the total number of reads in the manager in the given sample.
*
* @param sample The sample.
* @return Total number of reads in the given sample.
*/
public int size(final String sample) {
return readStatesBySample.get(sample).size();
}
public SAMRecordState getFirst() {
for (final String sample : samples) {
PerSampleReadStateManager reads = readStatesBySample.get(sample);
if (!reads.isEmpty())
return reads.peek();
}
return null;
}
public boolean hasNext() {
return totalReadStates > 0 || iterator.hasNext();
}
public void collectPendingReads() {
if (!iterator.hasNext())
return;
if (readStates.size() == 0) {
int firstContigIndex = iterator.peek().getReferenceIndex();
int firstAlignmentStart = iterator.peek().getAlignmentStart();
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
samplePartitioner.submitRead(iterator.next());
}
} else {
// Fast fail in the case that the read is past the current position.
if (readIsPastCurrentPosition(iterator.peek()))
return;
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
samplePartitioner.submitRead(iterator.next());
}
}
for (final String sample : samples) {
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
addReadsToSample(statesBySample, newReads);
}
samplePartitioner.reset();
}
/**
* Add reads with the given sample name to the given hanger entry.
*
* @param readStates The list of read states to add this collection of reads.
* @param reads Reads to add. Selected reads will be pulled from this source.
*/
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
if (reads.isEmpty())
return;
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
for (SAMRecord read : reads) {
SAMRecordState state = new SAMRecordState(read);
state.stepForwardOnGenome();
newReadStates.add(state);
}
readStates.addStatesAtNextAlignmentStart(newReadStates);
}
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
private int thisSampleReadStates = 0;
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
performLevelingDownsampling ?
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
null;
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
if ( states.isEmpty() ) {
return;
}
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
thisSampleReadStates += states.size();
totalReadStates += states.size();
if ( levelingDownsampler != null ) {
levelingDownsampler.submit(readStatesByAlignmentStart);
levelingDownsampler.signalEndOfInput();
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
// use returned List directly rather than make a copy, for efficiency's sake
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
levelingDownsampler.reset();
}
}
public boolean isEmpty() {
return readStatesByAlignmentStart.isEmpty();
}
public SAMRecordState peek() {
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
}
public int size() {
return thisSampleReadStates;
}
public Iterator<SAMRecordState> iterator() {
return new Iterator<SAMRecordState>() {
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
private LinkedList<SAMRecordState> currentPositionReadStates = null;
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
public boolean hasNext() {
return alignmentStartIterator.hasNext() ||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
}
public SAMRecordState next() {
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
currentPositionReadStates = alignmentStartIterator.next();
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
}
return currentPositionReadStatesIterator.next();
}
public void remove() {
currentPositionReadStatesIterator.remove();
thisSampleReadStates--;
totalReadStates--;
if ( currentPositionReadStates.isEmpty() ) {
alignmentStartIterator.remove();
}
}
};
}
}
}
/**
* Note: stores reads by sample ID string, not by sample object
*/
private class SamplePartitioner {
private Map<String, Collection<SAMRecord>> readsBySample;
private long readsSeen = 0;
public SamplePartitioner() {
readsBySample = new HashMap<String, Collection<SAMRecord>>();
for ( String sample : samples ) {
readsBySample.put(sample, new ArrayList<SAMRecord>());
}
}
public void submitRead(SAMRecord read) {
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
if (readsBySample.containsKey(sampleName))
readsBySample.get(sampleName).add(read);
readsSeen++;
}
public long getNumReadsSeen() {
return readsSeen;
}
public Collection<SAMRecord> getReadsForSample(String sampleName) {
if ( ! readsBySample.containsKey(sampleName) )
throw new NoSuchElementException("Sample name not found");
return readsBySample.get(sampleName);
}
public void reset() {
for ( Collection<SAMRecord> perSampleReads : readsBySample.values() )
perSampleReads.clear();
readsSeen = 0;
}
}
}

View File

@ -10,13 +10,11 @@ import java.util.Iterator;
* Verifies that the incoming stream of reads is correctly sorted
*/
public class VerifyingSamIterator implements StingSAMIterator {
private GenomeLocParser genomeLocParser;
StingSAMIterator it;
SAMRecord last = null;
boolean checkOrderP = true;
public VerifyingSamIterator(GenomeLocParser genomeLocParser,StingSAMIterator it) {
this.genomeLocParser = genomeLocParser;
public VerifyingSamIterator(StingSAMIterator it) {
this.it = it;
}

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import java.lang.annotation.*;

View File

@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;

View File

@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;

View File

@ -75,6 +75,17 @@ public class MathUtils {
}
}
/**
* Get a random int between min and max (inclusive) using the global GATK random number generator
*
* @param min lower bound of the range
* @param max upper bound of the range
* @return a random int >= min and <= max
*/
public static int randomIntegerInRange( int min, int max ) {
return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min;
}
// A fast implementation of the Math.round() method. This method does not perform
// under/overflow checking, so this shouldn't be used in the general case (but is fine
// if one is already make those checks before calling in to the rounding).
@ -1655,5 +1666,4 @@ public class MathUtils {
return result;
}
}

View File

@ -613,6 +613,8 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
/**
* Returns a pileup randomly downsampled to the desiredCoverage.
*
* TODO: delete this once the experimental downsampler stabilizes
*
* @param desiredCoverage
* @return
*/

View File

@ -0,0 +1,86 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.picard.sam.MergingSamRecordIterator;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Simple wrapper class that multiplexes multiple ArtificialSingleSampleReadStreams into a single stream of reads
*
* @author David Roazen
*/
public class ArtificialMultiSampleReadStream implements Iterable<SAMRecord> {
private Collection<ArtificialSingleSampleReadStream> perSampleArtificialReadStreams;
private MergingSamRecordIterator mergingIterator;
public ArtificialMultiSampleReadStream( Collection<ArtificialSingleSampleReadStream> perSampleArtificialReadStreams ) {
if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) {
throw new ReviewedStingException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams");
}
this.perSampleArtificialReadStreams = perSampleArtificialReadStreams;
}
public Iterator<SAMRecord> iterator() {
// lazy initialization to prevent reads from being created until they're needed
initialize();
return mergingIterator;
}
public StingSAMIterator getStingSAMIterator() {
// lazy initialization to prevent reads from being created until they're needed
initialize();
return StingSAMIteratorAdapter.adapt(mergingIterator);
}
private void initialize() {
Collection<SAMFileReader> perSampleSAMReaders = new ArrayList<SAMFileReader>(perSampleArtificialReadStreams.size());
Collection<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(perSampleArtificialReadStreams.size());
for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) {
Collection<SAMRecord> thisStreamReads = readStream.makeReads();
SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(),
thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()]));
perSampleSAMReaders.add(reader);
headers.add(reader.getFileHeader());
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true);
mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true);
}
}

View File

@ -40,8 +40,11 @@ public class ArtificialSAMFileReader extends SAMFileReader {
*/
private final List<SAMRecord> reads;
private SAMFileHeader customHeader = null;
/**
* Construct an artificial SAM file reader.
* @param sequenceDictionary sequence dictionary used to initialize our GenomeLocParser
* @param reads Reads to use as backing data source.
*/
public ArtificialSAMFileReader(SAMSequenceDictionary sequenceDictionary,SAMRecord... reads) {
@ -50,6 +53,30 @@ public class ArtificialSAMFileReader extends SAMFileReader {
this.reads = Arrays.asList(reads);
}
/**
* Construct an artificial SAM file reader with the given SAM file header
*
* @param customHeader Header that should be returned by calls to getFileHeader() on this reader
* @param reads Reads to use as backing data source.
*/
public ArtificialSAMFileReader( SAMFileHeader customHeader, SAMRecord... reads ) {
super(createEmptyInputStream(),true);
this.customHeader = customHeader;
this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary());
this.reads = Arrays.asList(reads);
}
@Override
public SAMFileHeader getFileHeader() {
if ( customHeader != null ) {
return customHeader;
}
return super.getFileHeader();
}
/**
* @{inheritDoc}
*/

View File

@ -276,6 +276,30 @@ public class ArtificialSAMUtils {
return Arrays.asList(left, right);
}
/**
* Create a collection of identical artificial reads based on the parameters. The cigar string for each
* read will be *M, where * is the length of the read.
*
* Useful for testing things like positional downsampling where you care only about the position and
* number of reads, and not the other attributes.
*
* @param stackSize number of identical reads to create
* @param header the SAM header to associate each read with
* @param name name associated with each read
* @param refIndex the reference index, i.e. what chromosome to associate them with
* @param alignmentStart where to start each alignment
* @param length the length of each read
*
* @return a collection of stackSize reads all sharing the above properties
*/
public static Collection<GATKSAMRecord> createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
Collection<GATKSAMRecord> stack = new ArrayList<GATKSAMRecord>(stackSize);
for ( int i = 1; i <= stackSize; i++ ) {
stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length));
}
return stack;
}
/**
* create an iterator containing the specified read piles
*

View File

@ -0,0 +1,212 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
/**
* An artificial stream of reads from a single read group/sample with configurable characteristics
* such as:
*
* -the number of contigs that the reads should be distributed across
* -number of "stacks" of reads sharing the same alignment start position per contig
* -the min/max number of reads in each stack (exact values chosen randomly from this range)
* -the min/max distance between stack start positions (exact values chosen randomly from this range)
* -the min/max length of each read (exact values chosen randomly from this range)
* -the number of unmapped reads
*
* The cigar string for all reads will be *M, where * is the length of the read.
*
* @author David Roazen
*/
public class ArtificialSingleSampleReadStream implements Iterable<SAMRecord> {
private SAMFileHeader header;
private String readGroupID;
private int numContigs;
private int numStacksPerContig;
private int minReadsPerStack;
private int maxReadsPerStack;
private int minDistanceBetweenStacks;
private int maxDistanceBetweenStacks;
private int minReadLength;
private int maxReadLength;
private int numUnmappedReads;
private static final String READ_GROUP_TAG = "RG";
public ArtificialSingleSampleReadStream( SAMFileHeader header,
String readGroupID,
int numContigs,
int numStacksPerContig,
int minReadsPerStack,
int maxReadsPerStack,
int minDistanceBetweenStacks,
int maxDistanceBetweenStacks,
int minReadLength,
int maxReadLength,
int numUnmappedReads ) {
this.header = header;
this.readGroupID = readGroupID;
this.numContigs = numContigs;
this.numStacksPerContig = numStacksPerContig;
this.minReadsPerStack = minReadsPerStack;
this.maxReadsPerStack = maxReadsPerStack;
this.minDistanceBetweenStacks = minDistanceBetweenStacks;
this.maxDistanceBetweenStacks = maxDistanceBetweenStacks;
this.minReadLength = minReadLength;
this.maxReadLength = maxReadLength;
this.numUnmappedReads = numUnmappedReads;
validateStreamParameters();
}
private void validateStreamParameters() {
if ( header == null || readGroupID == null ) {
throw new ReviewedStingException("null SAMFileHeader or read group ID") ;
}
if ( header.getReadGroup(readGroupID) == null ) {
throw new ReviewedStingException(String.format("Read group %s not found in SAMFileHeader", readGroupID));
}
if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 ||
minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 ||
numUnmappedReads < 0 ) {
throw new ReviewedStingException("Read stream parameters must be >= 0");
}
if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) {
throw new ReviewedStingException("numContigs and numStacksPerContig must either both be > 0, or both be 0");
}
if ( minReadsPerStack > maxReadsPerStack ) {
throw new ReviewedStingException("minReadsPerStack > maxReadsPerStack");
}
if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) {
throw new ReviewedStingException("minDistanceBetweenStacks > maxDistanceBetweenStacks");
}
if ( minReadLength > maxReadLength ) {
throw new ReviewedStingException("minReadLength > maxReadLength");
}
}
public Iterator<SAMRecord> iterator() {
return makeReads().iterator();
}
public StingSAMIterator getStingSAMIterator() {
return StingSAMIteratorAdapter.adapt(iterator());
}
public Collection<SAMRecord> makeReads() {
Collection<SAMRecord> reads = new ArrayList<SAMRecord>(numContigs * numStacksPerContig * maxReadsPerStack);
for ( int contig = 0; contig < numContigs; contig++ ) {
int alignmentStart = 1;
for ( int stack = 0; stack < numStacksPerContig; stack++ ) {
reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack)));
alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks);
}
}
if ( numUnmappedReads > 0 ) {
reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads));
}
return reads;
}
private Collection<SAMRecord> makeReadStack( int contig, int alignmentStart, int stackSize ) {
Collection<SAMRecord> readStack = new ArrayList<SAMRecord>(stackSize);
for ( int i = 0; i < stackSize; i++ ) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,
"foo",
contig,
alignmentStart,
MathUtils.randomIntegerInRange(minReadLength, maxReadLength));
read.setAttribute(READ_GROUP_TAG, readGroupID);
readStack.add(read);
}
return readStack;
}
public SAMFileHeader getHeader() {
return header;
}
public String getReadGroupID() {
return readGroupID;
}
public int getNumContigs() {
return numContigs;
}
public int getNumStacksPerContig() {
return numStacksPerContig;
}
public int getMinReadsPerStack() {
return minReadsPerStack;
}
public int getMaxReadsPerStack() {
return maxReadsPerStack;
}
public int getMinDistanceBetweenStacks() {
return minDistanceBetweenStacks;
}
public int getMaxDistanceBetweenStacks() {
return maxDistanceBetweenStacks;
}
public int getMinReadLength() {
return minReadLength;
}
public int getMaxReadLength() {
return maxReadLength;
}
public int getNumUnmappedReads() {
return numUnmappedReads;
}
}

View File

@ -0,0 +1,281 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.List;
/**
* A class for analyzing and validating the read stream produced by an ArtificialSingleSampleReadStream.
*
* Collects various statistics about the stream of reads it's fed, and validates the stream
* by checking whether the collected statistics match the nominal properties of the stream.
*
* Subclasses are expected to override the validate() method in order to check whether an artificial
* read stream has been *transformed* in some way (eg., by downsampling or some other process), rather
* than merely checking whether the stream matches its original properties.
*
* Usage is simple:
*
* ArtificialSingleSampleReadStreamAnalyzer analyzer = new ArtificialSingleSampleReadStreamAnalyzer(originalStream);
* analyzer.analyze(originalOrTransformedStream);
* analyzer.validate(); // override this method if you want to check whether the stream has been transformed
* // in a certain way relative to the original stream
*
* @author David Roazen
*/
public class ArtificialSingleSampleReadStreamAnalyzer {
protected ArtificialSingleSampleReadStream originalStream;
protected SAMRecord lastRead;
protected int totalReads;
protected boolean allSamplesMatch;
protected int numContigs;
protected List<Integer> stacksPerContig;
protected Integer minReadsPerStack;
protected Integer maxReadsPerStack;
protected Integer minDistanceBetweenStacks;
protected Integer maxDistanceBetweenStacks;
protected Integer minReadLength;
protected Integer maxReadLength;
protected int numUnmappedReads;
protected int currentContigNumStacks;
protected int currentStackNumReads;
/**
* Construct a new read stream analyzer, providing an ArtificialSingleSampleReadStream that will
* serve as the basis for comparison after the analysis is complete.
*
* @param originalStream the original ArtificialSingleSampleReadStream upon which the stream
* that will be fed to the analyzer is based
*/
public ArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream ) {
this.originalStream = originalStream;
reset();
}
/**
* Reset all read stream statistics collected by this analyzer to prepare for a fresh run
*/
public void reset() {
lastRead = null;
totalReads = 0;
allSamplesMatch = true;
numContigs = 0;
stacksPerContig = new ArrayList<Integer>();
minReadsPerStack = null;
maxReadsPerStack = null;
minDistanceBetweenStacks = null;
maxDistanceBetweenStacks = null;
minReadLength = null;
maxReadLength = null;
numUnmappedReads = 0;
currentContigNumStacks = 0;
currentStackNumReads = 0;
}
/**
* Collect statistics on the stream of reads passed in
*
* @param stream the stream of reads to analyze
*/
public void analyze( Iterable<SAMRecord> stream ) {
for ( SAMRecord read : stream ) {
update(read);
}
finalizeStats();
}
/**
* Validate the stream by checking whether our collected statistics match the properties of the
* original stream. Throws a ReviewedStingException if the stream is invalid.
*
* Override this method if you want to check whether the stream has been transformed in some
* way relative to the original stream.
*/
public void validate() {
if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) {
if ( totalReads != 0 ) {
throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads");
}
return; // no further validation needed for the 0-reads case
}
else if ( totalReads == 0 ) {
throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads");
}
if ( ! allSamplesMatch ) {
throw new ReviewedStingException("some reads had the wrong sample");
}
if ( numContigs != originalStream.getNumContigs() ) {
throw new ReviewedStingException("number of contigs not correct");
}
if ( stacksPerContig.size() != originalStream.getNumContigs() ) {
throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs",
stacksPerContig.size(), originalStream.getNumContigs()));
}
for ( int contigStackCount : stacksPerContig ) {
if ( contigStackCount != originalStream.getNumStacksPerContig() ) {
throw new ReviewedStingException("contig had incorrect number of stacks");
}
}
if ( originalStream.getNumStacksPerContig() > 0 ) {
if ( minReadsPerStack < originalStream.getMinReadsPerStack() ) {
throw new ReviewedStingException("stack had fewer than the minimum number of reads");
}
if ( maxReadsPerStack > originalStream.getMaxReadsPerStack() ) {
throw new ReviewedStingException("stack had more than the maximum number of reads");
}
}
else if ( minReadsPerStack != null || maxReadsPerStack != null ) {
throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified");
}
if ( originalStream.getNumStacksPerContig() > 1 ) {
if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) {
throw new ReviewedStingException("stacks were separated by less than the minimum distance");
}
if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) {
throw new ReviewedStingException("stacks were separated by more than the maximum distance");
}
}
else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) {
throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1");
}
if ( minReadLength < originalStream.getMinReadLength() ) {
throw new ReviewedStingException("read was shorter than the minimum allowed length");
}
if ( maxReadLength > originalStream.getMaxReadLength() ) {
throw new ReviewedStingException("read was longer than the maximum allowed length");
}
if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) {
throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d",
originalStream.getNumUnmappedReads(), numUnmappedReads));
}
if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) &&
numUnmappedReads != totalReads ) {
throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads");
}
}
public void update( SAMRecord read ) {
if ( read.getReadUnmappedFlag() ) {
numUnmappedReads++;
if ( numUnmappedReads == 1 && lastRead != null ) {
processContigChange();
numContigs--;
}
}
else if ( lastRead == null ) {
numContigs = 1;
currentContigNumStacks = 1;
currentStackNumReads = 1;
}
else if ( ! read.getReferenceIndex().equals(lastRead.getReferenceIndex()) ) {
processContigChange();
}
else if ( read.getAlignmentStart() != lastRead.getAlignmentStart() ) {
processStackChangeWithinContig(read);
}
else {
currentStackNumReads++;
}
updateReadLength(read.getReadLength());
allSamplesMatch = allSamplesMatch && readHasCorrectSample(read);
totalReads++;
lastRead = read;
}
private void processContigChange() {
numContigs++;
stacksPerContig.add(currentContigNumStacks);
currentContigNumStacks = 1;
updateReadsPerStack(currentStackNumReads);
currentStackNumReads = 1;
}
private void processStackChangeWithinContig( SAMRecord read ) {
currentContigNumStacks++;
updateReadsPerStack(currentStackNumReads);
currentStackNumReads = 1;
updateDistanceBetweenStacks(read.getAlignmentStart() - lastRead.getAlignmentStart());
}
private void updateReadsPerStack( int stackReadCount ) {
if ( minReadsPerStack == null || stackReadCount < minReadsPerStack ) {
minReadsPerStack = stackReadCount;
}
if ( maxReadsPerStack == null || stackReadCount > maxReadsPerStack ) {
maxReadsPerStack = stackReadCount;
}
}
private void updateDistanceBetweenStacks( int stackDistance ) {
if ( minDistanceBetweenStacks == null || stackDistance < minDistanceBetweenStacks ) {
minDistanceBetweenStacks = stackDistance;
}
if ( maxDistanceBetweenStacks == null || stackDistance > maxDistanceBetweenStacks ) {
maxDistanceBetweenStacks = stackDistance;
}
}
private void updateReadLength( int readLength ) {
if ( minReadLength == null || readLength < minReadLength ) {
minReadLength = readLength;
}
if ( maxReadLength == null || readLength > maxReadLength ) {
maxReadLength = readLength;
}
}
private boolean readHasCorrectSample( SAMRecord read ) {
return originalStream.getReadGroupID().equals(read.getAttribute("RG"));
}
public void finalizeStats() {
if ( lastRead != null && ! lastRead.getReadUnmappedFlag() ) {
stacksPerContig.add(currentContigNumStacks);
updateReadsPerStack(currentStackNumReads);
}
}
}

View File

@ -29,7 +29,7 @@ import net.sf.picard.filter.FilteringIterator;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.walkers.qc.CountLoci;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.Collections;
@ -97,7 +98,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark {
},
PER_SAMPLE {
@Override
DownsamplingMethod create() { return GATKArgumentCollection.getDefaultDownsamplingMethod(); }
DownsamplingMethod create() { return DownsamplingMethod.getDefaultDownsamplingMethod(new CountLoci(), false); }
};
abstract DownsamplingMethod create();
}

View File

@ -25,36 +25,40 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import static org.testng.Assert.*;
/**
* @author aaron
* @version 1.0
* @date Apr 8, 2009
* <p/>
* Class SAMDataSourceUnitTest
* <p/>
@ -66,6 +70,161 @@ public class SAMDataSourceUnitTest extends BaseTest {
private IndexedFastaSequenceFile seq;
private GenomeLocParser genomeLocParser;
/***********************************
* Tests for the fillShard() method
***********************************/
/**
* Tests to ensure that the fillShard() method does not place shard boundaries at inappropriate places,
* such as within an alignment start position
*/
private static class SAMDataSourceFillShardBoundaryTest extends TestDataProvider {
private int numContigs;
private int numStacksPerContig;
private int stackSize;
private int numUnmappedReads;
private DownsamplingMethod downsamplingMethod;
private SAMFileHeader header;
public SAMDataSourceFillShardBoundaryTest( int numContigs,
int numStacksPerContig,
int stackSize,
int numUnmappedReads,
int downsamplingTargetCoverage ) {
super(SAMDataSourceFillShardBoundaryTest.class);
this.numContigs = numContigs;
this.numStacksPerContig = numStacksPerContig;
this.stackSize = stackSize;
this.numUnmappedReads = numUnmappedReads;
this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true);
setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d",
getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage));
}
public void run() {
SAMDataSource dataSource = new SAMDataSource(Arrays.asList(createTestBAM()),
new ThreadAllocation(),
null,
new GenomeLocParser(header.getSequenceDictionary()),
false,
SAMFileReader.ValidationStringency.SILENT,
null,
downsamplingMethod,
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
false);
Assert.assertTrue(dataSource.usingExpandedShards());
Iterable<Shard> shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
SAMRecord readAtEndOfLastShard = null;
for ( Shard shard : shardIterator ) {
int numContigsThisShard = 0;
SAMRecord lastRead = null;
for ( SAMRecord read : shard.iterator() ) {
if ( lastRead == null ) {
numContigsThisShard = 1;
}
else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) {
numContigsThisShard++;
}
// If the last read from the previous shard is not unmapped, we have to make sure
// that no reads in this shard start at the same position
if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) {
Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) &&
readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(),
String.format("Reads from alignment start position %d:%d are split across multiple shards",
read.getReferenceIndex(), read.getAlignmentStart()));
}
lastRead = read;
}
// There should never be reads from more than 1 contig in a shard (ignoring unmapped reads)
Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs");
readAtEndOfLastShard = lastRead;
}
}
private SAMReaderID createTestBAM() {
header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000);
SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo");
readGroup.setSample("testSample");
header.addReadGroup(readGroup);
ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header,
"foo",
numContigs,
numStacksPerContig,
stackSize,
stackSize,
1,
100,
50,
150,
numUnmappedReads);
File testBAMFile;
try {
testBAMFile = File.createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam");
testBAMFile.deleteOnExit();
}
catch ( IOException e ) {
throw new ReviewedStingException(String.format("Failed to create temp bam file for test %s. %s", this, e.getMessage()));
}
SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile);
for ( SAMRecord read : artificialReads ) {
bamWriter.addAlignment(read);
}
bamWriter.close();
return new SAMReaderID(testBAMFile, new Tags());
}
}
@DataProvider(name = "SAMDataSourceFillShardTestDataProvider")
public Object[][] createSAMDataSourceFillShardBoundaryTests() {
// Take downsampling out of the equation for these tests -- we are only interested in whether the
// shard boundaries occur at the right places in the read stream, and removing downsampling as a
// factor simplifies that task (note that we still need to provide a specific downsampling method with
// experimental downsampling enabled to trigger the shard expansion behavior, for now)
int downsamplingTargetCoverage = ReadShard.MAX_READS * 10;
for ( int numContigs = 1; numContigs <= 3; numContigs++ ) {
for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) {
// Use crucial read shard boundary values as the stack sizes
for ( int stackSize : Arrays.asList(ReadShard.MAX_READS / 2, ReadShard.MAX_READS / 2 + 10, ReadShard.MAX_READS, ReadShard.MAX_READS - 1, ReadShard.MAX_READS + 1, ReadShard.MAX_READS * 2) ) {
for ( int numUnmappedReads : Arrays.asList(0, ReadShard.MAX_READS / 2, ReadShard.MAX_READS * 2) ) {
new SAMDataSourceFillShardBoundaryTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage);
}
}
}
}
return SAMDataSourceFillShardBoundaryTest.getTests(SAMDataSourceFillShardBoundaryTest.class);
}
// TODO: re-enable these tests once the issues with filepointer ordering + the downsamplers are worked out
@Test(dataProvider = "SAMDataSourceFillShardTestDataProvider", enabled = false)
public void testSAMDataSourceFillShard( SAMDataSourceFillShardBoundaryTest test ) {
logger.warn("Running test: " + test);
test.run();
}
// TODO: the legacy tests below should really be replaced with a more comprehensive suite of tests for SAMDataSource
/**
* This function does the setup of our parser, before each method call.
* <p/>

View File

@ -1,73 +1,138 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Arrays;
public class DownsamplingReadsIteratorUnitTest {
public class DownsamplingReadsIteratorUnitTest extends BaseTest {
@Test
public void testDownsamplingIteratorWithPositionalDownsampling() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
private static class DownsamplingReadsIteratorTest extends TestDataProvider {
private DownsamplingReadsIterator downsamplingIter;
private int targetCoverage;
private ArtificialSingleSampleReadStream stream;
private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer;
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) {
super(DownsamplingReadsIteratorTest.class);
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100));
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100));
this.stream = stream;
this.targetCoverage = targetCoverage;
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
Assert.assertTrue(iter.hasNext());
SAMRecord previous = iter.next();
int count = 1;
while ( iter.hasNext() ) {
SAMRecord current = iter.next();
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
count++;
previous = current;
setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d",
getClass().getSimpleName(),
targetCoverage,
stream.getNumContigs(),
stream.getNumStacksPerContig(),
stream.getMinReadsPerStack(),
stream.getMaxReadsPerStack(),
stream.getMinDistanceBetweenStacks(),
stream.getMaxDistanceBetweenStacks(),
stream.getMinReadLength(),
stream.getMaxReadLength(),
stream.getNumUnmappedReads()));
}
Assert.assertEquals(count, 1000);
public void run() {
streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage);
downsamplingIter = new DownsamplingReadsIterator(stream.getStingSAMIterator(), new SimplePositionalDownsampler<SAMRecord>(targetCoverage));
streamAnalyzer.analyze(downsamplingIter);
// Check whether the observed properties of the downsampled stream are what they should be
streamAnalyzer.validate();
// Allow memory used by this test to be reclaimed
stream = null;
streamAnalyzer = null;
downsamplingIter = null;
}
}
@Test
public void testDownsamplingIteratorNoEffectiveDownsampling() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
@DataProvider(name = "DownsamplingReadsIteratorTestDataProvider")
public Object[][] createDownsamplingReadsIteratorTests() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000);
String readGroupID = "testReadGroup";
SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID);
readGroup.setSample("testSample");
header.addReadGroup(readGroup);
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
// Values that don't vary across tests
int targetCoverage = 10;
int minReadLength = 50;
int maxReadLength = 100;
int minDistanceBetweenStacks = 1;
int maxDistanceBetweenStacks = maxReadLength + 1;
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
GenomeAnalysisEngine.resetRandomGenerator();
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
Assert.assertTrue(iter.hasNext());
SAMRecord previous = iter.next();
int count = 1;
while ( iter.hasNext() ) {
SAMRecord current = iter.next();
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
count++;
previous = current;
// brute force testing!
for ( int numContigs : Arrays.asList(1, 2, 5) ) {
for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) {
for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) {
for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) {
for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) {
// Only interested in sane read stream configurations here
if ( minReadsPerStack <= maxReadsPerStack ) {
new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header,
readGroupID,
numContigs,
stacksPerContig,
minReadsPerStack,
maxReadsPerStack,
minDistanceBetweenStacks,
maxDistanceBetweenStacks,
minReadLength,
maxReadLength,
numUnmappedReads),
targetCoverage);
}
}
}
}
}
}
Assert.assertEquals(count, 600);
return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class);
}
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
for ( int i = 1; i <= stackSize; i++ ) {
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
}
return stack;
@Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider")
public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
test.run();
}
}

View File

@ -1,65 +1,157 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
public class FractionalDownsamplerUnitTest {
public class FractionalDownsamplerUnitTest extends BaseTest {
@Test
public void test100PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(1.0);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
private static class FractionalDownsamplerTest extends TestDataProvider {
double fraction;
int totalReads;
int expectedMinNumReadsAfterDownsampling;
int expectedMaxNumReadsAfterDownsampling;
int expectedMinDiscardedItems;
int expectedMaxDiscardedItems;
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
public FractionalDownsamplerTest( double fraction, int totalReads ) {
super(FractionalDownsamplerTest.class);
Assert.assertTrue(downsampledReads.size() == 1000);
}
this.fraction = fraction;
this.totalReads = totalReads;
@Test
public void test0PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.0);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
calculateExpectations();
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.isEmpty());
}
@Test
public void test50PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.5);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000);
}
private List<SAMRecord> createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) {
List<SAMRecord> reads = new ArrayList<SAMRecord>(numReads);
for ( int i = 1; i <= numReads; i++ ) {
reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex,
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1,
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1));
setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d",
getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling));
}
return reads;
private void calculateExpectations() {
// Require an exact match in the 0% and 100% cases
if ( fraction == 0.0 ) {
expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0;
expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads;
}
else if ( fraction == 1.0 ) {
expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads;
expectedMinDiscardedItems = expectedMaxDiscardedItems = 0;
}
else {
expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0);
expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads);
expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling;
expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling;
}
}
public Collection<SAMRecord> createReads() {
Collection<SAMRecord> reads = new ArrayList<SAMRecord>(totalReads);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100));
return reads;
}
}
@DataProvider(name = "FractionalDownsamplerTestDataProvider")
public Object[][] createFractionalDownsamplerTestData() {
for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) {
for ( int totalReads : Arrays.asList(0, 1000, 10000) ) {
new FractionalDownsamplerTest(fraction, totalReads);
}
}
return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class);
}
@Test(dataProvider = "FractionalDownsamplerTestDataProvider")
public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
ReadsDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(test.fraction);
downsampler.submit(test.createReads());
if ( test.totalReads > 0 ) {
if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
}
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
downsampler.signalEndOfInput();
if ( test.totalReads > 0 ) {
if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
}
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling &&
downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling);
Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems &&
downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems);
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size());
downsampler.reset();
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
}
}

View File

@ -0,0 +1,163 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import org.testng.Assert;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
public class LevelingDownsamplerUnitTest extends BaseTest {
private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider {
public enum DataStructure { LINKED_LIST, ARRAY_LIST }
int targetSize;
int numStacks;
int stackSize;
DataStructure dataStructure;
int expectedSize;
public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) {
super(LevelingDownsamplerUniformStacksTest.class);
this.targetSize = targetSize;
this.numStacks = numStacks;
this.stackSize = stackSize;
this.dataStructure = dataStructure;
expectedSize = calculateExpectedDownsampledStackSize();
setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d",
getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize));
}
public Collection<List<Object>> createStacks() {
Collection<List<Object>> stacks = new ArrayList<List<Object>>();
for ( int i = 1; i <= numStacks; i++ ) {
List<Object> stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList<Object>() : new ArrayList<Object>();
for ( int j = 1; j <= stackSize; j++ ) {
stack.add(new Object());
}
stacks.add(stack);
}
return stacks;
}
private int calculateExpectedDownsampledStackSize() {
int numItemsToRemove = numStacks * stackSize - targetSize;
if ( numStacks == 0 ) {
return 0;
}
else if ( numItemsToRemove <= 0 ) {
return stackSize;
}
return Math.max(1, stackSize - (numItemsToRemove / numStacks));
}
}
@DataProvider(name = "UniformStacksDataProvider")
public Object[][] createUniformStacksTestData() {
for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) {
for ( int numStacks = 0; numStacks <= 10; numStacks++ ) {
for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) {
for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) {
new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure);
}
}
}
}
return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class);
}
@Test( dataProvider = "UniformStacksDataProvider" )
public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
Downsampler<List<Object>> downsampler = new LevelingDownsampler<List<Object>, Object>(test.targetSize);
downsampler.submit(test.createStacks());
if ( test.numStacks > 0 ) {
Assert.assertFalse(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() == null);
Assert.assertTrue(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() != null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
downsampler.signalEndOfInput();
if ( test.numStacks > 0 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
List<List<Object>> downsampledStacks = downsampler.consumeFinalizedItems();
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
Assert.assertEquals(downsampledStacks.size(), test.numStacks);
int totalRemainingItems = 0;
for ( List<Object> stack : downsampledStacks ) {
Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1);
totalRemainingItems += stack.size();
}
int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems();
int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems;
Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded);
downsampler.reset();
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks));
}
}

View File

@ -0,0 +1,298 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialMultiSampleReadStream;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest {
private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider {
// TODO: tests should distinguish between variance across samples and variance within a sample
private enum StreamDensity {
SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2),
DENSE (1, MIN_READ_LENGTH),
MIXED (1, MAX_READ_LENGTH * 2),
UNIFORM_DENSE (1, 1),
UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2);
int minDistanceBetweenStacks;
int maxDistanceBetweenStacks;
StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) {
this.minDistanceBetweenStacks = minDistanceBetweenStacks;
this.maxDistanceBetweenStacks = maxDistanceBetweenStacks;
}
public String toString() {
return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks);
}
}
private enum StreamStackDepth {
NON_UNIFORM_LOW (1, 5),
NON_UNIFORM_HIGH (15, 20),
NON_UNIFORM_MIXED (1, 20),
UNIFORM_SINGLE (1, 1),
UNIFORM_LOW (2, 2),
UNIFORM_HIGH (20, 20),
UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing
int minReadsPerStack;
int maxReadsPerStack;
StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) {
this.minReadsPerStack = minReadsPerStack;
this.maxReadsPerStack = maxReadsPerStack;
}
public boolean isUniform() {
return minReadsPerStack == maxReadsPerStack;
}
public String toString() {
return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack);
}
}
private enum StreamStacksPerContig {
UNIFORM(20, 20),
NON_UNIFORM(1, 30);
int minStacksPerContig;
int maxStacksPerContig;
StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) {
this.minStacksPerContig = minStacksPerContig;
this.maxStacksPerContig = maxStacksPerContig;
}
public boolean isUniform() {
return minStacksPerContig == maxStacksPerContig;
}
public String toString() {
return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig);
}
}
// Not interested in testing multiple ranges for the read lengths, as none of our current
// downsamplers are affected by read length
private static final int MIN_READ_LENGTH = 50;
private static final int MAX_READ_LENGTH = 150;
private ReadsDownsamplerFactory<SAMRecord> downsamplerFactory;
private int targetCoverage;
private int numSamples;
private int minContigs;
private int maxContigs;
private StreamDensity streamDensity;
private StreamStackDepth streamStackDepth;
private StreamStacksPerContig streamStacksPerContig;
private double unmappedReadsFraction;
private int unmappedReadsCount;
private boolean verifySortedness;
private ArtificialMultiSampleReadStream mergedReadStream;
private Map<String, ArtificialSingleSampleReadStream> perSampleArtificialReadStreams;
private Map<String, ArtificialSingleSampleReadStreamAnalyzer> perSampleStreamAnalyzers;
private SAMFileHeader header;
public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory<SAMRecord> downsamplerFactory,
int targetCoverage,
int numSamples,
int minContigs,
int maxContigs,
StreamDensity streamDensity,
StreamStackDepth streamStackDepth,
StreamStacksPerContig streamStacksPerContig,
double unmappedReadsFraction,
int unmappedReadsCount,
boolean verifySortedness ) {
super(PerSampleDownsamplingReadsIteratorTest.class);
this.downsamplerFactory = downsamplerFactory;
this.targetCoverage = targetCoverage;
this.numSamples = numSamples;
this.minContigs = minContigs;
this.maxContigs = maxContigs;
this.streamDensity = streamDensity;
this.streamStackDepth = streamStackDepth;
this.streamStacksPerContig = streamStacksPerContig;
this.unmappedReadsFraction = unmappedReadsFraction;
this.unmappedReadsCount = unmappedReadsCount;
this.verifySortedness = verifySortedness;
header = createHeader();
createReadStreams();
setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b",
getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness));
}
private SAMFileHeader createHeader() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000);
List<String> readGroups = new ArrayList<String>(numSamples);
List<String> sampleNames = new ArrayList<String>(numSamples);
for ( int i = 0; i < numSamples; i++ ) {
readGroups.add("ReadGroup" + i);
sampleNames.add("Sample" + i);
}
return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames);
}
private void createReadStreams() {
perSampleArtificialReadStreams = new HashMap<String, ArtificialSingleSampleReadStream>(numSamples);
perSampleStreamAnalyzers = new HashMap<String, ArtificialSingleSampleReadStreamAnalyzer>(numSamples);
for (SAMReadGroupRecord readGroup : header.getReadGroups() ) {
String readGroupID = readGroup.getReadGroupId();
String sampleName = readGroup.getSample();
int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs);
int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig);
int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0;
ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header,
readGroupID,
thisSampleNumContigs,
thisSampleStacksPerContig,
streamStackDepth.minReadsPerStack,
streamStackDepth.maxReadsPerStack,
streamDensity.minDistanceBetweenStacks,
streamDensity.maxDistanceBetweenStacks,
MIN_READ_LENGTH,
MAX_READ_LENGTH,
thisSampleNumUnmappedReads);
perSampleArtificialReadStreams.put(sampleName, thisSampleStream);
perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage));
}
mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values());
}
public void run() {
StingSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getStingSAMIterator(), downsamplerFactory);
if ( verifySortedness ) {
downsamplingIter = new VerifyingSamIterator(downsamplingIter);
}
while ( downsamplingIter.hasNext() ) {
SAMRecord read = downsamplingIter.next();
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName);
if ( analyzer != null ) {
analyzer.update(read);
}
else {
throw new ReviewedStingException("bug: stream analyzer for sample " + sampleName + " not found");
}
}
for ( Map.Entry<String, ArtificialSingleSampleReadStreamAnalyzer> analyzerEntry : perSampleStreamAnalyzers.entrySet() ) {
ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue();
analyzer.finalizeStats();
// Validate the downsampled read stream for each sample individually
analyzer.validate();
}
// Allow memory used by this test to be reclaimed:
mergedReadStream = null;
perSampleArtificialReadStreams = null;
perSampleStreamAnalyzers = null;
}
}
@DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider")
public Object[][] createPerSampleDownsamplingReadsIteratorTests() {
GenomeAnalysisEngine.resetRandomGenerator();
// Some values don't vary across tests
int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack;
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = new SimplePositionalDownsamplerFactory<SAMRecord>(targetCoverage);
int maxContigs = 3;
boolean verifySortedness = true;
for ( int numSamples : Arrays.asList(1, 2, 10) ) {
for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) {
for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) {
for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) {
for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) {
for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) {
for ( int unmappedReadsCount : Arrays.asList(1, 50) ) {
new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory,
targetCoverage,
numSamples,
minContigs,
maxContigs,
streamDensity,
streamStackDepth,
streamStacksPerContig,
unmappedReadsFraction,
unmappedReadsCount,
verifySortedness);
}
}
}
}
}
}
}
return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class);
}
@Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider")
public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
test.run();
}
}

View File

@ -1,357 +0,0 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.*;
// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested
public class PositionalDownsamplerUnitTest extends BaseTest {
/**
* -------
* -------
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeOverlappingIdenticalStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* -------
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeNonOverlappingIdenticalStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100));
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) == 1000);
Assert.assertTrue(downsampledStackSizes.get(1) == 1000);
Assert.assertTrue(downsampledStackSizes.get(2) == 1000);
}
/**
* ---
* ---
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeStacksWithShortStackAtBeginning() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* -------
* ---
* ---
* -------
* -------
*/
@Test
public void testThreeStacksWithShortStackInMiddle() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000);
}
/**
* ------
* ------
* -------
* -------
* ---
* ---
*/
@Test
public void testThreeStacksWithShortStackAtEnd() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* ----
* -------
* ----
* -------
* -------
*/
@Test
public void testThreePartiallyOverlappingStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
// TODO: need to examine per-base coverage here
}
@Test
public void testNoDownsamplingRequired() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) == 300);
Assert.assertTrue(downsampledStackSizes.get(1) == 300);
Assert.assertTrue(downsampledStackSizes.get(2) == 300);
}
@Test
public void testGATKSAMRecordSupport() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<GATKSAMRecord> downsampler = new PositionalDownsampler<GATKSAMRecord>(1000);
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
for ( int i = 0; i < 10; i++ ) {
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10));
}
downsampler.submit(reads);
downsampler.signalEndOfInput();
List<GATKSAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.size() == 10);
}
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
for ( int i = 1; i <= stackSize; i++ ) {
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
}
return stack;
}
private ArrayList<SAMRecord> createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) {
ArrayList<SAMRecord> stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength);
stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength));
return stack;
}
private List<Integer> getDownsampledStackSizesAndVerifySortedness( List<SAMRecord> downsampledReads ) {
List<Integer> stackSizes = new ArrayList<Integer>();
Iterator<SAMRecord> iter = downsampledReads.iterator();
Assert.assertTrue(iter.hasNext());
SAMRecord previousRead = iter.next();
int currentStackSize = 1;
while ( iter.hasNext() ) {
SAMRecord currentRead = iter.next();
if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) {
stackSizes.add(currentStackSize);
currentStackSize = 1;
}
else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) {
Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead));
}
else {
currentStackSize++;
}
previousRead = currentRead;
}
stackSizes.add(currentStackSize);
return stackSizes;
}
}

View File

@ -0,0 +1,126 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer;
/**
* Class for analyzing an artificial read stream that has been positionally downsampled, and verifying
* that the downsampling was done correctly without changing the stream in unexpected ways.
*
* @author David Roazen
*/
public class PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer extends ArtificialSingleSampleReadStreamAnalyzer {
private int targetCoverage;
public PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream, int targetCoverage ) {
super(originalStream);
this.targetCoverage = targetCoverage;
}
/**
* Overridden validate() method that checks for the effects of positional downsampling in addition to checking
* for whether the original properties of the stream not affected by downsampling have been preserved
*/
@Override
public void validate() {
if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) {
if ( totalReads != 0 ) {
throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads");
}
return; // no further validation needed for the 0-reads case
}
else if ( totalReads == 0 ) {
throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads");
}
if ( ! allSamplesMatch ) {
throw new ReviewedStingException("some reads had the wrong sample");
}
if ( numContigs != originalStream.getNumContigs() ) {
throw new ReviewedStingException("number of contigs not correct");
}
if ( stacksPerContig.size() != originalStream.getNumContigs() ) {
throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs",
stacksPerContig.size(), originalStream.getNumContigs()));
}
for ( int contigStackCount : stacksPerContig ) {
if ( contigStackCount != originalStream.getNumStacksPerContig() ) {
throw new ReviewedStingException("contig had incorrect number of stacks");
}
}
if ( originalStream.getNumStacksPerContig() > 0 ) {
// Check for the effects of positional downsampling:
int stackMinimumAfterDownsampling = Math.min(targetCoverage, originalStream.getMinReadsPerStack());
int stackMaximumAfterDownsampling = targetCoverage;
if ( minReadsPerStack < stackMinimumAfterDownsampling ) {
throw new ReviewedStingException("stack had fewer than the minimum number of reads after downsampling");
}
if ( maxReadsPerStack > stackMaximumAfterDownsampling ) {
throw new ReviewedStingException("stack had more than the maximum number of reads after downsampling");
}
}
else if ( minReadsPerStack != null || maxReadsPerStack != null ) {
throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified");
}
if ( originalStream.getNumStacksPerContig() > 1 ) {
if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) {
throw new ReviewedStingException("stacks were separated by less than the minimum distance");
}
if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) {
throw new ReviewedStingException("stacks were separated by more than the maximum distance");
}
}
else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) {
throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1");
}
if ( minReadLength < originalStream.getMinReadLength() ) {
throw new ReviewedStingException("read was shorter than the minimum allowed length");
}
if ( maxReadLength > originalStream.getMaxReadLength() ) {
throw new ReviewedStingException("read was longer than the maximum allowed length");
}
if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) {
throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d",
originalStream.getNumUnmappedReads(), numUnmappedReads));
}
if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) &&
numUnmappedReads != totalReads ) {
throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads");
}
}
}

View File

@ -0,0 +1,129 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
public class ReservoirDownsamplerUnitTest extends BaseTest {
private static class ReservoirDownsamplerTest extends TestDataProvider {
int reservoirSize;
int totalReads;
int expectedNumReadsAfterDownsampling;
int expectedNumDiscardedItems;
public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) {
super(ReservoirDownsamplerTest.class);
this.reservoirSize = reservoirSize;
this.totalReads = totalReads;
expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads);
expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize;
setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d",
getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems));
}
public Collection<SAMRecord> createReads() {
Collection<SAMRecord> reads = new ArrayList<SAMRecord>(totalReads);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100));
return reads;
}
}
@DataProvider(name = "ReservoirDownsamplerTestDataProvider")
public Object[][] createReservoirDownsamplerTestData() {
for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) {
new ReservoirDownsamplerTest(reservoirSize, 0);
for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) {
new ReservoirDownsamplerTest(reservoirSize, totalReads);
}
}
return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class);
}
@Test(dataProvider = "ReservoirDownsamplerTestDataProvider")
public void testReservoirDownsampler( ReservoirDownsamplerTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
ReadsDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(test.reservoirSize);
downsampler.submit(test.createReads());
if ( test.totalReads > 0 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
downsampler.signalEndOfInput();
if ( test.totalReads > 0 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling);
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems);
Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems);
downsampler.reset();
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
}
}

View File

@ -0,0 +1,330 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.*;
public class SimplePositionalDownsamplerUnitTest extends BaseTest {
private static class SimplePositionalDownsamplerTest extends TestDataProvider {
int targetCoverage;
int numStacks;
List<Integer> stackSizes;
List<Integer> expectedStackSizes;
boolean multipleContigs;
int totalInitialReads;
public SimplePositionalDownsamplerTest( int targetCoverage, List<Integer> stackSizes, boolean multipleContigs ) {
super(SimplePositionalDownsamplerTest.class);
this.targetCoverage = targetCoverage;
this.numStacks = stackSizes.size();
this.stackSizes = stackSizes;
this.multipleContigs = multipleContigs;
calculateExpectedDownsampledStackSizes();
totalInitialReads = 0;
for ( Integer stackSize : stackSizes ) {
totalInitialReads += stackSize;
}
setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b",
getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs));
}
public Collection<SAMRecord> createReads() {
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
SAMFileHeader header = multipleContigs ?
ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) :
ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
int refIndex = 0;
int alignmentStart = 1;
int readLength = 100;
for ( int i = 0; i < numStacks; i++ ) {
if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) {
refIndex++;
}
reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo",
refIndex, alignmentStart, readLength));
alignmentStart += 10;
}
return reads;
}
private void calculateExpectedDownsampledStackSizes() {
expectedStackSizes = new ArrayList<Integer>(numStacks);
for ( Integer stackSize : stackSizes ) {
int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage;
expectedStackSizes.add(expectedSize);
}
}
}
@DataProvider(name = "SimplePositionalDownsamplerTestDataProvider")
public Object[][] createSimplePositionalDownsamplerTestData() {
GenomeAnalysisEngine.resetRandomGenerator();
for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) {
for ( int contigs = 1; contigs <= 2; contigs++ ) {
for ( int numStacks = 0; numStacks <= 10; numStacks++ ) {
List<Integer> stackSizes = new ArrayList<Integer>(numStacks);
for ( int stack = 1; stack <= numStacks; stack++ ) {
stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1);
}
new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1);
}
}
}
return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class);
}
@Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" )
public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
ReadsDownsampler<SAMRecord> downsampler = new SimplePositionalDownsampler<SAMRecord>(test.targetCoverage);
downsampler.submit(test.createReads());
if ( test.numStacks > 1 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertTrue(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() != null);
}
else if ( test.numStacks == 1 ) {
Assert.assertFalse(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() == null);
Assert.assertTrue(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() != null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
downsampler.signalEndOfInput();
if ( test.numStacks > 0 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
if ( test.numStacks == 0 ) {
Assert.assertTrue(downsampledReads.isEmpty());
}
else {
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads);
Assert.assertEquals(downsampledStackSizes.size(), test.numStacks);
Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes);
int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size();
int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems();
Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated);
}
downsampler.reset();
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
}
private List<Integer> getDownsampledStackSizesAndVerifySortedness( List<SAMRecord> downsampledReads ) {
List<Integer> stackSizes = new ArrayList<Integer>();
if ( downsampledReads.isEmpty() ) {
return stackSizes;
}
Iterator<SAMRecord> iter = downsampledReads.iterator();
Assert.assertTrue(iter.hasNext());
SAMRecord previousRead = iter.next();
int currentStackSize = 1;
while ( iter.hasNext() ) {
SAMRecord currentRead = iter.next();
if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) {
stackSizes.add(currentStackSize);
currentStackSize = 1;
}
else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) {
Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead));
}
else {
currentStackSize++;
}
previousRead = currentRead;
}
stackSizes.add(currentStackSize);
return stackSizes;
}
@Test
public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() {
ReadsDownsampler<SAMRecord> downsampler = new SimplePositionalDownsampler<SAMRecord>(1000);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
Collection<SAMRecord> readStack = new ArrayList<SAMRecord>();
readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100));
downsampler.submit(readStack);
Assert.assertFalse(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() == null);
Assert.assertTrue(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() != null);
SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100);
downsampler.signalNoMoreReadsBefore(laterRead);
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertEquals(downsampledReads.size(), readStack.size());
}
@Test
public void testBasicUnmappedReadsSupport() {
ReadsDownsampler<SAMRecord> downsampler = new SimplePositionalDownsampler<SAMRecord>(100);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
Collection<SAMRecord> readStack = new ArrayList<SAMRecord>();
readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX,
SAMRecord.NO_ALIGNMENT_START, 100));
for ( SAMRecord read : readStack ) {
Assert.assertTrue(read.getReadUnmappedFlag());
}
downsampler.submit(readStack);
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
// Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler
Assert.assertEquals(downsampledReads.size(), readStack.size());
for ( SAMRecord read: downsampledReads ) {
Assert.assertTrue(read.getReadUnmappedFlag());
}
}
@Test
public void testMixedMappedAndUnmappedReadsSupport() {
ReadsDownsampler<SAMRecord> downsampler = new SimplePositionalDownsampler<SAMRecord>(100);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
Collection<SAMRecord> mappedReadStack = new ArrayList<SAMRecord>();
mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100));
for ( SAMRecord read : mappedReadStack ) {
Assert.assertFalse(read.getReadUnmappedFlag());
}
Collection<SAMRecord> unmappedReadStack = new ArrayList<SAMRecord>();
unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX,
SAMRecord.NO_ALIGNMENT_START, 100));
for ( SAMRecord read : unmappedReadStack ) {
Assert.assertTrue(read.getReadUnmappedFlag());
}
downsampler.submit(mappedReadStack);
downsampler.submit(unmappedReadStack);
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
// Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler
Assert.assertEquals(downsampledReads.size(), 300);
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100);
int count = 1;
for ( SAMRecord read: downsampledReads ) {
if ( count <= 100 ) {
Assert.assertFalse(read.getReadUnmappedFlag());
}
else {
Assert.assertTrue(read.getReadUnmappedFlag());
}
count++;
}
}
@Test
public void testGATKSAMRecordSupport() {
ReadsDownsampler<GATKSAMRecord> downsampler = new SimplePositionalDownsampler<GATKSAMRecord>(1000);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
for ( int i = 0; i < 10; i++ ) {
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10));
}
downsampler.submit(reads);
downsampler.signalEndOfInput();
List<GATKSAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertEquals(downsampledReads.size(), 10);
}
}

View File

@ -0,0 +1,546 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
/**
* testing of the experimental version of LocusIteratorByState
*/
public class LocusIteratorByStateExperimentalUnitTest extends BaseTest {
private static SAMFileHeader header;
private LocusIteratorByStateExperimental li;
private GenomeLocParser genomeLocParser;
@BeforeClass
public void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
private final LocusIteratorByStateExperimental makeLTBS(List<SAMRecord> reads, ReadProperties readAttributes) {
return new LocusIteratorByStateExperimental(new FakeCloseableIterator<SAMRecord>(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups());
}
private static ReadProperties createTestReadProperties() {
return createTestReadProperties(null);
}
private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) {
return new ReadProperties(
Collections.<SAMReaderID>emptyList(),
new SAMFileHeader(),
false,
SAMFileReader.ValidationStringency.STRICT,
downsamplingMethod,
new ValidationExclusion(),
Collections.<ReadFilter>emptyList(),
Collections.<ReadTransformer>emptyList(),
false,
(byte) -1
);
}
private static class FakeCloseableIterator<T> implements CloseableIterator<T> {
Iterator<T> iterator;
public FakeCloseableIterator(Iterator<T> it) {
iterator = it;
}
@Override
public void close() {
return;
}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public T next() {
return iterator.next();
}
@Override
public void remove() {
throw new UnsupportedOperationException("Don't remove!");
}
}
@Test
public void testXandEQOperators() {
final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'};
final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'};
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10);
r1.setReadBases(bases1);
r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
r1.setCigarString("10M");
SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10);
r2.setReadBases(bases2);
r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
r2.setCigarString("3=1X5=1X");
SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10);
r3.setReadBases(bases2);
r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
r3.setCigarString("3=1X5M1X");
SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10);
r4.setReadBases(bases2);
r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
r4.setCigarString("10M");
List<SAMRecord> reads = Arrays.asList(r1, r2, r3, r4);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,readAttributes);
while (li.hasNext()) {
AlignmentContext context = li.next();
ReadBackedPileup pileup = context.getBasePileup();
Assert.assertEquals(pileup.depthOfCoverage(), 4);
}
}
@Test
public void testIndelsInRegularPileup() {
final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'};
final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'};
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10);
before.setReadBases(bases);
before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
before.setCigarString("10M");
SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10);
during.setReadBases(indelBases);
during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
during.setCigarString("4M2I6M");
SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10);
after.setReadBases(bases);
after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
after.setCigarString("10M");
List<SAMRecord> reads = Arrays.asList(before, during, after);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,readAttributes);
boolean foundIndel = false;
while (li.hasNext()) {
AlignmentContext context = li.next();
ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10);
for (PileupElement p : pileup) {
if (p.isBeforeInsertion()) {
foundIndel = true;
Assert.assertEquals(p.getEventLength(), 2, "Wrong event length");
Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect");
break;
}
}
}
Assert.assertTrue(foundIndel,"Indel in pileup not found");
}
@Test
public void testWholeIndelReadInIsolation() {
final int firstLocus = 44367789;
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76);
indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76));
indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76));
indelOnlyRead.setCigarString("76I");
List<SAMRecord> reads = Arrays.asList(indelOnlyRead);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, readAttributes);
// Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read
// and considers it to be an indel-containing read.
Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled");
AlignmentContext alignmentContext = li.next();
Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location.");
ReadBackedPileup basePileup = alignmentContext.getBasePileup();
Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size");
Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect");
}
/**
* Test to make sure that reads supporting only an indel (example cigar string: 76I) do
* not negatively influence the ordering of the pileup.
*/
@Test
public void testWholeIndelRead() {
final int firstLocus = 44367788, secondLocus = firstLocus + 1;
SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76);
leadingRead.setReadBases(Utils.dupBytes((byte)'A',76));
leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76));
leadingRead.setCigarString("1M75I");
SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76);
indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76));
indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76));
indelOnlyRead.setCigarString("76I");
SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76);
fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76));
fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76));
fullMatchAfterIndel.setCigarString("75I1M");
List<SAMRecord> reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
int currentLocus = firstLocus;
int numAlignmentContextsFound = 0;
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect");
if(currentLocus == firstLocus) {
List<GATKSAMRecord> readsAtLocus = alignmentContext.getBasePileup().getReads();
Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus);
}
else if(currentLocus == secondLocus) {
List<GATKSAMRecord> readsAtLocus = alignmentContext.getBasePileup().getReads();
Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus);
}
currentLocus++;
numAlignmentContextsFound++;
}
Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts");
}
/**
* Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly
*/
@Test
public void testWholeIndelReadRepresentedTest() {
final int firstLocus = 44367788, secondLocus = firstLocus + 1;
SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1);
read1.setReadBases(Utils.dupBytes((byte) 'A', 1));
read1.setBaseQualities(Utils.dupBytes((byte) '@', 1));
read1.setCigarString("1I");
List<SAMRecord> reads = Arrays.asList(read1);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
Assert.assertTrue(pe.isBeforeInsertion());
Assert.assertFalse(pe.isAfterInsertion());
Assert.assertEquals(pe.getEventBases(), "A");
}
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10);
read2.setReadBases(Utils.dupBytes((byte) 'A', 10));
read2.setBaseQualities(Utils.dupBytes((byte) '@', 10));
read2.setCigarString("10I");
reads = Arrays.asList(read2);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
Assert.assertTrue(pe.isBeforeInsertion());
Assert.assertFalse(pe.isAfterInsertion());
Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA");
}
}
////////////////////////////////////////////
// comprehensive LIBS/PileupElement tests //
////////////////////////////////////////////
private static final int IS_BEFORE_DELETED_BASE_FLAG = 1;
private static final int IS_BEFORE_DELETION_START_FLAG = 2;
private static final int IS_AFTER_DELETED_BASE_FLAG = 4;
private static final int IS_AFTER_DELETION_END_FLAG = 8;
private static final int IS_BEFORE_INSERTION_FLAG = 16;
private static final int IS_AFTER_INSERTION_FLAG = 32;
private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64;
private static class LIBSTest {
final String cigar;
final int readLength;
final List<Integer> offsets;
final List<Integer> flags;
private LIBSTest(final String cigar, final int readLength, final List<Integer> offsets, final List<Integer> flags) {
this.cigar = cigar;
this.readLength = readLength;
this.offsets = offsets;
this.flags = flags;
}
}
@DataProvider(name = "LIBSTest")
public Object[][] createLIBSTestData() {
return new Object[][]{
{new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))},
{new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))},
{new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))},
{new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))},
//TODO -- uncomment these when LIBS is fixed
//{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))},
//{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))},
//{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))},
{new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))},
{new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))},
{new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))},
{new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))}
};
}
@Test(dataProvider = "LIBSTest")
public void testLIBS(LIBSTest params) {
final int locus = 44367788;
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength));
read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength));
read.setCigarString(params.cigar);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(Arrays.asList(read), createTestReadProperties());
int offset = 0;
while ( li.hasNext() ) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
final int flag = params.flags.get(offset);
Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0);
Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0);
Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0);
Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0);
Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0);
Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0);
Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0);
Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue());
offset++;
}
}
////////////////////////////////////////////////
// End comprehensive LIBS/PileupElement tests //
////////////////////////////////////////////////
///////////////////////////////////////
// Read State Manager Tests //
///////////////////////////////////////
private class PerSampleReadStateManagerTest extends TestDataProvider {
private List<Integer> readCountsPerAlignmentStart;
private List<SAMRecord> reads;
private List<ArrayList<LocusIteratorByStateExperimental.SAMRecordState>> recordStatesByAlignmentStart;
private int removalInterval;
public PerSampleReadStateManagerTest( List<Integer> readCountsPerAlignmentStart, int removalInterval ) {
super(PerSampleReadStateManagerTest.class);
this.readCountsPerAlignmentStart = readCountsPerAlignmentStart;
this.removalInterval = removalInterval;
reads = new ArrayList<SAMRecord>();
recordStatesByAlignmentStart = new ArrayList<ArrayList<LocusIteratorByStateExperimental.SAMRecordState>>();
setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d",
getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval));
}
public void run() {
LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList<SAMRecord>(), createTestReadProperties());
LocusIteratorByStateExperimental.ReadStateManager readStateManager =
libs.new ReadStateManager(new ArrayList<SAMRecord>().iterator());
LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager =
readStateManager.new PerSampleReadStateManager();
makeReads();
for ( ArrayList<LocusIteratorByStateExperimental.SAMRecordState> stackRecordStates : recordStatesByAlignmentStart ) {
perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates);
}
// read state manager should have the right number of reads
Assert.assertEquals(reads.size(), perSampleReadStateManager.size());
Iterator<SAMRecord> originalReadsIterator = reads.iterator();
Iterator<LocusIteratorByStateExperimental.SAMRecordState> recordStateIterator = perSampleReadStateManager.iterator();
int recordStateCount = 0;
int numReadStatesRemoved = 0;
// Do a first-pass validation of the record state iteration by making sure we get back everything we
// put in, in the same order, doing any requested removals of read states along the way
while ( recordStateIterator.hasNext() ) {
LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next();
recordStateCount++;
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
Assert.assertTrue(originalReadsIterator.hasNext());
SAMRecord originalRead = originalReadsIterator.next();
// The read we get back should be literally the same read in memory as we put in
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
// If requested, remove a read state every removalInterval states
if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) {
recordStateIterator.remove();
numReadStatesRemoved++;
}
}
Assert.assertFalse(originalReadsIterator.hasNext());
// If we removed any read states, do a second pass through the read states to make sure the right
// states were removed
if ( numReadStatesRemoved > 0 ) {
Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved);
originalReadsIterator = reads.iterator();
recordStateIterator = perSampleReadStateManager.iterator();
int readCount = 0;
int readStateCount = 0;
// Match record states with the reads that should remain after removal
while ( recordStateIterator.hasNext() ) {
LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next();
readStateCount++;
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
Assert.assertTrue(originalReadsIterator.hasNext());
SAMRecord originalRead = originalReadsIterator.next();
readCount++;
if ( readCount % removalInterval == 0 ) {
originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded
readCount++;
}
// The read we get back should be literally the same read in memory as we put in (after accounting for removals)
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
}
Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved);
}
// Allow memory used by this test to be reclaimed
readCountsPerAlignmentStart = null;
reads = null;
recordStatesByAlignmentStart = null;
}
private void makeReads() {
int alignmentStart = 1;
for ( int readsThisStack : readCountsPerAlignmentStart ) {
ArrayList<SAMRecord> stackReads = new ArrayList<SAMRecord>(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100)));
ArrayList<LocusIteratorByStateExperimental.SAMRecordState> stackRecordStates = new ArrayList<LocusIteratorByStateExperimental.SAMRecordState>();
for ( SAMRecord read : stackReads ) {
stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read));
}
reads.addAll(stackReads);
recordStatesByAlignmentStart.add(stackRecordStates);
}
}
}
@DataProvider(name = "PerSampleReadStateManagerTestDataProvider")
public Object[][] createPerSampleReadStateManagerTests() {
for ( List<Integer> thisTestReadStateCounts : Arrays.asList( Arrays.asList(1),
Arrays.asList(2),
Arrays.asList(10),
Arrays.asList(1, 1),
Arrays.asList(2, 2),
Arrays.asList(10, 10),
Arrays.asList(1, 10),
Arrays.asList(10, 1),
Arrays.asList(1, 1, 1),
Arrays.asList(2, 2, 2),
Arrays.asList(10, 10, 10),
Arrays.asList(1, 1, 1, 1, 1, 1),
Arrays.asList(10, 10, 10, 10, 10, 10),
Arrays.asList(1, 2, 10, 1, 2, 10)
) ) {
for ( int removalInterval : Arrays.asList(0, 2, 3) ) {
new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval);
}
}
return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class);
}
@Test(dataProvider = "PerSampleReadStateManagerTestDataProvider")
public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) {
logger.warn("Running test: " + test);
test.run();
}
}

View File

@ -28,14 +28,12 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -48,7 +46,6 @@ import java.util.List;
*/
public class VerifyingSamIteratorUnitTest {
private SAMFileHeader samFileHeader;
private GenomeLocParser genomeLocParser;
@BeforeClass
public void init() {
@ -58,8 +55,6 @@ public class VerifyingSamIteratorUnitTest {
samFileHeader = new SAMFileHeader();
samFileHeader.setSequenceDictionary(sequenceDictionary);
genomeLocParser = new GenomeLocParser(sequenceDictionary);
}
@Test
@ -68,7 +63,7 @@ public class VerifyingSamIteratorUnitTest {
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10);
List<SAMRecord> reads = Arrays.asList(read1,read2);
VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator()));
VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator()));
Assert.assertTrue(iterator.hasNext(),"Insufficient reads");
Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position");
@ -83,7 +78,7 @@ public class VerifyingSamIteratorUnitTest {
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10);
List<SAMRecord> reads = Arrays.asList(read1,read2);
VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator()));
VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator()));
Assert.assertTrue(iterator.hasNext(),"Insufficient reads");
Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position");
@ -98,7 +93,7 @@ public class VerifyingSamIteratorUnitTest {
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10);
List<SAMRecord> reads = Arrays.asList(read1,read2);
VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator()));
VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator()));
Assert.assertTrue(iterator.hasNext(),"Insufficient reads");
Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position");
@ -116,7 +111,7 @@ public class VerifyingSamIteratorUnitTest {
read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
List<SAMRecord> reads = Arrays.asList(read1,read2);
VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator()));
VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator()));
Assert.assertTrue(iterator.hasNext(),"Insufficient reads");
Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position");

View File

@ -17,7 +17,7 @@ import java.util.*;
* @author mhanna
* @version 0.1
*/
public class ReservoirDownsamplerUnitTest {
public class LegacyReservoirDownsamplerUnitTest {
private static final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,200);

View File

@ -0,0 +1,161 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import org.broadinstitute.sting.BaseTest;
public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest {
private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider {
private ArtificialSingleSampleReadStream stream;
private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer;
public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) {
super(ArtificialSingleSampleReadStreamTest.class);
this.stream = stream;
setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d",
getClass().getSimpleName(),
stream.getNumContigs(),
stream.getNumStacksPerContig(),
stream.getMinReadsPerStack(),
stream.getMaxReadsPerStack(),
stream.getMinDistanceBetweenStacks(),
stream.getMaxDistanceBetweenStacks(),
stream.getMinReadLength(),
stream.getMaxReadLength(),
stream.getNumUnmappedReads()));
}
public void run() {
streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream);
streamAnalyzer.analyze(stream);
// Check whether the observed properties of the stream match its nominal properties
streamAnalyzer.validate();
}
}
@DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider")
public Object[][] createArtificialSingleSampleReadStreamTests() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000);
String readGroupID = "testReadGroup";
SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID);
readGroup.setSample("testSample");
header.addReadGroup(readGroup);
GenomeAnalysisEngine.resetRandomGenerator();
// brute force testing!
for ( int numContigs = 0; numContigs <= 2; numContigs++ ) {
for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) {
for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) {
for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) {
for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) {
for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) {
for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) {
for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) {
for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) {
// Only test sane combinations here
if ( minReadsPerStack <= maxReadsPerStack &&
minDistanceBetweenStacks <= maxDistanceBetweenStacks &&
minReadLength <= maxReadLength &&
((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) {
new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header,
readGroupID,
numContigs,
stacksPerContig,
minReadsPerStack,
maxReadsPerStack,
minDistanceBetweenStacks,
maxDistanceBetweenStacks,
minReadLength,
maxReadLength,
numUnmappedReads));
}
}
}
}
}
}
}
}
}
}
return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class);
}
@Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider")
public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
test.run();
}
@DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider")
public Object[][] createInvalidArgumentsTests() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000);
String readGroupID = "testReadGroup";
header.addReadGroup(new SAMReadGroupRecord(readGroupID));
return new Object[][] {
{"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0},
{"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0},
{"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0},
{"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0},
{"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0},
{"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0},
{"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0},
{"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0},
{"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0},
{"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0},
{"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0},
{"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0},
{"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0},
{"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0},
{"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0},
{"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0},
{"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1},
};
}
@Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider",
expectedExceptions = ReviewedStingException.class)
public void testInvalidArguments( String testName,
SAMFileHeader header,
String readGroupID,
int numContigs,
int numStacksPerContig,
int minReadsPerStack,
int maxReadsPerStack,
int minDistanceBetweenStacks,
int maxDistanceBetweenStacks,
int minReadLength,
int maxReadLength,
int numUnmappedReads ) {
logger.warn("Running test: " + testName);
ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header,
readGroupID,
numContigs,
numStacksPerContig,
minReadsPerStack,
maxReadsPerStack,
minDistanceBetweenStacks,
maxDistanceBetweenStacks,
minReadLength,
maxReadLength,
numUnmappedReads);
}
}