auto-merging to latest version

This commit is contained in:
Ryan Poplin 2012-09-07 11:36:47 -04:00
commit 81b27f9db2
108 changed files with 6457 additions and 1408 deletions

View File

@ -577,6 +577,7 @@
docletpathref="doclet.classpath"
classpathref="external.dependencies"
classpath="${java.classes}"
maxmemory="2g"
additionalparam="-build-timestamp "${build.timestamp}" -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet">
<sourcefiles>
<union>
@ -780,6 +781,7 @@
docletpathref="doclet.classpath"
classpathref="external.dependencies"
classpath="${java.classes}"
maxmemory="2g"
additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -quiet"> <!-- -test to only do DocumentationTest walker -->
<sourcefiles>
<fileset refid="java.source.files"/>

View File

@ -546,7 +546,7 @@ public class SlidingWindow {
FractionalDownsampler <GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(fraction);
downsampler.submit(allReads);
return downsampler.consumeDownsampledItems();
return downsampler.consumeFinalizedItems();
}

View File

@ -112,31 +112,31 @@ public class CommandLineGATK extends CommandLineExecutable {
}
}
protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
private static void checkForMaskedUserErrors(final Throwable t) {
final String message = t.getMessage();
if ( message == null )
return;
// we know what to do about the common "Too many open files" error
if ( message.indexOf("Too many open files") != -1 )
if ( message.contains("Too many open files") )
exitSystemWithUserError(new UserException.TooManyOpenFiles());
// malformed BAM looks like a SAM file
if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 ||
message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 )
if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
exitSystemWithSamError(t);
// can't close tribble index when writing
if ( message.indexOf("Unable to close index for") != -1 )
if ( message.contains("Unable to close index for") )
exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
// disk is full
if ( message.indexOf("No space left on device") != -1 )
exitSystemWithUserError(new UserException(t.getMessage()));
if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 )
exitSystemWithUserError(new UserException(t.getCause().getMessage()));
if ( message.contains("No space left on device") )
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") )
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
}
/**

View File

@ -1,52 +0,0 @@
package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Describes the method for downsampling reads at a given locus.
*
* @author hanna
* @version 0.1
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
/**
* Expresses no downsampling applied at all.
*/
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null);
public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) {
// Do some basic sanity checks on the downsampling parameters passed in.
// Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator.
if(type != DownsampleType.NONE && toFraction == null && toCoverage == null)
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if(toFraction != null && toCoverage != null)
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
// Experimental by sample downsampling does not work with a fraction of reads.
if(type == DownsampleType.BY_SAMPLE && toFraction != null)
throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method");
this.type = type;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
}
}

View File

@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
@ -143,6 +144,8 @@ public class GenomeAnalysisEngine {
*/
private ThreadAllocation threadAllocation;
private ReadMetrics cumulativeMetrics = null;
/**
* A currently hacky unique name for this GATK instance
*/
@ -398,28 +401,22 @@ public class GenomeAnalysisEngine {
* Parse out the thread allocation from the given command-line argument.
*/
private void determineThreadAllocation() {
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads);
if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread);
if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads);
// TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters.
Integer numCPUThreads = null;
if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null)
throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
else if(tags.containsKey("cpu"))
numCPUThreads = Integer.parseInt(tags.getValue("cpu"));
else if(argCollection.numberOfCPUThreads != null)
numCPUThreads = argCollection.numberOfCPUThreads;
Integer numIOThreads = null;
if(tags.containsKey("io") && argCollection.numberOfIOThreads != null)
throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
else if(tags.containsKey("io"))
numIOThreads = Integer.parseInt(tags.getValue("io"));
else if(argCollection.numberOfIOThreads != null)
numIOThreads = argCollection.numberOfIOThreads;
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads, numCPUThreads, numIOThreads, ! argCollection.disableEfficiencyMonitor);
this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads,
argCollection.numberOfCPUThreadsPerDataThread,
argCollection.numberOfIOThreads,
! argCollection.disableEfficiencyMonitor);
}
public int getTotalNumberOfThreads() {
return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads();
}
/**
* Allow subclasses and others within this package direct access to the walker manager.
* @return The walker manager used by this package.
@ -445,14 +442,18 @@ public class GenomeAnalysisEngine {
protected DownsamplingMethod getDownsamplingMethod() {
GATKArgumentCollection argCollection = this.getArguments();
DownsamplingMethod method;
if(argCollection.getDownsamplingMethod() != null)
method = argCollection.getDownsamplingMethod();
else if(WalkerManager.getDownsamplingMethod(walker) != null)
method = WalkerManager.getDownsamplingMethod(walker);
else
method = GATKArgumentCollection.getDefaultDownsamplingMethod();
return method;
boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
// until the file pointer bug with the experimental downsamplers is fixed, disallow running with experimental downsampling
if ( useExperimentalDownsampling ) {
throw new UserException("The experimental downsampling implementation is currently crippled by a file-pointer-related bug. Until this bug is fixed, it's not safe (or possible) for anyone to use the experimental implementation!");
}
DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
}
protected void setDownsamplingMethod(DownsamplingMethod method) {
@ -825,11 +826,13 @@ public class GenomeAnalysisEngine {
* @return A data source for the given set of reads.
*/
private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
DownsamplingMethod method = getDownsamplingMethod();
DownsamplingMethod downsamplingMethod = getDownsamplingMethod();
// Synchronize the method back into the collection so that it shows up when
// interrogating for the downsample method during command line recreation.
setDownsamplingMethod(method);
setDownsamplingMethod(downsamplingMethod);
logger.info(downsamplingMethod);
if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
@ -847,7 +850,7 @@ public class GenomeAnalysisEngine {
argCollection.useOriginalBaseQualities,
argCollection.strictnessLevel,
argCollection.readBufferSize,
method,
downsamplingMethod,
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
filters,
readTransformers,
@ -1035,7 +1038,10 @@ public class GenomeAnalysisEngine {
* owned by the caller; the caller can do with the object what they wish.
*/
public ReadMetrics getCumulativeMetrics() {
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
// todo -- probably shouldn't be lazy
if ( cumulativeMetrics == null )
cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics();
return cumulativeMetrics;
}
/**

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.filter.SamRecordFilter;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable {
return nRecords;
}
/**
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
*/
public void incrementNumIterations(final long by) {
nRecords += by;
}
/**
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
*/
public void incrementNumIterations() {
nRecords++;
incrementNumIterations(1);
}
public long getNumReadsSeen() {

View File

@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@ -304,9 +306,10 @@ public class WalkerManager extends PluginManager<Walker> {
* downsampling method is specified on the command-line, the command-line version will
* be used instead.
* @param walkerClass The class of the walker to interrogate.
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
* @return The downsampling method, as specified by the walker. Null if none exists.
*/
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass) {
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useExperimentalDownsampling) {
DownsamplingMethod downsamplingMethod = null;
if( walkerClass.isAnnotationPresent(Downsample.class) ) {
@ -314,7 +317,7 @@ public class WalkerManager extends PluginManager<Walker> {
DownsampleType type = downsampleParameters.by();
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction);
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
}
return downsamplingMethod;
@ -333,10 +336,11 @@ public class WalkerManager extends PluginManager<Walker> {
* downsampling method is specified on the command-line, the command-line version will
* be used instead.
* @param walker The walker to interrogate.
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
* @return The downsampling method, as specified by the walker. Null if none exists.
*/
public static DownsamplingMethod getDownsamplingMethod(Walker walker) {
return getDownsamplingMethod(walker.getClass());
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
}
/**

View File

@ -31,8 +31,8 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.QualityUtils;
@ -41,7 +41,9 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import java.io.File;
import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @author aaron
@ -138,15 +140,11 @@ public class GATKArgumentCollection {
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
/**
* The override mechanism in the GATK, by default, populates the command-line arguments, then
* the defaults from the walker annotations. Unfortunately, walker annotations should be trumped
* by a user explicitly specifying command-line arguments.
* TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments.
*/
private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
// --------------------------------------------------------------------------------------------------------------
//
// Downsampling Arguments
//
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
public DownsampleType downsamplingType = null;
@ -156,17 +154,20 @@ public class GATKArgumentCollection {
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
public Integer downsampleCoverage = null;
@Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
@Hidden
public boolean enableExperimentalDownsampling = false;
/**
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
* a default downsampling mechanism, return the default.
* @return The explicitly specified downsampling mechanism, or the default if none exists.
*/
public DownsamplingMethod getDownsamplingMethod() {
if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null)
if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
return null;
if(downsamplingType == null && downsampleCoverage != null)
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null);
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
}
/**
@ -176,9 +177,11 @@ public class GATKArgumentCollection {
public void setDownsamplingMethod(DownsamplingMethod method) {
if (method == null)
throw new IllegalArgumentException("method is null");
downsamplingType = method.type;
downsampleCoverage = method.toCoverage;
downsampleFraction = method.toFraction;
enableExperimentalDownsampling = method.useExperimentalDownsampling;
}
// --------------------------------------------------------------------------------------------------------------
@ -197,17 +200,14 @@ public class GATKArgumentCollection {
// performance log arguments
//
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
public File performanceLog = null;
/**
* Gets the default downsampling method, returned if the user didn't specify any downsampling
* method.
* @return The default downsampling mechanism, or null if none exists.
* The file name for the GATK performance log output, or null if you don't want to generate the
* detailed performance logging table. This table is suitable for importing into R or any
* other analysis software that can read tsv files
*/
public static DownsamplingMethod getDefaultDownsamplingMethod() {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
}
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
public File performanceLog = null;
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
public Boolean useOriginalBaseQualities = false;
@ -279,9 +279,32 @@ public class GATKArgumentCollection {
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;
/** How many threads should be allocated to this analysis. */
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
public Integer numberOfThreads = 1;
// --------------------------------------------------------------------------------------------------------------
//
// Multi-threading arguments
//
// --------------------------------------------------------------------------------------------------------------
/**
* How many data threads should be allocated to this analysis? Data threads contains N cpu threads per
* data thread, and act as completely data parallel processing, increasing the memory usage of GATK
* by M data threads. Data threads generally scale extremely effectively, up to 24 cores
*/
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false)
public Integer numberOfDataThreads = 1;
/**
* How many CPU threads should be allocated per data thread? Each CPU thread operates the map
* cycle independently, but may run into earlier scaling problems with IO than data threads. Has
* the benefit of not requiring X times as much memory per thread as data threads do, but rather
* only a constant overhead.
*/
@Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false)
public int numberOfCPUThreadsPerDataThread = 1;
@Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
@Hidden
public int numberOfIOThreads = 0;
/**
* By default the GATK monitors its own efficiency, but this can have a itsy-bitsy tiny
@ -291,17 +314,6 @@ public class GATKArgumentCollection {
@Argument(fullName = "disableThreadEfficiencyMonitor", shortName = "dtem", doc = "Disable GATK efficiency monitoring", required = false)
public Boolean disableEfficiencyMonitor = false;
/**
* The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types.
* TODO: Kill this when I can do a tagged integer in Queue.
*/
@Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false)
@Hidden
public Integer numberOfCPUThreads = null;
@Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
@Hidden
public Integer numberOfIOThreads = null;
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
public Integer numberOfBAMFileHandles = null;

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View {
// Cache the current and apply filtering.
AlignmentContext current = nextLocus;
if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
// The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
}
// Indicate that the next operation will need to advance.
nextLocus = null;

View File

@ -30,7 +30,9 @@ import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.*;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -152,6 +154,8 @@ public class SAMDataSource {
*/
private final ThreadAllocation threadAllocation;
private final boolean expandShardsForDownsampling;
/**
* Create a new SAM data source given the supplied read metadata.
* @param samFiles list of reads files.
@ -302,6 +306,11 @@ public class SAMDataSource {
includeReadsWithDeletionAtLoci,
defaultBaseQualities);
expandShardsForDownsampling = readProperties.getDownsamplingMethod() != null &&
readProperties.getDownsamplingMethod().useExperimentalDownsampling &&
readProperties.getDownsamplingMethod().type != DownsampleType.NONE &&
readProperties.getDownsamplingMethod().toCoverage != null;
// cache the read group id (original) -> read group id (merged)
// and read group id (merged) -> read group id (original) mappings.
for(SAMReaderID id: readerIDs) {
@ -457,6 +466,16 @@ public class SAMDataSource {
}
}
/**
* Are we expanding shards as necessary to prevent shard boundaries from occurring at improper places?
*
* @return true if we are using expanded shards, otherwise false
*/
public boolean usingExpandedShards() {
return expandShardsForDownsampling;
}
/**
* Fill the given buffering shard with reads.
* @param shard Shard to fill.
@ -484,6 +503,31 @@ public class SAMDataSource {
}
}
// If the reads are sorted in coordinate order, ensure that all reads
// having the same alignment start become part of the same shard, to allow
// downsampling to work better across shard boundaries. Note that because our
// read stream has already been fed through the positional downsampler, which
// ensures that at each alignment start position there are no more than dcov
// reads, we're in no danger of accidentally creating a disproportionately huge
// shard
if ( expandShardsForDownsampling && sortOrder == SAMFileHeader.SortOrder.coordinate ) {
while ( iterator.hasNext() ) {
SAMRecord additionalRead = iterator.next();
// Stop filling the shard as soon as we encounter a read having a different
// alignment start or contig from the last read added in the earlier loop
// above, or an unmapped read
if ( read == null ||
additionalRead.getReadUnmappedFlag() ||
! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) ||
additionalRead.getAlignmentStart() != read.getAlignmentStart() ) {
break;
}
shard.addRead(additionalRead);
noteFilePositionUpdate(positionUpdates, additionalRead);
}
}
// If the reads are sorted in queryname order, ensure that all reads
// having the same queryname become part of the same shard.
if(sortOrder == SAMFileHeader.SortOrder.queryname) {
@ -578,6 +622,7 @@ public class SAMDataSource {
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
iteratorMap.put(readers.getReader(id), iterator);
}
@ -660,20 +705,25 @@ public class SAMDataSource {
List<ReadTransformer> readTransformers,
byte defaultBaseQualities) {
// *********************************************************************************** //
// * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * //
// *********************************************************************************** //
// ************************************************************************************************ //
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * //
// ************************************************************************************************ //
if (downsamplingFraction != null)
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
}
// Use the old fractional downsampler only if we're not using experimental downsampling:
if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
// verify the read ordering by applying a sort order iterator
if (!noValidationOfReadOrder && enableVerification)
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
// only wrap if we are replacing the original qualities or using a default base quality
@ -688,6 +738,26 @@ public class SAMDataSource {
return wrappedIterator;
}
protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
new FractionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
}
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
ReadsDownsampler<SAMRecord> downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
return new DownsamplingReadsIterator(wrappedIterator, downsampler);
}
return wrappedIterator;
}
private class SAMResourcePool {
/**
* How many entries can be cached in this resource pool?

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.gatk;
package org.broadinstitute.sting.gatk.downsampling;
/**
* Type of downsampling method to invoke.

View File

@ -28,49 +28,92 @@ import java.util.Collection;
import java.util.List;
/**
* The basic downsampler API, with no reads-specific operations
* The basic downsampler API, with no reads-specific operations.
*
* Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle
* any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a
* PerSampleDownsamplingReadsIterator.
*
* @author David Roazen
*/
public interface Downsampler<T> {
/*
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
/**
* Submit one item to the downsampler for consideration. Some downsamplers will be able to determine
* immediately whether the item survives the downsampling process, while others will need to see
* more items before making that determination.
*
* @param item the individual item to submit to the downsampler for consideration
*/
public void submit( T item );
/*
* Submit a collection of items to the downsampler for consideration.
/**
* Submit a collection of items to the downsampler for consideration. Should be equivalent to calling
* submit() on each individual item in the collection.
*
* @param items the collection of items to submit to the downsampler for consideration
*/
public void submit( Collection<T> items );
/*
/**
* Are there items that have survived the downsampling process waiting to be retrieved?
*
* @return true if this downsampler has > 0 finalized items, otherwise false
*/
public boolean hasDownsampledItems();
public boolean hasFinalizedItems();
/*
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
/**
* Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved.
*
* @return a list of all finalized items this downsampler contains, or an empty list if there are none
*/
public List<T> consumeDownsampledItems();
public List<T> consumeFinalizedItems();
/*
/**
* Are there items stored in this downsampler that it doesn't yet know whether they will
* ultimately survive the downsampling process?
*
* @return true if this downsampler has > 0 pending items, otherwise false
*/
public boolean hasPendingItems();
/*
/**
* Peek at the first finalized item stored in this downsampler (or null if there are no finalized items)
*
* @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call),
* or null if there are none
*/
public T peekFinalized();
/**
* Peek at the first pending item stored in this downsampler (or null if there are no pending items)
*
* @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call),
* or null if there are none
*/
public T peekPending();
/**
* Returns the number of items discarded (so far) during the downsampling process
*
* @return the number of items that have been submitted to this downsampler and discarded in the process of
* downsampling
*/
public int getNumberOfDiscardedItems();
/**
* Used to tell the downsampler that no more items will be submitted to it, and that it should
* finalize any pending items.
*/
public void signalEndOfInput();
/*
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
* information.
/**
* Empty the downsampler of all finalized/pending items
*/
public void clear();
/**
* Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items
*/
public void reset();
}

View File

@ -0,0 +1,153 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Describes the method for downsampling reads at a given locus.
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
/**
* Use the new experimental downsampling?
*/
public final boolean useExperimentalDownsampling;
/**
* Expresses no downsampling applied at all.
*/
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false);
/**
* Default type to use if no type is specified
*/
public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
/**
* Default target coverage for locus-based traversals
*/
public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
this.useExperimentalDownsampling = useExperimentalDownsampling;
if ( type == DownsampleType.NONE ) {
toCoverage = null;
toFraction = null;
}
validate();
}
private void validate() {
// Can't leave toFraction and toCoverage null unless type is NONE
if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null )
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if ( toFraction != null && toCoverage != null )
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
// toCoverage must be > 0 when specified
if ( toCoverage != null && toCoverage <= 0 ) {
throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage");
}
// toFraction must be >= 0.0 and <= 1.0 when specified
if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
}
// Some restrictions only exist for the old downsampling implementation:
if ( ! useExperimentalDownsampling ) {
// By sample downsampling does not work with a fraction of reads in the old downsampling implementation
if( type == DownsampleType.BY_SAMPLE && toFraction != null )
throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
}
// Some restrictions only exist for the new downsampling implementation:
if ( useExperimentalDownsampling ) {
if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
}
}
}
public String toString() {
StringBuilder builder = new StringBuilder("Downsampling Settings: ");
if ( type == DownsampleType.NONE ) {
builder.append("No downsampling");
}
else {
builder.append(String.format("Method: %s ", type));
if ( toCoverage != null ) {
builder.append(String.format("Target Coverage: %d ", toCoverage));
}
else {
builder.append(String.format("Target Fraction: %.2f ", toFraction));
}
if ( useExperimentalDownsampling ) {
builder.append("Using Experimental Downsampling");
}
}
return builder.toString();
}
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
null, useExperimentalDownsampling);
}
else {
return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
}
}
}

View File

@ -33,7 +33,8 @@ import java.util.NoSuchElementException;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
* StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style
* downsampler interface to a pull model.
*
* @author David Roazen
*/
@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsampler<SAMRecord> downsampler;
private Collection<SAMRecord> downsampledReadsCache;
private Iterator<SAMRecord> downsampledReadsCacheIterator;
private SAMRecord nextRead = null;
private Iterator<SAMRecord> downsampledReadsCacheIterator = null;
/**
* @param iter wrapped iterator from which this iterator will pull reads
* @param downsampler downsampler through which the reads will be fed
*/
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
nestedSAMIterator = iter;
this.downsampler = downsampler;
fillDownsampledReadsCache();
advanceToNextRead();
}
public boolean hasNext() {
if ( downsampledReadsCacheIterator.hasNext() ) {
return true;
}
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
return false;
}
return true;
return nextRead != null;
}
public SAMRecord next() {
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
if ( nextRead == null ) {
throw new NoSuchElementException("next() called when there are no more items");
}
return downsampledReadsCacheIterator.next();
SAMRecord toReturn = nextRead;
advanceToNextRead();
return toReturn;
}
private void advanceToNextRead() {
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
nextRead = null;
}
else {
nextRead = downsampledReadsCacheIterator.next();
}
}
private boolean readyToReleaseReads() {
return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext();
}
private boolean fillDownsampledReadsCache() {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) {
downsampler.submit(nestedSAMIterator.next());
}
@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
downsampler.signalEndOfInput();
}
downsampledReadsCache = downsampler.consumeDownsampledItems();
// use returned collection directly rather than make a copy, for speed
downsampledReadsCache = downsampler.consumeFinalizedItems();
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
return downsampledReadsCacheIterator.hasNext();

View File

@ -33,7 +33,10 @@ import java.util.Collection;
import java.util.List;
/**
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
* Fractional Downsampler: selects a specified fraction of the reads for inclusion.
*
* Since the selection is done randomly, the actual fraction of reads retained may be slightly
* more or less than the requested fraction, depending on the total number of reads submitted.
*
* @author David Roazen
*/
@ -43,8 +46,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
private int cutoffForInclusion;
private int numDiscardedItems;
private static final int RANDOM_POOL_SIZE = 10000;
/**
* Construct a FractionalDownsampler
*
* @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive).
* Actual number of reads preserved may differ randomly.
*/
public FractionalDownsampler( double fraction ) {
if ( fraction < 0.0 || fraction > 1.0 ) {
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
@ -52,12 +63,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
clear();
reset();
}
public void submit( T newRead ) {
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
selectedReads.add(newRead);
}
else {
numDiscardedItems++;
}
}
public void submit( Collection<T> newReads ) {
@ -66,11 +81,12 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
}
}
public boolean hasDownsampledItems() {
public boolean hasFinalizedItems() {
return selectedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = selectedReads;
clear();
return downsampledItems;
@ -80,6 +96,18 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
return false;
}
public T peekFinalized() {
return selectedReads.isEmpty() ? null : selectedReads.get(0);
}
public T peekPending() {
return null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
// NO-OP
}
@ -88,7 +116,15 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
selectedReads = new ArrayList<T>();
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
public void signalNoMoreReadsBefore( T read ) {
// NO-OP
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating FractionalDownsamplers on demand
*
* @author David Roazen
*/
public class FractionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private double fraction;
public FractionalDownsamplerFactory( double fraction ) {
this.fraction = fraction;
}
public ReadsDownsampler<T> newInstance() {
return new FractionalDownsampler<T>(fraction);
}
}

View File

@ -0,0 +1,212 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.*;
/**
* Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from
* the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling
* does not occur until all Lists have been submitted and signalEndOfInput() is called.
*
* The Lists should be LinkedLists for maximum efficiency during item removal, however other
* kinds of Lists are also accepted (albeit at a slight performance penalty).
*
* Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface,
* the Lists need not contain reads. However this downsampler may not be wrapped within one of the
* DownsamplingReadsIterators
*
* @param <T> the List type representing the stacks to be leveled
* @param <E> the type of the elements of each List
*
* @author David Roazen
*/
public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T> {
private int targetSize;
private List<T> groups;
private boolean groupsAreFinalized;
private int numDiscardedItems;
/**
* Construct a LevelingDownsampler
*
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
* this value -- if it does, items are removed from Lists evenly until the total size
* is <= this value
*/
public LevelingDownsampler( int targetSize ) {
this.targetSize = targetSize;
clear();
reset();
}
public void submit( T item ) {
groups.add(item);
}
public void submit( Collection<T> items ){
groups.addAll(items);
}
public boolean hasFinalizedItems() {
return groupsAreFinalized && groups.size() > 0;
}
public List<T> consumeFinalizedItems() {
if ( ! hasFinalizedItems() ) {
return new ArrayList<T>();
}
// pass by reference rather than make a copy, for speed
List<T> toReturn = groups;
clear();
return toReturn;
}
public boolean hasPendingItems() {
return ! groupsAreFinalized && groups.size() > 0;
}
public T peekFinalized() {
return hasFinalizedItems() ? groups.get(0) : null;
}
public T peekPending() {
return hasPendingItems() ? groups.get(0) : null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
levelGroups();
groupsAreFinalized = true;
}
public void clear() {
groups = new ArrayList<T>();
groupsAreFinalized = false;
}
public void reset() {
numDiscardedItems = 0;
}
private void levelGroups() {
int totalSize = 0;
int[] groupSizes = new int[groups.size()];
int currentGroupIndex = 0;
for ( T group : groups ) {
groupSizes[currentGroupIndex] = group.size();
totalSize += groupSizes[currentGroupIndex];
currentGroupIndex++;
}
if ( totalSize <= targetSize ) {
return; // no need to eliminate any items
}
// We will try to remove exactly this many items, however we will refuse to allow any
// one group to fall below size 1, and so might end up removing fewer items than this
int numItemsToRemove = totalSize - targetSize;
currentGroupIndex = 0;
int numConsecutiveUmodifiableGroups = 0;
// Continue until we've either removed all the items we wanted to, or we can't
// remove any more items without violating the constraint that all groups must
// be left with at least one item
while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) {
if ( groupSizes[currentGroupIndex] > 1 ) {
groupSizes[currentGroupIndex]--;
numItemsToRemove--;
numConsecutiveUmodifiableGroups = 0;
}
else {
numConsecutiveUmodifiableGroups++;
}
currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length;
}
// Now we actually go through and reduce each group to its new count as specified in groupSizes
currentGroupIndex = 0;
for ( T group : groups ) {
downsampleOneGroup(group, groupSizes[currentGroupIndex]);
currentGroupIndex++;
}
}
private void downsampleOneGroup( T group, int numItemsToKeep ) {
if ( numItemsToKeep >= group.size() ) {
return;
}
numDiscardedItems += group.size() - numItemsToKeep;
BitSet itemsToKeep = new BitSet(group.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
// If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator
if ( group instanceof LinkedList ) {
Iterator iter = group.iterator();
while ( iter.hasNext() ) {
iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
// If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather
// than suffer O(n^2) of item shifting
else {
List<E> keptItems = new ArrayList<E>(numItemsToKeep);
for ( E item : group ) {
if ( itemsToKeep.get(currentIndex) ) {
keptItems.add(item);
}
currentIndex++;
}
group.clear();
group.addAll(keptItems);
}
}
}

View File

@ -0,0 +1,202 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordComparator;
import net.sf.samtools.SAMRecordCoordinateComparator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import java.util.*;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
* that downsamples reads for each sample independently, and then re-assembles
* the reads back into a single merged stream.
*
* @author David Roazen
*/
public class PerSampleDownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsamplerFactory<SAMRecord> downsamplerFactory;
private Map<String, ReadsDownsampler<SAMRecord>> perSampleDownsamplers;
private PriorityQueue<SAMRecord> orderedDownsampledReadsCache;
private SAMRecord nextRead = null;
private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator();
private SAMRecord earliestPendingRead = null;
private ReadsDownsampler<SAMRecord> earliestPendingDownsampler = null;
// Initial size of our cache of finalized reads
private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096;
// The number of positional changes that can occur in the read stream before all downsamplers
// should be informed of the current position (guards against samples with relatively sparse reads
// getting stuck in a pending state):
private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value
/**
* @param iter wrapped iterator from which this iterator will pull reads
* @param downsamplerFactory factory used to create new downsamplers as needed
*/
public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory<SAMRecord> downsamplerFactory ) {
nestedSAMIterator = iter;
this.downsamplerFactory = downsamplerFactory;
perSampleDownsamplers = new HashMap<String, ReadsDownsampler<SAMRecord>>();
orderedDownsampledReadsCache = new PriorityQueue<SAMRecord>(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator);
advanceToNextRead();
}
public boolean hasNext() {
return nextRead != null;
}
public SAMRecord next() {
if ( nextRead == null ) {
throw new NoSuchElementException("next() called when there are no more items");
}
SAMRecord toReturn = nextRead;
advanceToNextRead();
return toReturn;
}
private void advanceToNextRead() {
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
nextRead = null;
}
else {
nextRead = orderedDownsampledReadsCache.poll();
}
}
private boolean readyToReleaseReads() {
if ( orderedDownsampledReadsCache.isEmpty() ) {
return false;
}
return earliestPendingRead == null ||
readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0;
}
private void updateEarliestPendingRead( ReadsDownsampler<SAMRecord> currentDownsampler ) {
// If there is no recorded earliest pending read and this downsampler has pending items,
// then this downsampler's first pending item becomes the new earliest pending read:
if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) {
earliestPendingRead = currentDownsampler.peekPending();
earliestPendingDownsampler = currentDownsampler;
}
// In all other cases, we only need to update the earliest pending read when the downsampler
// associated with it experiences a change in its pending reads, since by assuming a sorted
// read stream we're assured that each downsampler's earliest pending read will only increase
// in genomic position over time.
//
// TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers
// TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))),
// TODO: but need to verify this empirically.
else if ( currentDownsampler == earliestPendingDownsampler &&
(! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) {
earliestPendingRead = null;
earliestPendingDownsampler = null;
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
if ( perSampleDownsampler.hasPendingItems() &&
(earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) {
earliestPendingRead = perSampleDownsampler.peekPending();
earliestPendingDownsampler = perSampleDownsampler;
}
}
}
}
private boolean fillDownsampledReadsCache() {
SAMRecord prevRead = null;
int numPositionalChanges = 0;
// Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue
// can be released without violating global sort order
while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) {
SAMRecord read = nestedSAMIterator.next();
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
ReadsDownsampler<SAMRecord> thisSampleDownsampler = perSampleDownsamplers.get(sampleName);
if ( thisSampleDownsampler == null ) {
thisSampleDownsampler = downsamplerFactory.newInstance();
perSampleDownsamplers.put(sampleName, thisSampleDownsampler);
}
thisSampleDownsampler.submit(read);
updateEarliestPendingRead(thisSampleDownsampler);
if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) {
numPositionalChanges++;
}
// If the number of times we've changed position exceeds a certain threshold, inform all
// downsamplers of the current position in the read stream. This is to prevent downsamplers
// for samples with sparser reads than others from getting stuck too long in a pending state.
if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) {
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
perSampleDownsampler.signalNoMoreReadsBefore(read);
updateEarliestPendingRead(perSampleDownsampler);
}
}
prevRead = read;
}
if ( ! nestedSAMIterator.hasNext() ) {
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
perSampleDownsampler.signalEndOfInput();
}
earliestPendingRead = null;
earliestPendingDownsampler = null;
}
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
if ( perSampleDownsampler.hasFinalizedItems() ) {
orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems());
}
}
return readyToReleaseReads();
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
public void close() {
nestedSAMIterator.close();
}
public Iterator<SAMRecord> iterator() {
return this;
}
}

View File

@ -1,259 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
*
* @author David Roazen
*/
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private LinkedList<PositionalReadGrouping> pendingReads;
private ArrayList<T> finalizedReads;
public PositionalDownsampler ( int targetCoverage ) {
this.targetCoverage = targetCoverage;
clear();
}
public void submit ( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
updateAndDownsamplePendingReads();
}
reservoir.submit(newRead);
updateCurrentPosition(newRead);
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return pendingReads.size() > 0;
}
public void signalEndOfInput() {
updateAndDownsamplePendingReads();
for ( PositionalReadGrouping group : pendingReads ) {
group.finalizeAllActiveReads();
finalizedReads.addAll(group.getFinalizedReads());
}
pendingReads.clear();
}
public void clear() {
reservoir = new ReservoirDownsampler<T>(targetCoverage);
pendingReads = new LinkedList<PositionalReadGrouping>();
finalizedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return true;
}
private void updateCurrentPosition ( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
}
private boolean readIsPastCurrentPosition ( T read ) {
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
}
private void updateAndDownsamplePendingReads() {
finalizeOutOfScopeReads();
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
downsampleOverlappingGroups();
}
private void finalizeOutOfScopeReads() {
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
boolean noPrecedingUnfinalizedGroups = true;
while ( iter.hasNext() ) {
PositionalReadGrouping currentGroup = iter.next();
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
iter.remove();
finalizedReads.addAll(currentGroup.getFinalizedReads());
}
else {
noPrecedingUnfinalizedGroups = false;
}
}
}
private void downsampleOverlappingGroups() {
int[] groupReadCounts = new int[pendingReads.size()];
int totalCoverage = 0;
int numActiveGroups = 0;
int currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
groupReadCounts[currentGroup] = group.numActiveReads();
totalCoverage += groupReadCounts[currentGroup];
if ( groupReadCounts[currentGroup] > 0 ) {
numActiveGroups++;
}
currentGroup++;
}
if ( totalCoverage <= targetCoverage ) {
return;
}
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
currentGroup = 0;
while ( numReadsToRemove > 0 ) {
if ( groupReadCounts[currentGroup] > 1 ) {
groupReadCounts[currentGroup]--;
numReadsToRemove--;
}
currentGroup = (currentGroup + 1) % groupReadCounts.length;
}
currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
if ( ! group.isFinalized() ) {
group.downsampleActiveReads(groupReadCounts[currentGroup]);
}
currentGroup++;
}
}
private class PositionalReadGrouping {
private List<T> activeReads;
private List<T> finalizedReads;
private int contig;
private int alignmentStart;
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
activeReads = new LinkedList<T>(reads);
finalizedReads = new ArrayList<T>();
this.contig = contig;
this.alignmentStart = alignmentStart;
}
public int numActiveReads() {
return activeReads.size();
}
public boolean isFinalized() {
return activeReads.size() == 0;
}
public List<T> getFinalizedReads() {
return finalizedReads;
}
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
if ( this.contig != contig ) {
finalizeAllActiveReads();
return;
}
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( read.getAlignmentEnd() < position ) {
iter.remove();
finalizedReads.add(read);
}
}
}
public void finalizeAllActiveReads() {
finalizedReads.addAll(activeReads);
activeReads.clear();
}
public void downsampleActiveReads( int numReadsToKeep ) {
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
numReadsToKeep, activeReads.size()));
}
BitSet itemsToKeep = new BitSet(activeReads.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
}
}

View File

@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord;
*/
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
/*
/**
* Does this downsampler require that reads be fed to it in coordinate order?
*
* @return true if reads must be submitted to this downsampler in coordinate order, otherwise false
*/
public boolean requiresCoordinateSortOrder();
/**
* Tell this downsampler that no more reads located before the provided read (according to
* the sort order of the read stream) will be fed to it.
*
* Allows position-aware downsamplers to finalize pending reads earlier than they would
* otherwise be able to, particularly when doing per-sample downsampling and reads for
* certain samples are sparser than average.
*
* @param read the downsampler will assume that no reads located before this read will ever
* be submitted to it in the future
*/
public void signalNoMoreReadsBefore( T read );
}

View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular
* downsampler, all sharing the same construction parameters.
*
* @author David Roazen
*/
public interface ReadsDownsamplerFactory<T extends SAMRecord> {
public ReadsDownsampler<T> newInstance();
}

View File

@ -48,6 +48,14 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
private int totalReadsSeen;
private int numDiscardedItems;
/**
* Construct a ReservoirDownsampler
*
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
* after downsampling will be min(totalReads, targetSampleSize)
*/
public ReservoirDownsampler ( int targetSampleSize ) {
if ( targetSampleSize <= 0 ) {
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
@ -55,6 +63,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
this.targetSampleSize = targetSampleSize;
clear();
reset();
}
public void submit ( T newRead ) {
@ -68,6 +77,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
if ( randomSlot < targetSampleSize ) {
reservoir.set(randomSlot, newRead);
}
numDiscardedItems++;
}
}
@ -77,11 +87,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
}
}
public boolean hasDownsampledItems() {
public boolean hasFinalizedItems() {
return reservoir.size() > 0;
}
public List<T> consumeDownsampledItems() {
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
@ -91,16 +102,36 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
return false;
}
public T peekFinalized() {
return reservoir.isEmpty() ? null : reservoir.get(0);
}
public T peekPending() {
return null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
reservoir = new ArrayList<T>(targetSampleSize);
totalReadsSeen = 0;
totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
public void signalNoMoreReadsBefore( T read ) {
// NO-OP
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating ReservoirDownsamplers on demand
*
* @author David Roazen
*/
public class ReservoirDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private int targetSampleSize;
public ReservoirDownsamplerFactory( int targetSampleSize ) {
this.targetSampleSize = targetSampleSize;
}
public ReadsDownsampler<T> newInstance() {
return new ReservoirDownsampler<T>(targetSampleSize);
}
}

View File

@ -0,0 +1,169 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import java.util.*;
/**
* Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage
* using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time.
*
* @author David Roazen
*/
public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private boolean positionEstablished;
private boolean unmappedReadsReached;
private ArrayList<T> finalizedReads;
private int numDiscardedItems;
/**
* Construct a SimplePositionalDownsampler
*
* @param targetCoverage Maximum number of reads that may share any given alignment start position
*/
public SimplePositionalDownsampler( int targetCoverage ) {
this.targetCoverage = targetCoverage;
reservoir = new ReservoirDownsampler<T>(targetCoverage);
finalizedReads = new ArrayList<T>();
clear();
reset();
}
public void submit( T newRead ) {
updatePositionalState(newRead);
if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream
finalizedReads.add(newRead);
}
else {
int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems();
reservoir.submit(newRead);
numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems;
}
}
public void submit( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasFinalizedItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return reservoir.hasFinalizedItems();
}
public T peekFinalized() {
return finalizedReads.isEmpty() ? null : finalizedReads.get(0);
}
public T peekPending() {
return reservoir.peekFinalized();
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
finalizeReservoir();
}
public void clear() {
reservoir.clear();
reservoir.reset();
finalizedReads.clear();
positionEstablished = false;
unmappedReadsReached = false;
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return true;
}
public void signalNoMoreReadsBefore( T read ) {
updatePositionalState(read);
}
private void updatePositionalState( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
if ( reservoir.hasFinalizedItems() ) {
finalizeReservoir();
}
setCurrentPosition(newRead);
if ( newRead.getReadUnmappedFlag() ) {
unmappedReadsReached = true;
}
}
}
private void setCurrentPosition( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
positionEstablished = true;
}
private boolean readIsPastCurrentPosition( T read ) {
return ! positionEstablished ||
read.getReferenceIndex() > currentContigIndex ||
read.getAlignmentStart() > currentAlignmentStart ||
(read.getReadUnmappedFlag() && ! unmappedReadsReached);
}
private void finalizeReservoir() {
finalizedReads.addAll(reservoir.consumeFinalizedItems());
reservoir.reset();
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating SimplePositionalDownsamplers on demand
*
* @author David Roazen
*/
public class SimplePositionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private int targetCoverage;
public SimplePositionalDownsamplerFactory( int targetCoverage ) {
this.targetCoverage = targetCoverage;
}
public ReadsDownsampler<T> newInstance() {
return new SimplePositionalDownsampler<T>(targetCoverage);
}
}

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -76,21 +77,21 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
/**
* Create a new hierarchical microscheduler to process the given reads and reference.
*
* @param walker the walker used to process the dataset.
* @param reads Reads file(s) to process.
* @param reference Reference for driving the traversal.
* @param nThreadsToUse maximum number of threads to use to do the work
* @param walker the walker used to process the dataset.
* @param reads Reads file(s) to process.
* @param reference Reference for driving the traversal.
* @param threadAllocation How should we apply multi-threaded execution?
*/
protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine,
final Walker walker,
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final int nThreadsToUse,
final boolean monitorThreadPerformance ) {
super(engine, walker, reads, reference, rods, nThreadsToUse);
final ThreadAllocation threadAllocation) {
super(engine, walker, reads, reference, rods, threadAllocation);
if ( monitorThreadPerformance ) {
final int nThreadsToUse = threadAllocation.getNumDataThreads();
if ( threadAllocation.monitorThreadEfficiency() ) {
final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse);
setThreadEfficiencyMonitor(monitoringThreadFactory);
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory);

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.SampleUtils;
@ -39,13 +40,11 @@ public class LinearMicroScheduler extends MicroScheduler {
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final int numThreads, // may be > 1 if are nanoScheduling
final boolean monitorThreadPerformance ) {
super(engine, walker, reads, reference, rods, numThreads);
final ThreadAllocation threadAllocation) {
super(engine, walker, reads, reference, rods, threadAllocation);
if ( monitorThreadPerformance )
if ( threadAllocation.monitorThreadEfficiency() )
setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor());
}
/**
@ -60,11 +59,12 @@ public class LinearMicroScheduler extends MicroScheduler {
boolean done = walker.isDone();
int counter = 0;
traversalEngine.startTimersIfNecessary();
for (Shard shard : shardStrategy ) {
if ( done || shard == null ) // we ran out of shards that aren't owned
break;
traversalEngine.startTimersIfNecessary();
if(shard.getShardType() == Shard.ShardType.LOCUS) {
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));

View File

@ -59,6 +59,8 @@ import java.util.Collection;
/** Shards and schedules data in manageable chunks. */
public abstract class MicroScheduler implements MicroSchedulerMBean {
// TODO -- remove me and retire non nano scheduled versions of traversals
private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true;
protected static final Logger logger = Logger.getLogger(MicroScheduler.class);
/**
@ -100,27 +102,36 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* @return The best-fit microscheduler.
*/
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
if (threadAllocation.getNumCPUThreads() > 1) {
if ( threadAllocation.isRunningInParallelMode() ) {
// TODO -- remove me when we fix running NCT within HMS
if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1)
throw new UserException("Currently the GATK does not support running CPU threads within data threads, " +
"please specify only one of NT and NCT");
logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)",
threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads()));
}
if ( threadAllocation.getNumDataThreads() > 1 ) {
if (walker.isReduceByInterval())
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
if ( walker instanceof ReadWalker ) {
if ( ! (walker instanceof ThreadSafeMapReduce) ) badNT(engine, walker);
return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency());
if ( ! (walker instanceof TreeReducible) ) {
throw badNT("nt", engine, walker);
} else {
// TODO -- update test for when nano scheduling only is an option
if ( ! (walker instanceof TreeReducible) ) badNT(engine, walker);
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency());
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
}
} else {
return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency());
if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) )
throw badNT("nct", engine, walker);
return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
}
}
private static void badNT(final GenomeAnalysisEngine engine, final Walker walker) {
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) {
throw new UserException.BadArgumentValue("nt",
String.format("The analysis %s currently does not support parallel execution with %s. " +
"Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg));
}
/**
@ -130,23 +141,27 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* @param reads The reads.
* @param reference The reference.
* @param rods the rods to include in the traversal
* @param numThreads the number of threads we are using in the underlying traversal
* @param threadAllocation the allocation of threads to use in the underlying traversal
*/
protected MicroScheduler(final GenomeAnalysisEngine engine,
final Walker walker,
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final int numThreads) {
final ThreadAllocation threadAllocation) {
this.engine = engine;
this.reads = reads;
this.reference = reference;
this.rods = rods;
if (walker instanceof ReadWalker) {
traversalEngine = numThreads > 1 ? new TraverseReadsNano(numThreads) : new TraverseReads();
traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1
? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread())
: new TraverseReads();
} else if (walker instanceof LocusWalker) {
traversalEngine = new TraverseLoci();
traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1
? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread())
: new TraverseLociLinear();
} else if (walker instanceof DuplicateWalker) {
traversalEngine = new TraverseDuplicates();
} else if (walker instanceof ReadPairWalker) {

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -81,7 +82,13 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals, Collection<String> sampleNames) {
this.sourceInfo = shard.getReadProperties();
this.readIterator = iterator;
this.sourceIterator = new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
// Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested:
this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ?
new PeekableIterator<AlignmentContext>(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames))
:
new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
this.intervalIterator = intervals.size()>0 ? new PeekableIterator<GenomeLoc>(intervals.iterator()) : null;
}

View File

@ -29,6 +29,7 @@ import com.google.common.base.Function;
import com.google.common.collect.Collections2;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.help.GATKDocUtils;
import java.util.Collection;
import java.util.List;
@ -68,16 +69,29 @@ public class FilterManager extends PluginManager<ReadFilter> {
@Override
protected String formatErrorMessage(String pluginCategory, String pluginName) {
List<Class<? extends ReadFilter>> availableFilters = this.getPluginsImplementing(ReadFilter.class);
Collection<String> availableFilterNames = Collections2.transform(availableFilters, new Function<Class<? extends ReadFilter>,String>(){
@Override
public String apply(final Class<? extends ReadFilter> input) {
return getName(input);
}
});
return String.format("Read filter %s not found. Available read filters:%n%s.%n%n%s",pluginName,
Utils.join(String.format(", "),availableFilterNames),
return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName,
userFriendlyListofReadFilters(availableFilters),
"Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information.");
}
private String userFriendlyListofReadFilters(List<Class<? extends ReadFilter>> filters) {
final String headName = "FilterName", headDoc = "Documentation";
int longestNameLength = -1;
for ( Class < ? extends ReadFilter> filter : filters ) {
longestNameLength = Math.max(longestNameLength,this.getName(filter).length());
}
String format = " %"+longestNameLength+"s %s%n";
StringBuilder listBuilder = new StringBuilder();
listBuilder.append(String.format(format,headName,headDoc));
for ( Class<? extends ReadFilter> filter : filters ) {
String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter);
String filterName = this.getName(filter);
listBuilder.append(String.format(format,filterName,helpLink));
}
return listBuilder.toString();
}
}

View File

@ -32,9 +32,9 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.File;
@ -269,7 +269,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
* @return
*/
public boolean alsoWriteBCFForTest() {
return engine.getArguments().numberOfThreads == 1 && // only works single threaded
return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded
! isCompressed() && // for non-compressed outputs
getFile() != null && // that are going to disk
engine.getArguments().generateShadowBCF; // and we actually want to do it

View File

@ -6,13 +6,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import java.util.Iterator;
public class DownsampleIterator implements StingSAMIterator {
public class LegacyDownsampleIterator implements StingSAMIterator {
StingSAMIterator it;
int cutoff;
SAMRecord next;
public DownsampleIterator(StingSAMIterator it, double fraction) {
public LegacyDownsampleIterator(StingSAMIterator it, double fraction) {
this.it = it;
cutoff = (int)(fraction * 10000);
next = getNextRecord();

View File

@ -31,8 +31,8 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc;

View File

@ -0,0 +1,649 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.iterators;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
*/
public class LocusIteratorByStateExperimental extends LocusIterator {
/**
* our log, which we want to capture anything from this class
*/
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
// -----------------------------------------------------------------------------------------------------------------
//
// member fields
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Used to create new GenomeLocs.
*/
private final GenomeLocParser genomeLocParser;
private final ArrayList<String> samples;
private final ReadStateManager readStates;
protected static class SAMRecordState {
SAMRecord read;
int readOffset = -1; // how far are we offset from the start of the read bases?
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
Cigar cigar = null;
int cigarOffset = -1;
CigarElement curElement = null;
int nCigarElements = 0;
int cigarElementCounter = -1; // how far are we into a single cigarElement
// The logical model for generating extended events is as follows: the "record state" implements the traversal
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
// can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
// deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
// if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base).
public SAMRecordState(SAMRecord read) {
this.read = read;
cigar = read.getCigar();
nCigarElements = cigar.numCigarElements();
//System.out.printf("Creating a SAMRecordState: %s%n", this);
}
public SAMRecord getRead() {
return read;
}
/**
* What is our current offset in the read's bases that aligns us with the reference genome?
*
* @return
*/
public int getReadOffset() {
return readOffset;
}
/**
* What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
*
* @return
*/
public int getGenomeOffset() {
return genomeOffset;
}
public int getGenomePosition() {
return read.getAlignmentStart() + getGenomeOffset();
}
public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
}
public CigarOperator getCurrentCigarOperator() {
return curElement.getOperator();
}
public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
public CigarElement peekForwardOnGenome() {
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
}
public CigarElement peekBackwardOnGenome() {
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
}
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
cigarOffset++;
if (cigarOffset < nCigarElements) {
curElement = cigar.getCigarElement(cigarOffset);
cigarElementCounter = 0;
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
// we reenter in order to re-check cigarElementCounter against curElement's length
return stepForwardOnGenome();
} else {
if (curElement != null && curElement.getOperator() == CigarOperator.D)
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
// current offset of this read is the next ref base after the end of the indel. This position will
// model a point on the reference somewhere after the end of the read.
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
return null;
}
}
boolean done = false;
switch (curElement.getOperator()) {
case H: // ignore hard clips
case P: // ignore pads
cigarElementCounter = curElement.getLength();
break;
case I: // insertion w.r.t. the reference
case S: // soft clip
cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength();
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// should be the same as N case
genomeOffset++;
done = true;
break;
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
genomeOffset++;
done = true;
break;
case M:
case EQ:
case X:
readOffset++;
genomeOffset++;
done = true;
break;
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
}
return done ? curElement.getOperator() : stepForwardOnGenome();
}
}
//final boolean DEBUG = false;
//final boolean DEBUG2 = false && DEBUG;
private ReadProperties readInfo;
private AlignmentContext nextAlignmentContext;
private boolean performLevelingDownsampling;
// -----------------------------------------------------------------------------------------------------------------
//
// constructors and other basic operations
//
// -----------------------------------------------------------------------------------------------------------------
public LocusIteratorByStateExperimental(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
this.readInfo = readInformation;
this.genomeLocParser = genomeLocParser;
this.samples = new ArrayList<String>(samples);
this.readStates = new ReadStateManager(samIterator);
this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null &&
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
readInfo.getDownsamplingMethod().toCoverage != null;
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
if (this.samples.isEmpty() && samIterator.hasNext()) {
throw new IllegalArgumentException("samples list must not be empty");
}
}
/**
* For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
* for the system.
*/
public final static Collection<String> sampleListForSAMWithoutReadGroups() {
List<String> samples = new ArrayList<String>();
samples.add(null);
return samples;
}
public Iterator<AlignmentContext> iterator() {
return this;
}
public void close() {
//this.it.close();
}
public boolean hasNext() {
lazyLoadNextAlignmentContext();
return (nextAlignmentContext != null);
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
}
private GenomeLoc getLocation() {
return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
}
// -----------------------------------------------------------------------------------------------------------------
//
// next() routine and associated collection operations
//
// -----------------------------------------------------------------------------------------------------------------
public AlignmentContext next() {
lazyLoadNextAlignmentContext();
if (!hasNext())
throw new NoSuchElementException("LocusIteratorByState: out of elements.");
AlignmentContext currentAlignmentContext = nextAlignmentContext;
nextAlignmentContext = null;
return currentAlignmentContext;
}
/**
* Creates the next alignment context from the given state. Note that this is implemented as a lazy load method.
* nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
*/
private void lazyLoadNextAlignmentContext() {
while (nextAlignmentContext == null && readStates.hasNext()) {
readStates.collectPendingReads();
final GenomeLoc location = getLocation();
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
// TODO: How can you determine here whether the current pileup has been downsampled?
boolean hasBeenSampled = false;
for (final String sample : samples) {
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
int size = 0; // number of elements in this sample's pileup
int nDeletions = 0; // number of deletions in this sample's pileup
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
final boolean isSingleElementCigar = nextElement == lastElement;
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
int readOffset = state.getReadOffset(); // the base offset on this read
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar;
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
int nextElementLength = nextElement.getLength();
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (op == CigarOperator.D) {
// TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;
nDeletions++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
else {
if (!filterBaseInRead(read, location.getStart())) {
String insertedBaseString = null;
if (nextOp == CigarOperator.I) {
final int insertionOffset = isSingleElementCigar ? 0 : 1;
// TODO -- someone please implement a better fix for the single element insertion CIGAR!
if (isSingleElementCigar)
readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases!
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength()));
}
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
size++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
}
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
}
updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
}
}
// fast testing of position
private boolean readIsPastCurrentPosition(SAMRecord read) {
if (readStates.isEmpty())
return false;
else {
SAMRecordState state = readStates.getFirst();
SAMRecord ourRead = state.getRead();
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
}
}
/**
* Generic place to put per-base filters appropriate to LocusIteratorByState
*
* @param rec
* @param pos
* @return
*/
private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
return ReadUtils.isBaseInsideAdaptor(rec, pos);
}
private void updateReadStates() {
for (final String sample : samples) {
Iterator<SAMRecordState> it = readStates.iterator(sample);
while (it.hasNext()) {
SAMRecordState state = it.next();
CigarOperator op = state.stepForwardOnGenome();
if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
it.remove(); // we've stepped off the end of the object
}
}
}
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
protected class ReadStateManager {
private final PeekableIterator<SAMRecord> iterator;
private final SamplePartitioner samplePartitioner;
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
private int totalReadStates = 0;
public ReadStateManager(Iterator<SAMRecord> source) {
this.iterator = new PeekableIterator<SAMRecord>(source);
for (final String sample : samples) {
readStatesBySample.put(sample, new PerSampleReadStateManager());
}
samplePartitioner = new SamplePartitioner();
}
/**
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
* for this iterator; if present, total read states will be decremented.
*
* @param sample The sample.
* @return Iterator over the reads associated with that sample.
*/
public Iterator<SAMRecordState> iterator(final String sample) {
return new Iterator<SAMRecordState>() {
private Iterator<SAMRecordState> wrappedIterator = readStatesBySample.get(sample).iterator();
public boolean hasNext() {
return wrappedIterator.hasNext();
}
public SAMRecordState next() {
return wrappedIterator.next();
}
public void remove() {
wrappedIterator.remove();
}
};
}
public boolean isEmpty() {
return totalReadStates == 0;
}
/**
* Retrieves the total number of reads in the manager across all samples.
*
* @return Total number of reads over all samples.
*/
public int size() {
return totalReadStates;
}
/**
* Retrieves the total number of reads in the manager in the given sample.
*
* @param sample The sample.
* @return Total number of reads in the given sample.
*/
public int size(final String sample) {
return readStatesBySample.get(sample).size();
}
public SAMRecordState getFirst() {
for (final String sample : samples) {
PerSampleReadStateManager reads = readStatesBySample.get(sample);
if (!reads.isEmpty())
return reads.peek();
}
return null;
}
public boolean hasNext() {
return totalReadStates > 0 || iterator.hasNext();
}
public void collectPendingReads() {
if (!iterator.hasNext())
return;
if (readStates.size() == 0) {
int firstContigIndex = iterator.peek().getReferenceIndex();
int firstAlignmentStart = iterator.peek().getAlignmentStart();
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
samplePartitioner.submitRead(iterator.next());
}
} else {
// Fast fail in the case that the read is past the current position.
if (readIsPastCurrentPosition(iterator.peek()))
return;
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
samplePartitioner.submitRead(iterator.next());
}
}
for (final String sample : samples) {
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
addReadsToSample(statesBySample, newReads);
}
samplePartitioner.reset();
}
/**
* Add reads with the given sample name to the given hanger entry.
*
* @param readStates The list of read states to add this collection of reads.
* @param reads Reads to add. Selected reads will be pulled from this source.
*/
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
if (reads.isEmpty())
return;
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
for (SAMRecord read : reads) {
SAMRecordState state = new SAMRecordState(read);
state.stepForwardOnGenome();
newReadStates.add(state);
}
readStates.addStatesAtNextAlignmentStart(newReadStates);
}
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
private int thisSampleReadStates = 0;
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
performLevelingDownsampling ?
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
null;
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
if ( states.isEmpty() ) {
return;
}
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
thisSampleReadStates += states.size();
totalReadStates += states.size();
if ( levelingDownsampler != null ) {
levelingDownsampler.submit(readStatesByAlignmentStart);
levelingDownsampler.signalEndOfInput();
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
// use returned List directly rather than make a copy, for efficiency's sake
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
levelingDownsampler.reset();
}
}
public boolean isEmpty() {
return readStatesByAlignmentStart.isEmpty();
}
public SAMRecordState peek() {
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
}
public int size() {
return thisSampleReadStates;
}
public Iterator<SAMRecordState> iterator() {
return new Iterator<SAMRecordState>() {
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
private LinkedList<SAMRecordState> currentPositionReadStates = null;
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
public boolean hasNext() {
return alignmentStartIterator.hasNext() ||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
}
public SAMRecordState next() {
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
currentPositionReadStates = alignmentStartIterator.next();
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
}
return currentPositionReadStatesIterator.next();
}
public void remove() {
currentPositionReadStatesIterator.remove();
thisSampleReadStates--;
totalReadStates--;
if ( currentPositionReadStates.isEmpty() ) {
alignmentStartIterator.remove();
}
}
};
}
}
}
/**
* Note: stores reads by sample ID string, not by sample object
*/
private class SamplePartitioner {
private Map<String, Collection<SAMRecord>> readsBySample;
private long readsSeen = 0;
public SamplePartitioner() {
readsBySample = new HashMap<String, Collection<SAMRecord>>();
for ( String sample : samples ) {
readsBySample.put(sample, new ArrayList<SAMRecord>());
}
}
public void submitRead(SAMRecord read) {
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
if (readsBySample.containsKey(sampleName))
readsBySample.get(sampleName).add(read);
readsSeen++;
}
public long getNumReadsSeen() {
return readsSeen;
}
public Collection<SAMRecord> getReadsForSample(String sampleName) {
if ( ! readsBySample.containsKey(sampleName) )
throw new NoSuchElementException("Sample name not found");
return readsBySample.get(sampleName);
}
public void reset() {
for ( Collection<SAMRecord> perSampleReads : readsBySample.values() )
perSampleReads.clear();
readsSeen = 0;
}
}
}

View File

@ -10,13 +10,11 @@ import java.util.Iterator;
* Verifies that the incoming stream of reads is correctly sorted
*/
public class VerifyingSamIterator implements StingSAMIterator {
private GenomeLocParser genomeLocParser;
StingSAMIterator it;
SAMRecord last = null;
boolean checkOrderP = true;
public VerifyingSamIterator(GenomeLocParser genomeLocParser,StingSAMIterator it) {
this.genomeLocParser = genomeLocParser;
public VerifyingSamIterator(StingSAMIterator it) {
this.it = it;
}

View File

@ -218,7 +218,7 @@ public class GATKRunReport {
// if there was an exception, capture it
this.mException = e == null ? null : new ExceptionToXML(e);
numThreads = engine.getArguments().numberOfThreads;
numThreads = engine.getTotalNumberOfThreads();
percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU);
percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING);
percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING);

View File

@ -24,7 +24,7 @@
package org.broadinstitute.sting.gatk.resourcemanagement;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* Models how threads are distributed between various components of the GATK.
@ -33,7 +33,12 @@ public class ThreadAllocation {
/**
* The number of CPU threads to be used by the GATK.
*/
private final int numCPUThreads;
private final int numDataThreads;
/**
* The number of CPU threads per data thread for GATK processing
*/
private final int numCPUThreadsPerDataThread;
/**
* Number of threads to devote exclusively to IO. Default is 0.
@ -45,8 +50,12 @@ public class ThreadAllocation {
*/
private final boolean monitorEfficiency;
public int getNumCPUThreads() {
return numCPUThreads;
public int getNumDataThreads() {
return numDataThreads;
}
public int getNumCPUThreadsPerDataThread() {
return numCPUThreadsPerDataThread;
}
public int getNumIOThreads() {
@ -57,47 +66,50 @@ public class ThreadAllocation {
return monitorEfficiency;
}
/**
* Are we running in parallel mode?
*
* @return true if any parallel processing is enabled
*/
public boolean isRunningInParallelMode() {
return getTotalNumThreads() > 1;
}
/**
* What is the total number of threads in use by the GATK?
*
* @return the sum of all thread allocations in this object
*/
public int getTotalNumThreads() {
return getNumDataThreads() * getNumCPUThreadsPerDataThread() + getNumIOThreads();
}
/**
* Construct the default thread allocation.
*/
public ThreadAllocation() {
this(1, null, null, false);
this(1, 1, 0, false);
}
/**
* Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
* (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
* @param totalThreads Complete number of threads to allocate.
* @param numCPUThreads Total number of threads allocated to the traversal.
* @param numDataThreads Total number of threads allocated to the traversal.
* @param numCPUThreadsPerDataThread The number of CPU threads per data thread to allocate
* @param numIOThreads Total number of threads allocated exclusively to IO.
* @param monitorEfficiency should we monitor threading efficiency in the GATK?
*/
public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads, final boolean monitorEfficiency) {
// If no allocation information is present, allocate all threads to CPU
if(numCPUThreads == null && numIOThreads == null) {
this.numCPUThreads = totalThreads;
this.numIOThreads = 0;
}
// If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
else if(numIOThreads == null) {
if(numCPUThreads > totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
this.numCPUThreads = numCPUThreads;
this.numIOThreads = totalThreads - numCPUThreads;
}
// If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
else if(numCPUThreads == null) {
if(numIOThreads > totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
this.numIOThreads = numIOThreads;
}
else {
if(numCPUThreads + numIOThreads != totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
this.numCPUThreads = numCPUThreads;
this.numIOThreads = numIOThreads;
}
public ThreadAllocation(final int numDataThreads,
final int numCPUThreadsPerDataThread,
final int numIOThreads,
final boolean monitorEfficiency) {
if ( numDataThreads < 1 ) throw new ReviewedStingException("numDataThreads cannot be less than 1, but saw " + numDataThreads);
if ( numCPUThreadsPerDataThread < 1 ) throw new ReviewedStingException("numCPUThreadsPerDataThread cannot be less than 1, but saw " + numCPUThreadsPerDataThread);
if ( numIOThreads < 0 ) throw new ReviewedStingException("numIOThreads cannot be less than 0, but saw " + numIOThreads);
this.numDataThreads = numDataThreads;
this.numCPUThreadsPerDataThread = numCPUThreadsPerDataThread;
this.numIOThreads = numIOThreads;
this.monitorEfficiency = monitorEfficiency;
}
}

View File

@ -44,24 +44,12 @@ import java.util.List;
import java.util.Map;
public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,ProviderType extends ShardDataProvider> {
/** our log, which we want to capture anything from this class */
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
// Time in milliseconds since we initialized this engine
private static final int HISTORY_WINDOW_SIZE = 50;
private static class ProcessingHistory {
double elapsedSeconds;
long unitsProcessed;
long bpProcessed;
GenomeLoc loc;
public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) {
this.elapsedSeconds = elapsedSeconds;
this.loc = loc;
this.unitsProcessed = unitsProcessed;
this.bpProcessed = bpProcessed;
}
}
/** lock object to sure updates to history are consistent across threads */
private static final Object lock = new Object();
LinkedList<ProcessingHistory> history = new LinkedList<ProcessingHistory>();
@ -70,13 +58,12 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
private SimpleTimer timer = null;
// How long can we go without printing some progress info?
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
private int printProgressCheckCounter = 0;
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
private long progressPrintFrequency = 10 * 1000; // in milliseconds
private boolean progressMeterInitialized = false;
// for performance log
@ -85,15 +72,12 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
private File performanceLogFile;
private PrintStream performanceLog = null;
private long lastPerformanceLogPrintTime = -1; // When was the last time we printed to the performance log?
private final long PERFORMANCE_LOG_PRINT_FREQUENCY = PROGRESS_PRINT_FREQUENCY; // in milliseconds
private final long PERFORMANCE_LOG_PRINT_FREQUENCY = progressPrintFrequency; // in milliseconds
/** Size, in bp, of the area we are processing. Updated once in the system in initial for performance reasons */
long targetSize = -1;
GenomeLocSortedSet targetIntervals = null;
/** our log, which we want to capture anything from this class */
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
protected GenomeAnalysisEngine engine;
// ----------------------------------------------------------------------------------------------------
@ -186,15 +170,35 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS;
}
/**
* Update the cumulative traversal metrics according to the data in this shard
*
* @param shard a non-null shard
*/
public void updateCumulativeMetrics(final Shard shard) {
updateCumulativeMetrics(shard.getReadMetrics());
}
/**
* Update the cumulative traversal metrics according to the data in this shard
*
* @param singleTraverseMetrics read metrics object containing the information about a single shard's worth
* of data processing
*/
public void updateCumulativeMetrics(final ReadMetrics singleTraverseMetrics) {
engine.getCumulativeMetrics().incrementMetrics(singleTraverseMetrics);
}
/**
* Forward request to printProgress
*
* @param shard the given shard currently being processed.
* Assumes that one cycle has been completed
*
* @param loc the location
*/
public void printProgress(Shard shard, GenomeLoc loc) {
public void printProgress(final GenomeLoc loc) {
// A bypass is inserted here for unit testing.
printProgress(loc,shard.getReadMetrics(),false);
printProgress(loc, false);
}
/**
@ -202,15 +206,10 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* every M seconds, for N and M set in global variables.
*
* @param loc Current location, can be null if you are at the end of the traversal
* @param metrics Data processed since the last cumulative
* @param mustPrint If true, will print out info, regardless of nRecords or time interval
*/
private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) {
if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 )
// don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES
return;
if(!progressMeterInitialized && mustPrint == false ) {
private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) {
if( ! progressMeterInitialized ) {
logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]");
logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining",
"Location", getTraversalType(), getTraversalType()));
@ -218,40 +217,34 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
}
final long curTime = timer.currentTime();
boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY);
boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency);
boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY);
if ( printProgress || printLog ) {
// getting and appending metrics data actually turns out to be quite a heavyweight
// operation. Postpone it until after determining whether to print the log message.
ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics() : new ReadMetrics();
if(metrics != null)
cumulativeMetrics.incrementMetrics(metrics);
final long nRecords = cumulativeMetrics.getNumIterations();
ProcessingHistory last = updateHistory(loc,cumulativeMetrics);
final ProcessingHistory last = updateHistory(loc, engine.getCumulativeMetrics());
final AutoFormattingTime elapsed = new AutoFormattingTime(last.elapsedSeconds);
final AutoFormattingTime bpRate = new AutoFormattingTime(secondsPerMillionBP(last));
final AutoFormattingTime unitRate = new AutoFormattingTime(secondsPerMillionElements(last));
final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last);
final AutoFormattingTime bpRate = new AutoFormattingTime(last.secondsPerMillionBP());
final AutoFormattingTime unitRate = new AutoFormattingTime(last.secondsPerMillionElements());
final double fractionGenomeTargetCompleted = last.calculateFractionGenomeTargetCompleted(targetSize);
final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted);
final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds());
final long nRecords = engine.getCumulativeMetrics().getNumIterations();
if ( printProgress ) {
lastProgressPrintTime = curTime;
// dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates
if ( estTotalRuntime.getTimeInSeconds() > TWELVE_HOURS_IN_SECONDS )
PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds
progressPrintFrequency = 60 * 1000; // in milliseconds
else if ( estTotalRuntime.getTimeInSeconds() > TWO_HOURS_IN_SECONDS )
PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds
progressPrintFrequency = 30 * 1000; // in milliseconds
else
PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
progressPrintFrequency = 10 * 1000; // in milliseconds
logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s",
loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate,
final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : String.format("%s:%d", loc.getContig(), loc.getStart());
logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s",
posName, nRecords*1.0, elapsed, unitRate,
100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion));
}
@ -277,7 +270,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* @param metrics information about what's been processed already
* @return
*/
private final ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) {
private ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) {
synchronized (lock) {
if ( history.size() > HISTORY_WINDOW_SIZE )
history.pop();
@ -290,26 +283,11 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
}
}
/** How long in seconds to process 1M traversal units? */
private final double secondsPerMillionElements(ProcessingHistory last) {
return (last.elapsedSeconds * 1000000.0) / Math.max(last.unitsProcessed, 1);
}
/** How long in seconds to process 1M bp on the genome? */
private final double secondsPerMillionBP(ProcessingHistory last) {
return (last.elapsedSeconds * 1000000.0) / Math.max(last.bpProcessed, 1);
}
/** What fractoin of the target intervals have we covered? */
private final double calculateFractionGenomeTargetCompleted(ProcessingHistory last) {
return (1.0*last.bpProcessed) / targetSize;
}
/**
* Called after a traversal to print out information about the traversal process
*/
public void printOnTraversalDone() {
printProgress(null, null, true);
printProgress(null, true);
final double elapsed = timer == null ? 0 : timer.getElapsedTime();
@ -370,7 +348,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* @return Frequency, in seconds, of performance log writes.
*/
public long getPerformanceProgressPrintFrequencySeconds() {
return PROGRESS_PRINT_FREQUENCY;
return progressPrintFrequency;
}
/**
@ -378,6 +356,35 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* @param seconds number of seconds between messages indicating performance frequency.
*/
public void setPerformanceProgressPrintFrequencySeconds(long seconds) {
PROGRESS_PRINT_FREQUENCY = seconds;
progressPrintFrequency = seconds;
}
private static class ProcessingHistory {
double elapsedSeconds;
long unitsProcessed;
long bpProcessed;
GenomeLoc loc;
public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) {
this.elapsedSeconds = elapsedSeconds;
this.loc = loc;
this.unitsProcessed = unitsProcessed;
this.bpProcessed = bpProcessed;
}
/** How long in seconds to process 1M traversal units? */
private double secondsPerMillionElements() {
return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1);
}
/** How long in seconds to process 1M bp on the genome? */
private double secondsPerMillionBP() {
return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1);
}
/** What fractoin of the target intervals have we covered? */
private double calculateFractionGenomeTargetCompleted(final long targetSize) {
return (1.0*bpProcessed) / targetSize;
}
}
}

View File

@ -104,7 +104,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
prevLoc = location;
printProgress(dataProvider.getShard(), locus.getLocation());
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus.getLocation());
}
// Take the individual isActive calls and integrate them into contiguous active regions and

View File

@ -196,7 +196,8 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
sum = walker.reduce(x, sum);
}
printProgress(dataProvider.getShard(),site);
updateCumulativeMetrics(dataProvider.getShard());
printProgress(site);
done = walker.isDone();
}

View File

@ -3,9 +3,7 @@ package org.broadinstitute.sting.gatk.traversals;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
@ -15,28 +13,42 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
public abstract class TraverseLociBase<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
/**
* our log, which we want to capture anything from this class
*/
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
@Override
protected String getTraversalType() {
protected final String getTraversalType() {
return "sites";
}
protected static class TraverseResults<T> {
final int numIterations;
final T reduceResult;
public TraverseResults(int numIterations, T reduceResult) {
this.numIterations = numIterations;
this.reduceResult = reduceResult;
}
}
protected abstract TraverseResults<T> traverse( final LocusWalker<M,T> walker,
final LocusView locusView,
final LocusReferenceView referenceView,
final ReferenceOrderedView referenceOrderedDataView,
final T sum);
@Override
public T traverse( LocusWalker<M,T> walker,
LocusShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider));
logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider));
LocusView locusView = getLocusView( walker, dataProvider );
boolean done = false;
final LocusView locusView = getLocusView( walker, dataProvider );
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = null;
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
@ -44,43 +56,24 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
else
referenceOrderedDataView = (RodLocusView)locusView;
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
// We keep processing while the next reference location is within the interval
while( locusView.hasNext() && ! done ) {
AlignmentContext locus = locusView.next();
GenomeLoc location = locus.getLocation();
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
final boolean keepMeP = walker.filter(tracker, refContext, locus);
if (keepMeP) {
M x = walker.map(tracker, refContext, locus);
sum = walker.reduce(x, sum);
done = walker.isDone();
}
printProgress(dataProvider.getShard(),locus.getLocation());
}
final TraverseResults<T> result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum );
sum = result.reduceResult;
dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations);
updateCumulativeMetrics(dataProvider.getShard());
}
// We have a final map call to execute here to clean up the skipped based from the
// last position in the ROD to that in the interval
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
// only do this if the walker isn't done!
RodLocusView rodLocusView = (RodLocusView)locusView;
long nSkipped = rodLocusView.getLastSkippedBases();
final RodLocusView rodLocusView = (RodLocusView)locusView;
final long nSkipped = rodLocusView.getLastSkippedBases();
if ( nSkipped > 0 ) {
GenomeLoc site = rodLocusView.getLocOneBeyondShard();
AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
M x = walker.map(null, null, ac);
final GenomeLoc site = rodLocusView.getLocOneBeyondShard();
final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
final M x = walker.map(null, null, ac);
sum = walker.reduce(x, sum);
}
}
@ -90,14 +83,14 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
/**
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
* of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype
* of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype
* that comes along.
* @param walker walker to interrogate.
* @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
*/
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
if( dataSource == DataSource.READS )
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )

View File

@ -0,0 +1,47 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLociLinear<M,T> extends TraverseLociBase<M,T> {
@Override
protected TraverseResults<T> traverse(LocusWalker<M, T> walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) {
// We keep processing while the next reference location is within the interval
boolean done = false;
int numIterations = 0;
while( locusView.hasNext() && ! done ) {
numIterations++;
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
final boolean keepMeP = walker.filter(tracker, refContext, locus);
if (keepMeP) {
final M x = walker.map(tracker, refContext, locus);
sum = walker.reduce(x, sum);
done = walker.isDone();
}
printProgress(locus.getLocation());
}
return new TraverseResults<T>(numIterations, sum);
}
}

View File

@ -0,0 +1,205 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
import java.util.Iterator;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLociNano<M,T> extends TraverseLociBase<M,T> {
/** our log, which we want to capture anything from this class */
private static final boolean DEBUG = false;
private static final int BUFFER_SIZE = 1000;
final NanoScheduler<MapData, MapResult, T> nanoScheduler;
public TraverseLociNano(int nThreads) {
nanoScheduler = new NanoScheduler<MapData, MapResult, T>(BUFFER_SIZE, nThreads);
nanoScheduler.setProgressFunction(new TraverseLociProgress());
}
@Override
protected TraverseResults<T> traverse(final LocusWalker<M, T> walker,
final LocusView locusView,
final LocusReferenceView referenceView,
final ReferenceOrderedView referenceOrderedDataView,
final T sum) {
nanoScheduler.setDebug(DEBUG);
final TraverseLociMap myMap = new TraverseLociMap(walker);
final TraverseLociReduce myReduce = new TraverseLociReduce(walker);
final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView);
final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce);
return new TraverseResults<T>(inputIterator.numIterations, result);
}
/**
* Create iterator that provides inputs for all map calls into MapData, to be provided
* to NanoScheduler for Map/Reduce
*/
private class MapDataIterator implements Iterator<MapData> {
final LocusView locusView;
final LocusReferenceView referenceView;
final ReferenceOrderedView referenceOrderedDataView;
int numIterations = 0;
private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) {
this.locusView = locusView;
this.referenceView = referenceView;
this.referenceOrderedDataView = referenceOrderedDataView;
}
@Override
public boolean hasNext() {
return locusView.hasNext();
}
@Override
public MapData next() {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
//logger.info("Pulling data from MapDataIterator at " + location);
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext);
numIterations++;
return new MapData(locus, refContext, tracker);
}
@Override
public void remove() {
throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator");
}
}
@Override
public void printOnTraversalDone() {
nanoScheduler.shutdown();
super.printOnTraversalDone();
}
/**
* The input data needed for each map call. The read, the reference, and the RODs
*/
private class MapData {
final AlignmentContext alignmentContext;
final ReferenceContext refContext;
final RefMetaDataTracker tracker;
private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) {
this.alignmentContext = alignmentContext;
this.refContext = refContext;
this.tracker = tracker;
}
@Override
public String toString() {
return "MapData " + alignmentContext.getLocation();
}
}
/**
* Contains the results of a map call, indicating whether the call was good, filtered, or done
*/
private class MapResult {
final M value;
final boolean reduceMe;
/**
* Create a MapResult with value that should be reduced
*
* @param value the value to reduce
*/
private MapResult(final M value) {
this.value = value;
this.reduceMe = true;
}
/**
* Create a MapResult that shouldn't be reduced
*/
private MapResult() {
this.value = null;
this.reduceMe = false;
}
}
/**
* A static object that tells reduce that the result of map should be skipped (filtered or done)
*/
private final MapResult SKIP_REDUCE = new MapResult();
/**
* MapFunction for TraverseReads meeting NanoScheduler interface requirements
*
* Applies walker.map to MapData, returning a MapResult object containing the result
*/
private class TraverseLociMap implements NSMapFunction<MapData, MapResult> {
final LocusWalker<M,T> walker;
private TraverseLociMap(LocusWalker<M, T> walker) {
this.walker = walker;
}
@Override
public MapResult apply(final MapData data) {
if ( ! walker.isDone() ) {
final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext);
if (keepMeP) {
final M x = walker.map(data.tracker, data.refContext, data.alignmentContext);
return new MapResult(x);
}
}
return SKIP_REDUCE;
}
}
/**
* NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements
*
* Takes a MapResult object and applies the walkers reduce function to each map result, when applicable
*/
private class TraverseLociReduce implements NSReduceFunction<MapResult, T> {
final LocusWalker<M,T> walker;
private TraverseLociReduce(LocusWalker<M, T> walker) {
this.walker = walker;
}
@Override
public T apply(MapResult one, T sum) {
if ( one.reduceMe )
// only run reduce on values that aren't DONE or FAILED
return walker.reduce(one.value, sum);
else
return sum;
}
}
private class TraverseLociProgress implements NSProgressFunction<MapData> {
@Override
public void progress(MapData lastProcessedMap) {
if (lastProcessedMap.alignmentContext != null)
printProgress(lastProcessedMap.alignmentContext.getLocation());
}
}
}

View File

@ -65,7 +65,8 @@ public class TraverseReadPairs<M,T> extends TraversalEngine<M,T, ReadPairWalker<
pairs.clear();
pairs.add(read);
printProgress(dataProvider.getShard(),null);
updateCumulativeMetrics(dataProvider.getShard());
printProgress(null);
}
done = walker.isDone();

View File

@ -99,8 +99,11 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
sum = walker.reduce(x, sum);
}
GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
printProgress(dataProvider.getShard(),locus);
final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus);
done = walker.isDone();
}
return sum;

View File

@ -34,34 +34,34 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.nanoScheduler.MapFunction;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
* @author aaron
* A nano-scheduling version of TraverseReads.
*
* Implements the traversal of a walker that accepts individual reads, the reference, and
* RODs per map call. Directly supports shared memory parallelism via NanoScheduler
*
* @author depristo
* @version 1.0
* @date Apr 24, 2009
* <p/>
* Class TraverseReads
* <p/>
* This class handles traversing by reads in the new shardable style
* @date 9/2/2012
*/
public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,ReadShardDataProvider> {
/** our log, which we want to capture anything from this class */
protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class);
private static final boolean DEBUG = false;
private static final int MIN_GROUP_SIZE = 100;
final NanoScheduler<MapData, M, T> nanoScheduler;
final NanoScheduler<MapData, MapResult, T> nanoScheduler;
public TraverseReadsNano(int nThreads) {
final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max
final int mapGroupSize = (int)Math.max(Math.ceil(bufferSize / 50.0 + 1), MIN_GROUP_SIZE);
nanoScheduler = new NanoScheduler<MapData, M, T>(bufferSize, mapGroupSize, nThreads);
nanoScheduler = new NanoScheduler<MapData, MapResult, T>(bufferSize, nThreads);
}
@Override
@ -89,19 +89,32 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
final TraverseReadsMap myMap = new TraverseReadsMap(walker);
final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker);
T result = nanoScheduler.execute(aggregateMapData(dataProvider).iterator(), myMap, sum, myReduce);
// TODO -- how do we print progress?
//printProgress(dataProvider.getShard(), ???);
final List<MapData> aggregatedInputs = aggregateMapData(dataProvider);
final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce);
final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read;
final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead);
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus);
return result;
}
/**
* Aggregate all of the inputs for all map calls into MapData, to be provided
* to NanoScheduler for Map/Reduce
*
* @param dataProvider the source of our data
* @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce
* should execute
*/
private List<MapData> aggregateMapData(final ReadShardDataProvider dataProvider) {
final ReadView reads = new ReadView(dataProvider);
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
final List<MapData> mapData = new ArrayList<MapData>(); // TODO -- need size of reads
final List<MapData> mapData = new LinkedList<MapData>();
for ( final SAMRecord read : reads ) {
final ReferenceContext refContext = ! read.getReadUnmappedFlag()
? reference.getReferenceContext(read)
@ -127,19 +140,9 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
super.printOnTraversalDone();
}
private class TraverseReadsReduce implements ReduceFunction<M, T> {
final ReadWalker<M,T> walker;
private TraverseReadsReduce(ReadWalker<M, T> walker) {
this.walker = walker;
}
@Override
public T apply(M one, T sum) {
return walker.reduce(one, sum);
}
}
/**
* The input data needed for each map call. The read, the reference, and the RODs
*/
private class MapData {
final GATKSAMRecord read;
final ReferenceContext refContext;
@ -152,7 +155,43 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
}
}
private class TraverseReadsMap implements MapFunction<MapData, M> {
/**
* Contains the results of a map call, indicating whether the call was good, filtered, or done
*/
private class MapResult {
final M value;
final boolean reduceMe;
/**
* Create a MapResult with value that should be reduced
*
* @param value the value to reduce
*/
private MapResult(final M value) {
this.value = value;
this.reduceMe = true;
}
/**
* Create a MapResult that shouldn't be reduced
*/
private MapResult() {
this.value = null;
this.reduceMe = false;
}
}
/**
* A static object that tells reduce that the result of map should be skipped (filtered or done)
*/
private final MapResult SKIP_REDUCE = new MapResult();
/**
* MapFunction for TraverseReads meeting NanoScheduler interface requirements
*
* Applies walker.map to MapData, returning a MapResult object containing the result
*/
private class TraverseReadsMap implements NSMapFunction<MapData, MapResult> {
final ReadWalker<M,T> walker;
private TraverseReadsMap(ReadWalker<M, T> walker) {
@ -160,15 +199,36 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
}
@Override
public M apply(final MapData data) {
public MapResult apply(final MapData data) {
if ( ! walker.isDone() ) {
final boolean keepMeP = walker.filter(data.refContext, data.read);
if (keepMeP) {
return walker.map(data.refContext, data.read, data.tracker);
}
if (keepMeP)
return new MapResult(walker.map(data.refContext, data.read, data.tracker));
}
return null; // TODO -- what should we return in the case where the walker is done or the read is filtered?
return SKIP_REDUCE;
}
}
/**
* NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements
*
* Takes a MapResult object and applies the walkers reduce function to each map result, when applicable
*/
private class TraverseReadsReduce implements NSReduceFunction<MapResult, T> {
final ReadWalker<M,T> walker;
private TraverseReadsReduce(ReadWalker<M, T> walker) {
this.walker = walker;
}
@Override
public T apply(MapResult one, T sum) {
if ( one.reduceMe )
// only run reduce on values that aren't DONE or FAILED
return walker.reduce(one.value, sum);
else
return sum;
}
}
}

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import java.lang.annotation.*;

View File

@ -45,7 +45,7 @@ import java.text.NumberFormat;
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
@Requires({DataSource.READS})
public class FlagStat extends ReadWalker<FlagStat.FlagStatus, FlagStat.FlagStatus> implements ThreadSafeMapReduce {
public class FlagStat extends ReadWalker<FlagStat.FlagStatus, FlagStat.FlagStatus> implements NanoSchedulable {
@Output
PrintStream out;

View File

@ -27,5 +27,5 @@ package org.broadinstitute.sting.gatk.walkers;
* declare that their map function is thread-safe and so multiple
* map calls can be run in parallel in the same JVM instance.
*/
public interface ThreadSafeMapReduce {
public interface NanoSchedulable {
}

View File

@ -45,25 +45,14 @@ import java.util.Collections;
import java.util.List;
/**
* Prints the alignment in the pileup format. In the pileup format, each line represents a genomic position,
* consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping
* qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all
* encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand,
* a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the
* reverse strand.
*
* A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next
* reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence.
* Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference.
* Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read
* separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality.
* A symbol '$' marks the end of a read segment.
* Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position,
* consisting of chromosome name, coordinate, reference base, read bases, and read qualities.
*
* Associated command:
* samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] <in.alignment>
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
public class Pileup extends LocusWalker<Integer, Integer> implements TreeReducible<Integer> {
public class Pileup extends LocusWalker<String, Integer> implements TreeReducible<Integer>, NanoSchedulable {
private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names
@ -81,27 +70,32 @@ public class Pileup extends LocusWalker<Integer, Integer> implements TreeReducib
@Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
public void initialize() {
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
String rods = getReferenceOrderedData( tracker );
@Override
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
final String rods = getReferenceOrderedData( tracker );
ReadBackedPileup basePileup = context.getBasePileup();
out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
if ( SHOW_VERBOSE )
out.printf(" %s", createVerboseOutput(basePileup));
out.println();
return 1;
final StringBuilder s = new StringBuilder();
s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods));
if ( SHOW_VERBOSE )
s.append(" ").append(createVerboseOutput(basePileup));
s.append("\n");
return s.toString();
}
// Given result of map function
@Override
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return treeReduce(sum,value);
@Override
public Integer reduce(String value, Integer sum) {
out.print(value);
return sum + 1;
}
@Override
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}

View File

@ -93,7 +93,7 @@ import java.util.*;
@ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
@Requires({DataSource.READS, DataSource.REFERENCE})
public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements ThreadSafeMapReduce {
public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable {
@Output(doc="Write output to this BAM filename instead of STDOUT", required = true)
SAMFileWriter out;
@ -228,7 +228,6 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> impleme
GATKSAMRecord workingRead = readIn;
for ( final ReadTransformer transformer : readTransformers ) {
if ( logger.isDebugEnabled() ) logger.debug("Applying transformer " + transformer + " to read " + readIn.getReadName());
workingRead = transformer.apply(workingRead);
}

View File

@ -13,7 +13,7 @@ package org.broadinstitute.sting.gatk.walkers;
* shards of the data can reduce with each other, and the composite result
* can be reduced with other composite results.
*/
public interface TreeReducible<ReduceType> extends ThreadSafeMapReduce {
public interface TreeReducible<ReduceType> {
/**
* A composite, 'reduce of reduces' function.
* @param lhs 'left-most' portion of data in the composite reduce.

View File

@ -33,6 +33,9 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap){
if ( stratifiedContext == null )
return;
Double ratio = annotateSNP(stratifiedContext, vc, g);
if (ratio == null)
return;

View File

@ -54,7 +54,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( g == null || !g.isCalled() )
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
return;
if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty())

View File

@ -55,7 +55,7 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation {
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap){
if ( g == null || !g.isCalled() )
if ( g == null || !g.isCalled() || stratifiedContext == null )
return;
int mq0 = 0;

View File

@ -300,16 +300,12 @@ public class VariantAnnotatorEngine {
if (stratifiedPerReadAlleleLikelihoodMap != null)
perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName());
if ( context == null && perReadAlleleLikelihoodMap == null) {
// no likelihoods nor pileup available: just move on to next sample
genotypes.add(genotype);
} else {
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap);
}
genotypes.add(gb.make());
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap);
}
genotypes.add(gb.make());
}
return genotypes;

View File

@ -109,7 +109,7 @@ import java.util.ArrayList;
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file
@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality
@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta
public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeReducible<Long> {
public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeReducible<Long>, NanoSchedulable {
@ArgumentCollection
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates

View File

@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;

View File

@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -125,7 +125,7 @@ import java.util.*;
// TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources:
// TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} )
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, UnifiedGenotyper.UGStatistics> implements TreeReducible<UnifiedGenotyper.UGStatistics>, AnnotatorCompatible {
public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, UnifiedGenotyper.UGStatistics> implements TreeReducible<UnifiedGenotyper.UGStatistics>, AnnotatorCompatible, NanoSchedulable {
@ArgumentCollection
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();

View File

@ -57,7 +57,7 @@ import java.util.TreeSet;
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
* or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
* it is impossible to place reads on the reference genome such that mismatches are minimized across all reads. Consequently, even when some reads are
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
* also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus
* indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an
@ -69,7 +69,7 @@ import java.util.TreeSet;
* <li>Running the realigner over those intervals (see the IndelRealigner tool)</li>
* </ol>
* <p>
* An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
* An important note: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
* <p>
* Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
* (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.

View File

@ -541,7 +541,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Get a Map of genotype likelihoods.
//In case of null, unavailable or no call, all likelihoods are 1/3.
private EnumMap<GenotypeType,Double> getLikelihoodsAsMapSafeNull(Genotype genotype){
if(genotype == null || !genotype.isCalled()){
if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){
EnumMap<GenotypeType,Double> likelihoods = new EnumMap<GenotypeType, Double>(GenotypeType.class);
likelihoods.put(GenotypeType.HOM_REF,1.0/3.0);
likelihoods.put(GenotypeType.HET,1.0/3.0);

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
@ -40,7 +41,7 @@ import java.io.PrintStream;
*
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
public class CountLoci extends LocusWalker<Integer, Long> implements TreeReducible<Long> {
public class CountLoci extends LocusWalker<Integer, Long> implements TreeReducible<Long>, NanoSchedulable {
@Output(doc="Write count to this file instead of STDOUT")
PrintStream out;

View File

@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -73,7 +74,7 @@ import java.util.*;
*
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
public class CountRODs extends RodWalker<CountRODs.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
public class CountRODs extends RodWalker<CountRODs.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>>, NanoSchedulable {
@Output
public PrintStream out;

View File

@ -4,9 +4,9 @@ import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.ThreadSafeMapReduce;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CountReads extends ReadWalker<Integer, Integer> implements ThreadSafeMapReduce {
public class CountReads extends ReadWalker<Integer, Integer> implements NanoSchedulable {
public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) {
return 1;
}

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.CommandLineGATK;
@ -45,20 +46,23 @@ public class ErrorThrowing extends RodWalker<Integer,Integer> implements TreeRed
@Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true)
public String exceptionToThrow;
@Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false)
public FailMethod failMethod = FailMethod.MAP;
public enum FailMethod {
MAP,
REDUCE,
TREE_REDUCE
}
//
// Template code to allow us to build the walker, doesn't actually do anything
//
@Override
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( exceptionToThrow.equals("UserException") ) {
throw new UserException("UserException");
} else if ( exceptionToThrow.equals("NullPointerException") ) {
throw new NullPointerException();
} else if ( exceptionToThrow.equals("ReviewedStingException") ) {
throw new ReviewedStingException("ReviewedStingException");
} else {
throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow);
}
if ( failMethod == FailMethod.MAP )
fail();
return 0;
}
@Override
@ -68,10 +72,32 @@ public class ErrorThrowing extends RodWalker<Integer,Integer> implements TreeRed
@Override
public Integer reduce(Integer value, Integer sum) {
if ( failMethod == FailMethod.REDUCE )
fail();
return value + sum;
}
public Integer treeReduce(final Integer lhs, final Integer rhs) {
if ( failMethod == FailMethod.TREE_REDUCE )
fail();
return lhs + rhs;
}
private void fail() {
if ( exceptionToThrow.equals("UserException") ) {
throw new UserException("UserException");
} else if ( exceptionToThrow.equals("NullPointerException") ) {
throw new NullPointerException();
} else if ( exceptionToThrow.equals("ReviewedStingException") ) {
throw new ReviewedStingException("ReviewedStingException");
} else if ( exceptionToThrow.equals("SamError1") ) {
throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1);
} else if ( exceptionToThrow.equals("SamError2") ) {
throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2);
} else if ( exceptionToThrow.equals("NoSpace") ) {
throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)"));
} else {
throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow);
}
}
}

View File

@ -75,6 +75,17 @@ public class MathUtils {
}
}
/**
* Get a random int between min and max (inclusive) using the global GATK random number generator
*
* @param min lower bound of the range
* @param max upper bound of the range
* @return a random int >= min and <= max
*/
public static int randomIntegerInRange( int min, int max ) {
return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min;
}
// A fast implementation of the Math.round() method. This method does not perform
// under/overflow checking, so this shouldn't be used in the general case (but is fine
// if one is already make those checks before calling in to the rounding).
@ -1655,5 +1666,4 @@ public class MathUtils {
return result;
}
}

View File

@ -1,18 +1,42 @@
package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import java.util.concurrent.TimeUnit;
/**
* A useful simple system for timing code. This code is not thread safe!
* A useful simple system for timing code with nano second resolution
*
* Note that this code is not thread-safe. If you have a single timer
* being started and stopped by multiple threads you will need to protect the
* calls to avoid meaningless results of having multiple starts and stops
* called sequentially.
*
* User: depristo
* Date: Dec 10, 2010
* Time: 9:07:44 AM
*/
public class SimpleTimer {
final private String name;
private long elapsed = 0l;
private long startTime = 0l;
boolean running = false;
protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1);
private final String name;
/**
* The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the
* sum of times between starts/restrats and stops.
*/
private long elapsedTimeNano = 0l;
/**
* The start time of the last start/restart in nanoSeconds
*/
private long startTimeNano = 0l;
/**
* Is this timer currently running (i.e., the last call was start/restart)
*/
private boolean running = false;
/**
* Creates an anonymous simple timer
@ -25,7 +49,8 @@ public class SimpleTimer {
* Creates a simple timer named name
* @param name of the timer, must not be null
*/
public SimpleTimer(String name) {
public SimpleTimer(final String name) {
if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null");
this.name = name;
}
@ -37,27 +62,27 @@ public class SimpleTimer {
}
/**
* Starts the timer running, and sets the elapsed time to 0. This is equivalent to
* Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to
* resetting the time to have no history at all.
*
* @return this object, for programming convenience
*/
@Ensures("elapsedTimeNano == 0l")
public synchronized SimpleTimer start() {
elapsed = 0l;
restart();
return this;
elapsedTimeNano = 0l;
return restart();
}
/**
* Starts the timer running, without reseting the elapsed time. This function may be
* Starts the timer running, without resetting the elapsedTimeNano time. This function may be
* called without first calling start(). The only difference between start and restart
* is that start resets the elapsed time, while restart does not.
* is that start resets the elapsedTimeNano time, while restart does not.
*
* @return this object, for programming convenience
*/
public synchronized SimpleTimer restart() {
running = true;
startTime = currentTime();
startTimeNano = currentTimeNano();
return this;
}
@ -71,29 +96,53 @@ public class SimpleTimer {
/**
* @return A convenience function to obtain the current time in milliseconds from this timer
*/
public synchronized long currentTime() {
public long currentTime() {
return System.currentTimeMillis();
}
/**
* Stops the timer. Increases the elapsed time by difference between start and now. The
* timer must be running in order to call stop
* @return A convenience function to obtain the current time in nanoSeconds from this timer
*/
public long currentTimeNano() {
return System.nanoTime();
}
/**
* Stops the timer. Increases the elapsedTimeNano time by difference between start and now.
*
* It's ok to call stop on a timer that's not running. It has no effect on the timer.
*
* @return this object, for programming convenience
*/
@Requires("startTimeNano != 0l")
public synchronized SimpleTimer stop() {
running = false;
elapsed += currentTime() - startTime;
if ( running ) {
running = false;
elapsedTimeNano += currentTimeNano() - startTimeNano;
}
return this;
}
/**
* Returns the total elapsed time of all start/stops of this timer. If the timer is currently
* Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently
* running, includes the difference from currentTime() and the start as well
*
* @return this time, in seconds
*/
public synchronized double getElapsedTime() {
return (running ? (currentTime() - startTime + elapsed) : elapsed) / 1000.0;
return nanoToSecondsAsDouble(getElapsedTimeNano());
}
protected static double nanoToSecondsAsDouble(final long nano) {
return nano * NANO_TO_SECOND_DOUBLE;
}
/**
* @see #getElapsedTime() but returns the result in nanoseconds
*
* @return the elapsed time in nanoseconds
*/
public synchronized long getElapsedTimeNano() {
return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano;
}
}

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.utils.classloader;
import ch.qos.logback.classic.Level;
import ch.qos.logback.classic.Logger;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -276,8 +278,16 @@ public class PluginManager<PluginType> {
*/
public PluginType createByName(String pluginName) {
Class<? extends PluginType> plugin = pluginsByName.get(pluginName);
if( plugin == null )
throw new UserException(formatErrorMessage(pluginCategory,pluginName));
if( plugin == null ) {
String errorMessage = formatErrorMessage(pluginCategory,pluginName);
if ( this.getClass().isAssignableFrom(FilterManager.class) ) {
throw new UserException.MalformedReadFilterException(errorMessage);
} else if ( this.getClass().isAssignableFrom(WalkerManager.class) ) {
throw new UserException.MalformedWalkerArgumentsException(errorMessage);
} else {
throw new UserException.CommandLineException(errorMessage);
}
}
try {
return plugin.newInstance();
} catch (Exception e) {

View File

@ -63,6 +63,18 @@ public class UserException extends ReviewedStingException {
}
}
public static class MalformedReadFilterException extends CommandLineException {
public MalformedReadFilterException(String message) {
super(String.format("Malformed read filter: %s",message));
}
}
public static class MalformedWalkerArgumentsException extends CommandLineException {
public MalformedWalkerArgumentsException(String message) {
super(String.format("Malformed walker argument: %s",message));
}
}
public static class MalformedGenomeLoc extends UserException {
public MalformedGenomeLoc(String message, GenomeLoc loc) {
super(String.format("Badly formed genome loc: %s: %s", message, loc));
@ -129,6 +141,12 @@ public class UserException extends ReviewedStingException {
}
}
public static class NoSpaceOnDevice extends UserException {
public NoSpaceOnDevice() {
super("There is no space left on the device, so writing failed");
}
}
public static class CouldNotReadInputFile extends UserException {
public CouldNotReadInputFile(String message, Exception e) {
super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e)));

View File

@ -0,0 +1,82 @@
package org.broadinstitute.sting.utils.nanoScheduler;
import com.google.java.contract.Invariant;
/**
* Wrapper to hold data for a blocking queue, distinguishing an EOF marker from a real object
*
* The only way to tell in a consumer thread that a blocking queue has no more data ever
* coming down the pipe is to pass in a "poison" or EOF object. This class provides
* a generic capacity for that...
*
* The use case looks like this:
*
* BlockingQueue q
* producer:
* while ( x has items )
* q.put(new BlockingQueueValue(x))
* q.put(new BlockingQueueValue())
*
* Consumer:
* while ( true )
* value = q.take()
* if ( value.isLast() )
* break
* else
* do something useful with value
*
*
* User: depristo
* Date: 9/6/12
* Time: 3:08 PM
*/
@Invariant("! isLast || value == null")
class BlockingQueueValue<T> {
/**
* True if this is the EOF marker object
*/
final private boolean isLast;
/**
* Our value, if we aren't the EOF marker
*/
final private T value;
/**
* Create a new BlockingQueueValue containing a real value, where last is false
* @param value
*/
BlockingQueueValue(final T value) {
isLast = false;
this.value = value;
}
/**
* Create a new BlockingQueueValue that is the last item
*/
BlockingQueueValue() {
isLast = true;
this.value = null;
}
/**
* Is this the EOF marker?
*
* @return true if so, else false
*/
public boolean isLast() {
return isLast;
}
/**
* Get the value held by this BlockingQueueValue
*
* @return the value
* @throws IllegalStateException if this is the last item
*/
public T getValue() {
if ( isLast() )
throw new IllegalStateException("Cannot get value for last object");
return value;
}
}

View File

@ -0,0 +1,45 @@
package org.broadinstitute.sting.utils.nanoScheduler;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
/**
* Create a future that simply returns a given value
*
* The only standard way to create a future in java is via the ExecutorService interface.
* If you have a data structure holding futures of value T, and you want to add a
* value to it for some reason (to add a EOF marker, for instance) you can use this
* class to create a dummy Future<T> that simply returns a value.
*
* @author depristo
* @since 09/12
*/
class FutureValue<V> implements Future<V> {
final V value;
FutureValue(final V value) {
this.value = value;
}
@Override public boolean cancel(boolean mayInterruptIfRunning) {
return true;
}
@Override public boolean isCancelled() {
return false;
}
@Override public boolean isDone() {
return true;
}
@Override public V get() throws InterruptedException, ExecutionException {
return value;
}
@Override public V get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException {
return get();
}
}

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.utils.nanoScheduler;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Iterator;
import java.util.concurrent.BlockingQueue;
/**
* Producer Thread that reads input values from an inputReads and puts them into a BlockingQueue
*/
class InputProducer<InputType> implements Runnable {
/**
* The iterator we are using to get data from
*/
final Iterator<InputType> inputReader;
/**
* Our timer (may be null) that we use to track our input costs
*/
final SimpleTimer inputTimer;
/**
* Where we put our input values for consumption
*/
final BlockingQueue<InputValue> outputQueue;
public InputProducer(final Iterator<InputType> inputReader,
final SimpleTimer inputTimer,
final BlockingQueue<InputValue> outputQueue) {
if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null");
if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null");
this.inputReader = inputReader;
this.inputTimer = inputTimer;
this.outputQueue = outputQueue;
}
public void run() {
try {
while ( inputReader.hasNext() ) {
if ( inputTimer != null ) inputTimer.restart();
final InputType input = inputReader.next();
if ( inputTimer != null ) inputTimer.stop();
outputQueue.put(new InputValue(input));
}
// add the EOF object so our consumer knows we are done in all inputs
outputQueue.put(new InputValue());
} catch (InterruptedException ex) {
throw new ReviewedStingException("got execution exception", ex);
}
}
/**
* Helper class that contains a read value suitable for EOF marking in a BlockingQueue
*/
class InputValue extends BlockingQueueValue<InputType> {
private InputValue(InputType datum) { super(datum); }
private InputValue() { }
}
}

View File

@ -0,0 +1,36 @@
package org.broadinstitute.sting.utils.nanoScheduler;
/**
* Holds the results of a map job suitable for producer/consumer threading
* via a BlockingQueue
*/
class MapResult<MapType> extends BlockingQueueValue<MapType> {
final int jobID;
/**
* Create a new MapResult with value datum and jod jobID ID
*
* @param datum the value produced by the map job
* @param jobID the id of the map job (for correctness testing)
*/
MapResult(final MapType datum, final int jobID) {
super(datum);
this.jobID = jobID;
if ( jobID < 0 ) throw new IllegalArgumentException("JobID must be >= 0");
}
/**
* Create the EOF marker version of MapResult
*/
MapResult() {
super();
this.jobID = Integer.MAX_VALUE;
}
/**
* @return the job ID of the map job that produced this MapResult
*/
public int getJobID() {
return jobID;
}
}

View File

@ -9,7 +9,7 @@ package org.broadinstitute.sting.utils.nanoScheduler;
* Date: 8/24/12
* Time: 9:49 AM
*/
public interface MapFunction<InputType, ResultType> {
public interface NSMapFunction<InputType, ResultType> {
/**
* Return function on input, returning a value of ResultType
* @param input

View File

@ -0,0 +1,12 @@
package org.broadinstitute.sting.utils.nanoScheduler;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 9/4/12
* Time: 2:10 PM
* To change this template use File | Settings | File Templates.
*/
public interface NSProgressFunction<InputType> {
public void progress(final InputType lastMapInput);
}

View File

@ -7,7 +7,7 @@ package org.broadinstitute.sting.utils.nanoScheduler;
* Date: 8/24/12
* Time: 9:49 AM
*/
public interface ReduceFunction<MapType, ReduceType> {
public interface NSReduceFunction<MapType, ReduceType> {
/**
* Combine one with sum into a new ReduceType
* @param one the result of a map call on an input element

View File

@ -3,13 +3,13 @@ package org.broadinstitute.sting.utils.nanoScheduler;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.AutoFormattingTime;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.threading.NamedThreadFactory;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.*;
/**
@ -17,12 +17,12 @@ import java.util.concurrent.*;
*
* The overall framework works like this
*
* nano <- new Nanoschedule(bufferSize, numberOfMapElementsToProcessTogether, nThreads)
* nano <- new Nanoschedule(inputBufferSize, numberOfMapElementsToProcessTogether, nThreads)
* List[Input] outerData : outerDataLoop )
* result = nano.execute(outerData.iterator(), map, reduce)
*
* bufferSize determines how many elements from the input stream are read in one go by the
* nanoscheduler. The scheduler may hold up to bufferSize in memory at one time, as well
* inputBufferSize determines how many elements from the input stream are read in one go by the
* nanoscheduler. The scheduler may hold up to inputBufferSize in memory at one time, as well
* as up to inputBufferSize map results as well.
*
* numberOfMapElementsToProcessTogether determines how many input elements are processed
@ -45,42 +45,54 @@ import java.util.concurrent.*;
public class NanoScheduler<InputType, MapType, ReduceType> {
private final static Logger logger = Logger.getLogger(NanoScheduler.class);
private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true;
private final static boolean LOG_MAP_TIMES = false;
private final static boolean TIME_CALLS = true;
final int bufferSize;
final int mapGroupSize;
private final static int MAP_BUFFER_SIZE_SCALE_FACTOR = 100;
final int inputBufferSize;
final int mapBufferSize;
final int nThreads;
final ExecutorService executor;
final ExecutorService inputExecutor;
final ExecutorService reduceExecutor;
final ThreadPoolExecutor mapExecutor;
boolean shutdown = false;
boolean debug = false;
private NSProgressFunction<InputType> progressFunction = null;
final SimpleTimer outsideSchedulerTimer = TIME_CALLS ? new SimpleTimer("outside") : null;
final SimpleTimer inputTimer = TIME_CALLS ? new SimpleTimer("input") : null;
final SimpleTimer mapTimer = TIME_CALLS ? new SimpleTimer("map") : null;
final SimpleTimer reduceTimer = TIME_CALLS ? new SimpleTimer("reduce") : null;
/**
* Create a new nanoschedule with the desire characteristics requested by the argument
* Create a new nanoscheduler with the desire characteristics requested by the argument
*
* @param bufferSize the number of input elements to read in each scheduling cycle.
* @param mapGroupSize How many inputs should be grouped together per map? If -1 we make a reasonable guess
* @param nThreads the number of threads to use to get work done, in addition to the thread calling execute
* @param inputBufferSize the number of input elements to read in each scheduling cycle.
* @param nThreads the number of threads to use to get work done, in addition to the
* thread calling execute
*/
public NanoScheduler(final int bufferSize,
final int mapGroupSize,
final int nThreads) {
if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize);
public NanoScheduler(final int inputBufferSize, final int nThreads) {
if ( inputBufferSize < 1 ) throw new IllegalArgumentException("inputBufferSize must be >= 1, got " + inputBufferSize);
if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads);
if ( mapGroupSize > bufferSize ) throw new IllegalArgumentException("mapGroupSize " + mapGroupSize + " must be <= bufferSize " + bufferSize);
if ( mapGroupSize == 0 || mapGroupSize < -1 ) throw new IllegalArgumentException("mapGroupSize cannot be <= 0" + mapGroupSize);
this.bufferSize = bufferSize;
this.inputBufferSize = inputBufferSize;
this.mapBufferSize = inputBufferSize * MAP_BUFFER_SIZE_SCALE_FACTOR;
this.nThreads = nThreads;
if ( mapGroupSize == -1 ) {
this.mapGroupSize = (int)Math.ceil(this.bufferSize / (10.0*this.nThreads));
logger.info(String.format("Dynamically setting grouping size to %d based on buffer size %d and n threads %d",
this.mapGroupSize, this.bufferSize, this.nThreads));
if ( nThreads == 1 ) {
this.mapExecutor = null;
this.inputExecutor = this.reduceExecutor = null;
} else {
this.mapGroupSize = mapGroupSize;
this.mapExecutor = (ThreadPoolExecutor)Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d"));
this.mapExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d"));
this.reduceExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-reduce-thread-%d"));
}
this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads);
// start timing the time spent outside of the nanoScheduler
outsideSchedulerTimer.start();
}
/**
@ -97,17 +109,8 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
* @return
*/
@Ensures("result > 0")
public int getBufferSize() {
return bufferSize;
}
/**
* The grouping size used by this NanoScheduler
* @return
*/
@Ensures("result > 0")
public int getMapGroupSize() {
return mapGroupSize;
public int getInputBufferSize() {
return inputBufferSize;
}
/**
@ -116,12 +119,54 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
* After this call, execute cannot be invoked without throwing an error
*/
public void shutdown() {
if ( executor != null ) {
final List<Runnable> remaining = executor.shutdownNow();
if ( ! remaining.isEmpty() )
throw new IllegalStateException("Remaining tasks found in the executor, unexpected behavior!");
outsideSchedulerTimer.stop();
if ( nThreads > 1 ) {
shutdownExecutor("inputExecutor", inputExecutor);
shutdownExecutor("mapExecutor", mapExecutor);
shutdownExecutor("reduceExecutor", reduceExecutor);
}
shutdown = true;
if (TIME_CALLS) {
printTimerInfo("Input time", inputTimer);
printTimerInfo("Map time", mapTimer);
printTimerInfo("Reduce time", reduceTimer);
printTimerInfo("Outside time", outsideSchedulerTimer);
}
}
/**
* Helper function to cleanly shutdown an execution service, checking that the execution
* state is clean when it's done.
*
* @param name a string name for error messages for the executorService we are shutting down
* @param executorService the executorService to shut down
*/
@Requires({"name != null", "executorService != null"})
@Ensures("executorService.isShutdown()")
private void shutdownExecutor(final String name, final ExecutorService executorService) {
if ( executorService.isShutdown() || executorService.isTerminated() )
throw new IllegalStateException("Executor service " + name + " is already shut down!");
final List<Runnable> remaining = executorService.shutdownNow();
if ( ! remaining.isEmpty() )
throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!");
}
/**
* Print to logger.info timing information from timer, with name label
*
* @param label the name of the timer to display. Should be human readable
* @param timer the timer whose elapsed time we will display
*/
@Requires({"label != null", "timer != null"})
private void printTimerInfo(final String label, final SimpleTimer timer) {
final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime()
+ reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime();
final double myTimeInSec = timer.getElapsedTime();
final double myTimePercent = myTimeInSec / total * 100;
logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent));
}
/**
@ -131,20 +176,45 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
return shutdown;
}
/**
* @return are we displaying verbose debugging information about the scheduling?
*/
public boolean isDebug() {
return debug;
}
/**
* Helper function to display a String.formatted message if we are doing verbose debugging
*
* @param format the format argument suitable for String.format
* @param args the arguments for String.format
*/
@Requires("format != null")
private void debugPrint(final String format, Object ... args) {
if ( isDebug() )
logger.info("Thread " + Thread.currentThread().getId() + ":" + String.format(format, args));
}
/**
* Turn on/off verbose debugging
*
* @param debug true if we want verbose debugging
*/
public void setDebug(boolean debug) {
this.debug = debug;
}
/**
* Set the progress callback function to progressFunction
*
* The progress callback is invoked after each buffer size elements have been processed by map/reduce
*
* @param progressFunction a progress function to call, or null if you don't want any progress callback
*/
public void setProgressFunction(final NSProgressFunction<InputType> progressFunction) {
this.progressFunction = progressFunction;
}
/**
* Execute a map/reduce job with this nanoScheduler
*
@ -159,41 +229,73 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
* It is safe to call this function repeatedly on a single nanoScheduler, at least until the
* shutdown method is called.
*
* @param inputReader
* @param map
* @param reduce
* @return
* Note that this function goes through a single threaded fast path if the number of threads
* is 1.
*
* @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over
* @param map the map function from input type -> map type, will be applied in parallel to each input
* @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results
* @return the last reduce value
*/
public ReduceType execute(final Iterator<InputType> inputReader,
final MapFunction<InputType, MapType> map,
final NSMapFunction<InputType, MapType> map,
final ReduceType initialValue,
final ReduceFunction<MapType, ReduceType> reduce) {
final NSReduceFunction<MapType, ReduceType> reduce) {
if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler");
if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null");
if ( map == null ) throw new IllegalArgumentException("map function cannot be null");
if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null");
outsideSchedulerTimer.stop();
ReduceType result;
if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) {
return executeSingleThreaded(inputReader, map, initialValue, reduce);
result = executeSingleThreaded(inputReader, map, initialValue, reduce);
} else {
return executeMultiThreaded(inputReader, map, initialValue, reduce);
result = executeMultiThreaded(inputReader, map, initialValue, reduce);
}
outsideSchedulerTimer.restart();
return result;
}
/**
* Simple efficient reference implementation for single threaded execution
* Simple efficient reference implementation for single threaded execution.
*
* @return the reduce result of this map/reduce job
*/
@Requires({"inputReader != null", "map != null", "reduce != null"})
private ReduceType executeSingleThreaded(final Iterator<InputType> inputReader,
final MapFunction<InputType, MapType> map,
final NSMapFunction<InputType, MapType> map,
final ReduceType initialValue,
final ReduceFunction<MapType, ReduceType> reduce) {
final NSReduceFunction<MapType, ReduceType> reduce) {
ReduceType sum = initialValue;
int i = 0;
// start timer to ensure that both hasNext and next are caught by the timer
if ( TIME_CALLS ) inputTimer.restart();
while ( inputReader.hasNext() ) {
final InputType input = inputReader.next();
if ( TIME_CALLS ) inputTimer.stop();
// map
if ( TIME_CALLS ) mapTimer.restart();
final long preMapTime = LOG_MAP_TIMES ? 0 : mapTimer.currentTimeNano();
final MapType mapValue = map.apply(input);
if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime));
if ( TIME_CALLS ) mapTimer.stop();
if ( i++ % inputBufferSize == 0 && progressFunction != null )
progressFunction.progress(input);
// reduce
if ( TIME_CALLS ) reduceTimer.restart();
sum = reduce.apply(mapValue, sum);
if ( TIME_CALLS ) reduceTimer.stop();
if ( TIME_CALLS ) inputTimer.restart();
}
return sum;
}
@ -202,101 +304,89 @@ public class NanoScheduler<InputType, MapType, ReduceType> {
*
* @return the reduce result of this map/reduce job
*/
@Requires({"inputReader != null", "map != null", "reduce != null"})
private ReduceType executeMultiThreaded(final Iterator<InputType> inputReader,
final MapFunction<InputType, MapType> map,
final NSMapFunction<InputType, MapType> map,
final ReduceType initialValue,
final ReduceFunction<MapType, ReduceType> reduce) {
final NSReduceFunction<MapType, ReduceType> reduce) {
debugPrint("Executing nanoScheduler");
ReduceType sum = initialValue;
while ( inputReader.hasNext() ) {
try {
// read in our input values
final List<InputType> inputs = readInputs(inputReader);
// send jobs for map
final Queue<Future<List<MapType>>> mapQueue = submitMapJobs(map, executor, inputs);
// a blocking queue that limits the number of input datum to the requested buffer size
final BlockingQueue<InputProducer<InputType>.InputValue> inputQueue
= new LinkedBlockingDeque<InputProducer<InputType>.InputValue>(inputBufferSize);
// send off the reduce job, and block until we get at least one reduce result
sum = reduceParallel(reduce, mapQueue, sum);
} catch (InterruptedException ex) {
throw new ReviewedStingException("got execution exception", ex);
} catch (ExecutionException ex) {
throw new ReviewedStingException("got execution exception", ex);
// a priority queue that stores up to mapBufferSize elements
// produced by completed map jobs.
final BlockingQueue<Future<MapResult<MapType>>> mapResultQueue =
new LinkedBlockingDeque<Future<MapResult<MapType>>>(mapBufferSize);
// Start running the input reader thread
inputExecutor.submit(new InputProducer<InputType>(inputReader, inputTimer, inputQueue));
// Start running the reducer thread
final ReducerThread<MapType, ReduceType> reducer
= new ReducerThread<MapType, ReduceType>(reduce, reduceTimer, initialValue, mapResultQueue);
final Future<ReduceType> reduceResult = reduceExecutor.submit(reducer);
try {
int numJobs = 0;
while ( true ) {
// block on input
final InputProducer<InputType>.InputValue inputEnqueueWrapped = inputQueue.take();
if ( ! inputEnqueueWrapped.isLast() ) {
// get the object itself
final InputType input = inputEnqueueWrapped.getValue();
// the next map call has jobID + 1
numJobs++;
// send job for map via the completion service
final CallableMap doMap = new CallableMap(map, numJobs, input);
final Future<MapResult<MapType>> mapJob = mapExecutor.submit(doMap);
mapResultQueue.put(mapJob);
debugPrint(" Done with cycle of map/reduce");
if ( numJobs % inputBufferSize == 0 && progressFunction != null )
progressFunction.progress(input);
} else {
mapResultQueue.put(new FutureValue<MapResult<MapType>>(new MapResult<MapType>()));
return reduceResult.get(); // wait for our result of reduce
}
}
} catch (InterruptedException ex) {
throw new ReviewedStingException("got execution exception", ex);
} catch (ExecutionException ex) {
throw new ReviewedStingException("got execution exception", ex);
}
return sum;
}
@Requires({"reduce != null", "! mapQueue.isEmpty()"})
private ReduceType reduceParallel(final ReduceFunction<MapType, ReduceType> reduce,
final Queue<Future<List<MapType>>> mapQueue,
final ReduceType initSum)
throws InterruptedException, ExecutionException {
ReduceType sum = initSum;
// while mapQueue has something in it to reduce
for ( final Future<List<MapType>> future : mapQueue ) {
for ( final MapType value : future.get() ) // block until we get the values for this task
sum = reduce.apply(value, sum);
}
return sum;
}
/**
* Read up to inputBufferSize elements from inputReader
*
* @return a queue of inputs read in, containing one or more values of InputType read in
*/
@Requires("inputReader.hasNext()")
@Ensures("!result.isEmpty()")
private List<InputType> readInputs(final Iterator<InputType> inputReader) {
int n = 0;
final List<InputType> inputs = new LinkedList<InputType>();
while ( inputReader.hasNext() && n < getBufferSize() ) {
final InputType input = inputReader.next();
inputs.add(input);
n++;
}
return inputs;
}
@Requires({"map != null", "! inputs.isEmpty()"})
private Queue<Future<List<MapType>>> submitMapJobs(final MapFunction<InputType, MapType> map,
final ExecutorService executor,
final List<InputType> inputs) {
final Queue<Future<List<MapType>>> mapQueue = new LinkedList<Future<List<MapType>>>();
for ( final List<InputType> subinputs : Utils.groupList(inputs, getMapGroupSize()) ) {
final CallableMap doMap = new CallableMap(map, subinputs);
final Future<List<MapType>> future = executor.submit(doMap);
mapQueue.add(future);
}
return mapQueue;
}
/**
* A simple callable version of the map function for use with the executor pool
*/
private class CallableMap implements Callable<List<MapType>> {
final List<InputType> inputs;
final MapFunction<InputType, MapType> map;
private class CallableMap implements Callable<MapResult<MapType>> {
final int id;
final InputType input;
final NSMapFunction<InputType, MapType> map;
@Requires({"map != null", "inputs.size() <= getMapGroupSize()"})
private CallableMap(final MapFunction<InputType, MapType> map, final List<InputType> inputs) {
this.inputs = inputs;
@Requires({"map != null"})
private CallableMap(final NSMapFunction<InputType, MapType> map,
final int id,
final InputType input) {
this.id = id;
this.input = input;
this.map = map;
}
@Ensures("result.size() == inputs.size()")
@Override public List<MapType> call() throws Exception {
final List<MapType> outputs = new LinkedList<MapType>();
for ( final InputType input : inputs )
outputs.add(map.apply(input));
debugPrint(" Processed %d elements with map", outputs.size());
return outputs;
@Override
public MapResult<MapType> call() {
if ( TIME_CALLS ) mapTimer.restart();
if ( debug ) debugPrint("\t\tmap " + input);
final MapType result = map.apply(input);
if ( TIME_CALLS ) mapTimer.stop();
return new MapResult<MapType>(result, id);
}
}
}

View File

@ -0,0 +1,65 @@
package org.broadinstitute.sting.utils.nanoScheduler;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
/**
* Thread that runs the reduce of the map/reduce.
*
* This thread reads from mapResultsQueue until the poison EOF object arrives. At each
* stage is calls reduce(value, sum). The blocking mapResultQueue ensures that the
* queue waits until the mapResultQueue has a value to take. Then, it gets and waits
* until the map result Future has a value.
*/
class ReducerThread<MapType, ReduceType> implements Callable<ReduceType> {
final NSReduceFunction<MapType, ReduceType> reduce;
final SimpleTimer reduceTimer;
final BlockingQueue<Future<MapResult<MapType>>> mapResultQueue;
ReduceType sum;
int lastJobID = -1;
public ReducerThread(final NSReduceFunction<MapType, ReduceType> reduce,
final SimpleTimer reduceTimer,
final ReduceType sum,
final BlockingQueue<Future<MapResult<MapType>>> mapResultQueue) {
if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null");
if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null");
this.reduce = reduce;
this.reduceTimer = reduceTimer;
this.sum = sum;
this.mapResultQueue = mapResultQueue;
}
public ReduceType call() {
try {
while ( true ) {
final MapResult<MapType> result = mapResultQueue.take().get();
if ( result.isLast() ) {
// we are done, just return sum
return sum;
}
else if ( result.getJobID() < lastJobID ) {
// make sure the map results are coming in order
throw new IllegalStateException("BUG: last jobID " + lastJobID + " > current jobID " + result.getJobID());
} else {
lastJobID = result.getJobID();
// apply reduce, keeping track of sum
if ( reduceTimer != null ) reduceTimer.restart();
sum = reduce.apply(result.getValue(), sum);
if ( reduceTimer != null ) reduceTimer.stop();
}
}
} catch (ExecutionException ex) {
throw new ReviewedStingException("got execution exception", ex);
} catch (InterruptedException ex) {
throw new ReviewedStingException("got execution exception", ex);
}
}
}

View File

@ -613,6 +613,8 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
/**
* Returns a pileup randomly downsampled to the desiredCoverage.
*
* TODO: delete this once the experimental downsampler stabilizes
*
* @param desiredCoverage
* @return
*/

View File

@ -0,0 +1,86 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.picard.sam.MergingSamRecordIterator;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Simple wrapper class that multiplexes multiple ArtificialSingleSampleReadStreams into a single stream of reads
*
* @author David Roazen
*/
public class ArtificialMultiSampleReadStream implements Iterable<SAMRecord> {
private Collection<ArtificialSingleSampleReadStream> perSampleArtificialReadStreams;
private MergingSamRecordIterator mergingIterator;
public ArtificialMultiSampleReadStream( Collection<ArtificialSingleSampleReadStream> perSampleArtificialReadStreams ) {
if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) {
throw new ReviewedStingException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams");
}
this.perSampleArtificialReadStreams = perSampleArtificialReadStreams;
}
public Iterator<SAMRecord> iterator() {
// lazy initialization to prevent reads from being created until they're needed
initialize();
return mergingIterator;
}
public StingSAMIterator getStingSAMIterator() {
// lazy initialization to prevent reads from being created until they're needed
initialize();
return StingSAMIteratorAdapter.adapt(mergingIterator);
}
private void initialize() {
Collection<SAMFileReader> perSampleSAMReaders = new ArrayList<SAMFileReader>(perSampleArtificialReadStreams.size());
Collection<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(perSampleArtificialReadStreams.size());
for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) {
Collection<SAMRecord> thisStreamReads = readStream.makeReads();
SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(),
thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()]));
perSampleSAMReaders.add(reader);
headers.add(reader.getFileHeader());
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true);
mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true);
}
}

View File

@ -40,8 +40,11 @@ public class ArtificialSAMFileReader extends SAMFileReader {
*/
private final List<SAMRecord> reads;
private SAMFileHeader customHeader = null;
/**
* Construct an artificial SAM file reader.
* @param sequenceDictionary sequence dictionary used to initialize our GenomeLocParser
* @param reads Reads to use as backing data source.
*/
public ArtificialSAMFileReader(SAMSequenceDictionary sequenceDictionary,SAMRecord... reads) {
@ -50,6 +53,30 @@ public class ArtificialSAMFileReader extends SAMFileReader {
this.reads = Arrays.asList(reads);
}
/**
* Construct an artificial SAM file reader with the given SAM file header
*
* @param customHeader Header that should be returned by calls to getFileHeader() on this reader
* @param reads Reads to use as backing data source.
*/
public ArtificialSAMFileReader( SAMFileHeader customHeader, SAMRecord... reads ) {
super(createEmptyInputStream(),true);
this.customHeader = customHeader;
this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary());
this.reads = Arrays.asList(reads);
}
@Override
public SAMFileHeader getFileHeader() {
if ( customHeader != null ) {
return customHeader;
}
return super.getFileHeader();
}
/**
* @{inheritDoc}
*/

View File

@ -276,6 +276,30 @@ public class ArtificialSAMUtils {
return Arrays.asList(left, right);
}
/**
* Create a collection of identical artificial reads based on the parameters. The cigar string for each
* read will be *M, where * is the length of the read.
*
* Useful for testing things like positional downsampling where you care only about the position and
* number of reads, and not the other attributes.
*
* @param stackSize number of identical reads to create
* @param header the SAM header to associate each read with
* @param name name associated with each read
* @param refIndex the reference index, i.e. what chromosome to associate them with
* @param alignmentStart where to start each alignment
* @param length the length of each read
*
* @return a collection of stackSize reads all sharing the above properties
*/
public static Collection<GATKSAMRecord> createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
Collection<GATKSAMRecord> stack = new ArrayList<GATKSAMRecord>(stackSize);
for ( int i = 1; i <= stackSize; i++ ) {
stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length));
}
return stack;
}
/**
* create an iterator containing the specified read piles
*

View File

@ -0,0 +1,212 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
/**
* An artificial stream of reads from a single read group/sample with configurable characteristics
* such as:
*
* -the number of contigs that the reads should be distributed across
* -number of "stacks" of reads sharing the same alignment start position per contig
* -the min/max number of reads in each stack (exact values chosen randomly from this range)
* -the min/max distance between stack start positions (exact values chosen randomly from this range)
* -the min/max length of each read (exact values chosen randomly from this range)
* -the number of unmapped reads
*
* The cigar string for all reads will be *M, where * is the length of the read.
*
* @author David Roazen
*/
public class ArtificialSingleSampleReadStream implements Iterable<SAMRecord> {
private SAMFileHeader header;
private String readGroupID;
private int numContigs;
private int numStacksPerContig;
private int minReadsPerStack;
private int maxReadsPerStack;
private int minDistanceBetweenStacks;
private int maxDistanceBetweenStacks;
private int minReadLength;
private int maxReadLength;
private int numUnmappedReads;
private static final String READ_GROUP_TAG = "RG";
public ArtificialSingleSampleReadStream( SAMFileHeader header,
String readGroupID,
int numContigs,
int numStacksPerContig,
int minReadsPerStack,
int maxReadsPerStack,
int minDistanceBetweenStacks,
int maxDistanceBetweenStacks,
int minReadLength,
int maxReadLength,
int numUnmappedReads ) {
this.header = header;
this.readGroupID = readGroupID;
this.numContigs = numContigs;
this.numStacksPerContig = numStacksPerContig;
this.minReadsPerStack = minReadsPerStack;
this.maxReadsPerStack = maxReadsPerStack;
this.minDistanceBetweenStacks = minDistanceBetweenStacks;
this.maxDistanceBetweenStacks = maxDistanceBetweenStacks;
this.minReadLength = minReadLength;
this.maxReadLength = maxReadLength;
this.numUnmappedReads = numUnmappedReads;
validateStreamParameters();
}
private void validateStreamParameters() {
if ( header == null || readGroupID == null ) {
throw new ReviewedStingException("null SAMFileHeader or read group ID") ;
}
if ( header.getReadGroup(readGroupID) == null ) {
throw new ReviewedStingException(String.format("Read group %s not found in SAMFileHeader", readGroupID));
}
if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 ||
minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 ||
numUnmappedReads < 0 ) {
throw new ReviewedStingException("Read stream parameters must be >= 0");
}
if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) {
throw new ReviewedStingException("numContigs and numStacksPerContig must either both be > 0, or both be 0");
}
if ( minReadsPerStack > maxReadsPerStack ) {
throw new ReviewedStingException("minReadsPerStack > maxReadsPerStack");
}
if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) {
throw new ReviewedStingException("minDistanceBetweenStacks > maxDistanceBetweenStacks");
}
if ( minReadLength > maxReadLength ) {
throw new ReviewedStingException("minReadLength > maxReadLength");
}
}
public Iterator<SAMRecord> iterator() {
return makeReads().iterator();
}
public StingSAMIterator getStingSAMIterator() {
return StingSAMIteratorAdapter.adapt(iterator());
}
public Collection<SAMRecord> makeReads() {
Collection<SAMRecord> reads = new ArrayList<SAMRecord>(numContigs * numStacksPerContig * maxReadsPerStack);
for ( int contig = 0; contig < numContigs; contig++ ) {
int alignmentStart = 1;
for ( int stack = 0; stack < numStacksPerContig; stack++ ) {
reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack)));
alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks);
}
}
if ( numUnmappedReads > 0 ) {
reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads));
}
return reads;
}
private Collection<SAMRecord> makeReadStack( int contig, int alignmentStart, int stackSize ) {
Collection<SAMRecord> readStack = new ArrayList<SAMRecord>(stackSize);
for ( int i = 0; i < stackSize; i++ ) {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,
"foo",
contig,
alignmentStart,
MathUtils.randomIntegerInRange(minReadLength, maxReadLength));
read.setAttribute(READ_GROUP_TAG, readGroupID);
readStack.add(read);
}
return readStack;
}
public SAMFileHeader getHeader() {
return header;
}
public String getReadGroupID() {
return readGroupID;
}
public int getNumContigs() {
return numContigs;
}
public int getNumStacksPerContig() {
return numStacksPerContig;
}
public int getMinReadsPerStack() {
return minReadsPerStack;
}
public int getMaxReadsPerStack() {
return maxReadsPerStack;
}
public int getMinDistanceBetweenStacks() {
return minDistanceBetweenStacks;
}
public int getMaxDistanceBetweenStacks() {
return maxDistanceBetweenStacks;
}
public int getMinReadLength() {
return minReadLength;
}
public int getMaxReadLength() {
return maxReadLength;
}
public int getNumUnmappedReads() {
return numUnmappedReads;
}
}

View File

@ -0,0 +1,281 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.List;
/**
* A class for analyzing and validating the read stream produced by an ArtificialSingleSampleReadStream.
*
* Collects various statistics about the stream of reads it's fed, and validates the stream
* by checking whether the collected statistics match the nominal properties of the stream.
*
* Subclasses are expected to override the validate() method in order to check whether an artificial
* read stream has been *transformed* in some way (eg., by downsampling or some other process), rather
* than merely checking whether the stream matches its original properties.
*
* Usage is simple:
*
* ArtificialSingleSampleReadStreamAnalyzer analyzer = new ArtificialSingleSampleReadStreamAnalyzer(originalStream);
* analyzer.analyze(originalOrTransformedStream);
* analyzer.validate(); // override this method if you want to check whether the stream has been transformed
* // in a certain way relative to the original stream
*
* @author David Roazen
*/
public class ArtificialSingleSampleReadStreamAnalyzer {
protected ArtificialSingleSampleReadStream originalStream;
protected SAMRecord lastRead;
protected int totalReads;
protected boolean allSamplesMatch;
protected int numContigs;
protected List<Integer> stacksPerContig;
protected Integer minReadsPerStack;
protected Integer maxReadsPerStack;
protected Integer minDistanceBetweenStacks;
protected Integer maxDistanceBetweenStacks;
protected Integer minReadLength;
protected Integer maxReadLength;
protected int numUnmappedReads;
protected int currentContigNumStacks;
protected int currentStackNumReads;
/**
* Construct a new read stream analyzer, providing an ArtificialSingleSampleReadStream that will
* serve as the basis for comparison after the analysis is complete.
*
* @param originalStream the original ArtificialSingleSampleReadStream upon which the stream
* that will be fed to the analyzer is based
*/
public ArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream ) {
this.originalStream = originalStream;
reset();
}
/**
* Reset all read stream statistics collected by this analyzer to prepare for a fresh run
*/
public void reset() {
lastRead = null;
totalReads = 0;
allSamplesMatch = true;
numContigs = 0;
stacksPerContig = new ArrayList<Integer>();
minReadsPerStack = null;
maxReadsPerStack = null;
minDistanceBetweenStacks = null;
maxDistanceBetweenStacks = null;
minReadLength = null;
maxReadLength = null;
numUnmappedReads = 0;
currentContigNumStacks = 0;
currentStackNumReads = 0;
}
/**
* Collect statistics on the stream of reads passed in
*
* @param stream the stream of reads to analyze
*/
public void analyze( Iterable<SAMRecord> stream ) {
for ( SAMRecord read : stream ) {
update(read);
}
finalizeStats();
}
/**
* Validate the stream by checking whether our collected statistics match the properties of the
* original stream. Throws a ReviewedStingException if the stream is invalid.
*
* Override this method if you want to check whether the stream has been transformed in some
* way relative to the original stream.
*/
public void validate() {
if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) {
if ( totalReads != 0 ) {
throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads");
}
return; // no further validation needed for the 0-reads case
}
else if ( totalReads == 0 ) {
throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads");
}
if ( ! allSamplesMatch ) {
throw new ReviewedStingException("some reads had the wrong sample");
}
if ( numContigs != originalStream.getNumContigs() ) {
throw new ReviewedStingException("number of contigs not correct");
}
if ( stacksPerContig.size() != originalStream.getNumContigs() ) {
throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs",
stacksPerContig.size(), originalStream.getNumContigs()));
}
for ( int contigStackCount : stacksPerContig ) {
if ( contigStackCount != originalStream.getNumStacksPerContig() ) {
throw new ReviewedStingException("contig had incorrect number of stacks");
}
}
if ( originalStream.getNumStacksPerContig() > 0 ) {
if ( minReadsPerStack < originalStream.getMinReadsPerStack() ) {
throw new ReviewedStingException("stack had fewer than the minimum number of reads");
}
if ( maxReadsPerStack > originalStream.getMaxReadsPerStack() ) {
throw new ReviewedStingException("stack had more than the maximum number of reads");
}
}
else if ( minReadsPerStack != null || maxReadsPerStack != null ) {
throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified");
}
if ( originalStream.getNumStacksPerContig() > 1 ) {
if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) {
throw new ReviewedStingException("stacks were separated by less than the minimum distance");
}
if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) {
throw new ReviewedStingException("stacks were separated by more than the maximum distance");
}
}
else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) {
throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1");
}
if ( minReadLength < originalStream.getMinReadLength() ) {
throw new ReviewedStingException("read was shorter than the minimum allowed length");
}
if ( maxReadLength > originalStream.getMaxReadLength() ) {
throw new ReviewedStingException("read was longer than the maximum allowed length");
}
if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) {
throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d",
originalStream.getNumUnmappedReads(), numUnmappedReads));
}
if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) &&
numUnmappedReads != totalReads ) {
throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads");
}
}
public void update( SAMRecord read ) {
if ( read.getReadUnmappedFlag() ) {
numUnmappedReads++;
if ( numUnmappedReads == 1 && lastRead != null ) {
processContigChange();
numContigs--;
}
}
else if ( lastRead == null ) {
numContigs = 1;
currentContigNumStacks = 1;
currentStackNumReads = 1;
}
else if ( ! read.getReferenceIndex().equals(lastRead.getReferenceIndex()) ) {
processContigChange();
}
else if ( read.getAlignmentStart() != lastRead.getAlignmentStart() ) {
processStackChangeWithinContig(read);
}
else {
currentStackNumReads++;
}
updateReadLength(read.getReadLength());
allSamplesMatch = allSamplesMatch && readHasCorrectSample(read);
totalReads++;
lastRead = read;
}
private void processContigChange() {
numContigs++;
stacksPerContig.add(currentContigNumStacks);
currentContigNumStacks = 1;
updateReadsPerStack(currentStackNumReads);
currentStackNumReads = 1;
}
private void processStackChangeWithinContig( SAMRecord read ) {
currentContigNumStacks++;
updateReadsPerStack(currentStackNumReads);
currentStackNumReads = 1;
updateDistanceBetweenStacks(read.getAlignmentStart() - lastRead.getAlignmentStart());
}
private void updateReadsPerStack( int stackReadCount ) {
if ( minReadsPerStack == null || stackReadCount < minReadsPerStack ) {
minReadsPerStack = stackReadCount;
}
if ( maxReadsPerStack == null || stackReadCount > maxReadsPerStack ) {
maxReadsPerStack = stackReadCount;
}
}
private void updateDistanceBetweenStacks( int stackDistance ) {
if ( minDistanceBetweenStacks == null || stackDistance < minDistanceBetweenStacks ) {
minDistanceBetweenStacks = stackDistance;
}
if ( maxDistanceBetweenStacks == null || stackDistance > maxDistanceBetweenStacks ) {
maxDistanceBetweenStacks = stackDistance;
}
}
private void updateReadLength( int readLength ) {
if ( minReadLength == null || readLength < minReadLength ) {
minReadLength = readLength;
}
if ( maxReadLength == null || readLength > maxReadLength ) {
maxReadLength = readLength;
}
}
private boolean readHasCorrectSample( SAMRecord read ) {
return originalStream.getReadGroupID().equals(read.getAttribute("RG"));
}
public void finalizeStats() {
if ( lastRead != null && ! lastRead.getReadUnmappedFlag() ) {
stacksPerContig.add(currentContigNumStacks);
updateReadsPerStack(currentStackNumReads);
}
}
}

View File

@ -0,0 +1,26 @@
package org.broadinstitute.sting.utils.threading;
import java.util.concurrent.ThreadFactory;
/**
* Thread factor that produces threads with a given name pattern
*
* User: depristo
* Date: 9/5/12
* Time: 9:22 PM
*
*/
public class NamedThreadFactory implements ThreadFactory {
static int id = 0;
final String format;
public NamedThreadFactory(String format) {
this.format = format;
String.format(format, id); // test the name
}
@Override
public Thread newThread(Runnable r) {
return new Thread(r, String.format(format, id++));
}
}

View File

@ -40,13 +40,13 @@ import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContextTestProvider;
import java.io.*;
import org.testng.Assert;
import org.testng.annotations.AfterSuite;
import org.testng.annotations.BeforeMethod;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.PrintStream;
import java.text.SimpleDateFormat;
import java.util.*;
@ -251,20 +251,43 @@ public class WalkerTest extends BaseTest {
return false;
}
protected Pair<List<File>, List<String>> executeTestParallel(final String name, WalkerTestSpec spec) {
return executeTest(name, spec, Arrays.asList(1, 4));
public enum ParallelTestType {
TREE_REDUCIBLE,
NANO_SCHEDULED,
BOTH
}
protected Pair<List<File>, List<String>> executeTest(final String name, WalkerTestSpec spec, List<Integer> parallelThreads) {
protected Pair<List<File>, List<String>> executeTestParallel(final String name, WalkerTestSpec spec, ParallelTestType testType) {
final List<Integer> ntThreads = testType == ParallelTestType.TREE_REDUCIBLE || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.<Integer>emptyList();
final List<Integer> cntThreads = testType == ParallelTestType.NANO_SCHEDULED || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.<Integer>emptyList();
return executeTest(name, spec, ntThreads, cntThreads);
}
protected Pair<List<File>, List<String>> executeTestParallel(final String name, WalkerTestSpec spec) {
return executeTestParallel(name, spec, ParallelTestType.TREE_REDUCIBLE);
}
protected Pair<List<File>, List<String>> executeTest(final String name, WalkerTestSpec spec, List<Integer> ntThreads, List<Integer> cpuThreads) {
String originalArgs = spec.args;
Pair<List<File>, List<String>> results = null;
for ( int nt : parallelThreads ) {
boolean ran1 = false;
for ( int nt : ntThreads ) {
String extra = nt == 1 ? "" : (" -nt " + nt);
ran1 = ran1 || nt == 1;
spec.args = originalArgs + extra;
results = executeTest(name + "-nt-" + nt, spec);
}
for ( int nct : cpuThreads ) {
if ( nct != 1 ) {
String extra = " -nct " + nct;
spec.args = originalArgs + extra;
results = executeTest(name + "-cnt-" + nct, spec);
}
}
return results;
}

View File

@ -0,0 +1,41 @@
package org.broadinstitute.sting.commandline;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 8/31/12
* Time: 11:03 AM
* To change this template use File | Settings | File Templates.
*/
public class InvalidArgumentIntegrationTest extends WalkerTest {
private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf";
private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) {
return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf "
+ callsB36 + " -F POS,CHROM -R "
+ b36KGReference + " -o %s " + flag + " " + arg,
1, exeption);
}
@Test
public void testUnknownReadFilter() {
executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class));
}
@Test
public void testMalformedWalkerArgs() {
executeTest("MalformedWalkerArgs",
new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf "
+ callsB36 + " -F POS,CHROM -R "
+ b36KGReference + " -o %s ",
1, UserException.MalformedWalkerArgumentsException.class));
}
}

View File

@ -29,7 +29,7 @@ import net.sf.picard.filter.FilteringIterator;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.walkers.qc.CountLoci;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.Collections;
@ -97,7 +98,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark {
},
PER_SAMPLE {
@Override
DownsamplingMethod create() { return GATKArgumentCollection.getDefaultDownsamplingMethod(); }
DownsamplingMethod create() { return DownsamplingMethod.getDefaultDownsamplingMethod(new CountLoci(), false); }
};
abstract DownsamplingMethod create();
}

View File

@ -25,36 +25,40 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import static org.testng.Assert.*;
/**
* @author aaron
* @version 1.0
* @date Apr 8, 2009
* <p/>
* Class SAMDataSourceUnitTest
* <p/>
@ -66,6 +70,161 @@ public class SAMDataSourceUnitTest extends BaseTest {
private IndexedFastaSequenceFile seq;
private GenomeLocParser genomeLocParser;
/***********************************
* Tests for the fillShard() method
***********************************/
/**
* Tests to ensure that the fillShard() method does not place shard boundaries at inappropriate places,
* such as within an alignment start position
*/
private static class SAMDataSourceFillShardBoundaryTest extends TestDataProvider {
private int numContigs;
private int numStacksPerContig;
private int stackSize;
private int numUnmappedReads;
private DownsamplingMethod downsamplingMethod;
private SAMFileHeader header;
public SAMDataSourceFillShardBoundaryTest( int numContigs,
int numStacksPerContig,
int stackSize,
int numUnmappedReads,
int downsamplingTargetCoverage ) {
super(SAMDataSourceFillShardBoundaryTest.class);
this.numContigs = numContigs;
this.numStacksPerContig = numStacksPerContig;
this.stackSize = stackSize;
this.numUnmappedReads = numUnmappedReads;
this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true);
setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d",
getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage));
}
public void run() {
SAMDataSource dataSource = new SAMDataSource(Arrays.asList(createTestBAM()),
new ThreadAllocation(),
null,
new GenomeLocParser(header.getSequenceDictionary()),
false,
SAMFileReader.ValidationStringency.SILENT,
null,
downsamplingMethod,
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
false);
Assert.assertTrue(dataSource.usingExpandedShards());
Iterable<Shard> shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
SAMRecord readAtEndOfLastShard = null;
for ( Shard shard : shardIterator ) {
int numContigsThisShard = 0;
SAMRecord lastRead = null;
for ( SAMRecord read : shard.iterator() ) {
if ( lastRead == null ) {
numContigsThisShard = 1;
}
else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) {
numContigsThisShard++;
}
// If the last read from the previous shard is not unmapped, we have to make sure
// that no reads in this shard start at the same position
if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) {
Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) &&
readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(),
String.format("Reads from alignment start position %d:%d are split across multiple shards",
read.getReferenceIndex(), read.getAlignmentStart()));
}
lastRead = read;
}
// There should never be reads from more than 1 contig in a shard (ignoring unmapped reads)
Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs");
readAtEndOfLastShard = lastRead;
}
}
private SAMReaderID createTestBAM() {
header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000);
SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo");
readGroup.setSample("testSample");
header.addReadGroup(readGroup);
ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header,
"foo",
numContigs,
numStacksPerContig,
stackSize,
stackSize,
1,
100,
50,
150,
numUnmappedReads);
File testBAMFile;
try {
testBAMFile = File.createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam");
testBAMFile.deleteOnExit();
}
catch ( IOException e ) {
throw new ReviewedStingException(String.format("Failed to create temp bam file for test %s. %s", this, e.getMessage()));
}
SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile);
for ( SAMRecord read : artificialReads ) {
bamWriter.addAlignment(read);
}
bamWriter.close();
return new SAMReaderID(testBAMFile, new Tags());
}
}
@DataProvider(name = "SAMDataSourceFillShardTestDataProvider")
public Object[][] createSAMDataSourceFillShardBoundaryTests() {
// Take downsampling out of the equation for these tests -- we are only interested in whether the
// shard boundaries occur at the right places in the read stream, and removing downsampling as a
// factor simplifies that task (note that we still need to provide a specific downsampling method with
// experimental downsampling enabled to trigger the shard expansion behavior, for now)
int downsamplingTargetCoverage = ReadShard.MAX_READS * 10;
for ( int numContigs = 1; numContigs <= 3; numContigs++ ) {
for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) {
// Use crucial read shard boundary values as the stack sizes
for ( int stackSize : Arrays.asList(ReadShard.MAX_READS / 2, ReadShard.MAX_READS / 2 + 10, ReadShard.MAX_READS, ReadShard.MAX_READS - 1, ReadShard.MAX_READS + 1, ReadShard.MAX_READS * 2) ) {
for ( int numUnmappedReads : Arrays.asList(0, ReadShard.MAX_READS / 2, ReadShard.MAX_READS * 2) ) {
new SAMDataSourceFillShardBoundaryTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage);
}
}
}
}
return SAMDataSourceFillShardBoundaryTest.getTests(SAMDataSourceFillShardBoundaryTest.class);
}
// TODO: re-enable these tests once the issues with filepointer ordering + the downsamplers are worked out
@Test(dataProvider = "SAMDataSourceFillShardTestDataProvider", enabled = false)
public void testSAMDataSourceFillShard( SAMDataSourceFillShardBoundaryTest test ) {
logger.warn("Running test: " + test);
test.run();
}
// TODO: the legacy tests below should really be replaced with a more comprehensive suite of tests for SAMDataSource
/**
* This function does the setup of our parser, before each method call.
* <p/>

View File

@ -1,73 +1,138 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Arrays;
public class DownsamplingReadsIteratorUnitTest {
public class DownsamplingReadsIteratorUnitTest extends BaseTest {
@Test
public void testDownsamplingIteratorWithPositionalDownsampling() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
private static class DownsamplingReadsIteratorTest extends TestDataProvider {
private DownsamplingReadsIterator downsamplingIter;
private int targetCoverage;
private ArtificialSingleSampleReadStream stream;
private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer;
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) {
super(DownsamplingReadsIteratorTest.class);
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100));
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100));
this.stream = stream;
this.targetCoverage = targetCoverage;
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
Assert.assertTrue(iter.hasNext());
SAMRecord previous = iter.next();
int count = 1;
while ( iter.hasNext() ) {
SAMRecord current = iter.next();
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
count++;
previous = current;
setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d",
getClass().getSimpleName(),
targetCoverage,
stream.getNumContigs(),
stream.getNumStacksPerContig(),
stream.getMinReadsPerStack(),
stream.getMaxReadsPerStack(),
stream.getMinDistanceBetweenStacks(),
stream.getMaxDistanceBetweenStacks(),
stream.getMinReadLength(),
stream.getMaxReadLength(),
stream.getNumUnmappedReads()));
}
Assert.assertEquals(count, 1000);
public void run() {
streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage);
downsamplingIter = new DownsamplingReadsIterator(stream.getStingSAMIterator(), new SimplePositionalDownsampler<SAMRecord>(targetCoverage));
streamAnalyzer.analyze(downsamplingIter);
// Check whether the observed properties of the downsampled stream are what they should be
streamAnalyzer.validate();
// Allow memory used by this test to be reclaimed
stream = null;
streamAnalyzer = null;
downsamplingIter = null;
}
}
@Test
public void testDownsamplingIteratorNoEffectiveDownsampling() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
@DataProvider(name = "DownsamplingReadsIteratorTestDataProvider")
public Object[][] createDownsamplingReadsIteratorTests() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000);
String readGroupID = "testReadGroup";
SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID);
readGroup.setSample("testSample");
header.addReadGroup(readGroup);
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
// Values that don't vary across tests
int targetCoverage = 10;
int minReadLength = 50;
int maxReadLength = 100;
int minDistanceBetweenStacks = 1;
int maxDistanceBetweenStacks = maxReadLength + 1;
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
GenomeAnalysisEngine.resetRandomGenerator();
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
Assert.assertTrue(iter.hasNext());
SAMRecord previous = iter.next();
int count = 1;
while ( iter.hasNext() ) {
SAMRecord current = iter.next();
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
count++;
previous = current;
// brute force testing!
for ( int numContigs : Arrays.asList(1, 2, 5) ) {
for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) {
for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) {
for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) {
for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) {
// Only interested in sane read stream configurations here
if ( minReadsPerStack <= maxReadsPerStack ) {
new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header,
readGroupID,
numContigs,
stacksPerContig,
minReadsPerStack,
maxReadsPerStack,
minDistanceBetweenStacks,
maxDistanceBetweenStacks,
minReadLength,
maxReadLength,
numUnmappedReads),
targetCoverage);
}
}
}
}
}
}
Assert.assertEquals(count, 600);
return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class);
}
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
for ( int i = 1; i <= stackSize; i++ ) {
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
}
return stack;
@Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider")
public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
test.run();
}
}

View File

@ -1,65 +1,157 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
public class FractionalDownsamplerUnitTest {
public class FractionalDownsamplerUnitTest extends BaseTest {
@Test
public void test100PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(1.0);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
private static class FractionalDownsamplerTest extends TestDataProvider {
double fraction;
int totalReads;
int expectedMinNumReadsAfterDownsampling;
int expectedMaxNumReadsAfterDownsampling;
int expectedMinDiscardedItems;
int expectedMaxDiscardedItems;
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
public FractionalDownsamplerTest( double fraction, int totalReads ) {
super(FractionalDownsamplerTest.class);
Assert.assertTrue(downsampledReads.size() == 1000);
}
this.fraction = fraction;
this.totalReads = totalReads;
@Test
public void test0PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.0);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
calculateExpectations();
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.isEmpty());
}
@Test
public void test50PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.5);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000);
}
private List<SAMRecord> createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) {
List<SAMRecord> reads = new ArrayList<SAMRecord>(numReads);
for ( int i = 1; i <= numReads; i++ ) {
reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex,
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1,
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1));
setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d",
getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling));
}
return reads;
private void calculateExpectations() {
// Require an exact match in the 0% and 100% cases
if ( fraction == 0.0 ) {
expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0;
expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads;
}
else if ( fraction == 1.0 ) {
expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads;
expectedMinDiscardedItems = expectedMaxDiscardedItems = 0;
}
else {
expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0);
expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads);
expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling;
expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling;
}
}
public Collection<SAMRecord> createReads() {
Collection<SAMRecord> reads = new ArrayList<SAMRecord>(totalReads);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100));
return reads;
}
}
@DataProvider(name = "FractionalDownsamplerTestDataProvider")
public Object[][] createFractionalDownsamplerTestData() {
for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) {
for ( int totalReads : Arrays.asList(0, 1000, 10000) ) {
new FractionalDownsamplerTest(fraction, totalReads);
}
}
return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class);
}
@Test(dataProvider = "FractionalDownsamplerTestDataProvider")
public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
ReadsDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(test.fraction);
downsampler.submit(test.createReads());
if ( test.totalReads > 0 ) {
if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
}
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
downsampler.signalEndOfInput();
if ( test.totalReads > 0 ) {
if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
}
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling &&
downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling);
Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems &&
downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems);
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size());
downsampler.reset();
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
}
}

View File

@ -0,0 +1,163 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import org.testng.Assert;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
public class LevelingDownsamplerUnitTest extends BaseTest {
private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider {
public enum DataStructure { LINKED_LIST, ARRAY_LIST }
int targetSize;
int numStacks;
int stackSize;
DataStructure dataStructure;
int expectedSize;
public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) {
super(LevelingDownsamplerUniformStacksTest.class);
this.targetSize = targetSize;
this.numStacks = numStacks;
this.stackSize = stackSize;
this.dataStructure = dataStructure;
expectedSize = calculateExpectedDownsampledStackSize();
setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d",
getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize));
}
public Collection<List<Object>> createStacks() {
Collection<List<Object>> stacks = new ArrayList<List<Object>>();
for ( int i = 1; i <= numStacks; i++ ) {
List<Object> stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList<Object>() : new ArrayList<Object>();
for ( int j = 1; j <= stackSize; j++ ) {
stack.add(new Object());
}
stacks.add(stack);
}
return stacks;
}
private int calculateExpectedDownsampledStackSize() {
int numItemsToRemove = numStacks * stackSize - targetSize;
if ( numStacks == 0 ) {
return 0;
}
else if ( numItemsToRemove <= 0 ) {
return stackSize;
}
return Math.max(1, stackSize - (numItemsToRemove / numStacks));
}
}
@DataProvider(name = "UniformStacksDataProvider")
public Object[][] createUniformStacksTestData() {
for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) {
for ( int numStacks = 0; numStacks <= 10; numStacks++ ) {
for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) {
for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) {
new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure);
}
}
}
}
return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class);
}
@Test( dataProvider = "UniformStacksDataProvider" )
public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
Downsampler<List<Object>> downsampler = new LevelingDownsampler<List<Object>, Object>(test.targetSize);
downsampler.submit(test.createStacks());
if ( test.numStacks > 0 ) {
Assert.assertFalse(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() == null);
Assert.assertTrue(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() != null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
downsampler.signalEndOfInput();
if ( test.numStacks > 0 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
List<List<Object>> downsampledStacks = downsampler.consumeFinalizedItems();
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
Assert.assertEquals(downsampledStacks.size(), test.numStacks);
int totalRemainingItems = 0;
for ( List<Object> stack : downsampledStacks ) {
Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1);
totalRemainingItems += stack.size();
}
int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems();
int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems;
Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded);
downsampler.reset();
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks));
}
}

View File

@ -0,0 +1,298 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialMultiSampleReadStream;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest {
private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider {
// TODO: tests should distinguish between variance across samples and variance within a sample
private enum StreamDensity {
SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2),
DENSE (1, MIN_READ_LENGTH),
MIXED (1, MAX_READ_LENGTH * 2),
UNIFORM_DENSE (1, 1),
UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2);
int minDistanceBetweenStacks;
int maxDistanceBetweenStacks;
StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) {
this.minDistanceBetweenStacks = minDistanceBetweenStacks;
this.maxDistanceBetweenStacks = maxDistanceBetweenStacks;
}
public String toString() {
return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks);
}
}
private enum StreamStackDepth {
NON_UNIFORM_LOW (1, 5),
NON_UNIFORM_HIGH (15, 20),
NON_UNIFORM_MIXED (1, 20),
UNIFORM_SINGLE (1, 1),
UNIFORM_LOW (2, 2),
UNIFORM_HIGH (20, 20),
UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing
int minReadsPerStack;
int maxReadsPerStack;
StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) {
this.minReadsPerStack = minReadsPerStack;
this.maxReadsPerStack = maxReadsPerStack;
}
public boolean isUniform() {
return minReadsPerStack == maxReadsPerStack;
}
public String toString() {
return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack);
}
}
private enum StreamStacksPerContig {
UNIFORM(20, 20),
NON_UNIFORM(1, 30);
int minStacksPerContig;
int maxStacksPerContig;
StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) {
this.minStacksPerContig = minStacksPerContig;
this.maxStacksPerContig = maxStacksPerContig;
}
public boolean isUniform() {
return minStacksPerContig == maxStacksPerContig;
}
public String toString() {
return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig);
}
}
// Not interested in testing multiple ranges for the read lengths, as none of our current
// downsamplers are affected by read length
private static final int MIN_READ_LENGTH = 50;
private static final int MAX_READ_LENGTH = 150;
private ReadsDownsamplerFactory<SAMRecord> downsamplerFactory;
private int targetCoverage;
private int numSamples;
private int minContigs;
private int maxContigs;
private StreamDensity streamDensity;
private StreamStackDepth streamStackDepth;
private StreamStacksPerContig streamStacksPerContig;
private double unmappedReadsFraction;
private int unmappedReadsCount;
private boolean verifySortedness;
private ArtificialMultiSampleReadStream mergedReadStream;
private Map<String, ArtificialSingleSampleReadStream> perSampleArtificialReadStreams;
private Map<String, ArtificialSingleSampleReadStreamAnalyzer> perSampleStreamAnalyzers;
private SAMFileHeader header;
public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory<SAMRecord> downsamplerFactory,
int targetCoverage,
int numSamples,
int minContigs,
int maxContigs,
StreamDensity streamDensity,
StreamStackDepth streamStackDepth,
StreamStacksPerContig streamStacksPerContig,
double unmappedReadsFraction,
int unmappedReadsCount,
boolean verifySortedness ) {
super(PerSampleDownsamplingReadsIteratorTest.class);
this.downsamplerFactory = downsamplerFactory;
this.targetCoverage = targetCoverage;
this.numSamples = numSamples;
this.minContigs = minContigs;
this.maxContigs = maxContigs;
this.streamDensity = streamDensity;
this.streamStackDepth = streamStackDepth;
this.streamStacksPerContig = streamStacksPerContig;
this.unmappedReadsFraction = unmappedReadsFraction;
this.unmappedReadsCount = unmappedReadsCount;
this.verifySortedness = verifySortedness;
header = createHeader();
createReadStreams();
setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b",
getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness));
}
private SAMFileHeader createHeader() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000);
List<String> readGroups = new ArrayList<String>(numSamples);
List<String> sampleNames = new ArrayList<String>(numSamples);
for ( int i = 0; i < numSamples; i++ ) {
readGroups.add("ReadGroup" + i);
sampleNames.add("Sample" + i);
}
return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames);
}
private void createReadStreams() {
perSampleArtificialReadStreams = new HashMap<String, ArtificialSingleSampleReadStream>(numSamples);
perSampleStreamAnalyzers = new HashMap<String, ArtificialSingleSampleReadStreamAnalyzer>(numSamples);
for (SAMReadGroupRecord readGroup : header.getReadGroups() ) {
String readGroupID = readGroup.getReadGroupId();
String sampleName = readGroup.getSample();
int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs);
int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig);
int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0;
ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header,
readGroupID,
thisSampleNumContigs,
thisSampleStacksPerContig,
streamStackDepth.minReadsPerStack,
streamStackDepth.maxReadsPerStack,
streamDensity.minDistanceBetweenStacks,
streamDensity.maxDistanceBetweenStacks,
MIN_READ_LENGTH,
MAX_READ_LENGTH,
thisSampleNumUnmappedReads);
perSampleArtificialReadStreams.put(sampleName, thisSampleStream);
perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage));
}
mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values());
}
public void run() {
StingSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getStingSAMIterator(), downsamplerFactory);
if ( verifySortedness ) {
downsamplingIter = new VerifyingSamIterator(downsamplingIter);
}
while ( downsamplingIter.hasNext() ) {
SAMRecord read = downsamplingIter.next();
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName);
if ( analyzer != null ) {
analyzer.update(read);
}
else {
throw new ReviewedStingException("bug: stream analyzer for sample " + sampleName + " not found");
}
}
for ( Map.Entry<String, ArtificialSingleSampleReadStreamAnalyzer> analyzerEntry : perSampleStreamAnalyzers.entrySet() ) {
ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue();
analyzer.finalizeStats();
// Validate the downsampled read stream for each sample individually
analyzer.validate();
}
// Allow memory used by this test to be reclaimed:
mergedReadStream = null;
perSampleArtificialReadStreams = null;
perSampleStreamAnalyzers = null;
}
}
@DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider")
public Object[][] createPerSampleDownsamplingReadsIteratorTests() {
GenomeAnalysisEngine.resetRandomGenerator();
// Some values don't vary across tests
int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack;
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = new SimplePositionalDownsamplerFactory<SAMRecord>(targetCoverage);
int maxContigs = 3;
boolean verifySortedness = true;
for ( int numSamples : Arrays.asList(1, 2, 10) ) {
for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) {
for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) {
for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) {
for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) {
for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) {
for ( int unmappedReadsCount : Arrays.asList(1, 50) ) {
new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory,
targetCoverage,
numSamples,
minContigs,
maxContigs,
streamDensity,
streamStackDepth,
streamStacksPerContig,
unmappedReadsFraction,
unmappedReadsCount,
verifySortedness);
}
}
}
}
}
}
}
return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class);
}
@Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider")
public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
test.run();
}
}

View File

@ -1,357 +0,0 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.*;
// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested
public class PositionalDownsamplerUnitTest extends BaseTest {
/**
* -------
* -------
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeOverlappingIdenticalStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* -------
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeNonOverlappingIdenticalStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100));
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) == 1000);
Assert.assertTrue(downsampledStackSizes.get(1) == 1000);
Assert.assertTrue(downsampledStackSizes.get(2) == 1000);
}
/**
* ---
* ---
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeStacksWithShortStackAtBeginning() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* -------
* ---
* ---
* -------
* -------
*/
@Test
public void testThreeStacksWithShortStackInMiddle() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000);
}
/**
* ------
* ------
* -------
* -------
* ---
* ---
*/
@Test
public void testThreeStacksWithShortStackAtEnd() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* ----
* -------
* ----
* -------
* -------
*/
@Test
public void testThreePartiallyOverlappingStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
// TODO: need to examine per-base coverage here
}
@Test
public void testNoDownsamplingRequired() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) == 300);
Assert.assertTrue(downsampledStackSizes.get(1) == 300);
Assert.assertTrue(downsampledStackSizes.get(2) == 300);
}
@Test
public void testGATKSAMRecordSupport() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<GATKSAMRecord> downsampler = new PositionalDownsampler<GATKSAMRecord>(1000);
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
for ( int i = 0; i < 10; i++ ) {
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10));
}
downsampler.submit(reads);
downsampler.signalEndOfInput();
List<GATKSAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.size() == 10);
}
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
for ( int i = 1; i <= stackSize; i++ ) {
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
}
return stack;
}
private ArrayList<SAMRecord> createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) {
ArrayList<SAMRecord> stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength);
stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength));
return stack;
}
private List<Integer> getDownsampledStackSizesAndVerifySortedness( List<SAMRecord> downsampledReads ) {
List<Integer> stackSizes = new ArrayList<Integer>();
Iterator<SAMRecord> iter = downsampledReads.iterator();
Assert.assertTrue(iter.hasNext());
SAMRecord previousRead = iter.next();
int currentStackSize = 1;
while ( iter.hasNext() ) {
SAMRecord currentRead = iter.next();
if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) {
stackSizes.add(currentStackSize);
currentStackSize = 1;
}
else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) {
Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead));
}
else {
currentStackSize++;
}
previousRead = currentRead;
}
stackSizes.add(currentStackSize);
return stackSizes;
}
}

View File

@ -0,0 +1,126 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream;
import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer;
/**
* Class for analyzing an artificial read stream that has been positionally downsampled, and verifying
* that the downsampling was done correctly without changing the stream in unexpected ways.
*
* @author David Roazen
*/
public class PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer extends ArtificialSingleSampleReadStreamAnalyzer {
private int targetCoverage;
public PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream, int targetCoverage ) {
super(originalStream);
this.targetCoverage = targetCoverage;
}
/**
* Overridden validate() method that checks for the effects of positional downsampling in addition to checking
* for whether the original properties of the stream not affected by downsampling have been preserved
*/
@Override
public void validate() {
if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) {
if ( totalReads != 0 ) {
throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads");
}
return; // no further validation needed for the 0-reads case
}
else if ( totalReads == 0 ) {
throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads");
}
if ( ! allSamplesMatch ) {
throw new ReviewedStingException("some reads had the wrong sample");
}
if ( numContigs != originalStream.getNumContigs() ) {
throw new ReviewedStingException("number of contigs not correct");
}
if ( stacksPerContig.size() != originalStream.getNumContigs() ) {
throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs",
stacksPerContig.size(), originalStream.getNumContigs()));
}
for ( int contigStackCount : stacksPerContig ) {
if ( contigStackCount != originalStream.getNumStacksPerContig() ) {
throw new ReviewedStingException("contig had incorrect number of stacks");
}
}
if ( originalStream.getNumStacksPerContig() > 0 ) {
// Check for the effects of positional downsampling:
int stackMinimumAfterDownsampling = Math.min(targetCoverage, originalStream.getMinReadsPerStack());
int stackMaximumAfterDownsampling = targetCoverage;
if ( minReadsPerStack < stackMinimumAfterDownsampling ) {
throw new ReviewedStingException("stack had fewer than the minimum number of reads after downsampling");
}
if ( maxReadsPerStack > stackMaximumAfterDownsampling ) {
throw new ReviewedStingException("stack had more than the maximum number of reads after downsampling");
}
}
else if ( minReadsPerStack != null || maxReadsPerStack != null ) {
throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified");
}
if ( originalStream.getNumStacksPerContig() > 1 ) {
if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) {
throw new ReviewedStingException("stacks were separated by less than the minimum distance");
}
if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) {
throw new ReviewedStingException("stacks were separated by more than the maximum distance");
}
}
else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) {
throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1");
}
if ( minReadLength < originalStream.getMinReadLength() ) {
throw new ReviewedStingException("read was shorter than the minimum allowed length");
}
if ( maxReadLength > originalStream.getMaxReadLength() ) {
throw new ReviewedStingException("read was longer than the maximum allowed length");
}
if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) {
throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d",
originalStream.getNumUnmappedReads(), numUnmappedReads));
}
if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) &&
numUnmappedReads != totalReads ) {
throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads");
}
}
}

View File

@ -0,0 +1,129 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
public class ReservoirDownsamplerUnitTest extends BaseTest {
private static class ReservoirDownsamplerTest extends TestDataProvider {
int reservoirSize;
int totalReads;
int expectedNumReadsAfterDownsampling;
int expectedNumDiscardedItems;
public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) {
super(ReservoirDownsamplerTest.class);
this.reservoirSize = reservoirSize;
this.totalReads = totalReads;
expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads);
expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize;
setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d",
getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems));
}
public Collection<SAMRecord> createReads() {
Collection<SAMRecord> reads = new ArrayList<SAMRecord>(totalReads);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100));
return reads;
}
}
@DataProvider(name = "ReservoirDownsamplerTestDataProvider")
public Object[][] createReservoirDownsamplerTestData() {
for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) {
new ReservoirDownsamplerTest(reservoirSize, 0);
for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) {
new ReservoirDownsamplerTest(reservoirSize, totalReads);
}
}
return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class);
}
@Test(dataProvider = "ReservoirDownsamplerTestDataProvider")
public void testReservoirDownsampler( ReservoirDownsamplerTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
ReadsDownsampler<SAMRecord> downsampler = new ReservoirDownsampler<SAMRecord>(test.reservoirSize);
downsampler.submit(test.createReads());
if ( test.totalReads > 0 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
downsampler.signalEndOfInput();
if ( test.totalReads > 0 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling);
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems);
Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems);
downsampler.reset();
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
}
}

View File

@ -0,0 +1,330 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.*;
public class SimplePositionalDownsamplerUnitTest extends BaseTest {
private static class SimplePositionalDownsamplerTest extends TestDataProvider {
int targetCoverage;
int numStacks;
List<Integer> stackSizes;
List<Integer> expectedStackSizes;
boolean multipleContigs;
int totalInitialReads;
public SimplePositionalDownsamplerTest( int targetCoverage, List<Integer> stackSizes, boolean multipleContigs ) {
super(SimplePositionalDownsamplerTest.class);
this.targetCoverage = targetCoverage;
this.numStacks = stackSizes.size();
this.stackSizes = stackSizes;
this.multipleContigs = multipleContigs;
calculateExpectedDownsampledStackSizes();
totalInitialReads = 0;
for ( Integer stackSize : stackSizes ) {
totalInitialReads += stackSize;
}
setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b",
getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs));
}
public Collection<SAMRecord> createReads() {
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
SAMFileHeader header = multipleContigs ?
ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) :
ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
int refIndex = 0;
int alignmentStart = 1;
int readLength = 100;
for ( int i = 0; i < numStacks; i++ ) {
if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) {
refIndex++;
}
reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo",
refIndex, alignmentStart, readLength));
alignmentStart += 10;
}
return reads;
}
private void calculateExpectedDownsampledStackSizes() {
expectedStackSizes = new ArrayList<Integer>(numStacks);
for ( Integer stackSize : stackSizes ) {
int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage;
expectedStackSizes.add(expectedSize);
}
}
}
@DataProvider(name = "SimplePositionalDownsamplerTestDataProvider")
public Object[][] createSimplePositionalDownsamplerTestData() {
GenomeAnalysisEngine.resetRandomGenerator();
for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) {
for ( int contigs = 1; contigs <= 2; contigs++ ) {
for ( int numStacks = 0; numStacks <= 10; numStacks++ ) {
List<Integer> stackSizes = new ArrayList<Integer>(numStacks);
for ( int stack = 1; stack <= numStacks; stack++ ) {
stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1);
}
new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1);
}
}
}
return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class);
}
@Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" )
public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) {
logger.warn("Running test: " + test);
GenomeAnalysisEngine.resetRandomGenerator();
ReadsDownsampler<SAMRecord> downsampler = new SimplePositionalDownsampler<SAMRecord>(test.targetCoverage);
downsampler.submit(test.createReads());
if ( test.numStacks > 1 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertTrue(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() != null);
}
else if ( test.numStacks == 1 ) {
Assert.assertFalse(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() == null);
Assert.assertTrue(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() != null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
downsampler.signalEndOfInput();
if ( test.numStacks > 0 ) {
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
}
else {
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
}
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null);
if ( test.numStacks == 0 ) {
Assert.assertTrue(downsampledReads.isEmpty());
}
else {
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads);
Assert.assertEquals(downsampledStackSizes.size(), test.numStacks);
Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes);
int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size();
int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems();
Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated);
}
downsampler.reset();
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
}
private List<Integer> getDownsampledStackSizesAndVerifySortedness( List<SAMRecord> downsampledReads ) {
List<Integer> stackSizes = new ArrayList<Integer>();
if ( downsampledReads.isEmpty() ) {
return stackSizes;
}
Iterator<SAMRecord> iter = downsampledReads.iterator();
Assert.assertTrue(iter.hasNext());
SAMRecord previousRead = iter.next();
int currentStackSize = 1;
while ( iter.hasNext() ) {
SAMRecord currentRead = iter.next();
if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) {
stackSizes.add(currentStackSize);
currentStackSize = 1;
}
else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) {
Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead));
}
else {
currentStackSize++;
}
previousRead = currentRead;
}
stackSizes.add(currentStackSize);
return stackSizes;
}
@Test
public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() {
ReadsDownsampler<SAMRecord> downsampler = new SimplePositionalDownsampler<SAMRecord>(1000);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
Collection<SAMRecord> readStack = new ArrayList<SAMRecord>();
readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100));
downsampler.submit(readStack);
Assert.assertFalse(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() == null);
Assert.assertTrue(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() != null);
SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100);
downsampler.signalNoMoreReadsBefore(laterRead);
Assert.assertTrue(downsampler.hasFinalizedItems());
Assert.assertTrue(downsampler.peekFinalized() != null);
Assert.assertFalse(downsampler.hasPendingItems());
Assert.assertTrue(downsampler.peekPending() == null);
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertEquals(downsampledReads.size(), readStack.size());
}
@Test
public void testBasicUnmappedReadsSupport() {
ReadsDownsampler<SAMRecord> downsampler = new SimplePositionalDownsampler<SAMRecord>(100);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
Collection<SAMRecord> readStack = new ArrayList<SAMRecord>();
readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX,
SAMRecord.NO_ALIGNMENT_START, 100));
for ( SAMRecord read : readStack ) {
Assert.assertTrue(read.getReadUnmappedFlag());
}
downsampler.submit(readStack);
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
// Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler
Assert.assertEquals(downsampledReads.size(), readStack.size());
for ( SAMRecord read: downsampledReads ) {
Assert.assertTrue(read.getReadUnmappedFlag());
}
}
@Test
public void testMixedMappedAndUnmappedReadsSupport() {
ReadsDownsampler<SAMRecord> downsampler = new SimplePositionalDownsampler<SAMRecord>(100);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
Collection<SAMRecord> mappedReadStack = new ArrayList<SAMRecord>();
mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100));
for ( SAMRecord read : mappedReadStack ) {
Assert.assertFalse(read.getReadUnmappedFlag());
}
Collection<SAMRecord> unmappedReadStack = new ArrayList<SAMRecord>();
unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX,
SAMRecord.NO_ALIGNMENT_START, 100));
for ( SAMRecord read : unmappedReadStack ) {
Assert.assertTrue(read.getReadUnmappedFlag());
}
downsampler.submit(mappedReadStack);
downsampler.submit(unmappedReadStack);
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
// Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler
Assert.assertEquals(downsampledReads.size(), 300);
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100);
int count = 1;
for ( SAMRecord read: downsampledReads ) {
if ( count <= 100 ) {
Assert.assertFalse(read.getReadUnmappedFlag());
}
else {
Assert.assertTrue(read.getReadUnmappedFlag());
}
count++;
}
}
@Test
public void testGATKSAMRecordSupport() {
ReadsDownsampler<GATKSAMRecord> downsampler = new SimplePositionalDownsampler<GATKSAMRecord>(1000);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
for ( int i = 0; i < 10; i++ ) {
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10));
}
downsampler.submit(reads);
downsampler.signalEndOfInput();
List<GATKSAMRecord> downsampledReads = downsampler.consumeFinalizedItems();
Assert.assertEquals(downsampledReads.size(), 10);
}
}

View File

@ -0,0 +1,546 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
/**
* testing of the experimental version of LocusIteratorByState
*/
public class LocusIteratorByStateExperimentalUnitTest extends BaseTest {
private static SAMFileHeader header;
private LocusIteratorByStateExperimental li;
private GenomeLocParser genomeLocParser;
@BeforeClass
public void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
private final LocusIteratorByStateExperimental makeLTBS(List<SAMRecord> reads, ReadProperties readAttributes) {
return new LocusIteratorByStateExperimental(new FakeCloseableIterator<SAMRecord>(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups());
}
private static ReadProperties createTestReadProperties() {
return createTestReadProperties(null);
}
private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) {
return new ReadProperties(
Collections.<SAMReaderID>emptyList(),
new SAMFileHeader(),
false,
SAMFileReader.ValidationStringency.STRICT,
downsamplingMethod,
new ValidationExclusion(),
Collections.<ReadFilter>emptyList(),
Collections.<ReadTransformer>emptyList(),
false,
(byte) -1
);
}
private static class FakeCloseableIterator<T> implements CloseableIterator<T> {
Iterator<T> iterator;
public FakeCloseableIterator(Iterator<T> it) {
iterator = it;
}
@Override
public void close() {
return;
}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public T next() {
return iterator.next();
}
@Override
public void remove() {
throw new UnsupportedOperationException("Don't remove!");
}
}
@Test
public void testXandEQOperators() {
final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'};
final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'};
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10);
r1.setReadBases(bases1);
r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
r1.setCigarString("10M");
SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10);
r2.setReadBases(bases2);
r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
r2.setCigarString("3=1X5=1X");
SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10);
r3.setReadBases(bases2);
r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
r3.setCigarString("3=1X5M1X");
SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10);
r4.setReadBases(bases2);
r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
r4.setCigarString("10M");
List<SAMRecord> reads = Arrays.asList(r1, r2, r3, r4);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,readAttributes);
while (li.hasNext()) {
AlignmentContext context = li.next();
ReadBackedPileup pileup = context.getBasePileup();
Assert.assertEquals(pileup.depthOfCoverage(), 4);
}
}
@Test
public void testIndelsInRegularPileup() {
final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'};
final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'};
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10);
before.setReadBases(bases);
before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
before.setCigarString("10M");
SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10);
during.setReadBases(indelBases);
during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
during.setCigarString("4M2I6M");
SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10);
after.setReadBases(bases);
after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
after.setCigarString("10M");
List<SAMRecord> reads = Arrays.asList(before, during, after);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,readAttributes);
boolean foundIndel = false;
while (li.hasNext()) {
AlignmentContext context = li.next();
ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10);
for (PileupElement p : pileup) {
if (p.isBeforeInsertion()) {
foundIndel = true;
Assert.assertEquals(p.getEventLength(), 2, "Wrong event length");
Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect");
break;
}
}
}
Assert.assertTrue(foundIndel,"Indel in pileup not found");
}
@Test
public void testWholeIndelReadInIsolation() {
final int firstLocus = 44367789;
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76);
indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76));
indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76));
indelOnlyRead.setCigarString("76I");
List<SAMRecord> reads = Arrays.asList(indelOnlyRead);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, readAttributes);
// Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read
// and considers it to be an indel-containing read.
Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled");
AlignmentContext alignmentContext = li.next();
Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location.");
ReadBackedPileup basePileup = alignmentContext.getBasePileup();
Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size");
Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect");
}
/**
* Test to make sure that reads supporting only an indel (example cigar string: 76I) do
* not negatively influence the ordering of the pileup.
*/
@Test
public void testWholeIndelRead() {
final int firstLocus = 44367788, secondLocus = firstLocus + 1;
SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76);
leadingRead.setReadBases(Utils.dupBytes((byte)'A',76));
leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76));
leadingRead.setCigarString("1M75I");
SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76);
indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76));
indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76));
indelOnlyRead.setCigarString("76I");
SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76);
fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76));
fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76));
fullMatchAfterIndel.setCigarString("75I1M");
List<SAMRecord> reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
int currentLocus = firstLocus;
int numAlignmentContextsFound = 0;
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect");
if(currentLocus == firstLocus) {
List<GATKSAMRecord> readsAtLocus = alignmentContext.getBasePileup().getReads();
Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus);
}
else if(currentLocus == secondLocus) {
List<GATKSAMRecord> readsAtLocus = alignmentContext.getBasePileup().getReads();
Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus);
}
currentLocus++;
numAlignmentContextsFound++;
}
Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts");
}
/**
* Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly
*/
@Test
public void testWholeIndelReadRepresentedTest() {
final int firstLocus = 44367788, secondLocus = firstLocus + 1;
SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1);
read1.setReadBases(Utils.dupBytes((byte) 'A', 1));
read1.setBaseQualities(Utils.dupBytes((byte) '@', 1));
read1.setCigarString("1I");
List<SAMRecord> reads = Arrays.asList(read1);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
Assert.assertTrue(pe.isBeforeInsertion());
Assert.assertFalse(pe.isAfterInsertion());
Assert.assertEquals(pe.getEventBases(), "A");
}
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10);
read2.setReadBases(Utils.dupBytes((byte) 'A', 10));
read2.setBaseQualities(Utils.dupBytes((byte) '@', 10));
read2.setCigarString("10I");
reads = Arrays.asList(read2);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
Assert.assertTrue(pe.isBeforeInsertion());
Assert.assertFalse(pe.isAfterInsertion());
Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA");
}
}
////////////////////////////////////////////
// comprehensive LIBS/PileupElement tests //
////////////////////////////////////////////
private static final int IS_BEFORE_DELETED_BASE_FLAG = 1;
private static final int IS_BEFORE_DELETION_START_FLAG = 2;
private static final int IS_AFTER_DELETED_BASE_FLAG = 4;
private static final int IS_AFTER_DELETION_END_FLAG = 8;
private static final int IS_BEFORE_INSERTION_FLAG = 16;
private static final int IS_AFTER_INSERTION_FLAG = 32;
private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64;
private static class LIBSTest {
final String cigar;
final int readLength;
final List<Integer> offsets;
final List<Integer> flags;
private LIBSTest(final String cigar, final int readLength, final List<Integer> offsets, final List<Integer> flags) {
this.cigar = cigar;
this.readLength = readLength;
this.offsets = offsets;
this.flags = flags;
}
}
@DataProvider(name = "LIBSTest")
public Object[][] createLIBSTestData() {
return new Object[][]{
{new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))},
{new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))},
{new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))},
{new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))},
//TODO -- uncomment these when LIBS is fixed
//{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))},
//{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))},
//{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))},
{new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))},
{new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))},
{new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))},
{new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))}
};
}
@Test(dataProvider = "LIBSTest")
public void testLIBS(LIBSTest params) {
final int locus = 44367788;
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength));
read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength));
read.setCigarString(params.cigar);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(Arrays.asList(read), createTestReadProperties());
int offset = 0;
while ( li.hasNext() ) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
final int flag = params.flags.get(offset);
Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0);
Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0);
Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0);
Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0);
Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0);
Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0);
Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0);
Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue());
offset++;
}
}
////////////////////////////////////////////////
// End comprehensive LIBS/PileupElement tests //
////////////////////////////////////////////////
///////////////////////////////////////
// Read State Manager Tests //
///////////////////////////////////////
private class PerSampleReadStateManagerTest extends TestDataProvider {
private List<Integer> readCountsPerAlignmentStart;
private List<SAMRecord> reads;
private List<ArrayList<LocusIteratorByStateExperimental.SAMRecordState>> recordStatesByAlignmentStart;
private int removalInterval;
public PerSampleReadStateManagerTest( List<Integer> readCountsPerAlignmentStart, int removalInterval ) {
super(PerSampleReadStateManagerTest.class);
this.readCountsPerAlignmentStart = readCountsPerAlignmentStart;
this.removalInterval = removalInterval;
reads = new ArrayList<SAMRecord>();
recordStatesByAlignmentStart = new ArrayList<ArrayList<LocusIteratorByStateExperimental.SAMRecordState>>();
setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d",
getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval));
}
public void run() {
LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList<SAMRecord>(), createTestReadProperties());
LocusIteratorByStateExperimental.ReadStateManager readStateManager =
libs.new ReadStateManager(new ArrayList<SAMRecord>().iterator());
LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager =
readStateManager.new PerSampleReadStateManager();
makeReads();
for ( ArrayList<LocusIteratorByStateExperimental.SAMRecordState> stackRecordStates : recordStatesByAlignmentStart ) {
perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates);
}
// read state manager should have the right number of reads
Assert.assertEquals(reads.size(), perSampleReadStateManager.size());
Iterator<SAMRecord> originalReadsIterator = reads.iterator();
Iterator<LocusIteratorByStateExperimental.SAMRecordState> recordStateIterator = perSampleReadStateManager.iterator();
int recordStateCount = 0;
int numReadStatesRemoved = 0;
// Do a first-pass validation of the record state iteration by making sure we get back everything we
// put in, in the same order, doing any requested removals of read states along the way
while ( recordStateIterator.hasNext() ) {
LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next();
recordStateCount++;
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
Assert.assertTrue(originalReadsIterator.hasNext());
SAMRecord originalRead = originalReadsIterator.next();
// The read we get back should be literally the same read in memory as we put in
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
// If requested, remove a read state every removalInterval states
if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) {
recordStateIterator.remove();
numReadStatesRemoved++;
}
}
Assert.assertFalse(originalReadsIterator.hasNext());
// If we removed any read states, do a second pass through the read states to make sure the right
// states were removed
if ( numReadStatesRemoved > 0 ) {
Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved);
originalReadsIterator = reads.iterator();
recordStateIterator = perSampleReadStateManager.iterator();
int readCount = 0;
int readStateCount = 0;
// Match record states with the reads that should remain after removal
while ( recordStateIterator.hasNext() ) {
LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next();
readStateCount++;
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
Assert.assertTrue(originalReadsIterator.hasNext());
SAMRecord originalRead = originalReadsIterator.next();
readCount++;
if ( readCount % removalInterval == 0 ) {
originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded
readCount++;
}
// The read we get back should be literally the same read in memory as we put in (after accounting for removals)
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
}
Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved);
}
// Allow memory used by this test to be reclaimed
readCountsPerAlignmentStart = null;
reads = null;
recordStatesByAlignmentStart = null;
}
private void makeReads() {
int alignmentStart = 1;
for ( int readsThisStack : readCountsPerAlignmentStart ) {
ArrayList<SAMRecord> stackReads = new ArrayList<SAMRecord>(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100)));
ArrayList<LocusIteratorByStateExperimental.SAMRecordState> stackRecordStates = new ArrayList<LocusIteratorByStateExperimental.SAMRecordState>();
for ( SAMRecord read : stackReads ) {
stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read));
}
reads.addAll(stackReads);
recordStatesByAlignmentStart.add(stackRecordStates);
}
}
}
@DataProvider(name = "PerSampleReadStateManagerTestDataProvider")
public Object[][] createPerSampleReadStateManagerTests() {
for ( List<Integer> thisTestReadStateCounts : Arrays.asList( Arrays.asList(1),
Arrays.asList(2),
Arrays.asList(10),
Arrays.asList(1, 1),
Arrays.asList(2, 2),
Arrays.asList(10, 10),
Arrays.asList(1, 10),
Arrays.asList(10, 1),
Arrays.asList(1, 1, 1),
Arrays.asList(2, 2, 2),
Arrays.asList(10, 10, 10),
Arrays.asList(1, 1, 1, 1, 1, 1),
Arrays.asList(10, 10, 10, 10, 10, 10),
Arrays.asList(1, 2, 10, 1, 2, 10)
) ) {
for ( int removalInterval : Arrays.asList(0, 2, 3) ) {
new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval);
}
}
return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class);
}
@Test(dataProvider = "PerSampleReadStateManagerTestDataProvider")
public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) {
logger.warn("Running test: " + test);
test.run();
}
}

View File

@ -28,14 +28,12 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -48,7 +46,6 @@ import java.util.List;
*/
public class VerifyingSamIteratorUnitTest {
private SAMFileHeader samFileHeader;
private GenomeLocParser genomeLocParser;
@BeforeClass
public void init() {
@ -58,8 +55,6 @@ public class VerifyingSamIteratorUnitTest {
samFileHeader = new SAMFileHeader();
samFileHeader.setSequenceDictionary(sequenceDictionary);
genomeLocParser = new GenomeLocParser(sequenceDictionary);
}
@Test
@ -68,7 +63,7 @@ public class VerifyingSamIteratorUnitTest {
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10);
List<SAMRecord> reads = Arrays.asList(read1,read2);
VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator()));
VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator()));
Assert.assertTrue(iterator.hasNext(),"Insufficient reads");
Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position");
@ -83,7 +78,7 @@ public class VerifyingSamIteratorUnitTest {
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10);
List<SAMRecord> reads = Arrays.asList(read1,read2);
VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator()));
VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator()));
Assert.assertTrue(iterator.hasNext(),"Insufficient reads");
Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position");
@ -98,7 +93,7 @@ public class VerifyingSamIteratorUnitTest {
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10);
List<SAMRecord> reads = Arrays.asList(read1,read2);
VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator()));
VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator()));
Assert.assertTrue(iterator.hasNext(),"Insufficient reads");
Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position");
@ -116,7 +111,7 @@ public class VerifyingSamIteratorUnitTest {
read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
List<SAMRecord> reads = Arrays.asList(read1,read2);
VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator()));
VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator()));
Assert.assertTrue(iterator.hasNext(),"Insufficient reads");
Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position");

Some files were not shown because too many files have changed in this diff Show More