diff --git a/build.xml b/build.xml index f681ddafa..0d1deba29 100644 --- a/build.xml +++ b/build.xml @@ -577,6 +577,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" + maxmemory="2g" additionalparam="-build-timestamp "${build.timestamp}" -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet"> @@ -780,6 +781,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" + maxmemory="2g" additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet"> diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index bdb9ef843..d2fc08c62 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -546,7 +546,7 @@ public class SlidingWindow { FractionalDownsampler downsampler = new FractionalDownsampler(fraction); downsampler.submit(allReads); - return downsampler.consumeDownsampledItems(); + return downsampler.consumeFinalizedItems(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 312d31727..ce57d1a7a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -112,31 +112,31 @@ public class CommandLineGATK extends CommandLineExecutable { } } - protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; - protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; private static void checkForMaskedUserErrors(final Throwable t) { final String message = t.getMessage(); if ( message == null ) return; // we know what to do about the common "Too many open files" error - if ( message.indexOf("Too many open files") != -1 ) + if ( message.contains("Too many open files") ) exitSystemWithUserError(new UserException.TooManyOpenFiles()); // malformed BAM looks like a SAM file - if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 || - message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 ) + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || + message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) exitSystemWithSamError(t); // can't close tribble index when writing - if ( message.indexOf("Unable to close index for") != -1 ) + if ( message.contains("Unable to close index for") ) exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); // disk is full - if ( message.indexOf("No space left on device") != -1 ) - exitSystemWithUserError(new UserException(t.getMessage())); - if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 ) - exitSystemWithUserError(new UserException(t.getCause().getMessage())); + if ( message.contains("No space left on device") ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); + if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java deleted file mode 100644 index 6d9e79156..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.broadinstitute.sting.gatk; - -import org.broadinstitute.sting.utils.exceptions.UserException; - -/** - * Describes the method for downsampling reads at a given locus. - * - * @author hanna - * @version 0.1 - */ - -public class DownsamplingMethod { - /** - * Type of downsampling to perform. - */ - public final DownsampleType type; - - /** - * Actual downsampling target is specified as an integer number of reads. - */ - public final Integer toCoverage; - - /** - * Actual downsampling target is specified as a fraction of total available reads. - */ - public final Double toFraction; - - /** - * Expresses no downsampling applied at all. - */ - public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null); - - public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) { - // Do some basic sanity checks on the downsampling parameters passed in. - - // Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator. - if(type != DownsampleType.NONE && toFraction == null && toCoverage == null) - throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling."); - - // Fraction and coverage cannot both be specified. - if(toFraction != null && toCoverage != null) - throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one."); - - // Experimental by sample downsampling does not work with a fraction of reads. - if(type == DownsampleType.BY_SAMPLE && toFraction != null) - throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method"); - - this.type = type; - this.toCoverage = toCoverage; - this.toFraction = toFraction; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b9b5e452d..3ce8a92b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; @@ -143,6 +144,8 @@ public class GenomeAnalysisEngine { */ private ThreadAllocation threadAllocation; + private ReadMetrics cumulativeMetrics = null; + /** * A currently hacky unique name for this GATK instance */ @@ -398,28 +401,22 @@ public class GenomeAnalysisEngine { * Parse out the thread allocation from the given command-line argument. */ private void determineThreadAllocation() { - Tags tags = parsingEngine.getTags(argCollection.numberOfThreads); + if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); + if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); + if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); - // TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters. - Integer numCPUThreads = null; - if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null) - throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); - else if(tags.containsKey("cpu")) - numCPUThreads = Integer.parseInt(tags.getValue("cpu")); - else if(argCollection.numberOfCPUThreads != null) - numCPUThreads = argCollection.numberOfCPUThreads; - - Integer numIOThreads = null; - if(tags.containsKey("io") && argCollection.numberOfIOThreads != null) - throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); - else if(tags.containsKey("io")) - numIOThreads = Integer.parseInt(tags.getValue("io")); - else if(argCollection.numberOfIOThreads != null) - numIOThreads = argCollection.numberOfIOThreads; - - this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads, numCPUThreads, numIOThreads, ! argCollection.disableEfficiencyMonitor); + this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, + argCollection.numberOfCPUThreadsPerDataThread, + argCollection.numberOfIOThreads, + ! argCollection.disableEfficiencyMonitor); } + public int getTotalNumberOfThreads() { + return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); + } + + + /** * Allow subclasses and others within this package direct access to the walker manager. * @return The walker manager used by this package. @@ -445,14 +442,18 @@ public class GenomeAnalysisEngine { protected DownsamplingMethod getDownsamplingMethod() { GATKArgumentCollection argCollection = this.getArguments(); - DownsamplingMethod method; - if(argCollection.getDownsamplingMethod() != null) - method = argCollection.getDownsamplingMethod(); - else if(WalkerManager.getDownsamplingMethod(walker) != null) - method = WalkerManager.getDownsamplingMethod(walker); - else - method = GATKArgumentCollection.getDefaultDownsamplingMethod(); - return method; + boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling; + + // until the file pointer bug with the experimental downsamplers is fixed, disallow running with experimental downsampling + if ( useExperimentalDownsampling ) { + throw new UserException("The experimental downsampling implementation is currently crippled by a file-pointer-related bug. Until this bug is fixed, it's not safe (or possible) for anyone to use the experimental implementation!"); + } + + DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling); + DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling); + + return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); } protected void setDownsamplingMethod(DownsamplingMethod method) { @@ -825,11 +826,13 @@ public class GenomeAnalysisEngine { * @return A data source for the given set of reads. */ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { - DownsamplingMethod method = getDownsamplingMethod(); + DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); // Synchronize the method back into the collection so that it shows up when // interrogating for the downsample method during command line recreation. - setDownsamplingMethod(method); + setDownsamplingMethod(downsamplingMethod); + + logger.info(downsamplingMethod); if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); @@ -847,7 +850,7 @@ public class GenomeAnalysisEngine { argCollection.useOriginalBaseQualities, argCollection.strictnessLevel, argCollection.readBufferSize, - method, + downsamplingMethod, new ValidationExclusion(Arrays.asList(argCollection.unsafe)), filters, readTransformers, @@ -1035,7 +1038,10 @@ public class GenomeAnalysisEngine { * owned by the caller; the caller can do with the object what they wish. */ public ReadMetrics getCumulativeMetrics() { - return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); + // todo -- probably shouldn't be lazy + if ( cumulativeMetrics == null ) + cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); + return cumulativeMetrics; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java index ceaa30f01..bfea0b1e1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.filter.SamRecordFilter; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; @@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable { return nRecords; } + /** + * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed. + */ + public void incrementNumIterations(final long by) { + nRecords += by; + } + /** * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed. */ public void incrementNumIterations() { - nRecords++; + incrementNumIterations(1); } public long getNumReadsSeen() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index b2d4d202d..e1ada93cc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index ae59ce438..fbacbddc4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -304,9 +306,10 @@ public class WalkerManager extends PluginManager { * downsampling method is specified on the command-line, the command-line version will * be used instead. * @param walkerClass The class of the walker to interrogate. + * @param useExperimentalDownsampling If true, use the experimental downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Class walkerClass) { + public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useExperimentalDownsampling) { DownsamplingMethod downsamplingMethod = null; if( walkerClass.isAnnotationPresent(Downsample.class) ) { @@ -314,7 +317,7 @@ public class WalkerManager extends PluginManager { DownsampleType type = downsampleParameters.by(); Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction); + downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling); } return downsamplingMethod; @@ -333,10 +336,11 @@ public class WalkerManager extends PluginManager { * downsampling method is specified on the command-line, the command-line version will * be used instead. * @param walker The walker to interrogate. + * @param useExperimentalDownsampling If true, use the experimental downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Walker walker) { - return getDownsamplingMethod(walker.getClass()); + public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) { + return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 72cb5e02f..44817379a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -31,8 +31,8 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.IntervalBinding; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; @@ -41,7 +41,9 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; import java.io.File; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; /** * @author aaron @@ -138,15 +140,11 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; - /** - * The override mechanism in the GATK, by default, populates the command-line arguments, then - * the defaults from the walker annotations. Unfortunately, walker annotations should be trumped - * by a user explicitly specifying command-line arguments. - * TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments. - */ - private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; - private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000; - + // -------------------------------------------------------------------------------------------------------------- + // + // Downsampling Arguments + // + // -------------------------------------------------------------------------------------------------------------- @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false) public DownsampleType downsamplingType = null; @@ -156,17 +154,20 @@ public class GATKArgumentCollection { @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) public Integer downsampleCoverage = null; + @Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false) + @Hidden + public boolean enableExperimentalDownsampling = false; + /** * Gets the downsampling method explicitly specified by the user. If the user didn't specify * a default downsampling mechanism, return the default. * @return The explicitly specified downsampling mechanism, or the default if none exists. */ public DownsamplingMethod getDownsamplingMethod() { - if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null) + if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) return null; - if(downsamplingType == null && downsampleCoverage != null) - return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null); - return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction); + + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling); } /** @@ -176,9 +177,11 @@ public class GATKArgumentCollection { public void setDownsamplingMethod(DownsamplingMethod method) { if (method == null) throw new IllegalArgumentException("method is null"); + downsamplingType = method.type; downsampleCoverage = method.toCoverage; downsampleFraction = method.toFraction; + enableExperimentalDownsampling = method.useExperimentalDownsampling; } // -------------------------------------------------------------------------------------------------------------- @@ -197,17 +200,14 @@ public class GATKArgumentCollection { // performance log arguments // // -------------------------------------------------------------------------------------------------------------- - @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) - public File performanceLog = null; /** - * Gets the default downsampling method, returned if the user didn't specify any downsampling - * method. - * @return The default downsampling mechanism, or null if none exists. + * The file name for the GATK performance log output, or null if you don't want to generate the + * detailed performance logging table. This table is suitable for importing into R or any + * other analysis software that can read tsv files */ - public static DownsamplingMethod getDefaultDownsamplingMethod() { - return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null); - } + @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) + public File performanceLog = null; @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) public Boolean useOriginalBaseQualities = false; @@ -279,9 +279,32 @@ public class GATKArgumentCollection { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; - /** How many threads should be allocated to this analysis. */ - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) - public Integer numberOfThreads = 1; + // -------------------------------------------------------------------------------------------------------------- + // + // Multi-threading arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * How many data threads should be allocated to this analysis? Data threads contains N cpu threads per + * data thread, and act as completely data parallel processing, increasing the memory usage of GATK + * by M data threads. Data threads generally scale extremely effectively, up to 24 cores + */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false) + public Integer numberOfDataThreads = 1; + + /** + * How many CPU threads should be allocated per data thread? Each CPU thread operates the map + * cycle independently, but may run into earlier scaling problems with IO than data threads. Has + * the benefit of not requiring X times as much memory per thread as data threads do, but rather + * only a constant overhead. + */ + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) + public int numberOfCPUThreadsPerDataThread = 1; + + @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) + @Hidden + public int numberOfIOThreads = 0; /** * By default the GATK monitors its own efficiency, but this can have a itsy-bitsy tiny @@ -291,17 +314,6 @@ public class GATKArgumentCollection { @Argument(fullName = "disableThreadEfficiencyMonitor", shortName = "dtem", doc = "Disable GATK efficiency monitoring", required = false) public Boolean disableEfficiencyMonitor = false; - /** - * The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types. - * TODO: Kill this when I can do a tagged integer in Queue. - */ - @Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false) - @Hidden - public Integer numberOfCPUThreads = null; - @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) - @Hidden - public Integer numberOfIOThreads = null; - @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) public Integer numberOfBAMFileHandles = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index a3ce6dd27..cd3403f2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.iterators.LocusIterator; @@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View { // Cache the current and apply filtering. AlignmentContext current = nextLocus; - if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) + + // The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling: + if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling && + sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) { + current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); + } // Indicate that the next operation will need to advance. nextLocus = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 7d027438b..437813f19 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -30,7 +30,9 @@ import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.*; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -152,6 +154,8 @@ public class SAMDataSource { */ private final ThreadAllocation threadAllocation; + private final boolean expandShardsForDownsampling; + /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. @@ -302,6 +306,11 @@ public class SAMDataSource { includeReadsWithDeletionAtLoci, defaultBaseQualities); + expandShardsForDownsampling = readProperties.getDownsamplingMethod() != null && + readProperties.getDownsamplingMethod().useExperimentalDownsampling && + readProperties.getDownsamplingMethod().type != DownsampleType.NONE && + readProperties.getDownsamplingMethod().toCoverage != null; + // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { @@ -457,6 +466,16 @@ public class SAMDataSource { } } + /** + * Are we expanding shards as necessary to prevent shard boundaries from occurring at improper places? + * + * @return true if we are using expanded shards, otherwise false + */ + public boolean usingExpandedShards() { + return expandShardsForDownsampling; + } + + /** * Fill the given buffering shard with reads. * @param shard Shard to fill. @@ -484,6 +503,31 @@ public class SAMDataSource { } } + // If the reads are sorted in coordinate order, ensure that all reads + // having the same alignment start become part of the same shard, to allow + // downsampling to work better across shard boundaries. Note that because our + // read stream has already been fed through the positional downsampler, which + // ensures that at each alignment start position there are no more than dcov + // reads, we're in no danger of accidentally creating a disproportionately huge + // shard + if ( expandShardsForDownsampling && sortOrder == SAMFileHeader.SortOrder.coordinate ) { + while ( iterator.hasNext() ) { + SAMRecord additionalRead = iterator.next(); + + // Stop filling the shard as soon as we encounter a read having a different + // alignment start or contig from the last read added in the earlier loop + // above, or an unmapped read + if ( read == null || + additionalRead.getReadUnmappedFlag() || + ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || + additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { + break; + } + shard.addRead(additionalRead); + noteFilePositionUpdate(positionUpdates, additionalRead); + } + } + // If the reads are sorted in queryname order, ensure that all reads // having the same queryname become part of the same shard. if(sortOrder == SAMFileHeader.SortOrder.queryname) { @@ -578,6 +622,7 @@ public class SAMDataSource { iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); + iteratorMap.put(readers.getReader(id), iterator); } @@ -660,20 +705,25 @@ public class SAMDataSource { List readTransformers, byte defaultBaseQualities) { - // *********************************************************************************** // - // * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // - // * (otherwise we will process something that we may end up throwing away) * // - // *********************************************************************************** // + // ************************************************************************************************ // + // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // + // * (otherwise we will process something that we may end up throwing away) * // + // ************************************************************************************************ // - if (downsamplingFraction != null) - wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction); + wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + + if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } + + // Use the old fractional downsampler only if we're not using experimental downsampling: + if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null ) + wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction); // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, // verify the read ordering by applying a sort order iterator if (!noValidationOfReadOrder && enableVerification) - wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator); - - wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + wrappedIterator = new VerifyingSamIterator(wrappedIterator); if (useOriginalBaseQualities || defaultBaseQualities >= 0) // only wrap if we are replacing the original qualities or using a default base quality @@ -688,6 +738,26 @@ public class SAMDataSource { return wrappedIterator; } + protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + ReadsDownsamplerFactory downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ? + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage) : + new FractionalDownsamplerFactory(readProperties.getDownsamplingMethod().toFraction); + + return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + ReadsDownsampler downsampler = readProperties.getDownsamplingMethod().toCoverage != null ? + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage) : + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction); + + return new DownsamplingReadsIterator(wrappedIterator, downsampler); + } + + return wrappedIterator; + } + + private class SAMResourcePool { /** * How many entries can be cached in this resource pool? diff --git a/public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java similarity index 75% rename from public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java rename to public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java index 3fabf6e0d..c3d17436a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk; +package org.broadinstitute.sting.gatk.downsampling; /** * Type of downsampling method to invoke. diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java index 5fb99b2bc..f5741af4e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -28,49 +28,92 @@ import java.util.Collection; import java.util.List; /** - * The basic downsampler API, with no reads-specific operations + * The basic downsampler API, with no reads-specific operations. + * + * Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle + * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a + * PerSampleDownsamplingReadsIterator. * * @author David Roazen */ public interface Downsampler { - /* - * Submit one item to the downsampler for consideration . Some downsamplers will be able to determine + /** + * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine * immediately whether the item survives the downsampling process, while others will need to see * more items before making that determination. + * + * @param item the individual item to submit to the downsampler for consideration */ public void submit( T item ); - /* - * Submit a collection of items to the downsampler for consideration. + /** + * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling + * submit() on each individual item in the collection. + * + * @param items the collection of items to submit to the downsampler for consideration */ public void submit( Collection items ); - /* + /** * Are there items that have survived the downsampling process waiting to be retrieved? + * + * @return true if this downsampler has > 0 finalized items, otherwise false */ - public boolean hasDownsampledItems(); + public boolean hasFinalizedItems(); - /* - * Return (and remove) all items that have survived downsampling and are waiting to be retrieved. + /** + * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. + * + * @return a list of all finalized items this downsampler contains, or an empty list if there are none */ - public List consumeDownsampledItems(); + public List consumeFinalizedItems(); - /* + /** * Are there items stored in this downsampler that it doesn't yet know whether they will * ultimately survive the downsampling process? + * + * @return true if this downsampler has > 0 pending items, otherwise false */ public boolean hasPendingItems(); - /* + /** + * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) + * + * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public T peekFinalized(); + + /** + * Peek at the first pending item stored in this downsampler (or null if there are no pending items) + * + * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public T peekPending(); + + /** + * Returns the number of items discarded (so far) during the downsampling process + * + * @return the number of items that have been submitted to this downsampler and discarded in the process of + * downsampling + */ + public int getNumberOfDiscardedItems(); + + /** * Used to tell the downsampler that no more items will be submitted to it, and that it should * finalize any pending items. */ public void signalEndOfInput(); - /* - * Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state - * information. + /** + * Empty the downsampler of all finalized/pending items */ public void clear(); + + /** + * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items + */ + public void reset(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java new file mode 100644 index 000000000..ae1d98ce0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Describes the method for downsampling reads at a given locus. + */ + +public class DownsamplingMethod { + /** + * Type of downsampling to perform. + */ + public final DownsampleType type; + + /** + * Actual downsampling target is specified as an integer number of reads. + */ + public final Integer toCoverage; + + /** + * Actual downsampling target is specified as a fraction of total available reads. + */ + public final Double toFraction; + + /** + * Use the new experimental downsampling? + */ + public final boolean useExperimentalDownsampling; + + /** + * Expresses no downsampling applied at all. + */ + public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false); + + /** + * Default type to use if no type is specified + */ + public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; + + /** + * Default target coverage for locus-based traversals + */ + public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; + + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { + this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; + this.toCoverage = toCoverage; + this.toFraction = toFraction; + this.useExperimentalDownsampling = useExperimentalDownsampling; + + if ( type == DownsampleType.NONE ) { + toCoverage = null; + toFraction = null; + } + + validate(); + } + + private void validate() { + // Can't leave toFraction and toCoverage null unless type is NONE + if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null ) + throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling."); + + // Fraction and coverage cannot both be specified. + if ( toFraction != null && toCoverage != null ) + throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one."); + + // toCoverage must be > 0 when specified + if ( toCoverage != null && toCoverage <= 0 ) { + throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage"); + } + + // toFraction must be >= 0.0 and <= 1.0 when specified + if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { + throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); + } + + // Some restrictions only exist for the old downsampling implementation: + if ( ! useExperimentalDownsampling ) { + // By sample downsampling does not work with a fraction of reads in the old downsampling implementation + if( type == DownsampleType.BY_SAMPLE && toFraction != null ) + throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); + } + + // Some restrictions only exist for the new downsampling implementation: + if ( useExperimentalDownsampling ) { + if ( type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); + } + } + } + + public String toString() { + StringBuilder builder = new StringBuilder("Downsampling Settings: "); + + if ( type == DownsampleType.NONE ) { + builder.append("No downsampling"); + } + else { + builder.append(String.format("Method: %s ", type)); + + if ( toCoverage != null ) { + builder.append(String.format("Target Coverage: %d ", toCoverage)); + } + else { + builder.append(String.format("Target Fraction: %.2f ", toFraction)); + } + + if ( useExperimentalDownsampling ) { + builder.append("Using Experimental Downsampling"); + } + } + + return builder.toString(); + } + + public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { + if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { + return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, + null, useExperimentalDownsampling); + } + else { + return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java index bccc2e946..c8fbc829c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java @@ -33,7 +33,8 @@ import java.util.NoSuchElementException; /** - * StingSAMIterator wrapper around our generic reads downsampler interface + * StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style + * downsampler interface to a pull model. * * @author David Roazen */ @@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator { private StingSAMIterator nestedSAMIterator; private ReadsDownsampler downsampler; private Collection downsampledReadsCache; - private Iterator downsampledReadsCacheIterator; + private SAMRecord nextRead = null; + private Iterator downsampledReadsCacheIterator = null; + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsampler downsampler through which the reads will be fed + */ public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler downsampler ) { nestedSAMIterator = iter; this.downsampler = downsampler; - fillDownsampledReadsCache(); + + advanceToNextRead(); } public boolean hasNext() { - if ( downsampledReadsCacheIterator.hasNext() ) { - return true; - } - else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) { - return false; - } - - return true; + return nextRead != null; } public SAMRecord next() { - if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) { + if ( nextRead == null ) { throw new NoSuchElementException("next() called when there are no more items"); } - return downsampledReadsCacheIterator.next(); + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = downsampledReadsCacheIterator.next(); + } + } + + private boolean readyToReleaseReads() { + return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext(); } private boolean fillDownsampledReadsCache() { - while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) { + while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) { downsampler.submit(nestedSAMIterator.next()); } @@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator { downsampler.signalEndOfInput(); } - downsampledReadsCache = downsampler.consumeDownsampledItems(); + // use returned collection directly rather than make a copy, for speed + downsampledReadsCache = downsampler.consumeFinalizedItems(); downsampledReadsCacheIterator = downsampledReadsCache.iterator(); return downsampledReadsCacheIterator.hasNext(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java index d5d529c9f..8901ae525 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java @@ -33,7 +33,10 @@ import java.util.Collection; import java.util.List; /** - * Fractional Downsampler: selects a specified fraction of the reads for inclusion + * Fractional Downsampler: selects a specified fraction of the reads for inclusion. + * + * Since the selection is done randomly, the actual fraction of reads retained may be slightly + * more or less than the requested fraction, depending on the total number of reads submitted. * * @author David Roazen */ @@ -43,8 +46,16 @@ public class FractionalDownsampler implements ReadsDownsamp private int cutoffForInclusion; + private int numDiscardedItems; + private static final int RANDOM_POOL_SIZE = 10000; + /** + * Construct a FractionalDownsampler + * + * @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive). + * Actual number of reads preserved may differ randomly. + */ public FractionalDownsampler( double fraction ) { if ( fraction < 0.0 || fraction > 1.0 ) { throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); @@ -52,12 +63,16 @@ public class FractionalDownsampler implements ReadsDownsamp cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); clear(); + reset(); } public void submit( T newRead ) { if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) { selectedReads.add(newRead); } + else { + numDiscardedItems++; + } } public void submit( Collection newReads ) { @@ -66,11 +81,12 @@ public class FractionalDownsampler implements ReadsDownsamp } } - public boolean hasDownsampledItems() { + public boolean hasFinalizedItems() { return selectedReads.size() > 0; } - public List consumeDownsampledItems() { + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed List downsampledItems = selectedReads; clear(); return downsampledItems; @@ -80,6 +96,18 @@ public class FractionalDownsampler implements ReadsDownsamp return false; } + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + public void signalEndOfInput() { // NO-OP } @@ -88,7 +116,15 @@ public class FractionalDownsampler implements ReadsDownsamp selectedReads = new ArrayList(); } + public void reset() { + numDiscardedItems = 0; + } + public boolean requiresCoordinateSortOrder() { return false; } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java new file mode 100644 index 000000000..7a7c9e91e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating FractionalDownsamplers on demand + * + * @author David Roazen + */ +public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private double fraction; + + public FractionalDownsamplerFactory( double fraction ) { + this.fraction = fraction; + } + + public ReadsDownsampler newInstance() { + return new FractionalDownsampler(fraction); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java new file mode 100644 index 000000000..73d69140d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.*; + +/** + * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from + * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling + * does not occur until all Lists have been submitted and signalEndOfInput() is called. + * + * The Lists should be LinkedLists for maximum efficiency during item removal, however other + * kinds of Lists are also accepted (albeit at a slight performance penalty). + * + * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface, + * the Lists need not contain reads. However this downsampler may not be wrapped within one of the + * DownsamplingReadsIterators + * + * @param the List type representing the stacks to be leveled + * @param the type of the elements of each List + * + * @author David Roazen + */ +public class LevelingDownsampler, E> implements Downsampler { + + private int targetSize; + + private List groups; + + private boolean groupsAreFinalized; + + private int numDiscardedItems; + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + */ + public LevelingDownsampler( int targetSize ) { + this.targetSize = targetSize; + clear(); + reset(); + } + + public void submit( T item ) { + groups.add(item); + } + + public void submit( Collection items ){ + groups.addAll(items); + } + + public boolean hasFinalizedItems() { + return groupsAreFinalized && groups.size() > 0; + } + + public List consumeFinalizedItems() { + if ( ! hasFinalizedItems() ) { + return new ArrayList(); + } + + // pass by reference rather than make a copy, for speed + List toReturn = groups; + clear(); + return toReturn; + } + + public boolean hasPendingItems() { + return ! groupsAreFinalized && groups.size() > 0; + } + + public T peekFinalized() { + return hasFinalizedItems() ? groups.get(0) : null; + } + + public T peekPending() { + return hasPendingItems() ? groups.get(0) : null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + public void signalEndOfInput() { + levelGroups(); + groupsAreFinalized = true; + } + + public void clear() { + groups = new ArrayList(); + groupsAreFinalized = false; + } + + public void reset() { + numDiscardedItems = 0; + } + + private void levelGroups() { + int totalSize = 0; + int[] groupSizes = new int[groups.size()]; + int currentGroupIndex = 0; + + for ( T group : groups ) { + groupSizes[currentGroupIndex] = group.size(); + totalSize += groupSizes[currentGroupIndex]; + currentGroupIndex++; + } + + if ( totalSize <= targetSize ) { + return; // no need to eliminate any items + } + + // We will try to remove exactly this many items, however we will refuse to allow any + // one group to fall below size 1, and so might end up removing fewer items than this + int numItemsToRemove = totalSize - targetSize; + + currentGroupIndex = 0; + int numConsecutiveUmodifiableGroups = 0; + + // Continue until we've either removed all the items we wanted to, or we can't + // remove any more items without violating the constraint that all groups must + // be left with at least one item + while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { + if ( groupSizes[currentGroupIndex] > 1 ) { + groupSizes[currentGroupIndex]--; + numItemsToRemove--; + numConsecutiveUmodifiableGroups = 0; + } + else { + numConsecutiveUmodifiableGroups++; + } + + currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length; + } + + // Now we actually go through and reduce each group to its new count as specified in groupSizes + currentGroupIndex = 0; + for ( T group : groups ) { + downsampleOneGroup(group, groupSizes[currentGroupIndex]); + currentGroupIndex++; + } + } + + private void downsampleOneGroup( T group, int numItemsToKeep ) { + if ( numItemsToKeep >= group.size() ) { + return; + } + + numDiscardedItems += group.size() - numItemsToKeep; + + BitSet itemsToKeep = new BitSet(group.size()); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { + itemsToKeep.set(selectedIndex); + } + + int currentIndex = 0; + + // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator + if ( group instanceof LinkedList ) { + Iterator iter = group.iterator(); + while ( iter.hasNext() ) { + iter.next(); + + if ( ! itemsToKeep.get(currentIndex) ) { + iter.remove(); + } + + currentIndex++; + } + } + // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather + // than suffer O(n^2) of item shifting + else { + List keptItems = new ArrayList(numItemsToKeep); + + for ( E item : group ) { + if ( itemsToKeep.get(currentIndex) ) { + keptItems.add(item); + } + currentIndex++; + } + group.clear(); + group.addAll(keptItems); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java new file mode 100644 index 000000000..8b2034460 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMRecordComparator; +import net.sf.samtools.SAMRecordCoordinateComparator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; + +import java.util.*; + + +/** + * StingSAMIterator wrapper around our generic reads downsampler interface + * that downsamples reads for each sample independently, and then re-assembles + * the reads back into a single merged stream. + * + * @author David Roazen + */ +public class PerSampleDownsamplingReadsIterator implements StingSAMIterator { + + private StingSAMIterator nestedSAMIterator; + private ReadsDownsamplerFactory downsamplerFactory; + private Map> perSampleDownsamplers; + private PriorityQueue orderedDownsampledReadsCache; + private SAMRecord nextRead = null; + private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator(); + private SAMRecord earliestPendingRead = null; + private ReadsDownsampler earliestPendingDownsampler = null; + + // Initial size of our cache of finalized reads + private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096; + + // The number of positional changes that can occur in the read stream before all downsamplers + // should be informed of the current position (guards against samples with relatively sparse reads + // getting stuck in a pending state): + private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value + + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsamplerFactory factory used to create new downsamplers as needed + */ + public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) { + nestedSAMIterator = iter; + this.downsamplerFactory = downsamplerFactory; + perSampleDownsamplers = new HashMap>(); + orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator); + + advanceToNextRead(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if ( nextRead == null ) { + throw new NoSuchElementException("next() called when there are no more items"); + } + + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = orderedDownsampledReadsCache.poll(); + } + } + + private boolean readyToReleaseReads() { + if ( orderedDownsampledReadsCache.isEmpty() ) { + return false; + } + + return earliestPendingRead == null || + readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0; + } + + private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) { + // If there is no recorded earliest pending read and this downsampler has pending items, + // then this downsampler's first pending item becomes the new earliest pending read: + if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) { + earliestPendingRead = currentDownsampler.peekPending(); + earliestPendingDownsampler = currentDownsampler; + } + // In all other cases, we only need to update the earliest pending read when the downsampler + // associated with it experiences a change in its pending reads, since by assuming a sorted + // read stream we're assured that each downsampler's earliest pending read will only increase + // in genomic position over time. + // + // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers + // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))), + // TODO: but need to verify this empirically. + else if ( currentDownsampler == earliestPendingDownsampler && + (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) { + + earliestPendingRead = null; + earliestPendingDownsampler = null; + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasPendingItems() && + (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) { + + earliestPendingRead = perSampleDownsampler.peekPending(); + earliestPendingDownsampler = perSampleDownsampler; + } + } + } + } + + private boolean fillDownsampledReadsCache() { + SAMRecord prevRead = null; + int numPositionalChanges = 0; + + // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue + // can be released without violating global sort order + while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) { + SAMRecord read = nestedSAMIterator.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName); + if ( thisSampleDownsampler == null ) { + thisSampleDownsampler = downsamplerFactory.newInstance(); + perSampleDownsamplers.put(sampleName, thisSampleDownsampler); + } + + thisSampleDownsampler.submit(read); + updateEarliestPendingRead(thisSampleDownsampler); + + if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) { + numPositionalChanges++; + } + + // If the number of times we've changed position exceeds a certain threshold, inform all + // downsamplers of the current position in the read stream. This is to prevent downsamplers + // for samples with sparser reads than others from getting stuck too long in a pending state. + if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalNoMoreReadsBefore(read); + updateEarliestPendingRead(perSampleDownsampler); + } + } + + prevRead = read; + } + + if ( ! nestedSAMIterator.hasNext() ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalEndOfInput(); + } + earliestPendingRead = null; + earliestPendingDownsampler = null; + } + + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasFinalizedItems() ) { + orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems()); + } + } + + return readyToReleaseReads(); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + nestedSAMIterator.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java deleted file mode 100644 index f29c7728c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions - * - * @author David Roazen - */ -public class PositionalDownsampler implements ReadsDownsampler { - - private int targetCoverage; - - private ReservoirDownsampler reservoir; - - private int currentContigIndex; - - private int currentAlignmentStart; - - private LinkedList pendingReads; - - private ArrayList finalizedReads; - - public PositionalDownsampler ( int targetCoverage ) { - this.targetCoverage = targetCoverage; - clear(); - } - - public void submit ( T newRead ) { - if ( readIsPastCurrentPosition(newRead) ) { - updateAndDownsamplePendingReads(); - } - - reservoir.submit(newRead); - updateCurrentPosition(newRead); - } - - public void submit ( Collection newReads ) { - for ( T read : newReads ) { - submit(read); - } - } - - public boolean hasDownsampledItems() { - return finalizedReads.size() > 0; - } - - public List consumeDownsampledItems() { - List toReturn = finalizedReads; - finalizedReads = new ArrayList(); - return toReturn; - } - - public boolean hasPendingItems() { - return pendingReads.size() > 0; - } - - public void signalEndOfInput() { - updateAndDownsamplePendingReads(); - - for ( PositionalReadGrouping group : pendingReads ) { - group.finalizeAllActiveReads(); - finalizedReads.addAll(group.getFinalizedReads()); - } - - pendingReads.clear(); - } - - public void clear() { - reservoir = new ReservoirDownsampler(targetCoverage); - pendingReads = new LinkedList(); - finalizedReads = new ArrayList(); - } - - public boolean requiresCoordinateSortOrder() { - return true; - } - - private void updateCurrentPosition ( T read ) { - currentContigIndex = read.getReferenceIndex(); - currentAlignmentStart = read.getAlignmentStart(); - } - - private boolean readIsPastCurrentPosition ( T read ) { - return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart; - } - - private void updateAndDownsamplePendingReads() { - finalizeOutOfScopeReads(); - - List oldLocusReads = reservoir.consumeDownsampledItems(); - pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart)); - - downsampleOverlappingGroups(); - } - - private void finalizeOutOfScopeReads() { - Iterator iter = pendingReads.iterator(); - boolean noPrecedingUnfinalizedGroups = true; - - while ( iter.hasNext() ) { - PositionalReadGrouping currentGroup = iter.next(); - currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart); - - if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) { - iter.remove(); - finalizedReads.addAll(currentGroup.getFinalizedReads()); - } - else { - noPrecedingUnfinalizedGroups = false; - } - } - } - - private void downsampleOverlappingGroups() { - int[] groupReadCounts = new int[pendingReads.size()]; - int totalCoverage = 0; - int numActiveGroups = 0; - int currentGroup = 0; - - for ( PositionalReadGrouping group : pendingReads ) { - groupReadCounts[currentGroup] = group.numActiveReads(); - totalCoverage += groupReadCounts[currentGroup]; - - if ( groupReadCounts[currentGroup] > 0 ) { - numActiveGroups++; - } - - currentGroup++; - } - - if ( totalCoverage <= targetCoverage ) { - return; - } - - int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups); - currentGroup = 0; - - while ( numReadsToRemove > 0 ) { - if ( groupReadCounts[currentGroup] > 1 ) { - groupReadCounts[currentGroup]--; - numReadsToRemove--; - } - - currentGroup = (currentGroup + 1) % groupReadCounts.length; - } - - currentGroup = 0; - for ( PositionalReadGrouping group : pendingReads ) { - if ( ! group.isFinalized() ) { - group.downsampleActiveReads(groupReadCounts[currentGroup]); - } - currentGroup++; - } - } - - private class PositionalReadGrouping { - private List activeReads; - private List finalizedReads; - - private int contig; - private int alignmentStart; - - public PositionalReadGrouping( Collection reads, int contig, int alignmentStart ) { - activeReads = new LinkedList(reads); - finalizedReads = new ArrayList(); - this.contig = contig; - this.alignmentStart = alignmentStart; - } - - public int numActiveReads() { - return activeReads.size(); - } - - public boolean isFinalized() { - return activeReads.size() == 0; - } - - public List getFinalizedReads() { - return finalizedReads; - } - - public void finalizeActiveReadsBeforePosition( int contig, int position ) { - if ( this.contig != contig ) { - finalizeAllActiveReads(); - return; - } - - Iterator iter = activeReads.iterator(); - - while ( iter.hasNext() ) { - T read = iter.next(); - if ( read.getAlignmentEnd() < position ) { - iter.remove(); - finalizedReads.add(read); - } - } - } - - public void finalizeAllActiveReads() { - finalizedReads.addAll(activeReads); - activeReads.clear(); - } - - public void downsampleActiveReads( int numReadsToKeep ) { - if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) { - throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads", - numReadsToKeep, activeReads.size())); - } - - BitSet itemsToKeep = new BitSet(activeReads.size()); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) { - itemsToKeep.set(selectedIndex); - } - - int currentIndex = 0; - Iterator iter = activeReads.iterator(); - - while ( iter.hasNext() ) { - T read = iter.next(); - - if ( ! itemsToKeep.get(currentIndex) ) { - iter.remove(); - } - - currentIndex++; - } - } - - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java index f78aaf4bf..3ff6f4454 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java @@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord; */ public interface ReadsDownsampler extends Downsampler { - /* + /** * Does this downsampler require that reads be fed to it in coordinate order? + * + * @return true if reads must be submitted to this downsampler in coordinate order, otherwise false */ public boolean requiresCoordinateSortOrder(); + + /** + * Tell this downsampler that no more reads located before the provided read (according to + * the sort order of the read stream) will be fed to it. + * + * Allows position-aware downsamplers to finalize pending reads earlier than they would + * otherwise be able to, particularly when doing per-sample downsampling and reads for + * certain samples are sparser than average. + * + * @param read the downsampler will assume that no reads located before this read will ever + * be submitted to it in the future + */ + public void signalNoMoreReadsBefore( T read ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java new file mode 100644 index 000000000..2fa32497b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular + * downsampler, all sharing the same construction parameters. + * + * @author David Roazen + */ +public interface ReadsDownsamplerFactory { + public ReadsDownsampler newInstance(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java index cb40c7042..bab4734c4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -48,6 +48,14 @@ public class ReservoirDownsampler implements ReadsDownsampl private int totalReadsSeen; + private int numDiscardedItems; + + /** + * Construct a ReservoirDownsampler + * + * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained + * after downsampling will be min(totalReads, targetSampleSize) + */ public ReservoirDownsampler ( int targetSampleSize ) { if ( targetSampleSize <= 0 ) { throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); @@ -55,6 +63,7 @@ public class ReservoirDownsampler implements ReadsDownsampl this.targetSampleSize = targetSampleSize; clear(); + reset(); } public void submit ( T newRead ) { @@ -68,6 +77,7 @@ public class ReservoirDownsampler implements ReadsDownsampl if ( randomSlot < targetSampleSize ) { reservoir.set(randomSlot, newRead); } + numDiscardedItems++; } } @@ -77,11 +87,12 @@ public class ReservoirDownsampler implements ReadsDownsampl } } - public boolean hasDownsampledItems() { + public boolean hasFinalizedItems() { return reservoir.size() > 0; } - public List consumeDownsampledItems() { + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed List downsampledItems = reservoir; clear(); return downsampledItems; @@ -91,16 +102,36 @@ public class ReservoirDownsampler implements ReadsDownsampl return false; } + public T peekFinalized() { + return reservoir.isEmpty() ? null : reservoir.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + public void signalEndOfInput() { // NO-OP } public void clear() { reservoir = new ArrayList(targetSampleSize); - totalReadsSeen = 0; + totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below + } + + public void reset() { + numDiscardedItems = 0; } public boolean requiresCoordinateSortOrder() { return false; } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java new file mode 100644 index 000000000..040f0c788 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating ReservoirDownsamplers on demand + * + * @author David Roazen + */ +public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetSampleSize; + + public ReservoirDownsamplerFactory( int targetSampleSize ) { + this.targetSampleSize = targetSampleSize; + } + + public ReadsDownsampler newInstance() { + return new ReservoirDownsampler(targetSampleSize); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java new file mode 100644 index 000000000..30affc2b3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +import java.util.*; + +/** + * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage + * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time. + * + * @author David Roazen + */ +public class SimplePositionalDownsampler implements ReadsDownsampler { + + private int targetCoverage; + + private ReservoirDownsampler reservoir; + + private int currentContigIndex; + + private int currentAlignmentStart; + + private boolean positionEstablished; + + private boolean unmappedReadsReached; + + private ArrayList finalizedReads; + + private int numDiscardedItems; + + /** + * Construct a SimplePositionalDownsampler + * + * @param targetCoverage Maximum number of reads that may share any given alignment start position + */ + public SimplePositionalDownsampler( int targetCoverage ) { + this.targetCoverage = targetCoverage; + reservoir = new ReservoirDownsampler(targetCoverage); + finalizedReads = new ArrayList(); + clear(); + reset(); + } + + public void submit( T newRead ) { + updatePositionalState(newRead); + + if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream + finalizedReads.add(newRead); + } + else { + int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); + reservoir.submit(newRead); + numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; + } + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasFinalizedItems() { + return finalizedReads.size() > 0; + } + + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List toReturn = finalizedReads; + finalizedReads = new ArrayList(); + return toReturn; + } + + public boolean hasPendingItems() { + return reservoir.hasFinalizedItems(); + } + + public T peekFinalized() { + return finalizedReads.isEmpty() ? null : finalizedReads.get(0); + } + + public T peekPending() { + return reservoir.peekFinalized(); + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + public void signalEndOfInput() { + finalizeReservoir(); + } + + public void clear() { + reservoir.clear(); + reservoir.reset(); + finalizedReads.clear(); + positionEstablished = false; + unmappedReadsReached = false; + } + + public void reset() { + numDiscardedItems = 0; + } + + public boolean requiresCoordinateSortOrder() { + return true; + } + + public void signalNoMoreReadsBefore( T read ) { + updatePositionalState(read); + } + + private void updatePositionalState( T newRead ) { + if ( readIsPastCurrentPosition(newRead) ) { + if ( reservoir.hasFinalizedItems() ) { + finalizeReservoir(); + } + + setCurrentPosition(newRead); + + if ( newRead.getReadUnmappedFlag() ) { + unmappedReadsReached = true; + } + } + } + + private void setCurrentPosition( T read ) { + currentContigIndex = read.getReferenceIndex(); + currentAlignmentStart = read.getAlignmentStart(); + positionEstablished = true; + } + + private boolean readIsPastCurrentPosition( T read ) { + return ! positionEstablished || + read.getReferenceIndex() > currentContigIndex || + read.getAlignmentStart() > currentAlignmentStart || + (read.getReadUnmappedFlag() && ! unmappedReadsReached); + } + + private void finalizeReservoir() { + finalizedReads.addAll(reservoir.consumeFinalizedItems()); + reservoir.reset(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java new file mode 100644 index 000000000..fcc18b16b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating SimplePositionalDownsamplers on demand + * + * @author David Roazen + */ +public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetCoverage; + + public SimplePositionalDownsamplerFactory( int targetCoverage ) { + this.targetCoverage = targetCoverage; + } + + public ReadsDownsampler newInstance() { + return new SimplePositionalDownsampler(targetCoverage); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 9198d210d..f1d2f7b5b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -76,21 +77,21 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** * Create a new hierarchical microscheduler to process the given reads and reference. * - * @param walker the walker used to process the dataset. - * @param reads Reads file(s) to process. - * @param reference Reference for driving the traversal. - * @param nThreadsToUse maximum number of threads to use to do the work + * @param walker the walker used to process the dataset. + * @param reads Reads file(s) to process. + * @param reference Reference for driving the traversal. + * @param threadAllocation How should we apply multi-threaded execution? */ protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int nThreadsToUse, - final boolean monitorThreadPerformance ) { - super(engine, walker, reads, reference, rods, nThreadsToUse); + final ThreadAllocation threadAllocation) { + super(engine, walker, reads, reference, rods, threadAllocation); - if ( monitorThreadPerformance ) { + final int nThreadsToUse = threadAllocation.getNumDataThreads(); + if ( threadAllocation.monitorThreadEfficiency() ) { final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse); setThreadEfficiencyMonitor(monitoringThreadFactory); this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 5bcb16c94..ceb4a6f9b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.SampleUtils; @@ -39,13 +40,11 @@ public class LinearMicroScheduler extends MicroScheduler { final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int numThreads, // may be > 1 if are nanoScheduling - final boolean monitorThreadPerformance ) { - super(engine, walker, reads, reference, rods, numThreads); + final ThreadAllocation threadAllocation) { + super(engine, walker, reads, reference, rods, threadAllocation); - if ( monitorThreadPerformance ) + if ( threadAllocation.monitorThreadEfficiency() ) setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); - } /** @@ -60,11 +59,12 @@ public class LinearMicroScheduler extends MicroScheduler { boolean done = walker.isDone(); int counter = 0; + + traversalEngine.startTimersIfNecessary(); for (Shard shard : shardStrategy ) { if ( done || shard == null ) // we ran out of shards that aren't owned break; - traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 417a0982f..c6ef9acf1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -59,6 +59,8 @@ import java.util.Collection; /** Shards and schedules data in manageable chunks. */ public abstract class MicroScheduler implements MicroSchedulerMBean { + // TODO -- remove me and retire non nano scheduled versions of traversals + private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true; protected static final Logger logger = Logger.getLogger(MicroScheduler.class); /** @@ -100,27 +102,36 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if (threadAllocation.getNumCPUThreads() > 1) { + if ( threadAllocation.isRunningInParallelMode() ) { + // TODO -- remove me when we fix running NCT within HMS + if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1) + throw new UserException("Currently the GATK does not support running CPU threads within data threads, " + + "please specify only one of NT and NCT"); + + logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)", + threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads())); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { if (walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - - if ( walker instanceof ReadWalker ) { - if ( ! (walker instanceof ThreadSafeMapReduce) ) badNT(engine, walker); - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + if ( ! (walker instanceof TreeReducible) ) { + throw badNT("nt", engine, walker); } else { - // TODO -- update test for when nano scheduling only is an option - if ( ! (walker instanceof TreeReducible) ) badNT(engine, walker); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } else { - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) + throw badNT("nct", engine, walker); + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } - private static void badNT(final GenomeAnalysisEngine engine, final Walker walker) { - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue("nt", + String.format("The analysis %s currently does not support parallel execution with %s. " + + "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); } /** @@ -130,23 +141,27 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @param reads The reads. * @param reference The reference. * @param rods the rods to include in the traversal - * @param numThreads the number of threads we are using in the underlying traversal + * @param threadAllocation the allocation of threads to use in the underlying traversal */ protected MicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int numThreads) { + final ThreadAllocation threadAllocation) { this.engine = engine; this.reads = reads; this.reference = reference; this.rods = rods; if (walker instanceof ReadWalker) { - traversalEngine = numThreads > 1 ? new TraverseReadsNano(numThreads) : new TraverseReads(); + traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 + ? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()) + : new TraverseReads(); } else if (walker instanceof LocusWalker) { - traversalEngine = new TraverseLoci(); + traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 + ? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()) + : new TraverseLociLinear(); } else if (walker instanceof DuplicateWalker) { traversalEngine = new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index da11d36dd..6c0dc9769 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -81,7 +82,13 @@ public class WindowMaker implements Iterable, I public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) { this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + + // Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested: + this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ? + new PeekableIterator(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames)) + : + new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index bddfa6a0d..5ca8a1779 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -29,6 +29,7 @@ import com.google.common.base.Function; import com.google.common.collect.Collections2; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.help.GATKDocUtils; import java.util.Collection; import java.util.List; @@ -68,16 +69,29 @@ public class FilterManager extends PluginManager { @Override protected String formatErrorMessage(String pluginCategory, String pluginName) { List> availableFilters = this.getPluginsImplementing(ReadFilter.class); - Collection availableFilterNames = Collections2.transform(availableFilters, new Function,String>(){ - @Override - public String apply(final Class input) { - return getName(input); - } - }); - return String.format("Read filter %s not found. Available read filters:%n%s.%n%n%s",pluginName, - Utils.join(String.format(", "),availableFilterNames), + return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, + userFriendlyListofReadFilters(availableFilters), "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information."); } + + private String userFriendlyListofReadFilters(List> filters) { + final String headName = "FilterName", headDoc = "Documentation"; + int longestNameLength = -1; + for ( Class < ? extends ReadFilter> filter : filters ) { + longestNameLength = Math.max(longestNameLength,this.getName(filter).length()); + } + String format = " %"+longestNameLength+"s %s%n"; + + StringBuilder listBuilder = new StringBuilder(); + listBuilder.append(String.format(format,headName,headDoc)); + for ( Class filter : filters ) { + String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter); + String filterName = this.getName(filter); + listBuilder.append(String.format(format,filterName,helpLink)); + } + + return listBuilder.toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java index 260a7efda..ee1dc63e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java @@ -32,9 +32,9 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.writer.Options; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory; import java.io.File; @@ -269,7 +269,7 @@ public class VariantContextWriterStub implements Stub, Var * @return */ public boolean alsoWriteBCFForTest() { - return engine.getArguments().numberOfThreads == 1 && // only works single threaded + return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded ! isCompressed() && // for non-compressed outputs getFile() != null && // that are going to disk engine.getArguments().generateShadowBCF; // and we actually want to do it diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java similarity index 88% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java index 835748ff0..c0de06b49 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java @@ -6,13 +6,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.util.Iterator; -public class DownsampleIterator implements StingSAMIterator { +public class LegacyDownsampleIterator implements StingSAMIterator { StingSAMIterator it; int cutoff; SAMRecord next; - public DownsampleIterator(StingSAMIterator it, double fraction) { + public LegacyDownsampleIterator(StingSAMIterator it, double fraction) { this.it = it; cutoff = (int)(fraction * 10000); next = getNextRecord(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 64f914064..46e84798a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -31,8 +31,8 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java new file mode 100755 index 000000000..557cbd009 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java @@ -0,0 +1,649 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.Downsampler; +import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class LocusIteratorByStateExperimental extends LocusIterator { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Used to create new GenomeLocs. + */ + private final GenomeLocParser genomeLocParser; + private final ArrayList samples; + private final ReadStateManager readStates; + + protected static class SAMRecordState { + SAMRecord read; + int readOffset = -1; // how far are we offset from the start of the read bases? + int genomeOffset = -1; // how far are we offset from the alignment start on the genome? + + Cigar cigar = null; + int cigarOffset = -1; + CigarElement curElement = null; + int nCigarElements = 0; + + int cigarElementCounter = -1; // how far are we into a single cigarElement + + // The logical model for generating extended events is as follows: the "record state" implements the traversal + // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This + // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the + // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or + // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from + // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended + // events immediately preceding the current reference base). + + public SAMRecordState(SAMRecord read) { + this.read = read; + cigar = read.getCigar(); + nCigarElements = cigar.numCigarElements(); + + //System.out.printf("Creating a SAMRecordState: %s%n", this); + } + + public SAMRecord getRead() { + return read; + } + + /** + * What is our current offset in the read's bases that aligns us with the reference genome? + * + * @return + */ + public int getReadOffset() { + return readOffset; + } + + /** + * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? + * + * @return + */ + public int getGenomeOffset() { + return genomeOffset; + } + + public int getGenomePosition() { + return read.getAlignmentStart() + getGenomeOffset(); + } + + public GenomeLoc getLocation(GenomeLocParser genomeLocParser) { + return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); + } + + public CigarOperator getCurrentCigarOperator() { + return curElement.getOperator(); + } + + public String toString() { + return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); + } + + public CigarElement peekForwardOnGenome() { + return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ); + } + + public CigarElement peekBackwardOnGenome() { + return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement ); + } + + + public CigarOperator stepForwardOnGenome() { + // we enter this method with readOffset = index of the last processed base on the read + // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion + + + if (curElement == null || ++cigarElementCounter > curElement.getLength()) { + cigarOffset++; + if (cigarOffset < nCigarElements) { + curElement = cigar.getCigarElement(cigarOffset); + cigarElementCounter = 0; + // next line: guards against cigar elements of length 0; when new cigar element is retrieved, + // we reenter in order to re-check cigarElementCounter against curElement's length + return stepForwardOnGenome(); + } else { + if (curElement != null && curElement.getOperator() == CigarOperator.D) + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + + // Reads that contain indels model the genomeOffset as the following base in the reference. Because + // we fall into this else block only when indels end the read, increment genomeOffset such that the + // current offset of this read is the next ref base after the end of the indel. This position will + // model a point on the reference somewhere after the end of the read. + genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: + // we do step forward on the ref, and by returning null we also indicate that we are past the read end. + + return null; + } + } + + boolean done = false; + switch (curElement.getOperator()) { + case H: // ignore hard clips + case P: // ignore pads + cigarElementCounter = curElement.getLength(); + break; + case I: // insertion w.r.t. the reference + case S: // soft clip + cigarElementCounter = curElement.getLength(); + readOffset += curElement.getLength(); + break; + case D: // deletion w.r.t. the reference + if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string + throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + // should be the same as N case + genomeOffset++; + done = true; + break; + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + genomeOffset++; + done = true; + break; + case M: + case EQ: + case X: + readOffset++; + genomeOffset++; + done = true; + break; + default: + throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); + } + + return done ? curElement.getOperator() : stepForwardOnGenome(); + } + } + + //final boolean DEBUG = false; + //final boolean DEBUG2 = false && DEBUG; + private ReadProperties readInfo; + private AlignmentContext nextAlignmentContext; + private boolean performLevelingDownsampling; + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + + public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { + this.readInfo = readInformation; + this.genomeLocParser = genomeLocParser; + this.samples = new ArrayList(samples); + this.readStates = new ReadStateManager(samIterator); + + this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && + readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readInfo.getDownsamplingMethod().toCoverage != null; + + // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when + // there's no read data. So we need to throw this error only when samIterator.hasNext() is true + if (this.samples.isEmpty() && samIterator.hasNext()) { + throw new IllegalArgumentException("samples list must not be empty"); + } + } + + /** + * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list + * for the system. + */ + public final static Collection sampleListForSAMWithoutReadGroups() { + List samples = new ArrayList(); + samples.add(null); + return samples; + } + + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public boolean hasNext() { + lazyLoadNextAlignmentContext(); + return (nextAlignmentContext != null); + //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); + } + + private GenomeLoc getLocation() { + return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + public AlignmentContext next() { + lazyLoadNextAlignmentContext(); + if (!hasNext()) + throw new NoSuchElementException("LocusIteratorByState: out of elements."); + AlignmentContext currentAlignmentContext = nextAlignmentContext; + nextAlignmentContext = null; + return currentAlignmentContext; + } + + /** + * Creates the next alignment context from the given state. Note that this is implemented as a lazy load method. + * nextAlignmentContext MUST BE null in order for this method to advance to the next entry. + */ + private void lazyLoadNextAlignmentContext() { + while (nextAlignmentContext == null && readStates.hasNext()) { + readStates.collectPendingReads(); + + final GenomeLoc location = getLocation(); + final Map fullPileup = new HashMap(); + + // TODO: How can you determine here whether the current pileup has been downsampled? + boolean hasBeenSampled = false; + + for (final String sample : samples) { + final Iterator iterator = readStates.iterator(sample); + final List pile = new ArrayList(readStates.size(sample)); + + int size = 0; // number of elements in this sample's pileup + int nDeletions = 0; // number of deletions in this sample's pileup + int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) + + while (iterator.hasNext()) { + final SAMRecordState state = iterator.next(); // state object with the read/offset information + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element + final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final boolean isSingleElementCigar = nextElement == lastElement; + final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator + final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator + int readOffset = state.getReadOffset(); // the base offset on this read + + final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; + final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; + final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar; + final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); + + int nextElementLength = nextElement.getLength(); + + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + + if (op == CigarOperator.D) { + // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix + if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so + pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1)); + size++; + nDeletions++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } + } + else { + if (!filterBaseInRead(read, location.getStart())) { + String insertedBaseString = null; + if (nextOp == CigarOperator.I) { + final int insertionOffset = isSingleElementCigar ? 0 : 1; + // TODO -- someone please implement a better fix for the single element insertion CIGAR! + if (isSingleElementCigar) + readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases! + insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); + } + + pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); + size++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } + } + } + + if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup + fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads)); + } + + updateReadStates(); // critical - must be called after we get the current state offsets and location + if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); + } + } + + // fast testing of position + private boolean readIsPastCurrentPosition(SAMRecord read) { + if (readStates.isEmpty()) + return false; + else { + SAMRecordState state = readStates.getFirst(); + SAMRecord ourRead = state.getRead(); + return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); + } + } + + /** + * Generic place to put per-base filters appropriate to LocusIteratorByState + * + * @param rec + * @param pos + * @return + */ + private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { + return ReadUtils.isBaseInsideAdaptor(rec, pos); + } + + private void updateReadStates() { + for (final String sample : samples) { + Iterator it = readStates.iterator(sample); + while (it.hasNext()) { + SAMRecordState state = it.next(); + CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + } + } + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + protected class ReadStateManager { + private final PeekableIterator iterator; + private final SamplePartitioner samplePartitioner; + private final Map readStatesBySample = new HashMap(); + private int totalReadStates = 0; + + public ReadStateManager(Iterator source) { + this.iterator = new PeekableIterator(source); + + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + } + + samplePartitioner = new SamplePartitioner(); + } + + /** + * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented + * for this iterator; if present, total read states will be decremented. + * + * @param sample The sample. + * @return Iterator over the reads associated with that sample. + */ + public Iterator iterator(final String sample) { + return new Iterator() { + private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecordState next() { + return wrappedIterator.next(); + } + + public void remove() { + wrappedIterator.remove(); + } + }; + } + + public boolean isEmpty() { + return totalReadStates == 0; + } + + /** + * Retrieves the total number of reads in the manager across all samples. + * + * @return Total number of reads over all samples. + */ + public int size() { + return totalReadStates; + } + + /** + * Retrieves the total number of reads in the manager in the given sample. + * + * @param sample The sample. + * @return Total number of reads in the given sample. + */ + public int size(final String sample) { + return readStatesBySample.get(sample).size(); + } + + public SAMRecordState getFirst() { + for (final String sample : samples) { + PerSampleReadStateManager reads = readStatesBySample.get(sample); + if (!reads.isEmpty()) + return reads.peek(); + } + return null; + } + + public boolean hasNext() { + return totalReadStates > 0 || iterator.hasNext(); + } + + public void collectPendingReads() { + if (!iterator.hasNext()) + return; + + if (readStates.size() == 0) { + int firstContigIndex = iterator.peek().getReferenceIndex(); + int firstAlignmentStart = iterator.peek().getAlignmentStart(); + while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { + samplePartitioner.submitRead(iterator.next()); + } + } else { + // Fast fail in the case that the read is past the current position. + if (readIsPastCurrentPosition(iterator.peek())) + return; + + while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { + samplePartitioner.submitRead(iterator.next()); + } + } + + for (final String sample : samples) { + Collection newReads = samplePartitioner.getReadsForSample(sample); + PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + addReadsToSample(statesBySample, newReads); + } + + samplePartitioner.reset(); + } + + /** + * Add reads with the given sample name to the given hanger entry. + * + * @param readStates The list of read states to add this collection of reads. + * @param reads Reads to add. Selected reads will be pulled from this source. + */ + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { + if (reads.isEmpty()) + return; + + Collection newReadStates = new LinkedList(); + + for (SAMRecord read : reads) { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); + } + + readStates.addStatesAtNextAlignmentStart(newReadStates); + } + + protected class PerSampleReadStateManager implements Iterable { + private List> readStatesByAlignmentStart = new LinkedList>(); + private int thisSampleReadStates = 0; + private Downsampler> levelingDownsampler = + performLevelingDownsampling ? + new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : + null; + + public void addStatesAtNextAlignmentStart(Collection states) { + if ( states.isEmpty() ) { + return; + } + + readStatesByAlignmentStart.add(new LinkedList(states)); + thisSampleReadStates += states.size(); + totalReadStates += states.size(); + + if ( levelingDownsampler != null ) { + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } + } + + public boolean isEmpty() { + return readStatesByAlignmentStart.isEmpty(); + } + + public SAMRecordState peek() { + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + } + + public int size() { + return thisSampleReadStates; + } + + public Iterator iterator() { + return new Iterator() { + private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates = null; + private Iterator currentPositionReadStatesIterator = null; + + public boolean hasNext() { + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + } + + public SAMRecordState next() { + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); + } + + public void remove() { + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + totalReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } + } + }; + } + } + } + + /** + * Note: stores reads by sample ID string, not by sample object + */ + private class SamplePartitioner { + private Map> readsBySample; + private long readsSeen = 0; + + public SamplePartitioner() { + readsBySample = new HashMap>(); + + for ( String sample : samples ) { + readsBySample.put(sample, new ArrayList()); + } + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).add(read); + readsSeen++; + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public Collection getReadsForSample(String sampleName) { + if ( ! readsBySample.containsKey(sampleName) ) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for ( Collection perSampleReads : readsBySample.values() ) + perSampleReads.clear(); + readsSeen = 0; + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 3ffe95e8b..9578bba56 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -10,13 +10,11 @@ import java.util.Iterator; * Verifies that the incoming stream of reads is correctly sorted */ public class VerifyingSamIterator implements StingSAMIterator { - private GenomeLocParser genomeLocParser; StingSAMIterator it; SAMRecord last = null; boolean checkOrderP = true; - public VerifyingSamIterator(GenomeLocParser genomeLocParser,StingSAMIterator it) { - this.genomeLocParser = genomeLocParser; + public VerifyingSamIterator(StingSAMIterator it) { this.it = it; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 6f3f175a2..51fed470f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -218,7 +218,7 @@ public class GATKRunReport { // if there was an exception, capture it this.mException = e == null ? null : new ExceptionToXML(e); - numThreads = engine.getArguments().numberOfThreads; + numThreads = engine.getTotalNumberOfThreads(); percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU); percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING); percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING); diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java index caae55ac5..c86f06c25 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.gatk.resourcemanagement; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; /** * Models how threads are distributed between various components of the GATK. @@ -33,7 +33,12 @@ public class ThreadAllocation { /** * The number of CPU threads to be used by the GATK. */ - private final int numCPUThreads; + private final int numDataThreads; + + /** + * The number of CPU threads per data thread for GATK processing + */ + private final int numCPUThreadsPerDataThread; /** * Number of threads to devote exclusively to IO. Default is 0. @@ -45,8 +50,12 @@ public class ThreadAllocation { */ private final boolean monitorEfficiency; - public int getNumCPUThreads() { - return numCPUThreads; + public int getNumDataThreads() { + return numDataThreads; + } + + public int getNumCPUThreadsPerDataThread() { + return numCPUThreadsPerDataThread; } public int getNumIOThreads() { @@ -57,47 +66,50 @@ public class ThreadAllocation { return monitorEfficiency; } + /** + * Are we running in parallel mode? + * + * @return true if any parallel processing is enabled + */ + public boolean isRunningInParallelMode() { + return getTotalNumThreads() > 1; + } + + /** + * What is the total number of threads in use by the GATK? + * + * @return the sum of all thread allocations in this object + */ + public int getTotalNumThreads() { + return getNumDataThreads() * getNumCPUThreadsPerDataThread() + getNumIOThreads(); + } + /** * Construct the default thread allocation. */ public ThreadAllocation() { - this(1, null, null, false); + this(1, 1, 0, false); } /** * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads. * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread). - * @param totalThreads Complete number of threads to allocate. - * @param numCPUThreads Total number of threads allocated to the traversal. + * @param numDataThreads Total number of threads allocated to the traversal. + * @param numCPUThreadsPerDataThread The number of CPU threads per data thread to allocate * @param numIOThreads Total number of threads allocated exclusively to IO. + * @param monitorEfficiency should we monitor threading efficiency in the GATK? */ - public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads, final boolean monitorEfficiency) { - // If no allocation information is present, allocate all threads to CPU - if(numCPUThreads == null && numIOThreads == null) { - this.numCPUThreads = totalThreads; - this.numIOThreads = 0; - } - // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads). - else if(numIOThreads == null) { - if(numCPUThreads > totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads)); - this.numCPUThreads = numCPUThreads; - this.numIOThreads = totalThreads - numCPUThreads; - } - // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread). - else if(numCPUThreads == null) { - if(numIOThreads > totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads)); - this.numCPUThreads = Math.max(1,totalThreads-numIOThreads); - this.numIOThreads = numIOThreads; - } - else { - if(numCPUThreads + numIOThreads != totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads)); - this.numCPUThreads = numCPUThreads; - this.numIOThreads = numIOThreads; - } + public ThreadAllocation(final int numDataThreads, + final int numCPUThreadsPerDataThread, + final int numIOThreads, + final boolean monitorEfficiency) { + if ( numDataThreads < 1 ) throw new ReviewedStingException("numDataThreads cannot be less than 1, but saw " + numDataThreads); + if ( numCPUThreadsPerDataThread < 1 ) throw new ReviewedStingException("numCPUThreadsPerDataThread cannot be less than 1, but saw " + numCPUThreadsPerDataThread); + if ( numIOThreads < 0 ) throw new ReviewedStingException("numIOThreads cannot be less than 0, but saw " + numIOThreads); + this.numDataThreads = numDataThreads; + this.numCPUThreadsPerDataThread = numCPUThreadsPerDataThread; + this.numIOThreads = numIOThreads; this.monitorEfficiency = monitorEfficiency; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index abc71e549..8c617e4dc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -44,24 +44,12 @@ import java.util.List; import java.util.Map; public abstract class TraversalEngine,ProviderType extends ShardDataProvider> { + /** our log, which we want to capture anything from this class */ + protected static final Logger logger = Logger.getLogger(TraversalEngine.class); + // Time in milliseconds since we initialized this engine private static final int HISTORY_WINDOW_SIZE = 50; - private static class ProcessingHistory { - double elapsedSeconds; - long unitsProcessed; - long bpProcessed; - GenomeLoc loc; - - public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) { - this.elapsedSeconds = elapsedSeconds; - this.loc = loc; - this.unitsProcessed = unitsProcessed; - this.bpProcessed = bpProcessed; - } - - } - /** lock object to sure updates to history are consistent across threads */ private static final Object lock = new Object(); LinkedList history = new LinkedList(); @@ -70,13 +58,12 @@ public abstract class TraversalEngine,Provide private SimpleTimer timer = null; // How long can we go without printing some progress info? - private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; - private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? - private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds - private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds - private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; - private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + + private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds + private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; + private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + private long progressPrintFrequency = 10 * 1000; // in milliseconds private boolean progressMeterInitialized = false; // for performance log @@ -85,15 +72,12 @@ public abstract class TraversalEngine,Provide private File performanceLogFile; private PrintStream performanceLog = null; private long lastPerformanceLogPrintTime = -1; // When was the last time we printed to the performance log? - private final long PERFORMANCE_LOG_PRINT_FREQUENCY = PROGRESS_PRINT_FREQUENCY; // in milliseconds + private final long PERFORMANCE_LOG_PRINT_FREQUENCY = progressPrintFrequency; // in milliseconds /** Size, in bp, of the area we are processing. Updated once in the system in initial for performance reasons */ long targetSize = -1; GenomeLocSortedSet targetIntervals = null; - /** our log, which we want to capture anything from this class */ - protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - protected GenomeAnalysisEngine engine; // ---------------------------------------------------------------------------------------------------- @@ -186,15 +170,35 @@ public abstract class TraversalEngine,Provide return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; } + /** + * Update the cumulative traversal metrics according to the data in this shard + * + * @param shard a non-null shard + */ + public void updateCumulativeMetrics(final Shard shard) { + updateCumulativeMetrics(shard.getReadMetrics()); + } + + /** + * Update the cumulative traversal metrics according to the data in this shard + * + * @param singleTraverseMetrics read metrics object containing the information about a single shard's worth + * of data processing + */ + public void updateCumulativeMetrics(final ReadMetrics singleTraverseMetrics) { + engine.getCumulativeMetrics().incrementMetrics(singleTraverseMetrics); + } + /** * Forward request to printProgress * - * @param shard the given shard currently being processed. + * Assumes that one cycle has been completed + * * @param loc the location */ - public void printProgress(Shard shard, GenomeLoc loc) { + public void printProgress(final GenomeLoc loc) { // A bypass is inserted here for unit testing. - printProgress(loc,shard.getReadMetrics(),false); + printProgress(loc, false); } /** @@ -202,15 +206,10 @@ public abstract class TraversalEngine,Provide * every M seconds, for N and M set in global variables. * * @param loc Current location, can be null if you are at the end of the traversal - * @param metrics Data processed since the last cumulative * @param mustPrint If true, will print out info, regardless of nRecords or time interval */ - private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) { - if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 ) - // don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES - return; - - if(!progressMeterInitialized && mustPrint == false ) { + private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) { + if( ! progressMeterInitialized ) { logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", "Location", getTraversalType(), getTraversalType())); @@ -218,40 +217,34 @@ public abstract class TraversalEngine,Provide } final long curTime = timer.currentTime(); - boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY); + boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); if ( printProgress || printLog ) { - // getting and appending metrics data actually turns out to be quite a heavyweight - // operation. Postpone it until after determining whether to print the log message. - ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics() : new ReadMetrics(); - if(metrics != null) - cumulativeMetrics.incrementMetrics(metrics); - - final long nRecords = cumulativeMetrics.getNumIterations(); - - ProcessingHistory last = updateHistory(loc,cumulativeMetrics); + final ProcessingHistory last = updateHistory(loc, engine.getCumulativeMetrics()); final AutoFormattingTime elapsed = new AutoFormattingTime(last.elapsedSeconds); - final AutoFormattingTime bpRate = new AutoFormattingTime(secondsPerMillionBP(last)); - final AutoFormattingTime unitRate = new AutoFormattingTime(secondsPerMillionElements(last)); - final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last); + final AutoFormattingTime bpRate = new AutoFormattingTime(last.secondsPerMillionBP()); + final AutoFormattingTime unitRate = new AutoFormattingTime(last.secondsPerMillionElements()); + final double fractionGenomeTargetCompleted = last.calculateFractionGenomeTargetCompleted(targetSize); final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted); final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds()); + final long nRecords = engine.getCumulativeMetrics().getNumIterations(); if ( printProgress ) { lastProgressPrintTime = curTime; // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates if ( estTotalRuntime.getTimeInSeconds() > TWELVE_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds + progressPrintFrequency = 60 * 1000; // in milliseconds else if ( estTotalRuntime.getTimeInSeconds() > TWO_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds + progressPrintFrequency = 30 * 1000; // in milliseconds else - PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds + progressPrintFrequency = 10 * 1000; // in milliseconds - logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s", - loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate, + final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : String.format("%s:%d", loc.getContig(), loc.getStart()); + logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", + posName, nRecords*1.0, elapsed, unitRate, 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); } @@ -277,7 +270,7 @@ public abstract class TraversalEngine,Provide * @param metrics information about what's been processed already * @return */ - private final ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) { + private ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) { synchronized (lock) { if ( history.size() > HISTORY_WINDOW_SIZE ) history.pop(); @@ -290,26 +283,11 @@ public abstract class TraversalEngine,Provide } } - /** How long in seconds to process 1M traversal units? */ - private final double secondsPerMillionElements(ProcessingHistory last) { - return (last.elapsedSeconds * 1000000.0) / Math.max(last.unitsProcessed, 1); - } - - /** How long in seconds to process 1M bp on the genome? */ - private final double secondsPerMillionBP(ProcessingHistory last) { - return (last.elapsedSeconds * 1000000.0) / Math.max(last.bpProcessed, 1); - } - - /** What fractoin of the target intervals have we covered? */ - private final double calculateFractionGenomeTargetCompleted(ProcessingHistory last) { - return (1.0*last.bpProcessed) / targetSize; - } - /** * Called after a traversal to print out information about the traversal process */ public void printOnTraversalDone() { - printProgress(null, null, true); + printProgress(null, true); final double elapsed = timer == null ? 0 : timer.getElapsedTime(); @@ -370,7 +348,7 @@ public abstract class TraversalEngine,Provide * @return Frequency, in seconds, of performance log writes. */ public long getPerformanceProgressPrintFrequencySeconds() { - return PROGRESS_PRINT_FREQUENCY; + return progressPrintFrequency; } /** @@ -378,6 +356,35 @@ public abstract class TraversalEngine,Provide * @param seconds number of seconds between messages indicating performance frequency. */ public void setPerformanceProgressPrintFrequencySeconds(long seconds) { - PROGRESS_PRINT_FREQUENCY = seconds; + progressPrintFrequency = seconds; + } + + private static class ProcessingHistory { + double elapsedSeconds; + long unitsProcessed; + long bpProcessed; + GenomeLoc loc; + + public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) { + this.elapsedSeconds = elapsedSeconds; + this.loc = loc; + this.unitsProcessed = unitsProcessed; + this.bpProcessed = bpProcessed; + } + + /** How long in seconds to process 1M traversal units? */ + private double secondsPerMillionElements() { + return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1); + } + + /** How long in seconds to process 1M bp on the genome? */ + private double secondsPerMillionBP() { + return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1); + } + + /** What fractoin of the target intervals have we covered? */ + private double calculateFractionGenomeTargetCompleted(final long targetSize) { + return (1.0*bpProcessed) / targetSize; + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index ecaa15fe9..bbd9346b3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -104,7 +104,8 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine,LocusShardDataProvider> { +public abstract class TraverseLociBase extends TraversalEngine,LocusShardDataProvider> { /** * our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraversalEngine.class); @Override - protected String getTraversalType() { + protected final String getTraversalType() { return "sites"; } + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + protected abstract TraverseResults traverse( final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum); + @Override public T traverse( LocusWalker walker, LocusShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider)); - LocusView locusView = getLocusView( walker, dataProvider ); - boolean done = false; + final LocusView locusView = getLocusView( walker, dataProvider ); if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); ReferenceOrderedView referenceOrderedDataView = null; if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) @@ -44,43 +56,24 @@ public class TraverseLoci extends TraversalEngine,Locu else referenceOrderedDataView = (RodLocusView)locusView; - LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - // We keep processing while the next reference location is within the interval - while( locusView.hasNext() && ! done ) { - AlignmentContext locus = locusView.next(); - GenomeLoc location = locus.getLocation(); - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - final boolean keepMeP = walker.filter(tracker, refContext, locus); - if (keepMeP) { - M x = walker.map(tracker, refContext, locus); - sum = walker.reduce(x, sum); - done = walker.isDone(); - } - - printProgress(dataProvider.getShard(),locus.getLocation()); - } + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + updateCumulativeMetrics(dataProvider.getShard()); } // We have a final map call to execute here to clean up the skipped based from the // last position in the ROD to that in the interval if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { // only do this if the walker isn't done! - RodLocusView rodLocusView = (RodLocusView)locusView; - long nSkipped = rodLocusView.getLastSkippedBases(); + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); if ( nSkipped > 0 ) { - GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - M x = walker.map(null, null, ac); + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); sum = walker.reduce(x, sum); } } @@ -90,14 +83,14 @@ public class TraverseLoci extends TraversalEngine,Locu /** * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype * that comes along. * @param walker walker to interrogate. * @param dataProvider Data which which to drive the locus view. * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. */ private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); if( dataSource == DataSource.READS ) return new CoveredLocusView(dataProvider); else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java new file mode 100755 index 000000000..22381092f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java @@ -0,0 +1,47 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.LocusView; +import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociLinear extends TraverseLociBase { + + @Override + protected TraverseResults traverse(LocusWalker walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) { + // We keep processing while the next reference location is within the interval + boolean done = false; + int numIterations = 0; + + while( locusView.hasNext() && ! done ) { + numIterations++; + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + final boolean keepMeP = walker.filter(tracker, refContext, locus); + if (keepMeP) { + final M x = walker.map(tracker, refContext, locus); + sum = walker.reduce(x, sum); + done = walker.isDone(); + } + + printProgress(locus.getLocation()); + } + + return new TraverseResults(numIterations, sum); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java new file mode 100755 index 000000000..e4e2254d0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -0,0 +1,205 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.LocusView; +import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; + +import java.util.Iterator; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociNano extends TraverseLociBase { + /** our log, which we want to capture anything from this class */ + private static final boolean DEBUG = false; + private static final int BUFFER_SIZE = 1000; + + final NanoScheduler nanoScheduler; + + public TraverseLociNano(int nThreads) { + nanoScheduler = new NanoScheduler(BUFFER_SIZE, nThreads); + nanoScheduler.setProgressFunction(new TraverseLociProgress()); + } + + @Override + protected TraverseResults traverse(final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum) { + nanoScheduler.setDebug(DEBUG); + final TraverseLociMap myMap = new TraverseLociMap(walker); + final TraverseLociReduce myReduce = new TraverseLociReduce(walker); + + final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); + final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); + + return new TraverseResults(inputIterator.numIterations, result); + } + + /** + * Create iterator that provides inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + */ + private class MapDataIterator implements Iterator { + final LocusView locusView; + final LocusReferenceView referenceView; + final ReferenceOrderedView referenceOrderedDataView; + int numIterations = 0; + + private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { + this.locusView = locusView; + this.referenceView = referenceView; + this.referenceOrderedDataView = referenceOrderedDataView; + } + + @Override + public boolean hasNext() { + return locusView.hasNext(); + } + + @Override + public MapData next() { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + //logger.info("Pulling data from MapDataIterator at " + location); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext); + + numIterations++; + return new MapData(locus, refContext, tracker); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); + } + } + + @Override + public void printOnTraversalDone() { + nanoScheduler.shutdown(); + super.printOnTraversalDone(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final AlignmentContext alignmentContext; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.alignmentContext = alignmentContext; + this.refContext = refContext; + this.tracker = tracker; + } + + @Override + public String toString() { + return "MapData " + alignmentContext.getLocation(); + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseLociMap implements NSMapFunction { + final LocusWalker walker; + + private TraverseLociMap(LocusWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); + if (keepMeP) { + final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); + return new MapResult(x); + } + } + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseLociReduce implements NSReduceFunction { + final LocusWalker walker; + + private TraverseLociReduce(LocusWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } + + private class TraverseLociProgress implements NSProgressFunction { + @Override + public void progress(MapData lastProcessedMap) { + if (lastProcessedMap.alignmentContext != null) + printProgress(lastProcessedMap.alignmentContext.getLocation()); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index ebaac40af..9b076fce4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -65,7 +65,8 @@ public class TraverseReadPairs extends TraversalEngine extends TraversalEngine,Read sum = walker.reduce(x, sum); } - GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); - printProgress(dataProvider.getShard(),locus); + final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); + + updateCumulativeMetrics(dataProvider.getShard()); + printProgress(locus); + done = walker.isDone(); } return sum; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index b397cb8c0..b3a0a1390 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -34,34 +34,34 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; /** - * @author aaron + * A nano-scheduling version of TraverseReads. + * + * Implements the traversal of a walker that accepts individual reads, the reference, and + * RODs per map call. Directly supports shared memory parallelism via NanoScheduler + * + * @author depristo * @version 1.0 - * @date Apr 24, 2009 - *

- * Class TraverseReads - *

- * This class handles traversing by reads in the new shardable style + * @date 9/2/2012 */ public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - private static final int MIN_GROUP_SIZE = 100; - final NanoScheduler nanoScheduler; + final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - final int mapGroupSize = (int)Math.max(Math.ceil(bufferSize / 50.0 + 1), MIN_GROUP_SIZE); - nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); + nanoScheduler = new NanoScheduler(bufferSize, nThreads); } @Override @@ -89,19 +89,32 @@ public class TraverseReadsNano extends TraversalEngine, final TraverseReadsMap myMap = new TraverseReadsMap(walker); final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - T result = nanoScheduler.execute(aggregateMapData(dataProvider).iterator(), myMap, sum, myReduce); - // TODO -- how do we print progress? - //printProgress(dataProvider.getShard(), ???); + final List aggregatedInputs = aggregateMapData(dataProvider); + final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce); + + final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; + final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); + + updateCumulativeMetrics(dataProvider.getShard()); + printProgress(locus); return result; } + /** + * Aggregate all of the inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + * + * @param dataProvider the source of our data + * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce + * should execute + */ private List aggregateMapData(final ReadShardDataProvider dataProvider) { final ReadView reads = new ReadView(dataProvider); final ReadReferenceView reference = new ReadReferenceView(dataProvider); final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - final List mapData = new ArrayList(); // TODO -- need size of reads + final List mapData = new LinkedList(); for ( final SAMRecord read : reads ) { final ReferenceContext refContext = ! read.getReadUnmappedFlag() ? reference.getReferenceContext(read) @@ -127,19 +140,9 @@ public class TraverseReadsNano extends TraversalEngine, super.printOnTraversalDone(); } - private class TraverseReadsReduce implements ReduceFunction { - final ReadWalker walker; - - private TraverseReadsReduce(ReadWalker walker) { - this.walker = walker; - } - - @Override - public T apply(M one, T sum) { - return walker.reduce(one, sum); - } - } - + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ private class MapData { final GATKSAMRecord read; final ReferenceContext refContext; @@ -152,7 +155,43 @@ public class TraverseReadsNano extends TraversalEngine, } } - private class TraverseReadsMap implements MapFunction { + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseReadsMap implements NSMapFunction { final ReadWalker walker; private TraverseReadsMap(ReadWalker walker) { @@ -160,15 +199,36 @@ public class TraverseReadsNano extends TraversalEngine, } @Override - public M apply(final MapData data) { + public MapResult apply(final MapData data) { if ( ! walker.isDone() ) { final boolean keepMeP = walker.filter(data.refContext, data.read); - if (keepMeP) { - return walker.map(data.refContext, data.read, data.tracker); - } + if (keepMeP) + return new MapResult(walker.map(data.refContext, data.read, data.tracker)); } - return null; // TODO -- what should we return in the case where the walker is done or the read is filtered? + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseReadsReduce implements NSReduceFunction { + final ReadWalker walker; + + private TraverseReadsReduce(ReadWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java index d662b0092..de2cd836c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.gatk.walkers; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import java.lang.annotation.*; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java index 14d14aca5..b4ef66aaf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java @@ -45,7 +45,7 @@ import java.text.NumberFormat; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) -public class FlagStat extends ReadWalker implements ThreadSafeMapReduce { +public class FlagStat extends ReadWalker implements NanoSchedulable { @Output PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java index 1ce469f8c..731ce7e4e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java @@ -27,5 +27,5 @@ package org.broadinstitute.sting.gatk.walkers; * declare that their map function is thread-safe and so multiple * map calls can be run in parallel in the same JVM instance. */ -public interface ThreadSafeMapReduce { +public interface NanoSchedulable { } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 2a6ecdb8c..a3efea9f1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -45,25 +45,14 @@ import java.util.Collections; import java.util.List; /** - * Prints the alignment in the pileup format. In the pileup format, each line represents a genomic position, - * consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping - * qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all - * encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand, - * a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the - * reverse strand. - * - * A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next - * reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence. - * Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. - * Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read - * separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality. - * A symbol '$' marks the end of a read segment. + * Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, + * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. * * Associated command: * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class Pileup extends LocusWalker implements TreeReducible { +public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names @@ -81,27 +70,32 @@ public class Pileup extends LocusWalker implements TreeReducib @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) public List> rods = Collections.emptyList(); - public void initialize() { - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - String rods = getReferenceOrderedData( tracker ); + @Override + public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String rods = getReferenceOrderedData( tracker ); ReadBackedPileup basePileup = context.getBasePileup(); - out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods); - if ( SHOW_VERBOSE ) - out.printf(" %s", createVerboseOutput(basePileup)); - out.println(); - return 1; + final StringBuilder s = new StringBuilder(); + s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods)); + if ( SHOW_VERBOSE ) + s.append(" ").append(createVerboseOutput(basePileup)); + s.append("\n"); + + return s.toString(); } // Given result of map function + @Override public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { - return treeReduce(sum,value); + + @Override + public Integer reduce(String value, Integer sum) { + out.print(value); + return sum + 1; } + + @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index a5d4b45b6..37176cbf9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -93,7 +93,7 @@ import java.util.*; @ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { +public class PrintReads extends ReadWalker implements NanoSchedulable { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; @@ -228,7 +228,6 @@ public class PrintReads extends ReadWalker impleme GATKSAMRecord workingRead = readIn; for ( final ReadTransformer transformer : readTransformers ) { - if ( logger.isDebugEnabled() ) logger.debug("Applying transformer " + transformer + " to read " + readIn.getReadName()); workingRead = transformer.apply(workingRead); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java index 8621c0e9d..c950e07e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java @@ -13,7 +13,7 @@ package org.broadinstitute.sting.gatk.walkers; * shards of the data can reduce with each other, and the composite result * can be reduced with other composite results. */ -public interface TreeReducible extends ThreadSafeMapReduce { +public interface TreeReducible { /** * A composite, 'reduce of reduces' function. * @param lhs 'left-most' portion of data in the composite reduce. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 0104f24d9..1e1f65333 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -33,6 +33,9 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ + if ( stratifiedContext == null ) + return; + Double ratio = annotateSNP(stratifiedContext, vc, g); if (ratio == null) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 85387f7cf..ee9b51b56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -54,7 +54,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() ) + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) return; if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index 354b798bb..44657a7e7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -55,7 +55,7 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation { final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ - if ( g == null || !g.isCalled() ) + if ( g == null || !g.isCalled() || stratifiedContext == null ) return; int mq0 = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 22ec5468f..eae13e1b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -300,16 +300,12 @@ public class VariantAnnotatorEngine { if (stratifiedPerReadAlleleLikelihoodMap != null) perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if ( context == null && perReadAlleleLikelihoodMap == null) { - // no likelihoods nor pileup available: just move on to next sample - genotypes.add(genotype); - } else { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); - } - genotypes.add(gb.make()); + + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); } + genotypes.add(gb.make()); } return genotypes; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 443b493be..43aa85a05 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -109,7 +109,7 @@ import java.util.ArrayList; @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta -public class BaseRecalibrator extends LocusWalker implements TreeReducible { +public class BaseRecalibrator extends LocusWalker implements TreeReducible, NanoSchedulable { @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index c5b043b7a..44b0d74ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 93928a780..0d1997252 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -125,7 +125,7 @@ import java.util.*; // TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: // TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatible { +public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatible, NanoSchedulable { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index a52d57031..b14dc9cc9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -57,7 +57,7 @@ import java.util.TreeSet; * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, - * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * it is impossible to place reads on the reference genome such that mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an @@ -69,7 +69,7 @@ import java.util.TreeSet; *

  • Running the realigner over those intervals (see the IndelRealigner tool)
  • * *

    - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + * An important note: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. *

    * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index bbd4bf92f..00acf854a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -541,7 +541,7 @@ public class PhaseByTransmission extends RodWalker, HashMa //Get a Map of genotype likelihoods. //In case of null, unavailable or no call, all likelihoods are 1/3. private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - if(genotype == null || !genotype.isCalled()){ + if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){ EnumMap likelihoods = new EnumMap(GenotypeType.class); likelihoods.put(GenotypeType.HOM_REF,1.0/3.0); likelihoods.put(GenotypeType.HET,1.0/3.0); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java index bd10eab87..cd295f26e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -40,7 +41,7 @@ import java.io.PrintStream; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountLoci extends LocusWalker implements TreeReducible { +public class CountLoci extends LocusWalker implements TreeReducible, NanoSchedulable { @Output(doc="Write count to this file instead of STDOUT") PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java index 9915d617e..ab37a2322 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; @@ -73,7 +74,7 @@ import java.util.*; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountRODs extends RodWalker, Long>> implements TreeReducible, Long>> { +public class CountRODs extends RodWalker, Long>> implements TreeReducible, Long>>, NanoSchedulable { @Output public PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 856ea77f5..301fa5b9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -4,9 +4,9 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.ThreadSafeMapReduce; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker implements ThreadSafeMapReduce { +public class CountReads extends ReadWalker implements NanoSchedulable { public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java index a3df3bc13..12423595b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -45,20 +46,23 @@ public class ErrorThrowing extends RodWalker implements TreeRed @Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true) public String exceptionToThrow; + @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) + public FailMethod failMethod = FailMethod.MAP; + + public enum FailMethod { + MAP, + REDUCE, + TREE_REDUCE + } + // // Template code to allow us to build the walker, doesn't actually do anything // @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( exceptionToThrow.equals("UserException") ) { - throw new UserException("UserException"); - } else if ( exceptionToThrow.equals("NullPointerException") ) { - throw new NullPointerException(); - } else if ( exceptionToThrow.equals("ReviewedStingException") ) { - throw new ReviewedStingException("ReviewedStingException"); - } else { - throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); - } + if ( failMethod == FailMethod.MAP ) + fail(); + return 0; } @Override @@ -68,10 +72,32 @@ public class ErrorThrowing extends RodWalker implements TreeRed @Override public Integer reduce(Integer value, Integer sum) { + if ( failMethod == FailMethod.REDUCE ) + fail(); return value + sum; } public Integer treeReduce(final Integer lhs, final Integer rhs) { + if ( failMethod == FailMethod.TREE_REDUCE ) + fail(); return lhs + rhs; } + + private void fail() { + if ( exceptionToThrow.equals("UserException") ) { + throw new UserException("UserException"); + } else if ( exceptionToThrow.equals("NullPointerException") ) { + throw new NullPointerException(); + } else if ( exceptionToThrow.equals("ReviewedStingException") ) { + throw new ReviewedStingException("ReviewedStingException"); + } else if ( exceptionToThrow.equals("SamError1") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); + } else if ( exceptionToThrow.equals("SamError2") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); + } else if ( exceptionToThrow.equals("NoSpace") ) { + throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else { + throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 0c096ea73..759ec1cc6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -75,6 +75,17 @@ public class MathUtils { } } + /** + * Get a random int between min and max (inclusive) using the global GATK random number generator + * + * @param min lower bound of the range + * @param max upper bound of the range + * @return a random int >= min and <= max + */ + public static int randomIntegerInRange( int min, int max ) { + return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; + } + // A fast implementation of the Math.round() method. This method does not perform // under/overflow checking, so this shouldn't be used in the general case (but is fine // if one is already make those checks before calling in to the rounding). @@ -1655,5 +1666,4 @@ public class MathUtils { return result; } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java index 15d34a348..b3a9986c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java +++ b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java @@ -1,18 +1,42 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.concurrent.TimeUnit; + /** - * A useful simple system for timing code. This code is not thread safe! + * A useful simple system for timing code with nano second resolution + * + * Note that this code is not thread-safe. If you have a single timer + * being started and stopped by multiple threads you will need to protect the + * calls to avoid meaningless results of having multiple starts and stops + * called sequentially. * * User: depristo * Date: Dec 10, 2010 * Time: 9:07:44 AM */ public class SimpleTimer { - final private String name; - private long elapsed = 0l; - private long startTime = 0l; - boolean running = false; + protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); + private final String name; + + /** + * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the + * sum of times between starts/restrats and stops. + */ + private long elapsedTimeNano = 0l; + + /** + * The start time of the last start/restart in nanoSeconds + */ + private long startTimeNano = 0l; + + /** + * Is this timer currently running (i.e., the last call was start/restart) + */ + private boolean running = false; /** * Creates an anonymous simple timer @@ -25,7 +49,8 @@ public class SimpleTimer { * Creates a simple timer named name * @param name of the timer, must not be null */ - public SimpleTimer(String name) { + public SimpleTimer(final String name) { + if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); this.name = name; } @@ -37,27 +62,27 @@ public class SimpleTimer { } /** - * Starts the timer running, and sets the elapsed time to 0. This is equivalent to + * Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to * resetting the time to have no history at all. * * @return this object, for programming convenience */ + @Ensures("elapsedTimeNano == 0l") public synchronized SimpleTimer start() { - elapsed = 0l; - restart(); - return this; + elapsedTimeNano = 0l; + return restart(); } /** - * Starts the timer running, without reseting the elapsed time. This function may be + * Starts the timer running, without resetting the elapsedTimeNano time. This function may be * called without first calling start(). The only difference between start and restart - * is that start resets the elapsed time, while restart does not. + * is that start resets the elapsedTimeNano time, while restart does not. * * @return this object, for programming convenience */ public synchronized SimpleTimer restart() { running = true; - startTime = currentTime(); + startTimeNano = currentTimeNano(); return this; } @@ -71,29 +96,53 @@ public class SimpleTimer { /** * @return A convenience function to obtain the current time in milliseconds from this timer */ - public synchronized long currentTime() { + public long currentTime() { return System.currentTimeMillis(); } /** - * Stops the timer. Increases the elapsed time by difference between start and now. The - * timer must be running in order to call stop + * @return A convenience function to obtain the current time in nanoSeconds from this timer + */ + public long currentTimeNano() { + return System.nanoTime(); + } + + /** + * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. + * + * It's ok to call stop on a timer that's not running. It has no effect on the timer. * * @return this object, for programming convenience */ + @Requires("startTimeNano != 0l") public synchronized SimpleTimer stop() { - running = false; - elapsed += currentTime() - startTime; + if ( running ) { + running = false; + elapsedTimeNano += currentTimeNano() - startTimeNano; + } return this; } /** - * Returns the total elapsed time of all start/stops of this timer. If the timer is currently + * Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently * running, includes the difference from currentTime() and the start as well * * @return this time, in seconds */ public synchronized double getElapsedTime() { - return (running ? (currentTime() - startTime + elapsed) : elapsed) / 1000.0; + return nanoToSecondsAsDouble(getElapsedTimeNano()); + } + + protected static double nanoToSecondsAsDouble(final long nano) { + return nano * NANO_TO_SECOND_DOUBLE; + } + + /** + * @see #getElapsedTime() but returns the result in nanoseconds + * + * @return the elapsed time in nanoseconds + */ + public synchronized long getElapsedTimeNano() { + return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 9f1b6db93..82fb6b8d6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.utils.classloader; import ch.qos.logback.classic.Level; import ch.qos.logback.classic.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -276,8 +278,16 @@ public class PluginManager { */ public PluginType createByName(String pluginName) { Class plugin = pluginsByName.get(pluginName); - if( plugin == null ) - throw new UserException(formatErrorMessage(pluginCategory,pluginName)); + if( plugin == null ) { + String errorMessage = formatErrorMessage(pluginCategory,pluginName); + if ( this.getClass().isAssignableFrom(FilterManager.class) ) { + throw new UserException.MalformedReadFilterException(errorMessage); + } else if ( this.getClass().isAssignableFrom(WalkerManager.class) ) { + throw new UserException.MalformedWalkerArgumentsException(errorMessage); + } else { + throw new UserException.CommandLineException(errorMessage); + } + } try { return plugin.newInstance(); } catch (Exception e) { diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 3130469e5..faafc611a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -63,6 +63,18 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedReadFilterException extends CommandLineException { + public MalformedReadFilterException(String message) { + super(String.format("Malformed read filter: %s",message)); + } + } + + public static class MalformedWalkerArgumentsException extends CommandLineException { + public MalformedWalkerArgumentsException(String message) { + super(String.format("Malformed walker argument: %s",message)); + } + } + public static class MalformedGenomeLoc extends UserException { public MalformedGenomeLoc(String message, GenomeLoc loc) { super(String.format("Badly formed genome loc: %s: %s", message, loc)); @@ -129,6 +141,12 @@ public class UserException extends ReviewedStingException { } } + public static class NoSpaceOnDevice extends UserException { + public NoSpaceOnDevice() { + super("There is no space left on the device, so writing failed"); + } + } + public static class CouldNotReadInputFile extends UserException { public CouldNotReadInputFile(String message, Exception e) { super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java new file mode 100644 index 000000000..2daa6c9eb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Invariant; + +/** + * Wrapper to hold data for a blocking queue, distinguishing an EOF marker from a real object + * + * The only way to tell in a consumer thread that a blocking queue has no more data ever + * coming down the pipe is to pass in a "poison" or EOF object. This class provides + * a generic capacity for that... + * + * The use case looks like this: + * + * BlockingQueue q + * producer: + * while ( x has items ) + * q.put(new BlockingQueueValue(x)) + * q.put(new BlockingQueueValue()) + * + * Consumer: + * while ( true ) + * value = q.take() + * if ( value.isLast() ) + * break + * else + * do something useful with value + * + * + * User: depristo + * Date: 9/6/12 + * Time: 3:08 PM + */ +@Invariant("! isLast || value == null") +class BlockingQueueValue { + /** + * True if this is the EOF marker object + */ + final private boolean isLast; + + /** + * Our value, if we aren't the EOF marker + */ + final private T value; + + /** + * Create a new BlockingQueueValue containing a real value, where last is false + * @param value + */ + BlockingQueueValue(final T value) { + isLast = false; + this.value = value; + } + + /** + * Create a new BlockingQueueValue that is the last item + */ + BlockingQueueValue() { + isLast = true; + this.value = null; + } + + /** + * Is this the EOF marker? + * + * @return true if so, else false + */ + public boolean isLast() { + return isLast; + } + + /** + * Get the value held by this BlockingQueueValue + * + * @return the value + * @throws IllegalStateException if this is the last item + */ + public T getValue() { + if ( isLast() ) + throw new IllegalStateException("Cannot get value for last object"); + return value; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java new file mode 100644 index 000000000..9508a15aa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +/** + * Create a future that simply returns a given value + * + * The only standard way to create a future in java is via the ExecutorService interface. + * If you have a data structure holding futures of value T, and you want to add a + * value to it for some reason (to add a EOF marker, for instance) you can use this + * class to create a dummy Future that simply returns a value. + * + * @author depristo + * @since 09/12 + */ +class FutureValue implements Future { + final V value; + + FutureValue(final V value) { + this.value = value; + } + + @Override public boolean cancel(boolean mayInterruptIfRunning) { + return true; + } + + @Override public boolean isCancelled() { + return false; + } + + @Override public boolean isDone() { + return true; + } + + @Override public V get() throws InterruptedException, ExecutionException { + return value; + } + + @Override public V get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException { + return get(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java new file mode 100644 index 000000000..29dddbc49 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Iterator; +import java.util.concurrent.BlockingQueue; + +/** + * Producer Thread that reads input values from an inputReads and puts them into a BlockingQueue + */ +class InputProducer implements Runnable { + /** + * The iterator we are using to get data from + */ + final Iterator inputReader; + + /** + * Our timer (may be null) that we use to track our input costs + */ + final SimpleTimer inputTimer; + + /** + * Where we put our input values for consumption + */ + final BlockingQueue outputQueue; + + public InputProducer(final Iterator inputReader, + final SimpleTimer inputTimer, + final BlockingQueue outputQueue) { + if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); + + this.inputReader = inputReader; + this.inputTimer = inputTimer; + this.outputQueue = outputQueue; + } + + public void run() { + try { + while ( inputReader.hasNext() ) { + if ( inputTimer != null ) inputTimer.restart(); + final InputType input = inputReader.next(); + if ( inputTimer != null ) inputTimer.stop(); + outputQueue.put(new InputValue(input)); + } + + // add the EOF object so our consumer knows we are done in all inputs + outputQueue.put(new InputValue()); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } + + /** + * Helper class that contains a read value suitable for EOF marking in a BlockingQueue + */ + class InputValue extends BlockingQueueValue { + private InputValue(InputType datum) { super(datum); } + private InputValue() { } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java new file mode 100644 index 000000000..3cc6fa786 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Holds the results of a map job suitable for producer/consumer threading + * via a BlockingQueue + */ +class MapResult extends BlockingQueueValue { + final int jobID; + + /** + * Create a new MapResult with value datum and jod jobID ID + * + * @param datum the value produced by the map job + * @param jobID the id of the map job (for correctness testing) + */ + MapResult(final MapType datum, final int jobID) { + super(datum); + this.jobID = jobID; + if ( jobID < 0 ) throw new IllegalArgumentException("JobID must be >= 0"); + } + + /** + * Create the EOF marker version of MapResult + */ + MapResult() { + super(); + this.jobID = Integer.MAX_VALUE; + } + + /** + * @return the job ID of the map job that produced this MapResult + */ + public int getJobID() { + return jobID; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java similarity index 87% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java index 440c263b7..cc5335051 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java @@ -9,7 +9,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface MapFunction { +public interface NSMapFunction { /** * Return function on input, returning a value of ResultType * @param input diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java new file mode 100644 index 000000000..8b12c62c4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java @@ -0,0 +1,12 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 9/4/12 + * Time: 2:10 PM + * To change this template use File | Settings | File Templates. + */ +public interface NSProgressFunction { + public void progress(final InputType lastMapInput); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java similarity index 89% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java index 8f1b0eddd..879a33a1d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java @@ -7,7 +7,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface ReduceFunction { +public interface NSReduceFunction { /** * Combine one with sum into a new ReduceType * @param one the result of a map call on an input element diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 668c82524..664fb7b9b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -3,13 +3,13 @@ package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.threading.NamedThreadFactory; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; -import java.util.Queue; import java.util.concurrent.*; /** @@ -17,12 +17,12 @@ import java.util.concurrent.*; * * The overall framework works like this * - * nano <- new Nanoschedule(bufferSize, numberOfMapElementsToProcessTogether, nThreads) + * nano <- new Nanoschedule(inputBufferSize, numberOfMapElementsToProcessTogether, nThreads) * List[Input] outerData : outerDataLoop ) * result = nano.execute(outerData.iterator(), map, reduce) * - * bufferSize determines how many elements from the input stream are read in one go by the - * nanoscheduler. The scheduler may hold up to bufferSize in memory at one time, as well + * inputBufferSize determines how many elements from the input stream are read in one go by the + * nanoscheduler. The scheduler may hold up to inputBufferSize in memory at one time, as well * as up to inputBufferSize map results as well. * * numberOfMapElementsToProcessTogether determines how many input elements are processed @@ -45,42 +45,54 @@ import java.util.concurrent.*; public class NanoScheduler { private final static Logger logger = Logger.getLogger(NanoScheduler.class); private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; + private final static boolean LOG_MAP_TIMES = false; + private final static boolean TIME_CALLS = true; - final int bufferSize; - final int mapGroupSize; + private final static int MAP_BUFFER_SIZE_SCALE_FACTOR = 100; + + final int inputBufferSize; + final int mapBufferSize; final int nThreads; - final ExecutorService executor; + final ExecutorService inputExecutor; + final ExecutorService reduceExecutor; + final ThreadPoolExecutor mapExecutor; + boolean shutdown = false; boolean debug = false; + private NSProgressFunction progressFunction = null; + + final SimpleTimer outsideSchedulerTimer = TIME_CALLS ? new SimpleTimer("outside") : null; + final SimpleTimer inputTimer = TIME_CALLS ? new SimpleTimer("input") : null; + final SimpleTimer mapTimer = TIME_CALLS ? new SimpleTimer("map") : null; + final SimpleTimer reduceTimer = TIME_CALLS ? new SimpleTimer("reduce") : null; /** - * Create a new nanoschedule with the desire characteristics requested by the argument + * Create a new nanoscheduler with the desire characteristics requested by the argument * - * @param bufferSize the number of input elements to read in each scheduling cycle. - * @param mapGroupSize How many inputs should be grouped together per map? If -1 we make a reasonable guess - * @param nThreads the number of threads to use to get work done, in addition to the thread calling execute + * @param inputBufferSize the number of input elements to read in each scheduling cycle. + * @param nThreads the number of threads to use to get work done, in addition to the + * thread calling execute */ - public NanoScheduler(final int bufferSize, - final int mapGroupSize, - final int nThreads) { - if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize); + public NanoScheduler(final int inputBufferSize, final int nThreads) { + if ( inputBufferSize < 1 ) throw new IllegalArgumentException("inputBufferSize must be >= 1, got " + inputBufferSize); if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); - if ( mapGroupSize > bufferSize ) throw new IllegalArgumentException("mapGroupSize " + mapGroupSize + " must be <= bufferSize " + bufferSize); - if ( mapGroupSize == 0 || mapGroupSize < -1 ) throw new IllegalArgumentException("mapGroupSize cannot be <= 0" + mapGroupSize); - - this.bufferSize = bufferSize; + this.inputBufferSize = inputBufferSize; + this.mapBufferSize = inputBufferSize * MAP_BUFFER_SIZE_SCALE_FACTOR; this.nThreads = nThreads; - if ( mapGroupSize == -1 ) { - this.mapGroupSize = (int)Math.ceil(this.bufferSize / (10.0*this.nThreads)); - logger.info(String.format("Dynamically setting grouping size to %d based on buffer size %d and n threads %d", - this.mapGroupSize, this.bufferSize, this.nThreads)); + if ( nThreads == 1 ) { + this.mapExecutor = null; + this.inputExecutor = this.reduceExecutor = null; } else { - this.mapGroupSize = mapGroupSize; + this.mapExecutor = (ThreadPoolExecutor)Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); + this.mapExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); + this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); + this.reduceExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-reduce-thread-%d")); } - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); + // start timing the time spent outside of the nanoScheduler + outsideSchedulerTimer.start(); } /** @@ -97,17 +109,8 @@ public class NanoScheduler { * @return */ @Ensures("result > 0") - public int getBufferSize() { - return bufferSize; - } - - /** - * The grouping size used by this NanoScheduler - * @return - */ - @Ensures("result > 0") - public int getMapGroupSize() { - return mapGroupSize; + public int getInputBufferSize() { + return inputBufferSize; } /** @@ -116,12 +119,54 @@ public class NanoScheduler { * After this call, execute cannot be invoked without throwing an error */ public void shutdown() { - if ( executor != null ) { - final List remaining = executor.shutdownNow(); - if ( ! remaining.isEmpty() ) - throw new IllegalStateException("Remaining tasks found in the executor, unexpected behavior!"); + outsideSchedulerTimer.stop(); + + if ( nThreads > 1 ) { + shutdownExecutor("inputExecutor", inputExecutor); + shutdownExecutor("mapExecutor", mapExecutor); + shutdownExecutor("reduceExecutor", reduceExecutor); } shutdown = true; + + if (TIME_CALLS) { + printTimerInfo("Input time", inputTimer); + printTimerInfo("Map time", mapTimer); + printTimerInfo("Reduce time", reduceTimer); + printTimerInfo("Outside time", outsideSchedulerTimer); + } + } + + /** + * Helper function to cleanly shutdown an execution service, checking that the execution + * state is clean when it's done. + * + * @param name a string name for error messages for the executorService we are shutting down + * @param executorService the executorService to shut down + */ + @Requires({"name != null", "executorService != null"}) + @Ensures("executorService.isShutdown()") + private void shutdownExecutor(final String name, final ExecutorService executorService) { + if ( executorService.isShutdown() || executorService.isTerminated() ) + throw new IllegalStateException("Executor service " + name + " is already shut down!"); + + final List remaining = executorService.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); + } + + /** + * Print to logger.info timing information from timer, with name label + * + * @param label the name of the timer to display. Should be human readable + * @param timer the timer whose elapsed time we will display + */ + @Requires({"label != null", "timer != null"}) + private void printTimerInfo(final String label, final SimpleTimer timer) { + final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + + reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime(); + final double myTimeInSec = timer.getElapsedTime(); + final double myTimePercent = myTimeInSec / total * 100; + logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); } /** @@ -131,20 +176,45 @@ public class NanoScheduler { return shutdown; } + /** + * @return are we displaying verbose debugging information about the scheduling? + */ public boolean isDebug() { return debug; } + /** + * Helper function to display a String.formatted message if we are doing verbose debugging + * + * @param format the format argument suitable for String.format + * @param args the arguments for String.format + */ + @Requires("format != null") private void debugPrint(final String format, Object ... args) { if ( isDebug() ) logger.info("Thread " + Thread.currentThread().getId() + ":" + String.format(format, args)); } - + /** + * Turn on/off verbose debugging + * + * @param debug true if we want verbose debugging + */ public void setDebug(boolean debug) { this.debug = debug; } + /** + * Set the progress callback function to progressFunction + * + * The progress callback is invoked after each buffer size elements have been processed by map/reduce + * + * @param progressFunction a progress function to call, or null if you don't want any progress callback + */ + public void setProgressFunction(final NSProgressFunction progressFunction) { + this.progressFunction = progressFunction; + } + /** * Execute a map/reduce job with this nanoScheduler * @@ -159,41 +229,73 @@ public class NanoScheduler { * It is safe to call this function repeatedly on a single nanoScheduler, at least until the * shutdown method is called. * - * @param inputReader - * @param map - * @param reduce - * @return + * Note that this function goes through a single threaded fast path if the number of threads + * is 1. + * + * @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over + * @param map the map function from input type -> map type, will be applied in parallel to each input + * @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results + * @return the last reduce value */ public ReduceType execute(final Iterator inputReader, - final MapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NSReduceFunction reduce) { if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); + outsideSchedulerTimer.stop(); + + ReduceType result; if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { - return executeSingleThreaded(inputReader, map, initialValue, reduce); + result = executeSingleThreaded(inputReader, map, initialValue, reduce); } else { - return executeMultiThreaded(inputReader, map, initialValue, reduce); + result = executeMultiThreaded(inputReader, map, initialValue, reduce); } + + outsideSchedulerTimer.restart(); + return result; } /** - * Simple efficient reference implementation for single threaded execution + * Simple efficient reference implementation for single threaded execution. + * * @return the reduce result of this map/reduce job */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeSingleThreaded(final Iterator inputReader, - final MapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NSReduceFunction reduce) { ReduceType sum = initialValue; + int i = 0; + + // start timer to ensure that both hasNext and next are caught by the timer + if ( TIME_CALLS ) inputTimer.restart(); while ( inputReader.hasNext() ) { final InputType input = inputReader.next(); + if ( TIME_CALLS ) inputTimer.stop(); + + // map + if ( TIME_CALLS ) mapTimer.restart(); + final long preMapTime = LOG_MAP_TIMES ? 0 : mapTimer.currentTimeNano(); final MapType mapValue = map.apply(input); + if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime)); + if ( TIME_CALLS ) mapTimer.stop(); + + if ( i++ % inputBufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); + + // reduce + if ( TIME_CALLS ) reduceTimer.restart(); sum = reduce.apply(mapValue, sum); + if ( TIME_CALLS ) reduceTimer.stop(); + + if ( TIME_CALLS ) inputTimer.restart(); } + return sum; } @@ -202,101 +304,89 @@ public class NanoScheduler { * * @return the reduce result of this map/reduce job */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeMultiThreaded(final Iterator inputReader, - final MapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NSReduceFunction reduce) { debugPrint("Executing nanoScheduler"); - ReduceType sum = initialValue; - while ( inputReader.hasNext() ) { - try { - // read in our input values - final List inputs = readInputs(inputReader); - // send jobs for map - final Queue>> mapQueue = submitMapJobs(map, executor, inputs); + // a blocking queue that limits the number of input datum to the requested buffer size + final BlockingQueue.InputValue> inputQueue + = new LinkedBlockingDeque.InputValue>(inputBufferSize); - // send off the reduce job, and block until we get at least one reduce result - sum = reduceParallel(reduce, mapQueue, sum); - } catch (InterruptedException ex) { - throw new ReviewedStingException("got execution exception", ex); - } catch (ExecutionException ex) { - throw new ReviewedStingException("got execution exception", ex); + // a priority queue that stores up to mapBufferSize elements + // produced by completed map jobs. + final BlockingQueue>> mapResultQueue = + new LinkedBlockingDeque>>(mapBufferSize); + + // Start running the input reader thread + inputExecutor.submit(new InputProducer(inputReader, inputTimer, inputQueue)); + + // Start running the reducer thread + final ReducerThread reducer + = new ReducerThread(reduce, reduceTimer, initialValue, mapResultQueue); + final Future reduceResult = reduceExecutor.submit(reducer); + + try { + int numJobs = 0; + + while ( true ) { + // block on input + final InputProducer.InputValue inputEnqueueWrapped = inputQueue.take(); + + if ( ! inputEnqueueWrapped.isLast() ) { + // get the object itself + final InputType input = inputEnqueueWrapped.getValue(); + + // the next map call has jobID + 1 + numJobs++; + + // send job for map via the completion service + final CallableMap doMap = new CallableMap(map, numJobs, input); + final Future> mapJob = mapExecutor.submit(doMap); + mapResultQueue.put(mapJob); + + debugPrint(" Done with cycle of map/reduce"); + + if ( numJobs % inputBufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); + } else { + mapResultQueue.put(new FutureValue>(new MapResult())); + return reduceResult.get(); // wait for our result of reduce + } } + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); } - - return sum; - } - - @Requires({"reduce != null", "! mapQueue.isEmpty()"}) - private ReduceType reduceParallel(final ReduceFunction reduce, - final Queue>> mapQueue, - final ReduceType initSum) - throws InterruptedException, ExecutionException { - ReduceType sum = initSum; - - // while mapQueue has something in it to reduce - for ( final Future> future : mapQueue ) { - for ( final MapType value : future.get() ) // block until we get the values for this task - sum = reduce.apply(value, sum); - } - - return sum; - } - - /** - * Read up to inputBufferSize elements from inputReader - * - * @return a queue of inputs read in, containing one or more values of InputType read in - */ - @Requires("inputReader.hasNext()") - @Ensures("!result.isEmpty()") - private List readInputs(final Iterator inputReader) { - int n = 0; - final List inputs = new LinkedList(); - while ( inputReader.hasNext() && n < getBufferSize() ) { - final InputType input = inputReader.next(); - inputs.add(input); - n++; - } - return inputs; - } - - @Requires({"map != null", "! inputs.isEmpty()"}) - private Queue>> submitMapJobs(final MapFunction map, - final ExecutorService executor, - final List inputs) { - final Queue>> mapQueue = new LinkedList>>(); - - for ( final List subinputs : Utils.groupList(inputs, getMapGroupSize()) ) { - final CallableMap doMap = new CallableMap(map, subinputs); - final Future> future = executor.submit(doMap); - mapQueue.add(future); - } - - return mapQueue; } /** * A simple callable version of the map function for use with the executor pool */ - private class CallableMap implements Callable> { - final List inputs; - final MapFunction map; + private class CallableMap implements Callable> { + final int id; + final InputType input; + final NSMapFunction map; - @Requires({"map != null", "inputs.size() <= getMapGroupSize()"}) - private CallableMap(final MapFunction map, final List inputs) { - this.inputs = inputs; + @Requires({"map != null"}) + private CallableMap(final NSMapFunction map, + final int id, + final InputType input) { + this.id = id; + this.input = input; this.map = map; } - @Ensures("result.size() == inputs.size()") - @Override public List call() throws Exception { - final List outputs = new LinkedList(); - for ( final InputType input : inputs ) - outputs.add(map.apply(input)); - debugPrint(" Processed %d elements with map", outputs.size()); - return outputs; + @Override + public MapResult call() { + if ( TIME_CALLS ) mapTimer.restart(); + if ( debug ) debugPrint("\t\tmap " + input); + final MapType result = map.apply(input); + if ( TIME_CALLS ) mapTimer.stop(); + return new MapResult(result, id); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java new file mode 100644 index 000000000..506e45453 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +/** + * Thread that runs the reduce of the map/reduce. + * + * This thread reads from mapResultsQueue until the poison EOF object arrives. At each + * stage is calls reduce(value, sum). The blocking mapResultQueue ensures that the + * queue waits until the mapResultQueue has a value to take. Then, it gets and waits + * until the map result Future has a value. + */ +class ReducerThread implements Callable { + final NSReduceFunction reduce; + final SimpleTimer reduceTimer; + final BlockingQueue>> mapResultQueue; + + ReduceType sum; + int lastJobID = -1; + + public ReducerThread(final NSReduceFunction reduce, + final SimpleTimer reduceTimer, + final ReduceType sum, + final BlockingQueue>> mapResultQueue) { + if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); + if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); + + this.reduce = reduce; + this.reduceTimer = reduceTimer; + this.sum = sum; + this.mapResultQueue = mapResultQueue; + } + + public ReduceType call() { + try { + while ( true ) { + final MapResult result = mapResultQueue.take().get(); + if ( result.isLast() ) { + // we are done, just return sum + return sum; + } + else if ( result.getJobID() < lastJobID ) { + // make sure the map results are coming in order + throw new IllegalStateException("BUG: last jobID " + lastJobID + " > current jobID " + result.getJobID()); + } else { + lastJobID = result.getJobID(); + // apply reduce, keeping track of sum + if ( reduceTimer != null ) reduceTimer.restart(); + sum = reduce.apply(result.getValue(), sum); + if ( reduceTimer != null ) reduceTimer.stop(); + } + } + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 3d986f666..ed6fc46bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -613,6 +613,8 @@ public abstract class AbstractReadBackedPileup { + + private Collection perSampleArtificialReadStreams; + private MergingSamRecordIterator mergingIterator; + + public ArtificialMultiSampleReadStream( Collection perSampleArtificialReadStreams ) { + if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) { + throw new ReviewedStingException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams"); + } + + this.perSampleArtificialReadStreams = perSampleArtificialReadStreams; + } + + public Iterator iterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return mergingIterator; + } + + public StingSAMIterator getStingSAMIterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return StingSAMIteratorAdapter.adapt(mergingIterator); + } + + private void initialize() { + Collection perSampleSAMReaders = new ArrayList(perSampleArtificialReadStreams.size()); + Collection headers = new ArrayList(perSampleArtificialReadStreams.size()); + + for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) { + Collection thisStreamReads = readStream.makeReads(); + + SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(), + thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()])); + perSampleSAMReaders.add(reader); + headers.add(reader.getFileHeader()); + } + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); + mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java index adf60b16b..0b5fa391d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java @@ -40,8 +40,11 @@ public class ArtificialSAMFileReader extends SAMFileReader { */ private final List reads; + private SAMFileHeader customHeader = null; + /** * Construct an artificial SAM file reader. + * @param sequenceDictionary sequence dictionary used to initialize our GenomeLocParser * @param reads Reads to use as backing data source. */ public ArtificialSAMFileReader(SAMSequenceDictionary sequenceDictionary,SAMRecord... reads) { @@ -50,6 +53,30 @@ public class ArtificialSAMFileReader extends SAMFileReader { this.reads = Arrays.asList(reads); } + /** + * Construct an artificial SAM file reader with the given SAM file header + * + * @param customHeader Header that should be returned by calls to getFileHeader() on this reader + * @param reads Reads to use as backing data source. + */ + public ArtificialSAMFileReader( SAMFileHeader customHeader, SAMRecord... reads ) { + super(createEmptyInputStream(),true); + + this.customHeader = customHeader; + this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary()); + this.reads = Arrays.asList(reads); + } + + + @Override + public SAMFileHeader getFileHeader() { + if ( customHeader != null ) { + return customHeader; + } + + return super.getFileHeader(); + } + /** * @{inheritDoc} */ diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index d0211db07..0859957a3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -276,6 +276,30 @@ public class ArtificialSAMUtils { return Arrays.asList(left, right); } + /** + * Create a collection of identical artificial reads based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like positional downsampling where you care only about the position and + * number of reads, and not the other attributes. + * + * @param stackSize number of identical reads to create + * @param header the SAM header to associate each read with + * @param name name associated with each read + * @param refIndex the reference index, i.e. what chromosome to associate them with + * @param alignmentStart where to start each alignment + * @param length the length of each read + * + * @return a collection of stackSize reads all sharing the above properties + */ + public static Collection createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + Collection stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } + /** * create an iterator containing the specified read piles * diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java new file mode 100644 index 000000000..a9480692b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; + +/** + * An artificial stream of reads from a single read group/sample with configurable characteristics + * such as: + * + * -the number of contigs that the reads should be distributed across + * -number of "stacks" of reads sharing the same alignment start position per contig + * -the min/max number of reads in each stack (exact values chosen randomly from this range) + * -the min/max distance between stack start positions (exact values chosen randomly from this range) + * -the min/max length of each read (exact values chosen randomly from this range) + * -the number of unmapped reads + * + * The cigar string for all reads will be *M, where * is the length of the read. + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStream implements Iterable { + private SAMFileHeader header; + private String readGroupID; + private int numContigs; + private int numStacksPerContig; + private int minReadsPerStack; + private int maxReadsPerStack; + private int minDistanceBetweenStacks; + private int maxDistanceBetweenStacks; + private int minReadLength; + private int maxReadLength; + private int numUnmappedReads; + + private static final String READ_GROUP_TAG = "RG"; + + public ArtificialSingleSampleReadStream( SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + this.header = header; + this.readGroupID = readGroupID; + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + this.minReadLength = minReadLength; + this.maxReadLength = maxReadLength; + this.numUnmappedReads = numUnmappedReads; + + validateStreamParameters(); + } + + private void validateStreamParameters() { + if ( header == null || readGroupID == null ) { + throw new ReviewedStingException("null SAMFileHeader or read group ID") ; + } + + if ( header.getReadGroup(readGroupID) == null ) { + throw new ReviewedStingException(String.format("Read group %s not found in SAMFileHeader", readGroupID)); + } + + if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 || + minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 || + numUnmappedReads < 0 ) { + throw new ReviewedStingException("Read stream parameters must be >= 0"); + } + + if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) { + throw new ReviewedStingException("numContigs and numStacksPerContig must either both be > 0, or both be 0"); + } + + if ( minReadsPerStack > maxReadsPerStack ) { + throw new ReviewedStingException("minReadsPerStack > maxReadsPerStack"); + } + + if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) { + throw new ReviewedStingException("minDistanceBetweenStacks > maxDistanceBetweenStacks"); + } + + if ( minReadLength > maxReadLength ) { + throw new ReviewedStingException("minReadLength > maxReadLength"); + } + } + + public Iterator iterator() { + return makeReads().iterator(); + } + + public StingSAMIterator getStingSAMIterator() { + return StingSAMIteratorAdapter.adapt(iterator()); + } + + public Collection makeReads() { + Collection reads = new ArrayList(numContigs * numStacksPerContig * maxReadsPerStack); + + for ( int contig = 0; contig < numContigs; contig++ ) { + int alignmentStart = 1; + + for ( int stack = 0; stack < numStacksPerContig; stack++ ) { + reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack))); + alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + if ( numUnmappedReads > 0 ) { + reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads)); + } + + return reads; + } + + private Collection makeReadStack( int contig, int alignmentStart, int stackSize ) { + Collection readStack = new ArrayList(stackSize); + + for ( int i = 0; i < stackSize; i++ ) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, + "foo", + contig, + alignmentStart, + MathUtils.randomIntegerInRange(minReadLength, maxReadLength)); + read.setAttribute(READ_GROUP_TAG, readGroupID); + readStack.add(read); + } + + return readStack; + } + + public SAMFileHeader getHeader() { + return header; + } + + public String getReadGroupID() { + return readGroupID; + } + + public int getNumContigs() { + return numContigs; + } + + public int getNumStacksPerContig() { + return numStacksPerContig; + } + + public int getMinReadsPerStack() { + return minReadsPerStack; + } + + public int getMaxReadsPerStack() { + return maxReadsPerStack; + } + + public int getMinDistanceBetweenStacks() { + return minDistanceBetweenStacks; + } + + public int getMaxDistanceBetweenStacks() { + return maxDistanceBetweenStacks; + } + + public int getMinReadLength() { + return minReadLength; + } + + public int getMaxReadLength() { + return maxReadLength; + } + + public int getNumUnmappedReads() { + return numUnmappedReads; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..a4d7c5146 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.List; + +/** + * A class for analyzing and validating the read stream produced by an ArtificialSingleSampleReadStream. + * + * Collects various statistics about the stream of reads it's fed, and validates the stream + * by checking whether the collected statistics match the nominal properties of the stream. + * + * Subclasses are expected to override the validate() method in order to check whether an artificial + * read stream has been *transformed* in some way (eg., by downsampling or some other process), rather + * than merely checking whether the stream matches its original properties. + * + * Usage is simple: + * + * ArtificialSingleSampleReadStreamAnalyzer analyzer = new ArtificialSingleSampleReadStreamAnalyzer(originalStream); + * analyzer.analyze(originalOrTransformedStream); + * analyzer.validate(); // override this method if you want to check whether the stream has been transformed + * // in a certain way relative to the original stream + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStreamAnalyzer { + protected ArtificialSingleSampleReadStream originalStream; + protected SAMRecord lastRead; + protected int totalReads; + protected boolean allSamplesMatch; + protected int numContigs; + protected List stacksPerContig; + protected Integer minReadsPerStack; + protected Integer maxReadsPerStack; + protected Integer minDistanceBetweenStacks; + protected Integer maxDistanceBetweenStacks; + protected Integer minReadLength; + protected Integer maxReadLength; + protected int numUnmappedReads; + + protected int currentContigNumStacks; + protected int currentStackNumReads; + + /** + * Construct a new read stream analyzer, providing an ArtificialSingleSampleReadStream that will + * serve as the basis for comparison after the analysis is complete. + * + * @param originalStream the original ArtificialSingleSampleReadStream upon which the stream + * that will be fed to the analyzer is based + */ + public ArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream ) { + this.originalStream = originalStream; + reset(); + } + + /** + * Reset all read stream statistics collected by this analyzer to prepare for a fresh run + */ + public void reset() { + lastRead = null; + totalReads = 0; + allSamplesMatch = true; + numContigs = 0; + stacksPerContig = new ArrayList(); + minReadsPerStack = null; + maxReadsPerStack = null; + minDistanceBetweenStacks = null; + maxDistanceBetweenStacks = null; + minReadLength = null; + maxReadLength = null; + numUnmappedReads = 0; + currentContigNumStacks = 0; + currentStackNumReads = 0; + } + + /** + * Collect statistics on the stream of reads passed in + * + * @param stream the stream of reads to analyze + */ + public void analyze( Iterable stream ) { + for ( SAMRecord read : stream ) { + update(read); + } + finalizeStats(); + } + + /** + * Validate the stream by checking whether our collected statistics match the properties of the + * original stream. Throws a ReviewedStingException if the stream is invalid. + * + * Override this method if you want to check whether the stream has been transformed in some + * way relative to the original stream. + */ + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + if ( minReadsPerStack < originalStream.getMinReadsPerStack() ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads"); + } + if ( maxReadsPerStack > originalStream.getMaxReadsPerStack() ) { + throw new ReviewedStingException("stack had more than the maximum number of reads"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } + + public void update( SAMRecord read ) { + if ( read.getReadUnmappedFlag() ) { + numUnmappedReads++; + + if ( numUnmappedReads == 1 && lastRead != null ) { + processContigChange(); + numContigs--; + } + } + else if ( lastRead == null ) { + numContigs = 1; + currentContigNumStacks = 1; + currentStackNumReads = 1; + } + else if ( ! read.getReferenceIndex().equals(lastRead.getReferenceIndex()) ) { + processContigChange(); + } + else if ( read.getAlignmentStart() != lastRead.getAlignmentStart() ) { + processStackChangeWithinContig(read); + } + else { + currentStackNumReads++; + } + + updateReadLength(read.getReadLength()); + allSamplesMatch = allSamplesMatch && readHasCorrectSample(read); + totalReads++; + + lastRead = read; + } + + + private void processContigChange() { + numContigs++; + + stacksPerContig.add(currentContigNumStacks); + currentContigNumStacks = 1; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + } + + private void processStackChangeWithinContig( SAMRecord read ) { + currentContigNumStacks++; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + + updateDistanceBetweenStacks(read.getAlignmentStart() - lastRead.getAlignmentStart()); + } + + private void updateReadsPerStack( int stackReadCount ) { + if ( minReadsPerStack == null || stackReadCount < minReadsPerStack ) { + minReadsPerStack = stackReadCount; + } + if ( maxReadsPerStack == null || stackReadCount > maxReadsPerStack ) { + maxReadsPerStack = stackReadCount; + } + } + + private void updateDistanceBetweenStacks( int stackDistance ) { + if ( minDistanceBetweenStacks == null || stackDistance < minDistanceBetweenStacks ) { + minDistanceBetweenStacks = stackDistance; + } + if ( maxDistanceBetweenStacks == null || stackDistance > maxDistanceBetweenStacks ) { + maxDistanceBetweenStacks = stackDistance; + } + } + + private void updateReadLength( int readLength ) { + if ( minReadLength == null || readLength < minReadLength ) { + minReadLength = readLength; + } + if ( maxReadLength == null || readLength > maxReadLength ) { + maxReadLength = readLength; + } + } + + private boolean readHasCorrectSample( SAMRecord read ) { + return originalStream.getReadGroupID().equals(read.getAttribute("RG")); + } + + public void finalizeStats() { + if ( lastRead != null && ! lastRead.getReadUnmappedFlag() ) { + stacksPerContig.add(currentContigNumStacks); + updateReadsPerStack(currentStackNumReads); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java new file mode 100644 index 000000000..b25375b87 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java @@ -0,0 +1,26 @@ +package org.broadinstitute.sting.utils.threading; + +import java.util.concurrent.ThreadFactory; + +/** + * Thread factor that produces threads with a given name pattern + * + * User: depristo + * Date: 9/5/12 + * Time: 9:22 PM + * + */ +public class NamedThreadFactory implements ThreadFactory { + static int id = 0; + final String format; + + public NamedThreadFactory(String format) { + this.format = format; + String.format(format, id); // test the name + } + + @Override + public Thread newThread(Runnable r) { + return new Thread(r, String.format(format, id++)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 7e38c00f3..fa9f9e8a7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -40,13 +40,13 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.VariantContextTestProvider; - -import java.io.*; - import org.testng.Assert; import org.testng.annotations.AfterSuite; import org.testng.annotations.BeforeMethod; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; import java.text.SimpleDateFormat; import java.util.*; @@ -251,20 +251,43 @@ public class WalkerTest extends BaseTest { return false; } - protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { - return executeTest(name, spec, Arrays.asList(1, 4)); + public enum ParallelTestType { + TREE_REDUCIBLE, + NANO_SCHEDULED, + BOTH } - protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List parallelThreads) { + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec, ParallelTestType testType) { + final List ntThreads = testType == ParallelTestType.TREE_REDUCIBLE || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + final List cntThreads = testType == ParallelTestType.NANO_SCHEDULED || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + + return executeTest(name, spec, ntThreads, cntThreads); + } + + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { + return executeTestParallel(name, spec, ParallelTestType.TREE_REDUCIBLE); + } + + protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { String originalArgs = spec.args; Pair, List> results = null; - for ( int nt : parallelThreads ) { + boolean ran1 = false; + for ( int nt : ntThreads ) { String extra = nt == 1 ? "" : (" -nt " + nt); + ran1 = ran1 || nt == 1; spec.args = originalArgs + extra; results = executeTest(name + "-nt-" + nt, spec); } + for ( int nct : cpuThreads ) { + if ( nct != 1 ) { + String extra = " -nct " + nct; + spec.args = originalArgs + extra; + results = executeTest(name + "-cnt-" + nct, spec); + } + } + return results; } diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java new file mode 100644 index 000000000..924c6ec5a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/31/12 + * Time: 11:03 AM + * To change this template use File | Settings | File Templates. + */ +public class InvalidArgumentIntegrationTest extends WalkerTest { + private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf"; + + private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) { + return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s " + flag + " " + arg, + 1, exeption); + + } + + @Test + public void testUnknownReadFilter() { + executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class)); + } + + @Test + public void testMalformedWalkerArgs() { + executeTest("MalformedWalkerArgs", + new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s ", + 1, UserException.MalformedWalkerArgumentsException.class)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 5aeb741ec..d2bfabacf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -29,7 +29,7 @@ import net.sf.picard.filter.FilteringIterator; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.qc.CountLoci; import org.broadinstitute.sting.utils.GenomeLocParser; import java.util.Collections; @@ -97,7 +98,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { }, PER_SAMPLE { @Override - DownsamplingMethod create() { return GATKArgumentCollection.getDefaultDownsamplingMethod(); } + DownsamplingMethod create() { return DownsamplingMethod.getDefaultDownsamplingMethod(new CountLoci(), false); } }; abstract DownsamplingMethod create(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 730b3f410..9df849940 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -25,36 +25,40 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import org.testng.Assert; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import static org.testng.Assert.*; /** - * @author aaron - * @version 1.0 - * @date Apr 8, 2009 *

    * Class SAMDataSourceUnitTest *

    @@ -66,6 +70,161 @@ public class SAMDataSourceUnitTest extends BaseTest { private IndexedFastaSequenceFile seq; private GenomeLocParser genomeLocParser; + + /*********************************** + * Tests for the fillShard() method + ***********************************/ + + /** + * Tests to ensure that the fillShard() method does not place shard boundaries at inappropriate places, + * such as within an alignment start position + */ + private static class SAMDataSourceFillShardBoundaryTest extends TestDataProvider { + private int numContigs; + private int numStacksPerContig; + private int stackSize; + private int numUnmappedReads; + private DownsamplingMethod downsamplingMethod; + + private SAMFileHeader header; + + public SAMDataSourceFillShardBoundaryTest( int numContigs, + int numStacksPerContig, + int stackSize, + int numUnmappedReads, + int downsamplingTargetCoverage ) { + super(SAMDataSourceFillShardBoundaryTest.class); + + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.stackSize = stackSize; + this.numUnmappedReads = numUnmappedReads; + + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + + setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", + getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); + } + + public void run() { + SAMDataSource dataSource = new SAMDataSource(Arrays.asList(createTestBAM()), + new ThreadAllocation(), + null, + new GenomeLocParser(header.getSequenceDictionary()), + false, + SAMFileReader.ValidationStringency.SILENT, + null, + downsamplingMethod, + new ValidationExclusion(), + new ArrayList(), + false); + + Assert.assertTrue(dataSource.usingExpandedShards()); + + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + + SAMRecord readAtEndOfLastShard = null; + + for ( Shard shard : shardIterator ) { + int numContigsThisShard = 0; + SAMRecord lastRead = null; + + for ( SAMRecord read : shard.iterator() ) { + if ( lastRead == null ) { + numContigsThisShard = 1; + } + else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { + numContigsThisShard++; + } + + // If the last read from the previous shard is not unmapped, we have to make sure + // that no reads in this shard start at the same position + if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { + Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && + readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), + String.format("Reads from alignment start position %d:%d are split across multiple shards", + read.getReferenceIndex(), read.getAlignmentStart())); + } + + lastRead = read; + } + + // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) + Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); + + readAtEndOfLastShard = lastRead; + } + } + + private SAMReaderID createTestBAM() { + header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); + SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, + "foo", + numContigs, + numStacksPerContig, + stackSize, + stackSize, + 1, + 100, + 50, + 150, + numUnmappedReads); + + File testBAMFile; + try { + testBAMFile = File.createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); + testBAMFile.deleteOnExit(); + } + catch ( IOException e ) { + throw new ReviewedStingException(String.format("Failed to create temp bam file for test %s. %s", this, e.getMessage())); + } + + SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); + for ( SAMRecord read : artificialReads ) { + bamWriter.addAlignment(read); + } + bamWriter.close(); + + return new SAMReaderID(testBAMFile, new Tags()); + } + } + + @DataProvider(name = "SAMDataSourceFillShardTestDataProvider") + public Object[][] createSAMDataSourceFillShardBoundaryTests() { + // Take downsampling out of the equation for these tests -- we are only interested in whether the + // shard boundaries occur at the right places in the read stream, and removing downsampling as a + // factor simplifies that task (note that we still need to provide a specific downsampling method with + // experimental downsampling enabled to trigger the shard expansion behavior, for now) + int downsamplingTargetCoverage = ReadShard.MAX_READS * 10; + + for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { + for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { + // Use crucial read shard boundary values as the stack sizes + for ( int stackSize : Arrays.asList(ReadShard.MAX_READS / 2, ReadShard.MAX_READS / 2 + 10, ReadShard.MAX_READS, ReadShard.MAX_READS - 1, ReadShard.MAX_READS + 1, ReadShard.MAX_READS * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, ReadShard.MAX_READS / 2, ReadShard.MAX_READS * 2) ) { + new SAMDataSourceFillShardBoundaryTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); + } + } + } + } + + return SAMDataSourceFillShardBoundaryTest.getTests(SAMDataSourceFillShardBoundaryTest.class); + } + + // TODO: re-enable these tests once the issues with filepointer ordering + the downsamplers are worked out + @Test(dataProvider = "SAMDataSourceFillShardTestDataProvider", enabled = false) + public void testSAMDataSourceFillShard( SAMDataSourceFillShardBoundaryTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } + + + // TODO: the legacy tests below should really be replaced with a more comprehensive suite of tests for SAMDataSource + /** * This function does the setup of our parser, before each method call. *

    diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java index b0de78b97..b0a8ff065 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java @@ -1,73 +1,138 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.Collection; +import java.util.Arrays; -public class DownsamplingReadsIteratorUnitTest { +public class DownsamplingReadsIteratorUnitTest extends BaseTest { - @Test - public void testDownsamplingIteratorWithPositionalDownsampling() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + private static class DownsamplingReadsIteratorTest extends TestDataProvider { + private DownsamplingReadsIterator downsamplingIter; + private int targetCoverage; + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; - Collection reads = new ArrayList(); + public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) { + super(DownsamplingReadsIteratorTest.class); - reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100)); - reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100)); + this.stream = stream; + this.targetCoverage = targetCoverage; - StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); - - Assert.assertTrue(iter.hasNext()); - SAMRecord previous = iter.next(); - int count = 1; - - while ( iter.hasNext() ) { - SAMRecord current = iter.next(); - Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); - count++; - previous = current; + setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + targetCoverage, + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); } - Assert.assertEquals(count, 1000); + public void run() { + streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage); + downsamplingIter = new DownsamplingReadsIterator(stream.getStingSAMIterator(), new SimplePositionalDownsampler(targetCoverage)); + + streamAnalyzer.analyze(downsamplingIter); + + // Check whether the observed properties of the downsampled stream are what they should be + streamAnalyzer.validate(); + + // Allow memory used by this test to be reclaimed + stream = null; + streamAnalyzer = null; + downsamplingIter = null; + } } - @Test - public void testDownsamplingIteratorNoEffectiveDownsampling() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + @DataProvider(name = "DownsamplingReadsIteratorTestDataProvider") + public Object[][] createDownsamplingReadsIteratorTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); - Collection reads = new ArrayList(); + // Values that don't vary across tests + int targetCoverage = 10; + int minReadLength = 50; + int maxReadLength = 100; + int minDistanceBetweenStacks = 1; + int maxDistanceBetweenStacks = maxReadLength + 1; - reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); - reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); + GenomeAnalysisEngine.resetRandomGenerator(); - StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); - - Assert.assertTrue(iter.hasNext()); - SAMRecord previous = iter.next(); - int count = 1; - - while ( iter.hasNext() ) { - SAMRecord current = iter.next(); - Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); - count++; - previous = current; + // brute force testing! + for ( int numContigs : Arrays.asList(1, 2, 5) ) { + for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) { + for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) { + // Only interested in sane read stream configurations here + if ( minReadsPerStack <= maxReadsPerStack ) { + new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads), + targetCoverage); + } + } + } + } + } } - Assert.assertEquals(count, 600); + return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class); } - private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - ArrayList stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; + @Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider") + public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java index 0f4bae555..3bf1096b1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -1,65 +1,157 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.List; -public class FractionalDownsamplerUnitTest { +public class FractionalDownsamplerUnitTest extends BaseTest { - @Test - public void test100PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(1.0); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + private static class FractionalDownsamplerTest extends TestDataProvider { + double fraction; + int totalReads; + int expectedMinNumReadsAfterDownsampling; + int expectedMaxNumReadsAfterDownsampling; + int expectedMinDiscardedItems; + int expectedMaxDiscardedItems; - downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); + private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent - List downsampledReads = downsampler.consumeDownsampledItems(); + public FractionalDownsamplerTest( double fraction, int totalReads ) { + super(FractionalDownsamplerTest.class); - Assert.assertTrue(downsampledReads.size() == 1000); - } + this.fraction = fraction; + this.totalReads = totalReads; - @Test - public void test0PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(0.0); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + calculateExpectations(); - downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.isEmpty()); - } - - @Test - public void test50PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(0.5); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000); - } - - private List createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) { - List reads = new ArrayList(numReads); - - for ( int i = 1; i <= numReads; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex, - GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1, - GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1)); + setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", + getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); } - return reads; + private void calculateExpectations() { + // Require an exact match in the 0% and 100% cases + if ( fraction == 0.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; + expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; + } + else if ( fraction == 1.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; + expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; + } + else { + expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); + expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); + expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; + expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; + } + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "FractionalDownsamplerTestDataProvider") + public Object[][] createFractionalDownsamplerTestData() { + for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { + for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { + new FractionalDownsamplerTest(fraction, totalReads); + } + } + + return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); + } + + @Test(dataProvider = "FractionalDownsamplerTestDataProvider") + public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && + downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); + + Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && + downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java new file mode 100644 index 000000000..2717d014c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +public class LevelingDownsamplerUnitTest extends BaseTest { + + private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { + public enum DataStructure { LINKED_LIST, ARRAY_LIST } + + int targetSize; + int numStacks; + int stackSize; + DataStructure dataStructure; + int expectedSize; + + public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { + super(LevelingDownsamplerUniformStacksTest.class); + + this.targetSize = targetSize; + this.numStacks = numStacks; + this.stackSize = stackSize; + this.dataStructure = dataStructure; + expectedSize = calculateExpectedDownsampledStackSize(); + + setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", + getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); + } + + public Collection> createStacks() { + Collection> stacks = new ArrayList>(); + + for ( int i = 1; i <= numStacks; i++ ) { + List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); + + for ( int j = 1; j <= stackSize; j++ ) { + stack.add(new Object()); + } + + stacks.add(stack); + } + + return stacks; + } + + private int calculateExpectedDownsampledStackSize() { + int numItemsToRemove = numStacks * stackSize - targetSize; + + if ( numStacks == 0 ) { + return 0; + } + else if ( numItemsToRemove <= 0 ) { + return stackSize; + } + + return Math.max(1, stackSize - (numItemsToRemove / numStacks)); + } + } + + @DataProvider(name = "UniformStacksDataProvider") + public Object[][] createUniformStacksTestData() { + for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { + for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { + new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); + } + } + } + } + + return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); + } + + @Test( dataProvider = "UniformStacksDataProvider" ) + public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); + + downsampler.submit(test.createStacks()); + + if ( test.numStacks > 0 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List> downsampledStacks = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledStacks.size(), test.numStacks); + + int totalRemainingItems = 0; + for ( List stack : downsampledStacks ) { + Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); + totalRemainingItems += stack.size(); + } + + int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); + int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; + + Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + + Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..b9022900b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialMultiSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { + + // TODO: tests should distinguish between variance across samples and variance within a sample + + private enum StreamDensity { + SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), + DENSE (1, MIN_READ_LENGTH), + MIXED (1, MAX_READ_LENGTH * 2), + UNIFORM_DENSE (1, 1), + UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); + + int minDistanceBetweenStacks; + int maxDistanceBetweenStacks; + + StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + } + + public String toString() { + return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + private enum StreamStackDepth { + NON_UNIFORM_LOW (1, 5), + NON_UNIFORM_HIGH (15, 20), + NON_UNIFORM_MIXED (1, 20), + UNIFORM_SINGLE (1, 1), + UNIFORM_LOW (2, 2), + UNIFORM_HIGH (20, 20), + UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing + + int minReadsPerStack; + int maxReadsPerStack; + + StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + } + + public boolean isUniform() { + return minReadsPerStack == maxReadsPerStack; + } + + public String toString() { + return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); + } + } + + private enum StreamStacksPerContig { + UNIFORM(20, 20), + NON_UNIFORM(1, 30); + + int minStacksPerContig; + int maxStacksPerContig; + + StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { + this.minStacksPerContig = minStacksPerContig; + this.maxStacksPerContig = maxStacksPerContig; + } + + public boolean isUniform() { + return minStacksPerContig == maxStacksPerContig; + } + + public String toString() { + return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); + } + } + + // Not interested in testing multiple ranges for the read lengths, as none of our current + // downsamplers are affected by read length + private static final int MIN_READ_LENGTH = 50; + private static final int MAX_READ_LENGTH = 150; + + private ReadsDownsamplerFactory downsamplerFactory; + private int targetCoverage; + private int numSamples; + private int minContigs; + private int maxContigs; + private StreamDensity streamDensity; + private StreamStackDepth streamStackDepth; + private StreamStacksPerContig streamStacksPerContig; + private double unmappedReadsFraction; + private int unmappedReadsCount; + private boolean verifySortedness; + + private ArtificialMultiSampleReadStream mergedReadStream; + private Map perSampleArtificialReadStreams; + private Map perSampleStreamAnalyzers; + private SAMFileHeader header; + + public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, + int targetCoverage, + int numSamples, + int minContigs, + int maxContigs, + StreamDensity streamDensity, + StreamStackDepth streamStackDepth, + StreamStacksPerContig streamStacksPerContig, + double unmappedReadsFraction, + int unmappedReadsCount, + boolean verifySortedness ) { + super(PerSampleDownsamplingReadsIteratorTest.class); + + this.downsamplerFactory = downsamplerFactory; + this.targetCoverage = targetCoverage; + this.numSamples = numSamples; + this.minContigs = minContigs; + this.maxContigs = maxContigs; + this.streamDensity = streamDensity; + this.streamStackDepth = streamStackDepth; + this.streamStacksPerContig = streamStacksPerContig; + this.unmappedReadsFraction = unmappedReadsFraction; + this.unmappedReadsCount = unmappedReadsCount; + this.verifySortedness = verifySortedness; + + header = createHeader(); + createReadStreams(); + + setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", + getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); + } + + private SAMFileHeader createHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); + List readGroups = new ArrayList(numSamples); + List sampleNames = new ArrayList(numSamples); + + for ( int i = 0; i < numSamples; i++ ) { + readGroups.add("ReadGroup" + i); + sampleNames.add("Sample" + i); + } + + return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); + } + + private void createReadStreams() { + perSampleArtificialReadStreams = new HashMap(numSamples); + perSampleStreamAnalyzers = new HashMap(numSamples); + + for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { + String readGroupID = readGroup.getReadGroupId(); + String sampleName = readGroup.getSample(); + + int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); + int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); + + int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; + + ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, + readGroupID, + thisSampleNumContigs, + thisSampleStacksPerContig, + streamStackDepth.minReadsPerStack, + streamStackDepth.maxReadsPerStack, + streamDensity.minDistanceBetweenStacks, + streamDensity.maxDistanceBetweenStacks, + MIN_READ_LENGTH, + MAX_READ_LENGTH, + thisSampleNumUnmappedReads); + perSampleArtificialReadStreams.put(sampleName, thisSampleStream); + perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); + } + + mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); + } + + public void run() { + StingSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getStingSAMIterator(), downsamplerFactory); + + if ( verifySortedness ) { + downsamplingIter = new VerifyingSamIterator(downsamplingIter); + } + + while ( downsamplingIter.hasNext() ) { + SAMRecord read = downsamplingIter.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); + if ( analyzer != null ) { + analyzer.update(read); + } + else { + throw new ReviewedStingException("bug: stream analyzer for sample " + sampleName + " not found"); + } + } + + for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { + ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); + analyzer.finalizeStats(); + + // Validate the downsampled read stream for each sample individually + analyzer.validate(); + } + + // Allow memory used by this test to be reclaimed: + mergedReadStream = null; + perSampleArtificialReadStreams = null; + perSampleStreamAnalyzers = null; + } + } + + @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public Object[][] createPerSampleDownsamplingReadsIteratorTests() { + + GenomeAnalysisEngine.resetRandomGenerator(); + + // Some values don't vary across tests + int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; + ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); + int maxContigs = 3; + boolean verifySortedness = true; + + for ( int numSamples : Arrays.asList(1, 2, 10) ) { + for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { + for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { + for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { + for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { + new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, + targetCoverage, + numSamples, + minContigs, + maxContigs, + streamDensity, + streamStackDepth, + streamStacksPerContig, + unmappedReadsFraction, + unmappedReadsCount, + verifySortedness); + } + } + } + } + } + } + } + + return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java deleted file mode 100644 index b1d8e45c9..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java +++ /dev/null @@ -1,357 +0,0 @@ -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.*; - -// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested -public class PositionalDownsamplerUnitTest extends BaseTest { - - /** - * ------- - * ------- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeOverlappingIdenticalStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ------- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeNonOverlappingIdenticalStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100)); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) == 1000); - Assert.assertTrue(downsampledStackSizes.get(1) == 1000); - Assert.assertTrue(downsampledStackSizes.get(2) == 1000); - } - - /** - * --- - * --- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeStacksWithShortStackAtBeginning() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ------- - * --- - * --- - * ------- - * ------- - */ - @Test - public void testThreeStacksWithShortStackInMiddle() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------ - * ------ - * ------- - * ------- - * --- - * --- - */ - @Test - public void testThreeStacksWithShortStackAtEnd() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ---- - * ------- - * ---- - * ------- - * ------- - */ - @Test - public void testThreePartiallyOverlappingStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - - // TODO: need to examine per-base coverage here - } - - @Test - public void testNoDownsamplingRequired() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) == 300); - Assert.assertTrue(downsampledStackSizes.get(1) == 300); - Assert.assertTrue(downsampledStackSizes.get(2) == 300); - } - - @Test - public void testGATKSAMRecordSupport() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - List reads = new ArrayList(); - for ( int i = 0; i < 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.size() == 10); - } - - private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - ArrayList stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; - } - - private ArrayList createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) { - ArrayList stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength); - stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength)); - return stack; - } - - private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { - List stackSizes = new ArrayList(); - Iterator iter = downsampledReads.iterator(); - Assert.assertTrue(iter.hasNext()); - - SAMRecord previousRead = iter.next(); - int currentStackSize = 1; - - while ( iter.hasNext() ) { - SAMRecord currentRead = iter.next(); - - if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { - stackSizes.add(currentStackSize); - currentStackSize = 1; - } - else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { - Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); - } - else { - currentStackSize++; - } - - previousRead = currentRead; - } - - stackSizes.add(currentStackSize); - return stackSizes; - } -} - diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..9cbd0db8a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; + +/** + * Class for analyzing an artificial read stream that has been positionally downsampled, and verifying + * that the downsampling was done correctly without changing the stream in unexpected ways. + * + * @author David Roazen + */ +public class PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer extends ArtificialSingleSampleReadStreamAnalyzer { + private int targetCoverage; + + public PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream, int targetCoverage ) { + super(originalStream); + this.targetCoverage = targetCoverage; + } + + /** + * Overridden validate() method that checks for the effects of positional downsampling in addition to checking + * for whether the original properties of the stream not affected by downsampling have been preserved + */ + @Override + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + + // Check for the effects of positional downsampling: + int stackMinimumAfterDownsampling = Math.min(targetCoverage, originalStream.getMinReadsPerStack()); + int stackMaximumAfterDownsampling = targetCoverage; + + if ( minReadsPerStack < stackMinimumAfterDownsampling ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads after downsampling"); + } + if ( maxReadsPerStack > stackMaximumAfterDownsampling ) { + throw new ReviewedStingException("stack had more than the maximum number of reads after downsampling"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..75d0448c4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class ReservoirDownsamplerUnitTest extends BaseTest { + + private static class ReservoirDownsamplerTest extends TestDataProvider { + int reservoirSize; + int totalReads; + int expectedNumReadsAfterDownsampling; + int expectedNumDiscardedItems; + + public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { + super(ReservoirDownsamplerTest.class); + + this.reservoirSize = reservoirSize; + this.totalReads = totalReads; + + expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); + expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; + + setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", + getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "ReservoirDownsamplerTestDataProvider") + public Object[][] createReservoirDownsamplerTestData() { + for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, 0); + for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, totalReads); + } + } + + return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); + } + + @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") + public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); + Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..5dc41b4a0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +public class SimplePositionalDownsamplerUnitTest extends BaseTest { + + private static class SimplePositionalDownsamplerTest extends TestDataProvider { + int targetCoverage; + int numStacks; + List stackSizes; + List expectedStackSizes; + boolean multipleContigs; + int totalInitialReads; + + public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { + super(SimplePositionalDownsamplerTest.class); + + this.targetCoverage = targetCoverage; + this.numStacks = stackSizes.size(); + this.stackSizes = stackSizes; + this.multipleContigs = multipleContigs; + + calculateExpectedDownsampledStackSizes(); + + totalInitialReads = 0; + for ( Integer stackSize : stackSizes ) { + totalInitialReads += stackSize; + } + + setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", + getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); + } + + public Collection createReads() { + Collection reads = new ArrayList(); + SAMFileHeader header = multipleContigs ? + ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : + ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + int refIndex = 0; + int alignmentStart = 1; + int readLength = 100; + + for ( int i = 0; i < numStacks; i++ ) { + if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { + refIndex++; + } + + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", + refIndex, alignmentStart, readLength)); + + alignmentStart += 10; + } + + return reads; + } + + private void calculateExpectedDownsampledStackSizes() { + expectedStackSizes = new ArrayList(numStacks); + + for ( Integer stackSize : stackSizes ) { + int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; + expectedStackSizes.add(expectedSize); + } + } + } + + @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") + public Object[][] createSimplePositionalDownsamplerTestData() { + GenomeAnalysisEngine.resetRandomGenerator(); + + for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { + for ( int contigs = 1; contigs <= 2; contigs++ ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + List stackSizes = new ArrayList(numStacks); + for ( int stack = 1; stack <= numStacks; stack++ ) { + stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); + } + new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); + } + } + } + + return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); + } + + @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) + public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); + + downsampler.submit(test.createReads()); + + if ( test.numStacks > 1 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else if ( test.numStacks == 1 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + if ( test.numStacks == 0 ) { + Assert.assertTrue(downsampledReads.isEmpty()); + } + else { + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); + + Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); + Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); + + int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); + int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); + Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); + } + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + + if ( downsampledReads.isEmpty() ) { + return stackSizes; + } + + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } + + @Test + public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); + downsampler.submit(readStack); + + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + + SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); + downsampler.signalNoMoreReadsBefore(laterRead); + + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), readStack.size()); + } + + @Test + public void testBasicUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : readStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(readStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), readStack.size()); + + for ( SAMRecord read: downsampledReads ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + } + + @Test + public void testMixedMappedAndUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection mappedReadStack = new ArrayList(); + mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); + for ( SAMRecord read : mappedReadStack ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + + Collection unmappedReadStack = new ArrayList(); + unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : unmappedReadStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(mappedReadStack); + downsampler.submit(unmappedReadStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), 300); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); + + int count = 1; + for ( SAMRecord read: downsampledReads ) { + if ( count <= 100 ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + else { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + count++; + } + } + + @Test + public void testGATKSAMRecordSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), 10); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java new file mode 100644 index 000000000..c148bcf84 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java @@ -0,0 +1,546 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * testing of the experimental version of LocusIteratorByState + */ +public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { + private static SAMFileHeader header; + private LocusIteratorByStateExperimental li; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + private final LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { + return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + } + + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { + return new ReadProperties( + Collections.emptyList(), + new SAMFileHeader(), + false, + SAMFileReader.ValidationStringency.STRICT, + downsamplingMethod, + new ValidationExclusion(), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte) -1 + ); + } + + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; + + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() { + return; + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } + } + + @Test + public void testXandEQOperators() { + final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); + r1.setReadBases(bases1); + r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r1.setCigarString("10M"); + + SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); + r2.setReadBases(bases2); + r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r2.setCigarString("3=1X5=1X"); + + SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); + r3.setReadBases(bases2); + r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r3.setCigarString("3=1X5M1X"); + + SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); + r4.setReadBases(bases2); + r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r4.setCigarString("10M"); + + List reads = Arrays.asList(r1, r2, r3, r4); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 4); + } + } + + @Test + public void testIndelsInRegularPileup() { + final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); + before.setReadBases(bases); + before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + before.setCigarString("10M"); + + SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); + during.setReadBases(indelBases); + during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + during.setCigarString("4M2I6M"); + + SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); + after.setReadBases(bases); + after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + after.setCigarString("10M"); + + List reads = Arrays.asList(before, during, after); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + boolean foundIndel = false; + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); + for (PileupElement p : pileup) { + if (p.isBeforeInsertion()) { + foundIndel = true; + Assert.assertEquals(p.getEventLength(), 2, "Wrong event length"); + Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect"); + break; + } + } + + } + + Assert.assertTrue(foundIndel,"Indel in pileup not found"); + } + + @Test + public void testWholeIndelReadInIsolation() { + final int firstLocus = 44367789; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); + indelOnlyRead.setCigarString("76I"); + + List reads = Arrays.asList(indelOnlyRead); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, readAttributes); + + // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read + // and considers it to be an indel-containing read. + Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); + ReadBackedPileup basePileup = alignmentContext.getBasePileup(); + Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) do + * not negatively influence the ordering of the pileup. + */ + @Test + public void testWholeIndelRead() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); + leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); + leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + leadingRead.setCigarString("1M75I"); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setCigarString("76I"); + + SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76); + fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76)); + fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); + fullMatchAfterIndel.setCigarString("75I1M"); + + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + int currentLocus = firstLocus; + int numAlignmentContextsFound = 0; + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); + + if(currentLocus == firstLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); + } + else if(currentLocus == secondLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); + } + + currentLocus++; + numAlignmentContextsFound++; + } + + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly + */ + @Test + public void testWholeIndelReadRepresentedTest() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); + + List reads = Arrays.asList(read1); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "A"); + } + + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); + } + } + + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; + private static final int IS_BEFORE_DELETION_START_FLAG = 2; + private static final int IS_AFTER_DELETED_BASE_FLAG = 4; + private static final int IS_AFTER_DELETION_END_FLAG = 8; + private static final int IS_BEFORE_INSERTION_FLAG = 16; + private static final int IS_AFTER_INSERTION_FLAG = 32; + private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; + + private static class LIBSTest { + + + final String cigar; + final int readLength; + final List offsets; + final List flags; + + private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + this.cigar = cigar; + this.readLength = readLength; + this.offsets = offsets; + this.flags = flags; + } + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTestData() { + return new Object[][]{ + {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, + {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + //TODO -- uncomment these when LIBS is fixed + //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, + //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, + //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, + {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, + {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + }; + } + + @Test(dataProvider = "LIBSTest") + public void testLIBS(LIBSTest params) { + final int locus = 44367788; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); + read.setCigarString(params.cigar); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + + int offset = 0; + while ( li.hasNext() ) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + + final int flag = params.flags.get(offset); + Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); + Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + + Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); + + offset++; + } + } + + //////////////////////////////////////////////// + // End comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////////// + + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByStateExperimental.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java index 3b5d8d6b7..f0d7f83dc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java @@ -28,14 +28,12 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -48,7 +46,6 @@ import java.util.List; */ public class VerifyingSamIteratorUnitTest { private SAMFileHeader samFileHeader; - private GenomeLocParser genomeLocParser; @BeforeClass public void init() { @@ -58,8 +55,6 @@ public class VerifyingSamIteratorUnitTest { samFileHeader = new SAMFileHeader(); samFileHeader.setSequenceDictionary(sequenceDictionary); - - genomeLocParser = new GenomeLocParser(sequenceDictionary); } @Test @@ -68,7 +63,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -83,7 +78,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -98,7 +93,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -116,7 +111,7 @@ public class VerifyingSamIteratorUnitTest { read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 667b325ed..e16ef3125 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -16,13 +16,27 @@ public class PileupWalkerIntegrationTest extends WalkerTest { executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } + + + private final static String SingleReadAligningOffChromosome1MD5 = "4a45fe1f85aaa8c4158782f2b6dee2bd"; @Test public void testSingleReadAligningOffChromosome1() { String gatk_args = "-T Pileup " + " -I " + privateTestDir + "readOffb37contig1.bam" + " -R " + b37KGReference + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("4a45fe1f85aaa8c4158782f2b6dee2bd")); + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); executeTest("Testing single read spanning off chromosome 1", spec); } + + @Test + public void testSingleReadAligningOffChromosome1NoIndex() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.noIndex.bam" + + " -R " + b37KGReference + + " -U ALLOW_UNINDEXED_BAM" + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); + executeTest("Testing single read spanning off chromosome 1 unindexed", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java index 0f19e2f90..5b052454a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -17,7 +17,7 @@ import java.util.*; * @author mhanna * @version 0.1 */ -public class ReservoirDownsamplerUnitTest { +public class LegacyReservoirDownsamplerUnitTest { private static final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,200); diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 7a2696b7b..7285c00ac 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -1,12 +1,12 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; public class SimpleTimerUnitTest extends BaseTest { private final static String NAME = "unit.test.timer"; @@ -17,33 +17,88 @@ public class SimpleTimerUnitTest extends BaseTest { Assert.assertEquals(t.getName(), NAME, "Name is not the provided one"); Assert.assertFalse(t.isRunning(), "Initial state of the timer is running"); Assert.assertEquals(t.getElapsedTime(), 0.0, "New timer elapsed time should be 0"); + Assert.assertEquals(t.getElapsedTimeNano(), 0l, "New timer elapsed time nano should be 0"); t.start(); Assert.assertTrue(t.isRunning(), "Started timer isn't running"); Assert.assertTrue(t.getElapsedTime() >= 0.0, "Elapsed time should be >= 0"); + Assert.assertTrue(t.getElapsedTimeNano() >= 0.0, "Elapsed time nano should be >= 0"); + long n1 = t.getElapsedTimeNano(); double t1 = t.getElapsedTime(); idleLoop(); // idle loop to wait a tiny bit of time + long n2 = t.getElapsedTimeNano(); double t2 = t.getElapsedTime(); Assert.assertTrue(t2 >= t1, "T2 >= T1 for a running time"); + Assert.assertTrue(n2 >= n1, "T2 >= T1 nano for a running time"); t.stop(); Assert.assertFalse(t.isRunning(), "Stopped timer still running"); + long n3 = t.getElapsedTimeNano(); double t3 = t.getElapsedTime(); idleLoop(); // idle loop to wait a tiny bit of time double t4 = t.getElapsedTime(); + long n4 = t.getElapsedTimeNano(); Assert.assertTrue(t4 == t3, "Elapsed times for two calls of stop timer not the same"); + Assert.assertTrue(n4 == n3, "Elapsed times for two calls of stop timer not the same"); t.restart(); idleLoop(); // idle loop to wait a tiny bit of time double t5 = t.getElapsedTime(); + long n5 = t.getElapsedTimeNano(); Assert.assertTrue(t.isRunning(), "Restarted timer should be running"); idleLoop(); // idle loop to wait a tiny bit of time double t6 = t.getElapsedTime(); + long n6 = t.getElapsedTimeNano(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); + Assert.assertTrue(n5 >= n4, "Restarted timer elapsed time nano should be after elapsed time preceding the restart"); + Assert.assertTrue(n6 >= n5, "Second elapsed time nano not after the first in restarted timer"); + + final List secondTimes = Arrays.asList(t1, t2, t3, t4, t5, t6); + final List nanoTimes = Arrays.asList(n1, n2, n3, n4, n5, n6); + for ( int i = 0; i < nanoTimes.size(); i++ ) + Assert.assertEquals( + SimpleTimer.nanoToSecondsAsDouble(nanoTimes.get(i)), + secondTimes.get(i), 1e-1, "Nanosecond and second timer disagree"); } - private final static void idleLoop() { + @Test + public void testNanoResolution() { + SimpleTimer t = new SimpleTimer(NAME); + + // test the nanosecond resolution + long n7 = t.currentTimeNano(); + int sum = 0; + for ( int i = 0; i < 100; i++) sum += i; + long n8 = t.currentTimeNano(); + final long delta = n8 - n7; + final long oneMilliInNano = TimeUnit.MILLISECONDS.toNanos(1); + logger.warn("nanoTime before nano operation " + n7); + logger.warn("nanoTime after nano operation of summing 100 ints " + n8 + ", sum = " + sum + " time delta " + delta + " vs. 1 millsecond in nano " + oneMilliInNano); + Assert.assertTrue(n8 > n7, "SimpleTimer doesn't appear to have nanoSecond resolution: n8 " + n8 + " <= n7 " + n7); + Assert.assertTrue(delta < oneMilliInNano, + "SimpleTimer doesn't appear to have nanoSecond resolution: time delta is " + delta + " vs 1 millisecond in nano " + oneMilliInNano); + } + + @Test + public void testMeaningfulTimes() { + SimpleTimer t = new SimpleTimer(NAME); + + t.start(); + for ( int i = 0; i < 100; i++ ) ; + long nano = t.getElapsedTimeNano(); + double secs = t.getElapsedTime(); + + Assert.assertTrue(secs > 0, "Seconds timer doesn't appear to count properly: elapsed time is " + secs); + Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); + + Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); + final long maxTimeInMicro = 100; + final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(100); + Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); + } + + private static void idleLoop() { for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java new file mode 100644 index 000000000..b3365c13c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; + +/** + * UnitTests for the InputProducer + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class InputProducerUnitTest extends BaseTest { + @DataProvider(name = "InputProducerTest") + public Object[][] createInputProducerTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + for ( final int queueSize : Arrays.asList(1, 10, 100) ) { + tests.add(new Object[]{ nElements, queueSize }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testInputProducer(final int nElements, final int queueSize) throws InterruptedException { + final List elements = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) elements.add(i); + + final LinkedBlockingDeque.InputValue> readQueue = + new LinkedBlockingDeque.InputValue>(queueSize); + + final InputProducer ip = new InputProducer(elements.iterator(), null, readQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + es.submit(ip); + + int lastValue = -1; + int nRead = 0; + while ( true ) { + final int observedQueueSize = readQueue.size(); + Assert.assertTrue(observedQueueSize <= queueSize, + "Reader is enqueuing more elements " + observedQueueSize + " than allowed " + queueSize); + + final InputProducer.InputValue value = readQueue.take(); + if ( value.isLast() ) { + Assert.assertEquals(nRead, nElements, "Number of input values " + nRead + " not all that are expected " + nElements); + Assert.assertEquals(readQueue.size(), 0, "Last queue element found but queue contains more values!"); + break; + } else { + Assert.assertTrue(lastValue < value.getValue(), "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)value.getValue(), expected, "Value observed " + value.getValue() + " not equal to the expected value " + expected); + nRead++; + lastValue = value.getValue(); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 89506dcb1..47dcc1d5e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -1,11 +1,15 @@ package org.broadinstitute.sting.utils.nanoScheduler; +import org.apache.log4j.BasicConfigurator; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; /** * UnitTests for the NanoScheduler @@ -18,11 +22,11 @@ import java.util.*; public class NanoSchedulerUnitTest extends BaseTest { public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; - private static class Map2x implements MapFunction { + private static class Map2x implements NSMapFunction { @Override public Integer apply(Integer input) { return input * 2; } } - private static class ReduceSum implements ReduceFunction { + private static class ReduceSum implements NSReduceFunction { int prevOne = Integer.MIN_VALUE; @Override public Integer apply(Integer one, Integer sum) { @@ -31,6 +35,16 @@ public class NanoSchedulerUnitTest extends BaseTest { } } + private static class ProgressCallback implements NSProgressFunction { + int callBacks = 0; + + @Override + public void progress(Integer lastMapInput) { + callBacks++; + } + } + + private static int sum2x(final int start, final int end) { int sum = 0; for ( int i = start; i < end; i++ ) @@ -39,18 +53,17 @@ public class NanoSchedulerUnitTest extends BaseTest { } private static class NanoSchedulerBasicTest extends TestDataProvider { - final int bufferSize, mapGroupSize, nThreads, start, end, expectedResult; + final int bufferSize, nThreads, start, end, expectedResult; - public NanoSchedulerBasicTest(final int bufferSize, final int mapGroupSize, final int nThreads, final int start, final int end) { + public NanoSchedulerBasicTest(final int bufferSize, final int nThreads, final int start, final int end) { super(NanoSchedulerBasicTest.class); this.bufferSize = bufferSize; - this.mapGroupSize = mapGroupSize; this.nThreads = nThreads; this.start = start; this.end = end; this.expectedResult = sum2x(start, end); - setName(String.format("%s nt=%d buf=%d mapGroupSize=%d start=%d end=%d sum=%d", - getClass().getSimpleName(), nThreads, bufferSize, mapGroupSize, start, end, expectedResult)); + setName(String.format("%s nt=%d buf=%d start=%d end=%d sum=%d", + getClass().getSimpleName(), nThreads, bufferSize, start, end, expectedResult)); } public Iterator makeReader() { @@ -60,6 +73,11 @@ public class NanoSchedulerUnitTest extends BaseTest { return ints.iterator(); } + public int nExpectedCallbacks() { + int nElements = Math.max(end - start, 0); + return nElements / bufferSize; + } + public Map2x makeMap() { return new Map2x(); } public Integer initReduce() { return 0; } public ReduceSum makeReduce() { return new ReduceSum(); } @@ -69,14 +87,10 @@ public class NanoSchedulerUnitTest extends BaseTest { @DataProvider(name = "NanoSchedulerBasicTest") public Object[][] createNanoSchedulerBasicTest() { for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { - for ( final int mapGroupSize : Arrays.asList(-1, 1, 10, 100, 1000) ) { - if ( mapGroupSize <= bufferSize ) { - for ( final int nt : Arrays.asList(1, 2, 4) ) { - for ( final int start : Arrays.asList(0) ) { - for ( final int end : Arrays.asList(1, 2, 11, 10000, 100000) ) { - exampleTest = new NanoSchedulerBasicTest(bufferSize, mapGroupSize, nt, start, end); - } - } + for ( final int nt : Arrays.asList(1, 2, 4) ) { + for ( final int start : Arrays.asList(0) ) { + for ( final int end : Arrays.asList(0, 1, 2, 11, 10000, 100000) ) { + exampleTest = new NanoSchedulerBasicTest(bufferSize, nt, start, end); } } } @@ -101,25 +115,29 @@ public class NanoSchedulerUnitTest extends BaseTest { private void testNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); - Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); - Assert.assertTrue(nanoScheduler.getMapGroupSize() >= test.mapGroupSize, "mapGroupSize argument"); + final ProgressCallback callback = new ProgressCallback(); + nanoScheduler.setProgressFunction(callback); + + Assert.assertEquals(nanoScheduler.getInputBufferSize(), test.bufferSize, "inputBufferSize argument"); Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); Assert.assertNotNull(sum); Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + + Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); nanoScheduler.shutdown(); } @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { - if ( test.bufferSize > 1 && (test.mapGroupSize > 1 || test.mapGroupSize == -1)) { + if ( test.bufferSize > 1) { logger.warn("Running " + test); final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); // test reusing the scheduler for ( int i = 0; i < 10; i++ ) { @@ -134,7 +152,7 @@ public class NanoSchedulerUnitTest extends BaseTest { @Test(timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdown() throws InterruptedException { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); nanoScheduler.shutdown(); Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); @@ -142,17 +160,23 @@ public class NanoSchedulerUnitTest extends BaseTest { @Test(expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdownExecuteFailure() throws InterruptedException { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); nanoScheduler.shutdown(); nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); } public static void main(String [ ] args) { - final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, 100, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); + org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); + BasicConfigurator.configure(); + logger.setLevel(org.apache.log4j.Level.DEBUG); + + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); + nanoScheduler.setDebug(true); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); + nanoScheduler.shutdown(); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java new file mode 100644 index 000000000..61d1330bc --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java @@ -0,0 +1,94 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.*; + +/** + * UnitTests for the InputProducer + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ReducerThreadUnitTest extends BaseTest { + @DataProvider(name = "ReducerThreadTest") + public Object[][] createReducerThreadTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + tests.add(new Object[]{ nElements }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "ReducerThreadTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testReducerThreadTest(final int nElements) throws Exception { + List values = new ArrayList(nElements); + List jobIDs = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) { + values.add(i); + jobIDs.add(i); + } + + runTests(values, jobIDs); + } + + @Test(enabled = true, timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME, expectedExceptions = ExecutionException.class) + public void testReducerThreadTestByJobOrder() throws Exception { + runTests(Arrays.asList(0, 1, 2), Arrays.asList(1, 3, 2)); + } + + private void runTests( final List mapValues, final List jobIDs) throws Exception { + final LinkedBlockingDeque>> mapResultsQueue = + new LinkedBlockingDeque>>(mapValues.size()+1); + + for ( int i = 0; i < mapValues.size(); i++ ) { + final int value = mapValues.get(i); + final int jobID = jobIDs.get(i); + final MapResult mapResult = new MapResult(value, jobID); + mapResultsQueue.add(new FutureValue>(mapResult)); + } + mapResultsQueue.add(new FutureValue>(new MapResult())); + + final ReduceSumTest reduce = new ReduceSumTest(mapResultsQueue); + final ReducerThread thread + = new ReducerThread(reduce, null, 0, mapResultsQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + final Future value = es.submit(thread); + value.get(); + + Assert.assertEquals(reduce.nRead, mapValues.size()); + } + + public class ReduceSumTest implements NSReduceFunction { + final LinkedBlockingDeque>> mapResultsQueue; + int nRead = 0; + int lastValue = -1; + + public ReduceSumTest(LinkedBlockingDeque>> mapResultsQueue) { + this.mapResultsQueue = mapResultsQueue; + } + + @Override public Integer apply(Integer one, Integer sum) { + Assert.assertTrue(lastValue < one, "Reduce came in out of order. Prev " + lastValue + " cur " + one); + + Assert.assertTrue(lastValue < one, "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)one, expected, "Value observed " + one + " not equal to the expected value " + expected); + nRead++; + lastValue = expected; + + return one + sum; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java new file mode 100644 index 000000000..74626d031 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java @@ -0,0 +1,161 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import org.broadinstitute.sting.BaseTest; + +public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest { + + private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider { + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) { + super(ArtificialSingleSampleReadStreamTest.class); + + this.stream = stream; + + setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream); + + streamAnalyzer.analyze(stream); + + // Check whether the observed properties of the stream match its nominal properties + streamAnalyzer.validate(); + } + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider") + public Object[][] createArtificialSingleSampleReadStreamTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + GenomeAnalysisEngine.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs = 0; numContigs <= 2; numContigs++ ) { + for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) { + for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) { + for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) { + for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) { + for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) { + for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) { + for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) { + for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) { + // Only test sane combinations here + if ( minReadsPerStack <= maxReadsPerStack && + minDistanceBetweenStacks <= maxDistanceBetweenStacks && + minReadLength <= maxReadLength && + ((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) { + + new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads)); + } + } + } + } + } + } + } + } + } + } + + return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class); + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider") + public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider") + public Object[][] createInvalidArgumentsTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + header.addReadGroup(new SAMReadGroupRecord(readGroupID)); + + return new Object[][] { + {"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0}, + {"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0}, + {"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0}, + {"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0}, + {"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0}, + {"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0}, + {"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0}, + {"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0}, + {"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0}, + {"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1}, + }; + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider", + expectedExceptions = ReviewedStingException.class) + public void testInvalidArguments( String testName, + SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + + logger.warn("Running test: " + testName); + + ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + numStacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 6544b9845..7381bebc4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -34,14 +34,17 @@ import org.testng.annotations.Test; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.concurrent.*; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; /** * Tests for the state monitoring thread factory. */ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing - private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 1000; + private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 100000; private final static int MAX_THREADS = 4; final static Object GLOBAL_LOCK = new Object();