From 46edab6d6adc0e787a9f939a9f3b5e1b9e1bc212 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 26 Nov 2012 12:44:48 -0500 Subject: [PATCH] Use the new downsampling implementation by default -Switch back to the old implementation, if needed, with --use_legacy_downsampler -LocusIteratorByStateExperimental becomes the new LocusIteratorByState, and the original LocusIteratorByState becomes LegacyLocusIteratorByState -Similarly, the ExperimentalReadShardBalancer becomes the new ReadShardBalancer, with the old one renamed to LegacyReadShardBalancer -Performance improvements: locus traversals used to be 20% slower in the new downsampling implementation, now they are roughly the same speed. -Tests show a very high level of concordance with UG calls from the previous implementation, with some new calls and edge cases that still require more examination. -With the new implementation, can now use -dcov with ReadWalkers to set a limit on the max # of reads per alignment start position per sample. Appropriate value for ReadWalker dcov may be in the single digits for some tools, but this too requires more investigation. --- ...GenotyperGeneralPloidyIntegrationTest.java | 4 +- .../UnifiedGenotyperIntegrationTest.java | 10 +- .../sting/gatk/GenomeAnalysisEngine.java | 18 +- .../sting/gatk/WalkerManager.java | 32 +- .../arguments/GATKArgumentCollection.java | 11 +- .../gatk/datasources/providers/LocusView.java | 9 +- .../gatk/datasources/reads/BAMScheduler.java | 7 +- .../reads/ExperimentalReadShardBalancer.java | 228 -------- .../reads/LegacyReadShardBalancer.java | 129 +++++ .../datasources/reads/ReadShardBalancer.java | 187 ++++-- .../gatk/datasources/reads/SAMDataSource.java | 71 ++- .../gatk/downsampling/DownsamplingMethod.java | 48 +- .../downsampling/PassThroughDownsampler.java | 106 ++++ .../sting/gatk/executive/WindowMaker.java | 13 +- ...l.java => LegacyLocusIteratorByState.java} | 531 +++++++++++++---- .../gatk/iterators/LocusIteratorByState.java | 533 ++++-------------- ...r.java => LegacyReservoirDownsampler.java} | 6 +- .../reads/DownsamplerBenchmark.java | 5 +- ...st.java => ReadShardBalancerUnitTest.java} | 8 +- ...> LegacyLocusIteratorByStateUnitTest.java} | 438 +++++--------- .../LocusIteratorByStateUnitTest.java | 404 +++++++++---- .../traversals/TraverseReadsUnitTest.java | 4 +- .../LegacyReservoirDownsamplerUnitTest.java | 16 +- 23 files changed, 1490 insertions(+), 1328 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java rename public/java/src/org/broadinstitute/sting/gatk/iterators/{LocusIteratorByStateExperimental.java => LegacyLocusIteratorByState.java} (61%) rename public/java/src/org/broadinstitute/sting/utils/{ReservoirDownsampler.java => LegacyReservoirDownsampler.java} (94%) rename public/java/test/org/broadinstitute/sting/gatk/datasources/reads/{ExperimentalReadShardBalancerUnitTest.java => ReadShardBalancerUnitTest.java} (97%) rename public/java/test/org/broadinstitute/sting/gatk/iterators/{LocusIteratorByStateExperimentalUnitTest.java => LegacyLocusIteratorByStateUnitTest.java} (50%) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 73bc8fba6..f26194e00 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -80,11 +80,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","dd568dc30be90135a3a8957a45a7321c"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "bf793c43b635a931207170be8035b288"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7459d131b..f2b2dfb7d 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("d20c7a143b899f0239bf64b652ad3edb")); + Arrays.asList("97df6c2a8d390d43b9bdf56c979d9b09")); executeTest("test Multiple SNP alleles", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("fb204e821a24d03bd3a671b6e01c449a")); + Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8")); executeTest("test mismatched PLs", spec); } @@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - Arrays.asList("32f18ba50406cd8c8069ba07f2f89558")); + Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,13 +451,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("c1077662411164182c5f75478344f83d")); + Arrays.asList("092e42a712afb660ec79ff11c55933e2")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "dee6590e3b7079890bc3a9cb372c297e"); + testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); } @Test diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b7000e0ee..1187039bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -445,13 +445,17 @@ public class GenomeAnalysisEngine { protected DownsamplingMethod getDownsamplingMethod() { GATKArgumentCollection argCollection = this.getArguments(); - boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling; + + // Legacy downsampler can only be selected via the command line, not via walker annotations + boolean useLegacyDownsampler = argCollection.useLegacyDownsampler; DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); - DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling); - DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useLegacyDownsampler); + DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useLegacyDownsampler); - return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); + DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); + method.checkCompatibilityWithWalker(walker); + return method; } protected void setDownsamplingMethod(DownsamplingMethod method) { @@ -580,9 +584,9 @@ public class GenomeAnalysisEngine { throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); } - // Use the experimental ReadShardBalancer if experimental downsampling is enabled - ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ? - new ExperimentalReadShardBalancer() : + // Use the legacy ReadShardBalancer if legacy downsampling is enabled + ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ? + new LegacyReadShardBalancer() : new ReadShardBalancer(); if(intervals == null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index 28b5f918d..1a9162e51 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -305,11 +305,23 @@ public class WalkerManager extends PluginManager { * Gets the type of downsampling method requested by the walker. If an alternative * downsampling method is specified on the command-line, the command-line version will * be used instead. - * @param walkerClass The class of the walker to interrogate. - * @param useExperimentalDownsampling If true, use the experimental downsampling implementation + * @param walker The walker to interrogate. + * @param useLegacyDownsampler If true, use the legacy downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useExperimentalDownsampling) { + public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useLegacyDownsampler) { + return getDownsamplingMethod(walker.getClass(), useLegacyDownsampler); + } + + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walkerClass The class of the walker to interrogate. + * @param useLegacyDownsampler If true, use the legacy downsampling implementation + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useLegacyDownsampler) { DownsamplingMethod downsamplingMethod = null; if( walkerClass.isAnnotationPresent(Downsample.class) ) { @@ -317,7 +329,7 @@ public class WalkerManager extends PluginManager { DownsampleType type = downsampleParameters.by(); Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling); + downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useLegacyDownsampler); } return downsamplingMethod; @@ -331,18 +343,6 @@ public class WalkerManager extends PluginManager { return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); } - /** - * Gets the type of downsampling method requested by the walker. If an alternative - * downsampling method is specified on the command-line, the command-line version will - * be used instead. - * @param walker The walker to interrogate. - * @param useExperimentalDownsampling If true, use the experimental downsampling implementation - * @return The downsampling method, as specified by the walker. Null if none exists. - */ - public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) { - return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling); - } - /** * Create a name for this type of walker. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index d0f3e91e0..d9c7c9008 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -162,12 +162,11 @@ public class GATKArgumentCollection { @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.", required = false) public Integer downsampleCoverage = null; - @Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false) - @Hidden - public boolean enableExperimentalDownsampling = false; + @Argument(fullName = "use_legacy_downsampler", shortName = "use_legacy_downsampler", doc = "Use the legacy downsampling implementation instead of the newer, less-tested implementation", required = false) + public boolean useLegacyDownsampler = false; /** * Gets the downsampling method explicitly specified by the user. If the user didn't specify @@ -178,7 +177,7 @@ public class GATKArgumentCollection { if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) return null; - return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling); + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, useLegacyDownsampler); } /** @@ -192,7 +191,7 @@ public class GATKArgumentCollection { downsamplingType = method.type; downsampleCoverage = method.toCoverage; downsampleFraction = method.toFraction; - enableExperimentalDownsampling = method.useExperimentalDownsampling; + useLegacyDownsampler = method.useLegacyDownsampler; } // -------------------------------------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index cd3403f2f..c12bce208 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -136,11 +136,12 @@ public abstract class LocusView extends LocusIterator implements View { // Cache the current and apply filtering. AlignmentContext current = nextLocus; - // The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling: - if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling && - sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) { + // The old ALL_READS downsampling implementation -- use only if legacy downsampling was requested: + if ( sourceInfo.getDownsamplingMethod().useLegacyDownsampler && + sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && + sourceInfo.getDownsamplingMethod().toCoverage != null ) { - current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); + current.downsampleToCoverage(sourceInfo.getDownsamplingMethod().toCoverage); } // Indicate that the next operation will need to advance. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 8ee7e0439..cb33c5ab8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -134,12 +134,11 @@ public class BAMScheduler implements Iterator { // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling // TODO: clean this up once the experimental downsampling engine fork collapses - if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) { - currentPosition = dataSource.getInitialReaderPositions(); + if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useLegacyDownsampler ) { + currentPosition = dataSource.getCurrentPosition(); } else { - currentPosition = dataSource.getCurrentPosition(); - + currentPosition = dataSource.getInitialReaderPositions(); } for(SAMReaderID reader: dataSource.getReaderIDs()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java deleted file mode 100644 index 0440c7eae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. - * - * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig - * together into one monolithic FilePointer, create one persistent set of read iterators over - * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to - * fill read shards with reads. - * - * This strategy has several important advantages: - * - * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole - * contig will have regions that overlap with other FilePointers on the same contig, due - * to the limited granularity of BAM index data. By creating only one FilePointer per contig, - * we avoid having to track how much of each file region we've visited (as we did in the - * former implementation), we avoid expensive non-sequential access patterns in the files, - * and we avoid having to repeatedly re-create our iterator chain for every small region - * of interest. - * - * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single - * persistent set of read iterators (which include the downsampling iterator(s)) per contig, - * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never - * loses crucial state information while downsampling within a contig. - * - * TODO: There is also at least one important disadvantage: - * - * 1. We load more BAM index data into memory at once, and this work is done upfront before processing - * the next contig, creating a delay before traversal of each contig. This delay may be - * compensated for by the gains listed in #1 above, and we may be no worse off overall in - * terms of total runtime, but we need to verify this empirically. - * - * @author David Roazen - */ -public class ExperimentalReadShardBalancer extends ShardBalancer { - - private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class); - - /** - * Convert iterators of file pointers into balanced iterators of shards. - * @return An iterator over balanced shards. - */ - public Iterator iterator() { - return new Iterator() { - /** - * The cached shard to be returned next. Prefetched in the peekable iterator style. - */ - private Shard nextShard = null; - - /** - * The file pointer currently being processed. - */ - private FilePointer currentContigFilePointer = null; - - /** - * Iterator over the reads from the current contig's file pointer. The same iterator will be - * used to fill all shards associated with a given file pointer - */ - private PeekableIterator currentContigReadsIterator = null; - - /** - * How many FilePointers have we pulled from the filePointers iterator? - */ - private int totalFilePointersConsumed = 0; - - /** - * Have we encountered a monolithic FilePointer? - */ - private boolean encounteredMonolithicFilePointer = false; - - - { - createNextContigFilePointer(); - advance(); - } - - public boolean hasNext() { - return nextShard != null; - } - - public Shard next() { - if ( ! hasNext() ) - throw new NoSuchElementException("No next read shard available"); - Shard currentShard = nextShard; - advance(); - return currentShard; - } - - private void advance() { - nextShard = null; - - // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away - while ( nextShard == null && currentContigFilePointer != null ) { - - // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): - if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { - - // Close the old, exhausted chain of iterators to release resources - currentContigReadsIterator.close(); - - // Advance to the FilePointer for the next contig - createNextContigFilePointer(); - - // We'll need to create a fresh iterator for this file pointer when we create the first - // shard for it below. - currentContigReadsIterator = null; - } - - // At this point our currentContigReadsIterator may be null or non-null depending on whether or not - // this is our first shard for this file pointer. - if ( currentContigFilePointer != null ) { - Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - - // Create a new reads iterator only when we've just advanced to the file pointer for the next - // contig. It's essential that the iterators persist across all shards that share the same contig - // to allow the downsampling to work properly. - if ( currentContigReadsIterator == null ) { - currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); - } - - if ( currentContigReadsIterator.hasNext() ) { - shard.fill(currentContigReadsIterator); - nextShard = shard; - } - } - } - } - - /** - * Aggregate all FilePointers for the next contig together into one monolithic FilePointer - * to avoid boundary issues with visiting the same file regions more than once (since more - * granular FilePointers will have regions that overlap with other nearby FilePointers due - * to the nature of BAM indices). - * - * By creating one persistent set of iterators per contig we also avoid boundary artifacts - * in the engine-level downsampling. - * - * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for - * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely - * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should - * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for - * TODO: locus traversals). - */ - private void createNextContigFilePointer() { - currentContigFilePointer = null; - List nextContigFilePointers = new ArrayList(); - - logger.info("Loading BAM index data for next contig"); - - while ( filePointers.hasNext() ) { - - // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that - // it is the ONLY FilePointer we ever encounter - if ( encounteredMonolithicFilePointer ) { - throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); - } - if ( filePointers.peek().isMonolithic() ) { - if ( totalFilePointersConsumed > 0 ) { - throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); - } - encounteredMonolithicFilePointer = true; - logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); - } - - // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the - // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge - if ( nextContigFilePointers.isEmpty() || - (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && - nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || - (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { - - nextContigFilePointers.add(filePointers.next()); - totalFilePointersConsumed++; - } - else { - break; // next FilePointer is on a different contig or has different mapped/unmapped status, - // save it for next time - } - } - - if ( ! nextContigFilePointers.isEmpty() ) { - currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); - } - - if ( currentContigFilePointer != null ) { - logger.info("Done loading BAM index data for next contig"); - logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); - } - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); - } - }; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java new file mode 100644 index 000000000..f5b4fba8e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.SAMFileSpan; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Divide up large file pointers containing reads into more manageable subcomponents. + * + * TODO: delete this class once the experimental downsampling engine fork collapses + */ +public class LegacyReadShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer; + + /** + * Ending position of the last shard in the file. + */ + private Map position = readsDataSource.getCurrentPosition(); + + { + if(filePointers.hasNext()) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + + private void advance() { + Map shardPosition; + nextShard = null; + + Map selectedReaders = new HashMap(); + while(selectedReaders.size() == 0 && currentFilePointer != null) { + shardPosition = currentFilePointer.fileSpans; + + for(SAMReaderID id: shardPosition.keySet()) { + SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); + selectedReaders.put(id,fileSpan); + } + + if(!isEmpty(selectedReaders)) { + Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + readsDataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } + } + + selectedReaders.clear(); + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + } + + position = readsDataSource.getCurrentPosition(); + } + + /** + * Detects whether the list of file spans contain any read data. + * @param selectedSpans Mapping of readers to file spans. + * @return True if file spans are completely empty; false otherwise. + */ + private boolean isEmpty(Map selectedSpans) { + for(SAMFileSpan fileSpan: selectedSpans.values()) { + if(!fileSpan.isEmpty()) + return false; + } + return true; + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java index 18fafb95d..8cee535b0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -24,20 +24,49 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import net.sf.samtools.GATKBAMFileSpan; -import net.sf.samtools.SAMFileSpan; +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.NoSuchElementException; +import java.util.*; /** - * Divide up large file pointers containing reads into more manageable subcomponents. + * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. * - * TODO: delete this class once the experimental downsampling engine fork collapses + * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig + * together into one monolithic FilePointer, create one persistent set of read iterators over + * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to + * fill read shards with reads. + * + * This strategy has several important advantages: + * + * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole + * contig will have regions that overlap with other FilePointers on the same contig, due + * to the limited granularity of BAM index data. By creating only one FilePointer per contig, + * we avoid having to track how much of each file region we've visited (as we did in the + * former implementation), we avoid expensive non-sequential access patterns in the files, + * and we avoid having to repeatedly re-create our iterator chain for every small region + * of interest. + * + * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single + * persistent set of read iterators (which include the downsampling iterator(s)) per contig, + * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never + * loses crucial state information while downsampling within a contig. + * + * TODO: There is also at least one important disadvantage: + * + * 1. We load more BAM index data into memory at once, and this work is done upfront before processing + * the next contig, creating a delay before traversal of each contig. This delay may be + * compensated for by the gains listed in #1 above, and we may be no worse off overall in + * terms of total runtime, but we need to verify this empirically. + * + * @author David Roazen */ public class ReadShardBalancer extends ShardBalancer { + + private static Logger logger = Logger.getLogger(ReadShardBalancer.class); + /** * Convert iterators of file pointers into balanced iterators of shards. * @return An iterator over balanced shards. @@ -52,16 +81,27 @@ public class ReadShardBalancer extends ShardBalancer { /** * The file pointer currently being processed. */ - private FilePointer currentFilePointer; + private FilePointer currentContigFilePointer = null; /** - * Ending position of the last shard in the file. + * Iterator over the reads from the current contig's file pointer. The same iterator will be + * used to fill all shards associated with a given file pointer */ - private Map position = readsDataSource.getCurrentPosition(); + private PeekableIterator currentContigReadsIterator = null; + + /** + * How many FilePointers have we pulled from the filePointers iterator? + */ + private int totalFilePointersConsumed = 0; + + /** + * Have we encountered a monolithic FilePointer? + */ + private boolean encounteredMonolithicFilePointer = false; + { - if(filePointers.hasNext()) - currentFilePointer = filePointers.next(); + createNextContigFilePointer(); advance(); } @@ -70,58 +110,117 @@ public class ReadShardBalancer extends ShardBalancer { } public Shard next() { - if(!hasNext()) + if ( ! hasNext() ) throw new NoSuchElementException("No next read shard available"); Shard currentShard = nextShard; advance(); return currentShard; } - public void remove() { - throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); - } - private void advance() { - Map shardPosition; nextShard = null; - Map selectedReaders = new HashMap(); - while(selectedReaders.size() == 0 && currentFilePointer != null) { - shardPosition = currentFilePointer.fileSpans; + // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away + while ( nextShard == null && currentContigFilePointer != null ) { - for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); - selectedReaders.put(id,fileSpan); + // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): + if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { + + // Close the old, exhausted chain of iterators to release resources + currentContigReadsIterator.close(); + + // Advance to the FilePointer for the next contig + createNextContigFilePointer(); + + // We'll need to create a fresh iterator for this file pointer when we create the first + // shard for it below. + currentContigReadsIterator = null; } - if(!isEmpty(selectedReaders)) { - Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); - readsDataSource.fillShard(shard); + // At this point our currentContigReadsIterator may be null or non-null depending on whether or not + // this is our first shard for this file pointer. + if ( currentContigFilePointer != null ) { + Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - if(!shard.isBufferEmpty()) { + // Create a new reads iterator only when we've just advanced to the file pointer for the next + // contig. It's essential that the iterators persist across all shards that share the same contig + // to allow the downsampling to work properly. + if ( currentContigReadsIterator == null ) { + currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); + } + + if ( currentContigReadsIterator.hasNext() ) { + shard.fill(currentContigReadsIterator); nextShard = shard; - break; } } - - selectedReaders.clear(); - currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; } - - position = readsDataSource.getCurrentPosition(); } /** - * Detects whether the list of file spans contain any read data. - * @param selectedSpans Mapping of readers to file spans. - * @return True if file spans are completely empty; false otherwise. + * Aggregate all FilePointers for the next contig together into one monolithic FilePointer + * to avoid boundary issues with visiting the same file regions more than once (since more + * granular FilePointers will have regions that overlap with other nearby FilePointers due + * to the nature of BAM indices). + * + * By creating one persistent set of iterators per contig we also avoid boundary artifacts + * in the engine-level downsampling. + * + * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for + * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely + * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should + * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for + * TODO: locus traversals). */ - private boolean isEmpty(Map selectedSpans) { - for(SAMFileSpan fileSpan: selectedSpans.values()) { - if(!fileSpan.isEmpty()) - return false; + private void createNextContigFilePointer() { + currentContigFilePointer = null; + List nextContigFilePointers = new ArrayList(); + + logger.info("Loading BAM index data for next contig"); + + while ( filePointers.hasNext() ) { + + // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that + // it is the ONLY FilePointer we ever encounter + if ( encounteredMonolithicFilePointer ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); + } + if ( filePointers.peek().isMonolithic() ) { + if ( totalFilePointersConsumed > 0 ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); + } + encounteredMonolithicFilePointer = true; + logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); + } + + // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the + // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge + if ( nextContigFilePointers.isEmpty() || + (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && + nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || + (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { + + nextContigFilePointers.add(filePointers.next()); + totalFilePointersConsumed++; + } + else { + break; // next FilePointer is on a different contig or has different mapped/unmapped status, + // save it for next time + } } - return true; + + if ( ! nextContigFilePointers.isEmpty() ) { + currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); + } + + if ( currentContigFilePointer != null ) { + logger.info("Done loading BAM index data for next contig"); + logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); + } + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); } }; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 88de3ac9b..e99814278 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -466,7 +466,7 @@ public class SAMDataSource { /** * Legacy method to fill the given buffering shard with reads. * - * Shard.fill() is used instead of this method when experimental downsampling is enabled + * Shard.fill() is used instead of this method unless legacy downsampling is enabled * * TODO: delete this method once the experimental downsampling engine fork collapses * @@ -638,7 +638,8 @@ public class SAMDataSource { readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), readProperties.getSupplementalFilters(), readProperties.getReadTransformers(), - readProperties.defaultBaseQualities()); + readProperties.defaultBaseQualities(), + shard instanceof LocusShard); } private class BAMCodecIterator implements CloseableIterator { @@ -695,6 +696,7 @@ public class SAMDataSource { * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. * @param supplementalFilters additional filters to apply to the reads. * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. */ protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, @@ -705,7 +707,8 @@ public class SAMDataSource { Boolean noValidationOfReadOrder, Collection supplementalFilters, List readTransformers, - byte defaultBaseQualities) { + byte defaultBaseQualities, + boolean isLocusBasedTraversal ) { // ************************************************************************************************ // // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // @@ -714,12 +717,26 @@ public class SAMDataSource { wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); - if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) { - wrappedIterator = applyDownsamplingIterator(wrappedIterator); + // If we're using the new downsampling implementation, apply downsampling iterators at this + // point in the read stream for most (but not all) cases + if ( ! readProperties.getDownsamplingMethod().useLegacyDownsampler ) { + + // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers + // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding + // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling + // of individual reads. + boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && + readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readProperties.getDownsamplingMethod().toCoverage != null; + + if ( ! assumeDownstreamLIBSDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } } - // Use the old fractional downsampler only if we're not using experimental downsampling: - if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null ) + // Use the old fractional downsampler only if we're using legacy downsampling: + // TODO: remove this statement (and associated classes) once the downsampling engine fork collapses + if ( readProperties.getDownsamplingMethod().useLegacyDownsampler && downsamplingFraction != null ) wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction); // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, @@ -741,19 +758,37 @@ public class SAMDataSource { } protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { - if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { - ReadsDownsamplerFactory downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ? - new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage) : - new FractionalDownsamplerFactory(readProperties.getDownsamplingMethod().toFraction); - - return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory); + if ( readProperties.getDownsamplingMethod() == null || + readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { + return wrappedIterator; } - else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { - ReadsDownsampler downsampler = readProperties.getDownsamplingMethod().toCoverage != null ? - new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage) : - new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction); - return new DownsamplingReadsIterator(wrappedIterator, downsampler); + if ( readProperties.getDownsamplingMethod().toFraction != null ) { + + // If we're downsampling to a fraction of reads, there's no point in paying the cost of + // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on + // reads from each sample separately, since the result would be the same as running the + // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator + // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling + // was requested. + + return new DownsamplingReadsIterator(wrappedIterator, + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); + } + else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { + + // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling + // the read stream to run the downsampler on the reads for each individual sample separately if + // BY_SAMPLE downsampling was requested. + + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + return new PerSampleDownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + return new DownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); + } } return wrappedIterator; diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java index ae1d98ce0..b3f636fd6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java @@ -50,9 +50,9 @@ public class DownsamplingMethod { public final Double toFraction; /** - * Use the new experimental downsampling? + * Use the legacy downsampling implementation instead of the newer implementation? */ - public final boolean useExperimentalDownsampling; + public final boolean useLegacyDownsampler; /** * Expresses no downsampling applied at all. @@ -69,11 +69,11 @@ public class DownsamplingMethod { */ public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; - public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useLegacyDownsampler ) { this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; this.toCoverage = toCoverage; this.toFraction = toFraction; - this.useExperimentalDownsampling = useExperimentalDownsampling; + this.useLegacyDownsampler = useLegacyDownsampler; if ( type == DownsampleType.NONE ) { toCoverage = null; @@ -101,19 +101,19 @@ public class DownsamplingMethod { if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); } + } - // Some restrictions only exist for the old downsampling implementation: - if ( ! useExperimentalDownsampling ) { - // By sample downsampling does not work with a fraction of reads in the old downsampling implementation - if( type == DownsampleType.BY_SAMPLE && toFraction != null ) - throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); + public void checkCompatibilityWithWalker( Walker walker ) { + boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker; + + if ( ! isLocusTraversal && useLegacyDownsampler && toCoverage != null ) { + throw new UserException.CommandLineException("Downsampling to coverage for read-based traversals (eg., ReadWalkers) is not supported in the legacy downsampling implementation. " + + "The newer downsampling implementation does not have this limitation."); } - // Some restrictions only exist for the new downsampling implementation: - if ( useExperimentalDownsampling ) { - if ( type == DownsampleType.ALL_READS && toCoverage != null ) { - throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); - } + if ( isLocusTraversal && ! useLegacyDownsampler && type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException.CommandLineException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not yet supported in the new downsampling implementation (though it is supported for ReadWalkers). " + + "You can run with --use_legacy_downsampler for a broken and poorly-maintained implementation of ALL_READS to-coverage downsampling, but this is not recommended."); } } @@ -124,30 +124,34 @@ public class DownsamplingMethod { builder.append("No downsampling"); } else { - builder.append(String.format("Method: %s ", type)); + builder.append(String.format("Method: %s, ", type)); if ( toCoverage != null ) { - builder.append(String.format("Target Coverage: %d ", toCoverage)); + builder.append(String.format("Target Coverage: %d, ", toCoverage)); } else { - builder.append(String.format("Target Fraction: %.2f ", toFraction)); + builder.append(String.format("Target Fraction: %.2f, ", toFraction)); } - if ( useExperimentalDownsampling ) { - builder.append("Using Experimental Downsampling"); + if ( useLegacyDownsampler ) { + builder.append("Using the legacy downsampling implementation"); + } + else { + builder.append("Using the new downsampling implementation"); } } return builder.toString(); } - public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { + public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useLegacyDownsampler ) { if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, - null, useExperimentalDownsampling); + null, useLegacyDownsampler); } else { - return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); + // Downsampling is off by default for non-locus-based traversals + return new DownsamplingMethod(DownsampleType.NONE, null, null, useLegacyDownsampler); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java new file mode 100644 index 000000000..008ffde3b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no + * downsampling whatsoever, and instead simply "passes-through" all the reads it's given. + * Useful for situations where you want to disable downsampling, but still need to use + * the downsampler interface. + * + * @author David Roazen + */ +public class PassThroughDownsampler implements ReadsDownsampler { + + private ArrayList selectedReads; + + public PassThroughDownsampler() { + clear(); + } + + public void submit( T newRead ) { + // All reads pass-through, no reads get downsampled + selectedReads.add(newRead); + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasFinalizedItems() { + return selectedReads.size() > 0; + } + + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List downsampledItems = selectedReads; + clear(); + return downsampledItems; + } + + public boolean hasPendingItems() { + return false; + } + + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return 0; + } + + public void signalEndOfInput() { + // NO-OP + } + + public void clear() { + selectedReads = new ArrayList(); + } + + public void reset() { + // NO-OP + } + + public boolean requiresCoordinateSortOrder() { + return false; + } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index 6c0dc9769..735a4dce5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,9 +4,9 @@ import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -83,17 +83,18 @@ public class WindowMaker implements Iterable, I this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - // Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested: - this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ? - new PeekableIterator(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames)) + // Use the legacy version of LocusIteratorByState if legacy downsampling was requested: + this.sourceIterator = sourceInfo.getDownsamplingMethod().useLegacyDownsampler ? + new PeekableIterator(new LegacyLocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames)) : - new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames)); + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) { - this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + this(shard, genomeLocParser, iterator, intervals, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); } public Iterator iterator() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java similarity index 61% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java index 557cbd009..5c833de4c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java @@ -31,14 +31,14 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.Downsampler; -import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.LegacyReservoirDownsampler; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -50,11 +50,11 @@ import java.util.*; /** * Iterator that traverses a SAM File, accumulating information on a per-locus basis */ -public class LocusIteratorByStateExperimental extends LocusIterator { +public class LegacyLocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- // @@ -69,7 +69,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator { private final ArrayList samples; private final ReadStateManager readStates; - protected static class SAMRecordState { + static private class SAMRecordState { SAMRecord read; int readOffset = -1; // how far are we offset from the start of the read bases? int genomeOffset = -1; // how far are we offset from the alignment start on the genome? @@ -213,7 +213,6 @@ public class LocusIteratorByStateExperimental extends LocusIterator { //final boolean DEBUG2 = false && DEBUG; private ReadProperties readInfo; private AlignmentContext nextAlignmentContext; - private boolean performLevelingDownsampling; // ----------------------------------------------------------------------------------------------------------------- // @@ -221,15 +220,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { + public LegacyLocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator); - - this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && - readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readInfo.getDownsamplingMethod().toCoverage != null; + this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true @@ -290,13 +285,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); - - // TODO: How can you determine here whether the current pileup has been downsampled? boolean hasBeenSampled = false; - for (final String sample : samples) { final Iterator iterator = readStates.iterator(sample); final List pile = new ArrayList(readStates.size(sample)); + hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); int size = 0; // number of elements in this sample's pileup int nDeletions = 0; // number of deletions in this sample's pileup @@ -405,20 +398,34 @@ public class LocusIteratorByStateExperimental extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - protected class ReadStateManager { + private class ReadStateManager { private final PeekableIterator iterator; + private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; private final Map readStatesBySample = new HashMap(); + private final int targetCoverage; private int totalReadStates = 0; - public ReadStateManager(Iterator source) { + public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { this.iterator = new PeekableIterator(source); - - for (final String sample : samples) { - readStatesBySample.put(sample, new PerSampleReadStateManager()); + this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; + switch (this.downsamplingMethod.type) { + case BY_SAMPLE: + if (downsamplingMethod.toCoverage == null) + throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); + this.targetCoverage = downsamplingMethod.toCoverage; + break; + default: + this.targetCoverage = Integer.MAX_VALUE; } - samplePartitioner = new SamplePartitioner(); + Map readSelectors = new HashMap(); + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); + } + + samplePartitioner = new SamplePartitioner(readSelectors); } /** @@ -442,6 +449,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator { public void remove() { wrappedIterator.remove(); + totalReadStates--; } }; } @@ -469,6 +477,17 @@ public class LocusIteratorByStateExperimental extends LocusIterator { return readStatesBySample.get(sample).size(); } + /** + * The extent of downsampling; basically, the furthest base out which has 'fallen + * victim' to the downsampler. + * + * @param sample Sample, downsampled independently. + * @return Integer stop of the furthest undownsampled region. + */ + public int getDownsamplingExtent(final String sample) { + return readStatesBySample.get(sample).getDownsamplingExtent(); + } + public SAMRecordState getFirst() { for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); @@ -501,13 +520,61 @@ public class LocusIteratorByStateExperimental extends LocusIterator { samplePartitioner.submitRead(iterator.next()); } } + samplePartitioner.complete(); for (final String sample : samples) { - Collection newReads = samplePartitioner.getReadsForSample(sample); - PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - addReadsToSample(statesBySample, newReads); - } + ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); + Collection newReads = new ArrayList(aggregator.getSelectedReads()); + + PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + int numReads = statesBySample.size(); + int downsamplingExtent = aggregator.getDownsamplingExtent(); + + if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { + long readLimit = aggregator.getNumReadsSeen(); + addReadsToSample(statesBySample, newReads, readLimit); + statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); + } else { + int[] counts = statesBySample.getCountsPerAlignmentStart(); + int[] updatedCounts = new int[counts.length]; + System.arraycopy(counts, 0, updatedCounts, 0, counts.length); + + boolean readPruned = true; + while (numReads + newReads.size() > targetCoverage && readPruned) { + readPruned = false; + for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { + if (updatedCounts[alignmentStart] > 1) { + updatedCounts[alignmentStart]--; + numReads--; + readPruned = true; + } + } + } + + if (numReads == targetCoverage) { + updatedCounts[0]--; + numReads--; + } + + BitSet toPurge = new BitSet(readStates.size()); + int readOffset = 0; + + for (int i = 0; i < updatedCounts.length; i++) { + int n = counts[i]; + int k = updatedCounts[i]; + + for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) + toPurge.set(readOffset + purgedElement); + + readOffset += counts[i]; + } + downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); + + addReadsToSample(statesBySample, newReads, targetCoverage - numReads); + statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); + } + } samplePartitioner.reset(); } @@ -516,134 +583,380 @@ public class LocusIteratorByStateExperimental extends LocusIterator { * * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. + * @param maxReads Maximum number of reads to add. */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); - + int readCount = 0; for (SAMRecord read : reads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); + if (readCount < maxReads) { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); + readCount++; + } } - readStates.addStatesAtNextAlignmentStart(newReadStates); } - protected class PerSampleReadStateManager implements Iterable { - private List> readStatesByAlignmentStart = new LinkedList>(); - private int thisSampleReadStates = 0; - private Downsampler> levelingDownsampler = - performLevelingDownsampling ? - new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : - null; + private class PerSampleReadStateManager implements Iterable { + private final Queue readStates = new LinkedList(); + private final Deque readStateCounter = new LinkedList(); + private int downsamplingExtent = 0; public void addStatesAtNextAlignmentStart(Collection states) { - if ( states.isEmpty() ) { - return; - } - - readStatesByAlignmentStart.add(new LinkedList(states)); - thisSampleReadStates += states.size(); + readStates.addAll(states); + readStateCounter.add(new Counter(states.size())); totalReadStates += states.size(); - - if ( levelingDownsampler != null ) { - levelingDownsampler.submit(readStatesByAlignmentStart); - levelingDownsampler.signalEndOfInput(); - - thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - - // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); - levelingDownsampler.reset(); - } } public boolean isEmpty() { - return readStatesByAlignmentStart.isEmpty(); + return readStates.isEmpty(); } public SAMRecordState peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + return readStates.peek(); } public int size() { - return thisSampleReadStates; + return readStates.size(); + } + + public void specifyNewDownsamplingExtent(int downsamplingExtent) { + this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public int[] getCountsPerAlignmentStart() { + int[] counts = new int[readStateCounter.size()]; + int index = 0; + for (Counter counter : readStateCounter) + counts[index++] = counter.getCount(); + return counts; } public Iterator iterator() { return new Iterator() { - private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates = null; - private Iterator currentPositionReadStatesIterator = null; + private Iterator wrappedIterator = readStates.iterator(); public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + return wrappedIterator.hasNext(); } public SAMRecordState next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); + return wrappedIterator.next(); } public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - totalReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } + wrappedIterator.remove(); + Counter counter = readStateCounter.peek(); + counter.decrement(); + if (counter.getCount() == 0) + readStateCounter.remove(); } }; } + + /** + * Purge the given elements from the bitset. If an element in the bitset is true, purge + * the corresponding read state. + * + * @param elements bits from the set to purge. + * @return the extent of the final downsampled read. + */ + public int purge(final BitSet elements) { + int downsamplingExtent = 0; + + if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + + Iterator readStateIterator = readStates.iterator(); + + Iterator counterIterator = readStateCounter.iterator(); + Counter currentCounter = counterIterator.next(); + + int readIndex = 0; + long alignmentStartCounter = currentCounter.getCount(); + + int toPurge = elements.nextSetBit(0); + int removedCount = 0; + + while (readStateIterator.hasNext() && toPurge >= 0) { + SAMRecordState state = readStateIterator.next(); + downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); + + if (readIndex == toPurge) { + readStateIterator.remove(); + currentCounter.decrement(); + if (currentCounter.getCount() == 0) + counterIterator.remove(); + removedCount++; + toPurge = elements.nextSetBit(toPurge + 1); + } + + readIndex++; + alignmentStartCounter--; + if (alignmentStartCounter == 0 && counterIterator.hasNext()) { + currentCounter = counterIterator.next(); + alignmentStartCounter = currentCounter.getCount(); + } + } + + totalReadStates -= removedCount; + + return downsamplingExtent; + } } } /** - * Note: stores reads by sample ID string, not by sample object + * Note: assuming that, whenever we downsample, we downsample to an integer capacity. */ - private class SamplePartitioner { - private Map> readsBySample; - private long readsSeen = 0; + static private class Counter { + private int count; - public SamplePartitioner() { - readsBySample = new HashMap>(); - - for ( String sample : samples ) { - readsBySample.put(sample, new ArrayList()); - } + public Counter(int count) { + this.count = count; } - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).add(read); - readsSeen++; + public int getCount() { + return count; } - public long getNumReadsSeen() { - return readsSeen; - } - - public Collection getReadsForSample(String sampleName) { - if ( ! readsBySample.containsKey(sampleName) ) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for ( Collection perSampleReads : readsBySample.values() ) - perSampleReads.clear(); - readsSeen = 0; + public void decrement() { + count--; } } -} \ No newline at end of file +} + +/** + * Selects reads passed to it based on a criteria decided through inheritance. + * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. + */ +interface ReadSelector { + /** + * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. + * + * @param read the read to evaluate. + */ + public void submitRead(SAMRecord read); + + /** + * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. + * + * @param read the read previously rejected. + */ + public void notifyReadRejected(SAMRecord read); + + /** + * Signal the selector that read additions are complete. + */ + public void complete(); + + /** + * Retrieve the number of reads seen by this selector so far. + * + * @return number of reads seen. + */ + public long getNumReadsSeen(); + + /** + * Return the number of reads accepted by this selector so far. + * + * @return number of reads selected. + */ + public long getNumReadsSelected(); + + /** + * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the + * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at + * position 3 whose cigar string is 76M, the value of this parameter will be 78. + * + * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. + */ + public int getDownsamplingExtent(); + + /** + * Get the reads selected by this selector. + * + * @return collection of reads selected by this selector. + */ + public Collection getSelectedReads(); + + /** + * Reset this collection to its pre-gathered state. + */ + public void reset(); +} + +/** + * Select every read passed in. + */ +class AllReadsSelector implements ReadSelector { + private Collection reads = new LinkedList(); + private long readsSeen = 0; + private int downsamplingExtent = 0; + + public void submitRead(SAMRecord read) { + reads.add(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public Collection getSelectedReads() { + return reads; + } + + public void reset() { + reads.clear(); + readsSeen = 0; + downsamplingExtent = 0; + } +} + + +/** + * Select N reads randomly from the input stream. + */ +class NRandomReadSelector implements ReadSelector { + private final LegacyReservoirDownsampler reservoir; + private final ReadSelector chainedSelector; + private long readsSeen = 0; + private int downsamplingExtent = 0; + + public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { + this.reservoir = new LegacyReservoirDownsampler((int) readLimit); + this.chainedSelector = chainedSelector; + } + + public void submitRead(SAMRecord read) { + SAMRecord displaced = reservoir.add(read); + if (displaced != null && chainedSelector != null) { + chainedSelector.notifyReadRejected(read); + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); + } + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + } + + public void complete() { + for (SAMRecord read : reservoir.getDownsampledContents()) + chainedSelector.submitRead(read); + if (chainedSelector != null) + chainedSelector.complete(); + } + + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return reservoir.size(); + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public Collection getSelectedReads() { + return reservoir.getDownsampledContents(); + } + + public void reset() { + reservoir.clear(); + downsamplingExtent = 0; + if (chainedSelector != null) + chainedSelector.reset(); + } +} + +/** + * Note: stores reads by sample ID string, not by sample object + */ +class SamplePartitioner implements ReadSelector { + private final Map readsBySample; + private long readsSeen = 0; + + public SamplePartitioner(Map readSelectors) { + readsBySample = readSelectors; + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submitRead(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).notifyReadRejected(read); + readsSeen++; + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public int getDownsamplingExtent() { + int downsamplingExtent = 0; + for (ReadSelector storage : readsBySample.values()) + downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); + return downsamplingExtent; + } + + public Collection getSelectedReads() { + throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); + } + + public ReadSelector getSelectedReads(String sampleName) { + if (!readsBySample.containsKey(sampleName)) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for (ReadSelector storage : readsBySample.values()) + storage.reset(); + readsSeen = 0; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 46e84798a..4f8d7d3f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -31,14 +31,11 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.downsampling.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.ReservoirDownsampler; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -54,7 +51,7 @@ public class LocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- // @@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator { private final ArrayList samples; private final ReadStateManager readStates; - static private class SAMRecordState { + protected static class SAMRecordState { SAMRecord read; int readOffset = -1; // how far are we offset from the start of the read bases? int genomeOffset = -1; // how far are we offset from the alignment start on the genome? @@ -213,6 +210,7 @@ public class LocusIteratorByState extends LocusIterator { //final boolean DEBUG2 = false && DEBUG; private ReadProperties readInfo; private AlignmentContext nextAlignmentContext; + private boolean performDownsampling; // ----------------------------------------------------------------------------------------------------------------- // @@ -224,7 +222,18 @@ public class LocusIteratorByState extends LocusIterator { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); + + // LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're + // downsampling to coverage by sample. SAMDataSource will have refrained from applying + // any downsamplers to the read stream in this case, in the expectation that LIBS will + // manage the downsampling. The reason for this is twofold: performance (don't have to + // split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling + // of reads (eg., using half of a read, and throwing the rest away). + this.performDownsampling = readInfo.getDownsamplingMethod() != null && + readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readInfo.getDownsamplingMethod().toCoverage != null; + + this.readStates = new ReadStateManager(samIterator); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true @@ -285,11 +294,13 @@ public class LocusIteratorByState extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); + + // TODO: How can you determine here whether the current pileup has been downsampled? boolean hasBeenSampled = false; + for (final String sample : samples) { final Iterator iterator = readStates.iterator(sample); final List pile = new ArrayList(readStates.size(sample)); - hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); int size = 0; // number of elements in this sample's pileup int nDeletions = 0; // number of deletions in this sample's pileup @@ -398,34 +409,20 @@ public class LocusIteratorByState extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - private class ReadStateManager { + protected class ReadStateManager { private final PeekableIterator iterator; - private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; private final Map readStatesBySample = new HashMap(); - private final int targetCoverage; private int totalReadStates = 0; - public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { + public ReadStateManager(Iterator source) { this.iterator = new PeekableIterator(source); - this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; - switch (this.downsamplingMethod.type) { - case BY_SAMPLE: - if (downsamplingMethod.toCoverage == null) - throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); - this.targetCoverage = downsamplingMethod.toCoverage; - break; - default: - this.targetCoverage = Integer.MAX_VALUE; - } - Map readSelectors = new HashMap(); for (final String sample : samples) { readStatesBySample.put(sample, new PerSampleReadStateManager()); - readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); } - samplePartitioner = new SamplePartitioner(readSelectors); + samplePartitioner = new SamplePartitioner(performDownsampling); } /** @@ -449,7 +446,6 @@ public class LocusIteratorByState extends LocusIterator { public void remove() { wrappedIterator.remove(); - totalReadStates--; } }; } @@ -477,17 +473,6 @@ public class LocusIteratorByState extends LocusIterator { return readStatesBySample.get(sample).size(); } - /** - * The extent of downsampling; basically, the furthest base out which has 'fallen - * victim' to the downsampler. - * - * @param sample Sample, downsampled independently. - * @return Integer stop of the furthest undownsampled region. - */ - public int getDownsamplingExtent(final String sample) { - return readStatesBySample.get(sample).getDownsamplingExtent(); - } - public SAMRecordState getFirst() { for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); @@ -520,61 +505,15 @@ public class LocusIteratorByState extends LocusIterator { samplePartitioner.submitRead(iterator.next()); } } - samplePartitioner.complete(); + + samplePartitioner.doneSubmittingReads(); for (final String sample : samples) { - ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); - - Collection newReads = new ArrayList(aggregator.getSelectedReads()); - + Collection newReads = samplePartitioner.getReadsForSample(sample); PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - int numReads = statesBySample.size(); - int downsamplingExtent = aggregator.getDownsamplingExtent(); - - if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { - long readLimit = aggregator.getNumReadsSeen(); - addReadsToSample(statesBySample, newReads, readLimit); - statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } else { - int[] counts = statesBySample.getCountsPerAlignmentStart(); - int[] updatedCounts = new int[counts.length]; - System.arraycopy(counts, 0, updatedCounts, 0, counts.length); - - boolean readPruned = true; - while (numReads + newReads.size() > targetCoverage && readPruned) { - readPruned = false; - for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { - if (updatedCounts[alignmentStart] > 1) { - updatedCounts[alignmentStart]--; - numReads--; - readPruned = true; - } - } - } - - if (numReads == targetCoverage) { - updatedCounts[0]--; - numReads--; - } - - BitSet toPurge = new BitSet(readStates.size()); - int readOffset = 0; - - for (int i = 0; i < updatedCounts.length; i++) { - int n = counts[i]; - int k = updatedCounts[i]; - - for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) - toPurge.set(readOffset + purgedElement); - - readOffset += counts[i]; - } - downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); - - addReadsToSample(statesBySample, newReads, targetCoverage - numReads); - statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } + addReadsToSample(statesBySample, newReads); } + samplePartitioner.reset(); } @@ -583,380 +522,140 @@ public class LocusIteratorByState extends LocusIterator { * * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. - * @param maxReads Maximum number of reads to add. */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); - int readCount = 0; + for (SAMRecord read : reads) { - if (readCount < maxReads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); - readCount++; - } + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); } + readStates.addStatesAtNextAlignmentStart(newReadStates); } - private class PerSampleReadStateManager implements Iterable { - private final Queue readStates = new LinkedList(); - private final Deque readStateCounter = new LinkedList(); - private int downsamplingExtent = 0; + protected class PerSampleReadStateManager implements Iterable { + private List> readStatesByAlignmentStart = new LinkedList>(); + private int thisSampleReadStates = 0; + private Downsampler> levelingDownsampler = + performDownsampling ? + new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : + null; public void addStatesAtNextAlignmentStart(Collection states) { - readStates.addAll(states); - readStateCounter.add(new Counter(states.size())); + if ( states.isEmpty() ) { + return; + } + + readStatesByAlignmentStart.add(new LinkedList(states)); + thisSampleReadStates += states.size(); totalReadStates += states.size(); + + if ( levelingDownsampler != null ) { + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } } public boolean isEmpty() { - return readStates.isEmpty(); + return readStatesByAlignmentStart.isEmpty(); } public SAMRecordState peek() { - return readStates.peek(); + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); } public int size() { - return readStates.size(); - } - - public void specifyNewDownsamplingExtent(int downsamplingExtent) { - this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public int[] getCountsPerAlignmentStart() { - int[] counts = new int[readStateCounter.size()]; - int index = 0; - for (Counter counter : readStateCounter) - counts[index++] = counter.getCount(); - return counts; + return thisSampleReadStates; } public Iterator iterator() { return new Iterator() { - private Iterator wrappedIterator = readStates.iterator(); + private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates = null; + private Iterator currentPositionReadStatesIterator = null; public boolean hasNext() { - return wrappedIterator.hasNext(); + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); } public SAMRecordState next() { - return wrappedIterator.next(); + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); } public void remove() { - wrappedIterator.remove(); - Counter counter = readStateCounter.peek(); - counter.decrement(); - if (counter.getCount() == 0) - readStateCounter.remove(); + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + totalReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } } }; } + } + } - /** - * Purge the given elements from the bitset. If an element in the bitset is true, purge - * the corresponding read state. - * - * @param elements bits from the set to purge. - * @return the extent of the final downsampled read. - */ - public int purge(final BitSet elements) { - int downsamplingExtent = 0; + /** + * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler. + * + * Note: stores reads by sample ID string, not by sample object + */ + private class SamplePartitioner { + private Map> readsBySample; - if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + public SamplePartitioner( boolean downsampleReads ) { + readsBySample = new HashMap>(); - Iterator readStateIterator = readStates.iterator(); + for ( String sample : samples ) { + readsBySample.put(sample, + downsampleReads ? new ReservoirDownsampler(readInfo.getDownsamplingMethod().toCoverage) : + new PassThroughDownsampler()); + } + } - Iterator counterIterator = readStateCounter.iterator(); - Counter currentCounter = counterIterator.next(); + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submit(read); + } - int readIndex = 0; - long alignmentStartCounter = currentCounter.getCount(); + public void doneSubmittingReads() { + for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { + perSampleReads.getValue().signalEndOfInput(); + } + } - int toPurge = elements.nextSetBit(0); - int removedCount = 0; + public Collection getReadsForSample(String sampleName) { + if ( ! readsBySample.containsKey(sampleName) ) + throw new NoSuchElementException("Sample name not found"); - while (readStateIterator.hasNext() && toPurge >= 0) { - SAMRecordState state = readStateIterator.next(); - downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); + return readsBySample.get(sampleName).consumeFinalizedItems(); + } - if (readIndex == toPurge) { - readStateIterator.remove(); - currentCounter.decrement(); - if (currentCounter.getCount() == 0) - counterIterator.remove(); - removedCount++; - toPurge = elements.nextSetBit(toPurge + 1); - } - - readIndex++; - alignmentStartCounter--; - if (alignmentStartCounter == 0 && counterIterator.hasNext()) { - currentCounter = counterIterator.next(); - alignmentStartCounter = currentCounter.getCount(); - } - } - - totalReadStates -= removedCount; - - return downsamplingExtent; + public void reset() { + for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { + perSampleReads.getValue().clear(); + perSampleReads.getValue().reset(); } } } - - /** - * Note: assuming that, whenever we downsample, we downsample to an integer capacity. - */ - static private class Counter { - private int count; - - public Counter(int count) { - this.count = count; - } - - public int getCount() { - return count; - } - - public void decrement() { - count--; - } - } -} - -/** - * Selects reads passed to it based on a criteria decided through inheritance. - * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. - */ -interface ReadSelector { - /** - * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. - * - * @param read the read to evaluate. - */ - public void submitRead(SAMRecord read); - - /** - * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. - * - * @param read the read previously rejected. - */ - public void notifyReadRejected(SAMRecord read); - - /** - * Signal the selector that read additions are complete. - */ - public void complete(); - - /** - * Retrieve the number of reads seen by this selector so far. - * - * @return number of reads seen. - */ - public long getNumReadsSeen(); - - /** - * Return the number of reads accepted by this selector so far. - * - * @return number of reads selected. - */ - public long getNumReadsSelected(); - - /** - * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the - * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at - * position 3 whose cigar string is 76M, the value of this parameter will be 78. - * - * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. - */ - public int getDownsamplingExtent(); - - /** - * Get the reads selected by this selector. - * - * @return collection of reads selected by this selector. - */ - public Collection getSelectedReads(); - - /** - * Reset this collection to its pre-gathered state. - */ - public void reset(); -} - -/** - * Select every read passed in. - */ -class AllReadsSelector implements ReadSelector { - private Collection reads = new LinkedList(); - private long readsSeen = 0; - private int downsamplingExtent = 0; - - public void submitRead(SAMRecord read) { - reads.add(read); - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - readsSeen++; - downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); - } - - public void complete() { - // NO-OP. - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return readsSeen; - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public Collection getSelectedReads() { - return reads; - } - - public void reset() { - reads.clear(); - readsSeen = 0; - downsamplingExtent = 0; - } -} - - -/** - * Select N reads randomly from the input stream. - */ -class NRandomReadSelector implements ReadSelector { - private final ReservoirDownsampler reservoir; - private final ReadSelector chainedSelector; - private long readsSeen = 0; - private int downsamplingExtent = 0; - - public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { - this.reservoir = new ReservoirDownsampler((int) readLimit); - this.chainedSelector = chainedSelector; - } - - public void submitRead(SAMRecord read) { - SAMRecord displaced = reservoir.add(read); - if (displaced != null && chainedSelector != null) { - chainedSelector.notifyReadRejected(read); - downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); - } - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - readsSeen++; - } - - public void complete() { - for (SAMRecord read : reservoir.getDownsampledContents()) - chainedSelector.submitRead(read); - if (chainedSelector != null) - chainedSelector.complete(); - } - - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return reservoir.size(); - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public Collection getSelectedReads() { - return reservoir.getDownsampledContents(); - } - - public void reset() { - reservoir.clear(); - downsamplingExtent = 0; - if (chainedSelector != null) - chainedSelector.reset(); - } -} - -/** - * Note: stores reads by sample ID string, not by sample object - */ -class SamplePartitioner implements ReadSelector { - private final Map readsBySample; - private long readsSeen = 0; - - public SamplePartitioner(Map readSelectors) { - readsBySample = readSelectors; - } - - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).submitRead(read); - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).notifyReadRejected(read); - readsSeen++; - } - - public void complete() { - // NO-OP. - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return readsSeen; - } - - public int getDownsamplingExtent() { - int downsamplingExtent = 0; - for (ReadSelector storage : readsBySample.values()) - downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); - return downsamplingExtent; - } - - public Collection getSelectedReads() { - throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); - } - - public ReadSelector getSelectedReads(String sampleName) { - if (!readsBySample.containsKey(sampleName)) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for (ReadSelector storage : readsBySample.values()) - storage.reset(); - readsSeen = 0; - } - -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java similarity index 94% rename from public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java rename to public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java index a758df431..ba863310d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java @@ -8,6 +8,8 @@ import java.util.Collection; import java.util.Iterator; /** + * THIS IMPLEMENTATION IS BROKEN AND WILL BE REMOVED ONCE THE DOWNSAMPLING ENGINE FORK COLLAPSES + * * Randomly downsample from a stream of elements. This algorithm is a direct, * naive implementation of reservoir downsampling as described in "Random Downsampling * with a Reservoir" (Vitter 1985). At time of writing, this paper is located here: @@ -16,7 +18,7 @@ import java.util.Iterator; * @author mhanna * @version 0.1 */ -public class ReservoirDownsampler { +public class LegacyReservoirDownsampler { /** * The reservoir of elements tracked by this downsampler. */ @@ -31,7 +33,7 @@ public class ReservoirDownsampler { * Create a new downsampler with the given source iterator and given comparator. * @param maxElements What is the maximum number of reads that can be returned in any call of this */ - public ReservoirDownsampler(final int maxElements) { + public LegacyReservoirDownsampler(final int maxElements) { if(maxElements < 0) throw new ReviewedStingException("Unable to work with an negative size collection of elements"); this.reservoir = new ArrayList(maxElements); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 61c1c51b4..8186f69cf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -32,11 +32,10 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.walkers.qc.CountLoci; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -85,7 +84,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); - LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + LegacyLocusIteratorByState locusIteratorByState = new LegacyLocusIteratorByState(readIterator,readProperties,genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); while(locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java similarity index 97% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java index 0807f36dc..73664141f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java @@ -45,10 +45,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -public class ExperimentalReadShardBalancerUnitTest extends BaseTest { +public class ReadShardBalancerUnitTest extends BaseTest { /** - * Tests to ensure that ExperimentalReadShardBalancer works as expected and does not place shard boundaries + * Tests to ensure that ReadShardBalancer works as expected and does not place shard boundaries * at inappropriate places, such as within an alignment start position */ private static class ExperimentalReadShardBalancerTest extends TestDataProvider { @@ -74,7 +74,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { this.stackSize = stackSize; this.numUnmappedReads = numUnmappedReads; - this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, false); this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", @@ -96,7 +96,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { new ArrayList(), false); - Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ExperimentalReadShardBalancer()); + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); SAMRecord readAtEndOfLastShard = null; int totalReadsSeen = 0; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java similarity index 50% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java index a49a602c6..44d182661 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java @@ -7,10 +7,8 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -21,14 +19,17 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** - * testing of the experimental version of LocusIteratorByState + * testing of the LEGACY version of LocusIteratorByState */ -public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { +public class LegacyLocusIteratorByStateUnitTest extends BaseTest { private static SAMFileHeader header; - private LocusIteratorByStateExperimental li; + private LegacyLocusIteratorByState li; private GenomeLocParser genomeLocParser; @BeforeClass @@ -37,8 +38,8 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - private LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { - return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + private LegacyLocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + return new LegacyLocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); } @Test @@ -328,184 +329,14 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { // End comprehensive LIBS/PileupElement tests // //////////////////////////////////////////////// - - /////////////////////////////////////// - // Read State Manager Tests // - /////////////////////////////////////// - - private class PerSampleReadStateManagerTest extends TestDataProvider { - private List readCountsPerAlignmentStart; - private List reads; - private List> recordStatesByAlignmentStart; - private int removalInterval; - - public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { - super(PerSampleReadStateManagerTest.class); - - this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; - this.removalInterval = removalInterval; - - reads = new ArrayList(); - recordStatesByAlignmentStart = new ArrayList>(); - - setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", - getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); - } - - public void run() { - LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); - LocusIteratorByStateExperimental.ReadStateManager readStateManager = - libs.new ReadStateManager(new ArrayList().iterator()); - LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = - readStateManager.new PerSampleReadStateManager(); - - makeReads(); - - for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { - perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); - } - - // read state manager should have the right number of reads - Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); - - Iterator originalReadsIterator = reads.iterator(); - Iterator recordStateIterator = perSampleReadStateManager.iterator(); - int recordStateCount = 0; - int numReadStatesRemoved = 0; - - // Do a first-pass validation of the record state iteration by making sure we get back everything we - // put in, in the same order, doing any requested removals of read states along the way - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - recordStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - SAMRecord originalRead = originalReadsIterator.next(); - - // The read we get back should be literally the same read in memory as we put in - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - - // If requested, remove a read state every removalInterval states - if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { - recordStateIterator.remove(); - numReadStatesRemoved++; - } - } - - Assert.assertFalse(originalReadsIterator.hasNext()); - - // If we removed any read states, do a second pass through the read states to make sure the right - // states were removed - if ( numReadStatesRemoved > 0 ) { - Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); - - originalReadsIterator = reads.iterator(); - recordStateIterator = perSampleReadStateManager.iterator(); - int readCount = 0; - int readStateCount = 0; - - // Match record states with the reads that should remain after removal - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - readStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - - SAMRecord originalRead = originalReadsIterator.next(); - readCount++; - - if ( readCount % removalInterval == 0 ) { - originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded - readCount++; - } - - // The read we get back should be literally the same read in memory as we put in (after accounting for removals) - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - } - - Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); - } - - // Allow memory used by this test to be reclaimed - readCountsPerAlignmentStart = null; - reads = null; - recordStatesByAlignmentStart = null; - } - - private void makeReads() { - int alignmentStart = 1; - - for ( int readsThisStack : readCountsPerAlignmentStart ) { - ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); - ArrayList stackRecordStates = new ArrayList(); - - for ( SAMRecord read : stackReads ) { - stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); - } - - reads.addAll(stackReads); - recordStatesByAlignmentStart.add(stackRecordStates); - } - } - } - - @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") - public Object[][] createPerSampleReadStateManagerTests() { - for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), - Arrays.asList(2), - Arrays.asList(10), - Arrays.asList(1, 1), - Arrays.asList(2, 2), - Arrays.asList(10, 10), - Arrays.asList(1, 10), - Arrays.asList(10, 1), - Arrays.asList(1, 1, 1), - Arrays.asList(2, 2, 2), - Arrays.asList(10, 10, 10), - Arrays.asList(1, 1, 1, 1, 1, 1), - Arrays.asList(10, 10, 10, 10, 10, 10), - Arrays.asList(1, 2, 10, 1, 2, 10) - ) ) { - - for ( int removalInterval : Arrays.asList(0, 2, 3) ) { - new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); - } - } - - return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); - } - - @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") - public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { - logger.warn("Running test: " + test); - - test.run(); - } - - /////////////////////////////////////// - // End Read State Manager Tests // - /////////////////////////////////////// - - - - /////////////////////////////////////// - // Helper methods / classes // - /////////////////////////////////////// - private static ReadProperties createTestReadProperties() { - return createTestReadProperties(null); - } - - private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, - downsamplingMethod, + null, new ValidationExclusion(), Collections.emptyList(), Collections.emptyList(), @@ -513,136 +344,137 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { (byte) -1 ); } +} - private static class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; +class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; - public FakeCloseableIterator(Iterator it) { - iterator = it; - } - - @Override - public void close() {} - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public T next() { - return iterator.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } + public FakeCloseableIterator(Iterator it) { + iterator = it; } - private static final class LIBS_position { + @Override + public void close() {} - SAMRecord read; + @Override + public boolean hasNext() { + return iterator.hasNext(); + } - final int numOperators; - int currentOperatorIndex = 0; - int currentPositionOnOperator = 0; - int currentReadOffset = 0; + @Override + public T next() { + return iterator.next(); + } - boolean isBeforeDeletionStart = false; - boolean isBeforeDeletedBase = false; - boolean isAfterDeletionEnd = false; - boolean isAfterDeletedBase = false; - boolean isBeforeInsertion = false; - boolean isAfterInsertion = false; - boolean isNextToSoftClip = false; - - boolean sawMop = false; - - public LIBS_position(final SAMRecord read) { - this.read = read; - numOperators = read.getCigar().numCigarElements(); - } - - public int getCurrentReadOffset() { - return Math.max(0, currentReadOffset - 1); - } - - /** - * Steps forward on the genome. Returns false when done reading the read, true otherwise. - */ - public boolean stepForwardOnGenome() { - if ( currentOperatorIndex == numOperators ) - return false; - - CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); - if ( currentPositionOnOperator >= curElement.getLength() ) { - if ( ++currentOperatorIndex == numOperators ) - return false; - - curElement = read.getCigar().getCigarElement(currentOperatorIndex); - currentPositionOnOperator = 0; - } - - switch ( curElement.getOperator() ) { - case I: // insertion w.r.t. the reference - if ( !sawMop ) - break; - case S: // soft clip - currentReadOffset += curElement.getLength(); - case H: // hard clip - case P: // padding - currentOperatorIndex++; - return stepForwardOnGenome(); - - case D: // deletion w.r.t. the reference - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - currentPositionOnOperator++; - break; - - case M: - case EQ: - case X: - sawMop = true; - currentReadOffset++; - currentPositionOnOperator++; - break; - default: - throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); - } - - final boolean isFirstOp = currentOperatorIndex == 0; - final boolean isLastOp = currentOperatorIndex == numOperators - 1; - final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; - final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); - - isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); - isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); - isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); - isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); - isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) - || (!sawMop && curElement.getOperator() == CigarOperator.I); - isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); - isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) - || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); - - return true; - } - - private static boolean isBeforeOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isLastOp, - final boolean isLastBaseOfOp) { - return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; - } - - private static boolean isAfterOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isFirstOp, - final boolean isFirstBaseOfOp) { - return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; - } + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } +} + + +final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 83913fa76..4e4126ad4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -7,8 +7,10 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -19,13 +21,10 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; +import java.util.*; /** - * testing of the LocusIteratorByState + * testing of the new (non-legacy) version of LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends BaseTest { private static SAMFileHeader header; @@ -329,14 +328,184 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // End comprehensive LIBS/PileupElement tests // //////////////////////////////////////////////// + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByState libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByState.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByState.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByState.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByState.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByState.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } + + /////////////////////////////////////// + // End Read State Manager Tests // + /////////////////////////////////////// + + + + /////////////////////////////////////// + // Helper methods / classes // + /////////////////////////////////////// + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, - null, + downsamplingMethod, new ValidationExclusion(), Collections.emptyList(), Collections.emptyList(), @@ -344,137 +513,136 @@ public class LocusIteratorByStateUnitTest extends BaseTest { (byte) -1 ); } -} -class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; - public FakeCloseableIterator(Iterator it) { - iterator = it; + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() {} + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } } - @Override - public void close() {} + private static final class LIBS_position { - @Override - public boolean hasNext() { - return iterator.hasNext(); - } + SAMRecord read; - @Override - public T next() { - return iterator.next(); - } + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } -} + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + boolean sawMop = false; -final class LIBS_position { + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } - SAMRecord read; + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } - final int numOperators; - int currentOperatorIndex = 0; - int currentPositionOnOperator = 0; - int currentReadOffset = 0; - - boolean isBeforeDeletionStart = false; - boolean isBeforeDeletedBase = false; - boolean isAfterDeletionEnd = false; - boolean isAfterDeletedBase = false; - boolean isBeforeInsertion = false; - boolean isAfterInsertion = false; - boolean isNextToSoftClip = false; - - boolean sawMop = false; - - public LIBS_position(final SAMRecord read) { - this.read = read; - numOperators = read.getCigar().numCigarElements(); - } - - public int getCurrentReadOffset() { - return Math.max(0, currentReadOffset - 1); - } - - /** - * Steps forward on the genome. Returns false when done reading the read, true otherwise. - */ - public boolean stepForwardOnGenome() { - if ( currentOperatorIndex == numOperators ) - return false; - - CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); - if ( currentPositionOnOperator >= curElement.getLength() ) { - if ( ++currentOperatorIndex == numOperators ) + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) return false; - curElement = read.getCigar().getCigarElement(currentOperatorIndex); - currentPositionOnOperator = 0; - } + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; - switch ( curElement.getOperator() ) { - case I: // insertion w.r.t. the reference - if ( !sawMop ) + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; break; - case S: // soft clip - currentReadOffset += curElement.getLength(); - case H: // hard clip - case P: // padding - currentOperatorIndex++; - return stepForwardOnGenome(); - case D: // deletion w.r.t. the reference - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - currentPositionOnOperator++; - break; + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } - case M: - case EQ: - case X: - sawMop = true; - currentReadOffset++; - currentPositionOnOperator++; - break; - default: - throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; } - final boolean isFirstOp = currentOperatorIndex == 0; - final boolean isLastOp = currentOperatorIndex == numOperators - 1; - final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; - final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } - isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); - isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); - isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); - isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); - isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) - || (!sawMop && curElement.getOperator() == CigarOperator.I); - isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); - isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) - || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); - - return true; - } - - private static boolean isBeforeOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isLastOp, - final boolean isLastBaseOfOp) { - return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; - } - - private static boolean isAfterOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isFirstOp, - final boolean isFirstBaseOfOp) { - return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index bf1fc9e65..d9af2ea7e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; +import org.broadinstitute.sting.gatk.datasources.reads.LegacyReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; @@ -114,7 +114,7 @@ public class TraverseReadsUnitTest extends BaseTest { @Test public void testUnmappedReadCount() { SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); - Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new LegacyReadShardBalancer()); countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); diff --git a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java index 5b052454a..47096c73a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -23,14 +23,14 @@ public class LegacyReservoirDownsamplerUnitTest { @Test public void testEmptyIterator() { - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); Assert.assertTrue(downsampler.isEmpty(),"Downsampler is not empty but should be."); } @Test public void testOneElementWithPoolSizeOne() { List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -42,7 +42,7 @@ public class LegacyReservoirDownsamplerUnitTest { @Test public void testOneElementWithPoolSizeGreaterThanOne() { List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -58,7 +58,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -78,7 +78,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -99,7 +99,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(0); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(0); downsampler.addAll(reads); Assert.assertTrue(downsampler.isEmpty(),"Downsampler isn't empty but should be"); @@ -115,7 +115,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -128,7 +128,7 @@ public class LegacyReservoirDownsamplerUnitTest { public void testFillingAcrossLoci() { List reads = new ArrayList(); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be");